summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig29
-rw-r--r--mm/Kconfig.debug48
-rw-r--r--mm/Makefile13
-rw-r--r--mm/backing-dev.c178
-rw-r--r--mm/cma.c44
-rw-r--r--mm/cma.h7
-rw-r--r--mm/cma_debug.c7
-rw-r--r--mm/compaction.c286
-rw-r--r--mm/debug.c99
-rw-r--r--mm/debug_vm_pgtable.c1049
-rw-r--r--mm/fadvise.c6
-rw-r--r--mm/filemap.c473
-rw-r--r--mm/frame_vector.c13
-rw-r--r--mm/frontswap.c16
-rw-r--r--mm/gup.c592
-rw-r--r--mm/hmm.c204
-rw-r--r--mm/huge_memory.c414
-rw-r--r--mm/hugetlb.c418
-rw-r--r--mm/hugetlb_cgroup.c4
-rw-r--r--mm/init-mm.c4
-rw-r--r--mm/internal.h56
-rw-r--r--mm/ioremap.c289
-rw-r--r--mm/kasan/Makefile21
-rw-r--r--mm/kasan/common.c60
-rw-r--r--mm/kasan/generic.c43
-rw-r--r--mm/kasan/generic_report.c1
-rw-r--r--mm/kasan/init.c11
-rw-r--r--mm/kasan/kasan.h23
-rw-r--r--mm/kasan/quarantine.c1
-rw-r--r--mm/kasan/report.c76
-rw-r--r--mm/kasan/tags.c37
-rw-r--r--mm/khugepaged.c439
-rw-r--r--mm/kmemleak.c2
-rw-r--r--mm/ksm.c92
-rw-r--r--mm/list_lru.c10
-rw-r--r--mm/maccess.c295
-rw-r--r--mm/madvise.c42
-rw-r--r--mm/memblock.c78
-rw-r--r--mm/memcontrol.c1539
-rw-r--r--mm/memory-failure.c68
-rw-r--r--mm/memory.c712
-rw-r--r--mm/memory_hotplug.c330
-rw-r--r--mm/mempolicy.c84
-rw-r--r--mm/mempool.c2
-rw-r--r--mm/memremap.c2
-rw-r--r--mm/migrate.c247
-rw-r--r--mm/mincore.c6
-rw-r--r--mm/mlock.c51
-rw-r--r--mm/mm_init.c38
-rw-r--r--mm/mmap.c142
-rw-r--r--mm/mmu_context.c64
-rw-r--r--mm/mmu_gather.c2
-rw-r--r--mm/mmu_notifier.c31
-rw-r--r--mm/mprotect.c22
-rw-r--r--mm/mremap.c58
-rw-r--r--mm/msync.c8
-rw-r--r--mm/nommu.c97
-rw-r--r--mm/oom_kill.c50
-rw-r--r--mm/page-writeback.c86
-rw-r--r--mm/page_alloc.c933
-rw-r--r--mm/page_counter.c19
-rw-r--r--mm/page_idle.c7
-rw-r--r--mm/page_io.c32
-rw-r--r--mm/page_isolation.c22
-rw-r--r--mm/page_owner.c7
-rw-r--r--mm/page_reporting.h2
-rw-r--r--mm/page_vma_mapped.c6
-rw-r--r--mm/pagewalk.c12
-rw-r--r--mm/percpu-internal.h55
-rw-r--r--mm/percpu-km.c5
-rw-r--r--mm/percpu-stats.c36
-rw-r--r--mm/percpu-vm.c5
-rw-r--r--mm/percpu.c214
-rw-r--r--mm/pgalloc-track.h51
-rw-r--r--mm/pgtable-generic.c8
-rw-r--r--mm/process_vm_access.c6
-rw-r--r--mm/ptdump.c21
-rw-r--r--mm/readahead.c275
-rw-r--r--mm/rmap.c86
-rw-r--r--mm/rodata_test.c3
-rw-r--r--mm/shmem.c261
-rw-r--r--mm/shuffle.c46
-rw-r--r--mm/shuffle.h17
-rw-r--r--mm/slab.c115
-rw-r--r--mm/slab.h395
-rw-r--r--mm/slab_common.c687
-rw-r--r--mm/slob.c14
-rw-r--r--mm/slub.c658
-rw-r--r--mm/sparse-vmemmap.c57
-rw-r--r--mm/sparse.c32
-rw-r--r--mm/swap.c274
-rw-r--r--mm/swap_cgroup.c6
-rw-r--r--mm/swap_slots.c45
-rw-r--r--mm/swap_state.c196
-rw-r--r--mm/swapfile.c255
-rw-r--r--mm/usercopy.c2
-rw-r--r--mm/userfaultfd.c33
-rw-r--r--mm/util.c98
-rw-r--r--mm/vmacache.c5
-rw-r--r--mm/vmalloc.c564
-rw-r--r--mm/vmscan.c378
-rw-r--r--mm/vmstat.c117
-rw-r--r--mm/workingset.c88
-rw-r--r--mm/zbud.c2
-rw-r--r--mm/zpool.c8
-rw-r--r--mm/zsmalloc.c16
106 files changed, 8691 insertions, 6602 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index c1acc34c1c35..6c974888f86f 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -88,13 +88,9 @@ config NEED_MULTIPLE_NODES
def_bool y
depends on DISCONTIGMEM || NUMA
-config HAVE_MEMORY_PRESENT
- def_bool y
- depends on ARCH_HAVE_MEMORY_PRESENT || SPARSEMEM
-
#
# SPARSEMEM_EXTREME (which is the default) does some bootmem
-# allocations when memory_present() is called. If this cannot
+# allocations when sparse_init() is called. If this cannot
# be done on your architecture, select this option. However,
# statically allocating the mem_section[] array can potentially
# consume vast quantities of .bss, so be careful.
@@ -126,9 +122,6 @@ config SPARSEMEM_VMEMMAP
pfn_to_page and page_to_pfn operations. This is the most
efficient option when sufficient kernel resources are available.
-config HAVE_MEMBLOCK_NODE_MAP
- bool
-
config HAVE_MEMBLOCK_PHYS_MAP
bool
@@ -136,6 +129,9 @@ config HAVE_FAST_GUP
depends on MMU
bool
+# Don't discard allocated memory used to track "memory" and "reserved" memblocks
+# after early boot, so it can still be used to test for validity of memory.
+# Also, memblocks are updated with memory hot(un)plug.
config ARCH_KEEP_MEMBLOCK
bool
@@ -158,6 +154,7 @@ config MEMORY_HOTPLUG
bool "Allow for memory hot-add"
depends on SPARSEMEM || X86_64_ACPI_NUMA
depends on ARCH_ENABLE_MEMORY_HOTPLUG
+ depends on 64BIT || BROKEN
select NUMA_KEEP_MEMINFO if NUMA
config MEMORY_HOTPLUG_SPARSE
@@ -192,6 +189,9 @@ config MEMORY_HOTREMOVE
# Default to 4 for wider testing, though 8 might be more appropriate.
# ARM's adjust_pte (unused if VIPT) depends on mm-wide page_table_lock.
# PA-RISC 7xxx's spinlock_t would enlarge struct page from 32 to 44 bytes.
+# SPARC32 allocates multiple pte tables within a single page, and therefore
+# a per-page lock leads to problems when multiple tables need to be locked
+# at the same time (e.g. copy_page_range()).
# DEBUG_SPINLOCK and DEBUG_LOCK_ALLOC spinlock_t also enlarge struct page.
#
config SPLIT_PTLOCK_CPUS
@@ -199,6 +199,7 @@ config SPLIT_PTLOCK_CPUS
default "999999" if !MMU
default "999999" if ARM && !CPU_CACHE_VIPT
default "999999" if PARISC && !PA20
+ default "999999" if SPARC32
default "4"
config ARCH_ENABLE_SPLIT_PMD_PTLOCK
@@ -382,7 +383,7 @@ config NOMMU_INITIAL_TRIM_EXCESS
This option specifies the initial value of this option. The default
of 1 says that all excess pages should be trimmed.
- See Documentation/nommu-mmap.txt for more information.
+ See Documentation/mm/nommu-mmap.rst for more information.
config TRANSPARENT_HUGEPAGE
bool "Transparent Hugepage Support"
@@ -705,9 +706,9 @@ config ZSMALLOC
returned by an alloc(). This handle must be mapped in order to
access the allocated space.
-config PGTABLE_MAPPING
+config ZSMALLOC_PGTABLE_MAPPING
bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC
+ depends on ZSMALLOC=y
help
By default, zsmalloc uses a copy-based object mapping method to
access allocations that span two pages. However, if a particular
@@ -750,13 +751,13 @@ config DEFERRED_STRUCT_PAGE_INIT
depends on SPARSEMEM
depends on !NEED_PER_CPU_KM
depends on 64BIT
+ select PADATA
help
Ordinarily all struct pages are initialised during early boot in a
single thread. On very large machines this can take a considerable
amount of time. If this option is set, large machines will bring up
- a subset of memmap at boot and then initialise the rest in parallel
- by starting one-off "pgdatinitX" kernel thread for each node X. This
- has a potential performance impact on processes running early in the
+ a subset of memmap at boot and then initialise the rest in parallel.
+ This has a potential performance impact on tasks running early in the
lifetime of the system until these kthreads finish the
initialisation.
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 0271b22e063f..864f129f1937 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -1,7 +1,7 @@
# SPDX-License-Identifier: GPL-2.0-only
config PAGE_EXTENSION
bool "Extend memmap on extra space for more information on page"
- ---help---
+ help
Extend memmap on extra space for more information on page. This
could be used for debugging features that need to insert extra
field for every page. This extension enables us to save memory
@@ -13,7 +13,7 @@ config DEBUG_PAGEALLOC
depends on DEBUG_KERNEL
depends on !HIBERNATION || ARCH_SUPPORTS_DEBUG_PAGEALLOC && !PPC && !SPARC
select PAGE_POISONING if !ARCH_SUPPORTS_DEBUG_PAGEALLOC
- ---help---
+ help
Unmap pages from the kernel linear mapping after free_pages().
Depending on runtime enablement, this results in a small or large
slowdown, but helps to find certain types of memory corruption.
@@ -41,7 +41,7 @@ config DEBUG_PAGEALLOC
config DEBUG_PAGEALLOC_ENABLE_DEFAULT
bool "Enable debug page memory allocations by default?"
depends on DEBUG_PAGEALLOC
- ---help---
+ help
Enable debug page memory allocations by default? This value
can be overridden by debug_pagealloc=off|on.
@@ -65,7 +65,7 @@ config PAGE_OWNER
config PAGE_POISONING
bool "Poison pages after freeing"
select PAGE_POISONING_NO_SANITY if HIBERNATION
- ---help---
+ help
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages. The filling of the memory helps
reduce the risk of information leaks from freed data. This does
@@ -80,7 +80,7 @@ config PAGE_POISONING
config PAGE_POISONING_NO_SANITY
depends on PAGE_POISONING
bool "Only poison, don't sanity check"
- ---help---
+ help
Skip the sanity checking on alloc, only fill the pages with
poison on free. This reduces some of the overhead of the
poisoning feature.
@@ -91,7 +91,7 @@ config PAGE_POISONING_NO_SANITY
config PAGE_POISONING_ZERO
bool "Use zero for poisoning instead of debugging value"
depends on PAGE_POISONING
- ---help---
+ help
Instead of using the existing poison value, fill the pages with
zeros. This makes it harder to detect when errors are occurring
due to sanitization but the zeroing at free means that it is
@@ -104,7 +104,7 @@ config DEBUG_PAGE_REF
bool "Enable tracepoint to track down page reference manipulation"
depends on DEBUG_KERNEL
depends on TRACEPOINTS
- ---help---
+ help
This is a feature to add tracepoint for tracking down page reference
manipulation. This tracking is useful to diagnose functional failure
due to migration failures caused by page reference mismatches. Be
@@ -115,9 +115,41 @@ config DEBUG_PAGE_REF
config DEBUG_RODATA_TEST
bool "Testcase for the marking rodata read-only"
depends on STRICT_KERNEL_RWX
- ---help---
+ help
This option enables a testcase for the setting rodata read-only.
+config ARCH_HAS_DEBUG_WX
+ bool
+
+config DEBUG_WX
+ bool "Warn on W+X mappings at boot"
+ depends on ARCH_HAS_DEBUG_WX
+ depends on MMU
+ select PTDUMP_CORE
+ help
+ Generate a warning if any W+X mappings are found at boot.
+
+ This is useful for discovering cases where the kernel is leaving W+X
+ mappings after applying NX, as such mappings are a security risk.
+
+ Look for a message in dmesg output like this:
+
+ <arch>/mm: Checked W+X mappings: passed, no W+X pages found.
+
+ or like this, if the check failed:
+
+ <arch>/mm: Checked W+X mappings: failed, <N> W+X pages found.
+
+ Note that even if the check fails, your kernel is possibly
+ still fine, as W+X mappings are not a security hole in
+ themselves, what they do is that they make the exploitation
+ of other unfixed kernel bugs easier.
+
+ There is no runtime or memory usage effect of this option
+ once the kernel has booted up - it's a one time check.
+
+ If in doubt, say "Y".
+
config GENERIC_PTDUMP
bool
diff --git a/mm/Makefile b/mm/Makefile
index fccd3756b25f..d5649f1c12c0 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -8,6 +8,14 @@ KASAN_SANITIZE_slab.o := n
KASAN_SANITIZE_slub.o := n
KCSAN_SANITIZE_kmemleak.o := n
+# These produce frequent data race reports: most of them are due to races on
+# the same word but accesses to different bits of that word. Re-enable KCSAN
+# for these when we have more consensus on what to do about them.
+KCSAN_SANITIZE_slab_common.o := n
+KCSAN_SANITIZE_slab.o := n
+KCSAN_SANITIZE_slub.o := n
+KCSAN_SANITIZE_page_alloc.o := n
+
# These files are disabled because they produce non-interesting and/or
# flaky coverage that is not a function of syscall inputs. E.g. slab is out of
# free pages, or a task is migrated between nodes.
@@ -30,7 +38,7 @@ mmu-y := nommu.o
mmu-$(CONFIG_MMU) := highmem.o memory.o mincore.o \
mlock.o mmap.o mmu_gather.o mprotect.o mremap.o \
msync.o page_vma_mapped.o pagewalk.o \
- pgtable-generic.o rmap.o vmalloc.o
+ pgtable-generic.o rmap.o vmalloc.o ioremap.o
ifdef CONFIG_CROSS_MEMORY_ATTACH
@@ -41,7 +49,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
maccess.o page-writeback.o \
readahead.o swap.o truncate.o vmscan.o shmem.o \
util.o mmzone.o vmstat.o backing-dev.o \
- mm_init.o mmu_context.o percpu.o slab_common.o \
+ mm_init.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
debug.o gup.o $(mmu-y)
@@ -88,6 +96,7 @@ obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
obj-$(CONFIG_DEBUG_KMEMLEAK_TEST) += kmemleak-test.o
obj-$(CONFIG_DEBUG_RODATA_TEST) += rodata_test.o
+obj-$(CONFIG_DEBUG_VM_PGTABLE) += debug_vm_pgtable.o
obj-$(CONFIG_PAGE_OWNER) += page_owner.o
obj-$(CONFIG_CLEANCACHE) += cleancache.o
obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index efc5b83acd2d..8e8b00627bb2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -15,7 +15,6 @@
#include <trace/events/writeback.h>
struct backing_dev_info noop_backing_dev_info = {
- .name = "noop",
.capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK,
};
EXPORT_SYMBOL_GPL(noop_backing_dev_info);
@@ -282,7 +281,7 @@ void wb_wakeup_delayed(struct bdi_writeback *wb)
#define INIT_BW (100 << (20 - PAGE_SHIFT))
static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
- int blkcg_id, gfp_t gfp)
+ gfp_t gfp)
{
int i, err;
@@ -309,15 +308,9 @@ static int wb_init(struct bdi_writeback *wb, struct backing_dev_info *bdi,
INIT_DELAYED_WORK(&wb->dwork, wb_workfn);
wb->dirty_sleep = jiffies;
- wb->congested = wb_congested_get_create(bdi, blkcg_id, gfp);
- if (!wb->congested) {
- err = -ENOMEM;
- goto out_put_bdi;
- }
-
err = fprop_local_init_percpu(&wb->completions, gfp);
if (err)
- goto out_put_cong;
+ goto out_put_bdi;
for (i = 0; i < NR_WB_STAT_ITEMS; i++) {
err = percpu_counter_init(&wb->stat[i], 0, gfp);
@@ -331,8 +324,6 @@ out_destroy_stat:
while (i--)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
-out_put_cong:
- wb_congested_put(wb->congested);
out_put_bdi:
if (wb != &bdi->wb)
bdi_put(bdi);
@@ -375,7 +366,6 @@ static void wb_exit(struct bdi_writeback *wb)
percpu_counter_destroy(&wb->stat[i]);
fprop_local_destroy_percpu(&wb->completions);
- wb_congested_put(wb->congested);
if (wb != &wb->bdi->wb)
bdi_put(wb->bdi);
}
@@ -385,99 +375,12 @@ static void wb_exit(struct bdi_writeback *wb)
#include <linux/memcontrol.h>
/*
- * cgwb_lock protects bdi->cgwb_tree, bdi->cgwb_congested_tree,
- * blkcg->cgwb_list, and memcg->cgwb_list. bdi->cgwb_tree is also RCU
- * protected.
+ * cgwb_lock protects bdi->cgwb_tree, blkcg->cgwb_list, and memcg->cgwb_list.
+ * bdi->cgwb_tree is also RCU protected.
*/
static DEFINE_SPINLOCK(cgwb_lock);
static struct workqueue_struct *cgwb_release_wq;
-/**
- * wb_congested_get_create - get or create a wb_congested
- * @bdi: associated bdi
- * @blkcg_id: ID of the associated blkcg
- * @gfp: allocation mask
- *
- * Look up the wb_congested for @blkcg_id on @bdi. If missing, create one.
- * The returned wb_congested has its reference count incremented. Returns
- * NULL on failure.
- */
-struct bdi_writeback_congested *
-wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp)
-{
- struct bdi_writeback_congested *new_congested = NULL, *congested;
- struct rb_node **node, *parent;
- unsigned long flags;
-retry:
- spin_lock_irqsave(&cgwb_lock, flags);
-
- node = &bdi->cgwb_congested_tree.rb_node;
- parent = NULL;
-
- while (*node != NULL) {
- parent = *node;
- congested = rb_entry(parent, struct bdi_writeback_congested,
- rb_node);
- if (congested->blkcg_id < blkcg_id)
- node = &parent->rb_left;
- else if (congested->blkcg_id > blkcg_id)
- node = &parent->rb_right;
- else
- goto found;
- }
-
- if (new_congested) {
- /* !found and storage for new one already allocated, insert */
- congested = new_congested;
- rb_link_node(&congested->rb_node, parent, node);
- rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree);
- spin_unlock_irqrestore(&cgwb_lock, flags);
- return congested;
- }
-
- spin_unlock_irqrestore(&cgwb_lock, flags);
-
- /* allocate storage for new one and retry */
- new_congested = kzalloc(sizeof(*new_congested), gfp);
- if (!new_congested)
- return NULL;
-
- refcount_set(&new_congested->refcnt, 1);
- new_congested->__bdi = bdi;
- new_congested->blkcg_id = blkcg_id;
- goto retry;
-
-found:
- refcount_inc(&congested->refcnt);
- spin_unlock_irqrestore(&cgwb_lock, flags);
- kfree(new_congested);
- return congested;
-}
-
-/**
- * wb_congested_put - put a wb_congested
- * @congested: wb_congested to put
- *
- * Put @congested and destroy it if the refcnt reaches zero.
- */
-void wb_congested_put(struct bdi_writeback_congested *congested)
-{
- unsigned long flags;
-
- if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags))
- return;
-
- /* bdi might already have been destroyed leaving @congested unlinked */
- if (congested->__bdi) {
- rb_erase(&congested->rb_node,
- &congested->__bdi->cgwb_congested_tree);
- congested->__bdi = NULL;
- }
-
- spin_unlock_irqrestore(&cgwb_lock, flags);
- kfree(congested);
-}
-
static void cgwb_release_workfn(struct work_struct *work)
{
struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
@@ -559,7 +462,7 @@ static int cgwb_create(struct backing_dev_info *bdi,
goto out_put;
}
- ret = wb_init(wb, bdi, blkcg_css->id, gfp);
+ ret = wb_init(wb, bdi, gfp);
if (ret)
goto err_free;
@@ -697,11 +600,10 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi)
int ret;
INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
- bdi->cgwb_congested_tree = RB_ROOT;
mutex_init(&bdi->cgwb_release_mutex);
init_rwsem(&bdi->wb_switch_rwsem);
- ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
+ ret = wb_init(&bdi->wb, bdi, GFP_KERNEL);
if (!ret) {
bdi->wb.memcg_css = &root_mem_cgroup->css;
bdi->wb.blkcg_css = blkcg_root_css;
@@ -770,21 +672,6 @@ void wb_blkcg_offline(struct blkcg *blkcg)
spin_unlock_irq(&cgwb_lock);
}
-static void cgwb_bdi_exit(struct backing_dev_info *bdi)
-{
- struct rb_node *rbn;
-
- spin_lock_irq(&cgwb_lock);
- while ((rbn = rb_first(&bdi->cgwb_congested_tree))) {
- struct bdi_writeback_congested *congested =
- rb_entry(rbn, struct bdi_writeback_congested, rb_node);
-
- rb_erase(rbn, &bdi->cgwb_congested_tree);
- congested->__bdi = NULL; /* mark @congested unlinked */
- }
- spin_unlock_irq(&cgwb_lock);
-}
-
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
spin_lock_irq(&cgwb_lock);
@@ -811,29 +698,11 @@ subsys_initcall(cgwb_init);
static int cgwb_bdi_init(struct backing_dev_info *bdi)
{
- int err;
-
- bdi->wb_congested = kzalloc(sizeof(*bdi->wb_congested), GFP_KERNEL);
- if (!bdi->wb_congested)
- return -ENOMEM;
-
- refcount_set(&bdi->wb_congested->refcnt, 1);
-
- err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
- if (err) {
- wb_congested_put(bdi->wb_congested);
- return err;
- }
- return 0;
+ return wb_init(&bdi->wb, bdi, GFP_KERNEL);
}
static void cgwb_bdi_unregister(struct backing_dev_info *bdi) { }
-static void cgwb_bdi_exit(struct backing_dev_info *bdi)
-{
- wb_congested_put(bdi->wb_congested);
-}
-
static void cgwb_bdi_register(struct backing_dev_info *bdi)
{
list_add_tail_rcu(&bdi->wb.bdi_node, &bdi->wb_list);
@@ -865,12 +734,11 @@ static int bdi_init(struct backing_dev_info *bdi)
return ret;
}
-struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
+struct backing_dev_info *bdi_alloc(int node_id)
{
struct backing_dev_info *bdi;
- bdi = kmalloc_node(sizeof(struct backing_dev_info),
- gfp_mask | __GFP_ZERO, node_id);
+ bdi = kzalloc_node(sizeof(*bdi), GFP_KERNEL, node_id);
if (!bdi)
return NULL;
@@ -880,7 +748,7 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id)
}
return bdi;
}
-EXPORT_SYMBOL(bdi_alloc_node);
+EXPORT_SYMBOL(bdi_alloc);
static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp)
{
@@ -964,7 +832,6 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args)
trace_writeback_bdi_register(bdi);
return 0;
}
-EXPORT_SYMBOL(bdi_register_va);
int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
{
@@ -978,20 +845,12 @@ int bdi_register(struct backing_dev_info *bdi, const char *fmt, ...)
}
EXPORT_SYMBOL(bdi_register);
-int bdi_register_owner(struct backing_dev_info *bdi, struct device *owner)
+void bdi_set_owner(struct backing_dev_info *bdi, struct device *owner)
{
- int rc;
-
- rc = bdi_register(bdi, "%u:%u", MAJOR(owner->devt), MINOR(owner->devt));
- if (rc)
- return rc;
- /* Leaking owner reference... */
- WARN_ON(bdi->owner);
+ WARN_ON_ONCE(bdi->owner);
bdi->owner = owner;
get_device(owner);
- return 0;
}
-EXPORT_SYMBOL(bdi_register_owner);
/*
* Remove bdi from bdi_list, and ensure that it is no longer visible
@@ -1034,7 +893,6 @@ static void release_bdi(struct kref *ref)
bdi_unregister(bdi);
WARN_ON_ONCE(bdi->dev);
wb_exit(&bdi->wb);
- cgwb_bdi_exit(bdi);
kfree(bdi);
}
@@ -1058,29 +916,29 @@ static wait_queue_head_t congestion_wqh[2] = {
};
static atomic_t nr_wb_congested[2];
-void clear_wb_congested(struct bdi_writeback_congested *congested, int sync)
+void clear_bdi_congested(struct backing_dev_info *bdi, int sync)
{
wait_queue_head_t *wqh = &congestion_wqh[sync];
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
- if (test_and_clear_bit(bit, &congested->state))
+ if (test_and_clear_bit(bit, &bdi->wb.congested))
atomic_dec(&nr_wb_congested[sync]);
smp_mb__after_atomic();
if (waitqueue_active(wqh))
wake_up(wqh);
}
-EXPORT_SYMBOL(clear_wb_congested);
+EXPORT_SYMBOL(clear_bdi_congested);
-void set_wb_congested(struct bdi_writeback_congested *congested, int sync)
+void set_bdi_congested(struct backing_dev_info *bdi, int sync)
{
enum wb_congested_state bit;
bit = sync ? WB_sync_congested : WB_async_congested;
- if (!test_and_set_bit(bit, &congested->state))
+ if (!test_and_set_bit(bit, &bdi->wb.congested))
atomic_inc(&nr_wb_congested[sync]);
}
-EXPORT_SYMBOL(set_wb_congested);
+EXPORT_SYMBOL(set_bdi_congested);
/**
* congestion_wait - wait for a backing_dev to become uncongested
diff --git a/mm/cma.c b/mm/cma.c
index 0463ad2ce06b..7f415d7cda9f 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -52,7 +52,7 @@ unsigned long cma_get_size(const struct cma *cma)
const char *cma_get_name(const struct cma *cma)
{
- return cma->name ? cma->name : "(undefined)";
+ return cma->name;
}
static unsigned long cma_bitmap_aligned_mask(const struct cma *cma,
@@ -93,17 +93,15 @@ static void cma_clear_bitmap(struct cma *cma, unsigned long pfn,
mutex_unlock(&cma->lock);
}
-static int __init cma_activate_area(struct cma *cma)
+static void __init cma_activate_area(struct cma *cma)
{
unsigned long base_pfn = cma->base_pfn, pfn = base_pfn;
unsigned i = cma->count >> pageblock_order;
struct zone *zone;
cma->bitmap = bitmap_zalloc(cma_bitmap_maxno(cma), GFP_KERNEL);
- if (!cma->bitmap) {
- cma->count = 0;
- return -ENOMEM;
- }
+ if (!cma->bitmap)
+ goto out_error;
WARN_ON_ONCE(!pfn_valid(pfn));
zone = page_zone(pfn_to_page(pfn));
@@ -133,25 +131,22 @@ static int __init cma_activate_area(struct cma *cma)
spin_lock_init(&cma->mem_head_lock);
#endif
- return 0;
+ return;
not_in_zone:
- pr_err("CMA area %s could not be activated\n", cma->name);
bitmap_free(cma->bitmap);
+out_error:
cma->count = 0;
- return -EINVAL;
+ pr_err("CMA area %s could not be activated\n", cma->name);
+ return;
}
static int __init cma_init_reserved_areas(void)
{
int i;
- for (i = 0; i < cma_area_count; i++) {
- int ret = cma_activate_area(&cma_areas[i]);
-
- if (ret)
- return ret;
- }
+ for (i = 0; i < cma_area_count; i++)
+ cma_activate_area(&cma_areas[i]);
return 0;
}
@@ -202,13 +197,12 @@ int __init cma_init_reserved_mem(phys_addr_t base, phys_addr_t size,
* subsystems (like slab allocator) are available.
*/
cma = &cma_areas[cma_area_count];
- if (name) {
- cma->name = name;
- } else {
- cma->name = kasprintf(GFP_KERNEL, "cma%d\n", cma_area_count);
- if (!cma->name)
- return -ENOMEM;
- }
+
+ if (name)
+ snprintf(cma->name, CMA_MAX_NAME, name);
+ else
+ snprintf(cma->name, CMA_MAX_NAME, "cma%d\n", cma_area_count);
+
cma->base_pfn = PFN_DOWN(base);
cma->count = size >> PAGE_SHIFT;
cma->order_per_bit = order_per_bit;
@@ -339,13 +333,13 @@ int __init cma_declare_contiguous_nid(phys_addr_t base,
*/
if (base < highmem_start && limit > highmem_start) {
addr = memblock_alloc_range_nid(size, alignment,
- highmem_start, limit, nid, false);
+ highmem_start, limit, nid, true);
limit = highmem_start;
}
if (!addr) {
addr = memblock_alloc_range_nid(size, alignment, base,
- limit, nid, false);
+ limit, nid, true);
if (!addr) {
ret = -ENOMEM;
goto err;
@@ -425,7 +419,7 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
struct page *page = NULL;
int ret = -ENOMEM;
- if (!cma || !cma->count)
+ if (!cma || !cma->count || !cma->bitmap)
return NULL;
pr_debug("%s(cma %p, count %zu, align %d)\n", __func__, (void *)cma,
diff --git a/mm/cma.h b/mm/cma.h
index 33c0b517733c..20f6e24bc477 100644
--- a/mm/cma.h
+++ b/mm/cma.h
@@ -2,6 +2,10 @@
#ifndef __MM_CMA_H__
#define __MM_CMA_H__
+#include <linux/debugfs.h>
+
+#define CMA_MAX_NAME 64
+
struct cma {
unsigned long base_pfn;
unsigned long count;
@@ -11,8 +15,9 @@ struct cma {
#ifdef CONFIG_CMA_DEBUGFS
struct hlist_head mem_head;
spinlock_t mem_head_lock;
+ struct debugfs_u32_array dfs_bitmap;
#endif
- const char *name;
+ char name[CMA_MAX_NAME];
};
extern struct cma cma_areas[MAX_CMA_AREAS];
diff --git a/mm/cma_debug.c b/mm/cma_debug.c
index 4e6cbe2f586e..d5bf8aa34fdc 100644
--- a/mm/cma_debug.c
+++ b/mm/cma_debug.c
@@ -164,7 +164,6 @@ static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
{
struct dentry *tmp;
char name[16];
- int u32s;
scnprintf(name, sizeof(name), "cma-%s", cma->name);
@@ -180,8 +179,10 @@ static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
debugfs_create_file("used", 0444, tmp, cma, &cma_used_fops);
debugfs_create_file("maxchunk", 0444, tmp, cma, &cma_maxchunk_fops);
- u32s = DIV_ROUND_UP(cma_bitmap_maxno(cma), BITS_PER_BYTE * sizeof(u32));
- debugfs_create_u32_array("bitmap", 0444, tmp, (u32 *)cma->bitmap, u32s);
+ cma->dfs_bitmap.array = (u32 *)cma->bitmap;
+ cma->dfs_bitmap.n_elements = DIV_ROUND_UP(cma_bitmap_maxno(cma),
+ BITS_PER_BYTE * sizeof(u32));
+ debugfs_create_u32_array("bitmap", 0444, tmp, &cma->dfs_bitmap);
}
static int __init cma_debugfs_init(void)
diff --git a/mm/compaction.c b/mm/compaction.c
index 46f0fcc93081..176dcded298e 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -50,6 +50,24 @@ static inline void count_compact_events(enum vm_event_item item, long delta)
#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order)
#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order)
+/*
+ * Fragmentation score check interval for proactive compaction purposes.
+ */
+static const unsigned int HPAGE_FRAG_CHECK_INTERVAL_MSEC = 500;
+
+/*
+ * Page order with-respect-to which proactive compaction
+ * calculates external fragmentation, which is used as
+ * the "fragmentation score" of a node/zone.
+ */
+#if defined CONFIG_TRANSPARENT_HUGEPAGE
+#define COMPACTION_HPAGE_ORDER HPAGE_PMD_ORDER
+#elif defined CONFIG_HUGETLBFS
+#define COMPACTION_HPAGE_ORDER HUGETLB_PAGE_ORDER
+#else
+#define COMPACTION_HPAGE_ORDER (PMD_SHIFT - PAGE_SHIFT)
+#endif
+
static unsigned long release_freepages(struct list_head *freelist)
{
struct page *page, *next;
@@ -136,7 +154,7 @@ EXPORT_SYMBOL(__ClearPageMovable);
/*
* Compaction is deferred when compaction fails to result in a page
- * allocation success. 1 << compact_defer_limit compactions are skipped up
+ * allocation success. 1 << compact_defer_shift, compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
void defer_compaction(struct zone *zone, int order)
@@ -991,7 +1009,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
del_page_from_lru_list(page, lruvec, page_lru(page));
mod_node_page_state(page_pgdat(page),
NR_ISOLATED_ANON + page_is_file_lru(page),
- hpage_nr_pages(page));
+ thp_nr_pages(page));
isolate_success:
list_add(&page->lru, &cc->migratepages);
@@ -1401,7 +1419,7 @@ fast_isolate_freepages(struct compact_control *cc)
if (scan_start) {
/*
* Use the highest PFN found above min. If one was
- * not found, be pessemistic for direct compaction
+ * not found, be pessimistic for direct compaction
* and use the min mark.
*/
if (highest) {
@@ -1409,7 +1427,9 @@ fast_isolate_freepages(struct compact_control *cc)
cc->free_pfn = highest;
} else {
if (cc->direct_compaction && pfn_valid(min_pfn)) {
- page = pfn_to_page(min_pfn);
+ page = pageblock_pfn_to_page(min_pfn,
+ pageblock_end_pfn(min_pfn),
+ cc->zone);
cc->free_pfn = min_pfn;
}
}
@@ -1457,7 +1477,7 @@ static void isolate_freepages(struct compact_control *cc)
* this pfn aligned down to the pageblock boundary, because we do
* block_start_pfn -= pageblock_nr_pages in the for loop.
* For ending point, take care when isolating in last pageblock of a
- * a zone which ends in the middle of a pageblock.
+ * zone which ends in the middle of a pageblock.
* The low boundary is the end of the pageblock the migration scanner
* is using.
*/
@@ -1855,6 +1875,76 @@ static inline bool is_via_compact_memory(int order)
return order == -1;
}
+static bool kswapd_is_running(pg_data_t *pgdat)
+{
+ return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING);
+}
+
+/*
+ * A zone's fragmentation score is the external fragmentation wrt to the
+ * COMPACTION_HPAGE_ORDER scaled by the zone's size. It returns a value
+ * in the range [0, 100].
+ *
+ * The scaling factor ensures that proactive compaction focuses on larger
+ * zones like ZONE_NORMAL, rather than smaller, specialized zones like
+ * ZONE_DMA32. For smaller zones, the score value remains close to zero,
+ * and thus never exceeds the high threshold for proactive compaction.
+ */
+static unsigned int fragmentation_score_zone(struct zone *zone)
+{
+ unsigned long score;
+
+ score = zone->present_pages *
+ extfrag_for_order(zone, COMPACTION_HPAGE_ORDER);
+ return div64_ul(score, zone->zone_pgdat->node_present_pages + 1);
+}
+
+/*
+ * The per-node proactive (background) compaction process is started by its
+ * corresponding kcompactd thread when the node's fragmentation score
+ * exceeds the high threshold. The compaction process remains active till
+ * the node's score falls below the low threshold, or one of the back-off
+ * conditions is met.
+ */
+static unsigned int fragmentation_score_node(pg_data_t *pgdat)
+{
+ unsigned int score = 0;
+ int zoneid;
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ struct zone *zone;
+
+ zone = &pgdat->node_zones[zoneid];
+ score += fragmentation_score_zone(zone);
+ }
+
+ return score;
+}
+
+static unsigned int fragmentation_score_wmark(pg_data_t *pgdat, bool low)
+{
+ unsigned int wmark_low;
+
+ /*
+ * Cap the low watermak to avoid excessive compaction
+ * activity in case a user sets the proactivess tunable
+ * close to 100 (maximum).
+ */
+ wmark_low = max(100U - sysctl_compaction_proactiveness, 5U);
+ return low ? wmark_low : min(wmark_low + 10, 100U);
+}
+
+static bool should_proactive_compact_node(pg_data_t *pgdat)
+{
+ int wmark_high;
+
+ if (!sysctl_compaction_proactiveness || kswapd_is_running(pgdat))
+ return false;
+
+ wmark_high = fragmentation_score_wmark(pgdat, false);
+ return fragmentation_score_node(pgdat) > wmark_high;
+}
+
static enum compact_result __compact_finished(struct compact_control *cc)
{
unsigned int order;
@@ -1881,6 +1971,25 @@ static enum compact_result __compact_finished(struct compact_control *cc)
return COMPACT_PARTIAL_SKIPPED;
}
+ if (cc->proactive_compaction) {
+ int score, wmark_low;
+ pg_data_t *pgdat;
+
+ pgdat = cc->zone->zone_pgdat;
+ if (kswapd_is_running(pgdat))
+ return COMPACT_PARTIAL_SKIPPED;
+
+ score = fragmentation_score_zone(cc->zone);
+ wmark_low = fragmentation_score_wmark(pgdat, true);
+
+ if (score > wmark_low)
+ ret = COMPACT_CONTINUE;
+ else
+ ret = COMPACT_SUCCESS;
+
+ goto out;
+ }
+
if (is_via_compact_memory(cc->order))
return COMPACT_CONTINUE;
@@ -1939,6 +2048,7 @@ static enum compact_result __compact_finished(struct compact_control *cc)
}
}
+out:
if (cc->contended || fatal_signal_pending(current))
ret = COMPACT_CONTENDED;
@@ -1966,7 +2076,7 @@ static enum compact_result compact_finished(struct compact_control *cc)
*/
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx,
+ int highest_zoneidx,
unsigned long wmark_target)
{
unsigned long watermark;
@@ -1979,7 +2089,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* If watermarks for high-order allocation are already met, there
* should be no need for compaction at all.
*/
- if (zone_watermark_ok(zone, order, watermark, classzone_idx,
+ if (zone_watermark_ok(zone, order, watermark, highest_zoneidx,
alloc_flags))
return COMPACT_SUCCESS;
@@ -1989,9 +2099,9 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
* watermark and alloc_flags have to match, or be more pessimistic than
* the check in __isolate_free_page(). We don't use the direct
* compactor's alloc_flags, as they are not relevant for freepage
- * isolation. We however do use the direct compactor's classzone_idx to
- * skip over zones where lowmem reserves would prevent allocation even
- * if compaction succeeds.
+ * isolation. We however do use the direct compactor's highest_zoneidx
+ * to skip over zones where lowmem reserves would prevent allocation
+ * even if compaction succeeds.
* For costly orders, we require low watermark instead of min for
* compaction to proceed to increase its chances.
* ALLOC_CMA is used, as pages in CMA pageblocks are considered
@@ -2000,7 +2110,7 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
watermark = (order > PAGE_ALLOC_COSTLY_ORDER) ?
low_wmark_pages(zone) : min_wmark_pages(zone);
watermark += compact_gap(order);
- if (!__zone_watermark_ok(zone, 0, watermark, classzone_idx,
+ if (!__zone_watermark_ok(zone, 0, watermark, highest_zoneidx,
ALLOC_CMA, wmark_target))
return COMPACT_SKIPPED;
@@ -2009,12 +2119,12 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
- int classzone_idx)
+ int highest_zoneidx)
{
enum compact_result ret;
int fragindex;
- ret = __compaction_suitable(zone, order, alloc_flags, classzone_idx,
+ ret = __compaction_suitable(zone, order, alloc_flags, highest_zoneidx,
zone_page_state(zone, NR_FREE_PAGES));
/*
* fragmentation index determines if allocation failures are due to
@@ -2055,8 +2165,8 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
* Make sure at least one zone would pass __compaction_suitable if we continue
* retrying the reclaim.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
enum compact_result compact_result;
@@ -2069,7 +2179,7 @@ bool compaction_zonelist_suitable(struct alloc_context *ac, int order,
available = zone_reclaimable_pages(zone) / order;
available += zone_page_state_snapshot(zone, NR_FREE_PAGES);
compact_result = __compaction_suitable(zone, order, alloc_flags,
- ac_classzone_idx(ac), available);
+ ac->highest_zoneidx, available);
if (compact_result != COMPACT_SKIPPED)
return true;
}
@@ -2098,9 +2208,9 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
INIT_LIST_HEAD(&cc->freepages);
INIT_LIST_HEAD(&cc->migratepages);
- cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask);
+ cc->migratetype = gfp_migratetype(cc->gfp_mask);
ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags,
- cc->classzone_idx);
+ cc->highest_zoneidx);
/* Compaction is likely to fail */
if (ret == COMPACT_SUCCESS || ret == COMPACT_SKIPPED)
return ret;
@@ -2243,15 +2353,11 @@ check_drain:
* would succeed.
*/
if (cc->order > 0 && last_migrated_pfn) {
- int cpu;
unsigned long current_block_start =
block_start_pfn(cc->migrate_pfn, cc->order);
if (last_migrated_pfn < current_block_start) {
- cpu = get_cpu();
- lru_add_drain_cpu(cpu);
- drain_local_pages(cc->zone);
- put_cpu();
+ lru_add_drain_cpu_zone(cc->zone);
/* No more flushing until we migrate again */
last_migrated_pfn = 0;
}
@@ -2295,7 +2401,7 @@ out:
static enum compact_result compact_zone_order(struct zone *zone, int order,
gfp_t gfp_mask, enum compact_priority prio,
- unsigned int alloc_flags, int classzone_idx,
+ unsigned int alloc_flags, int highest_zoneidx,
struct page **capture)
{
enum compact_result ret;
@@ -2307,7 +2413,7 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.mode = (prio == COMPACT_PRIO_ASYNC) ?
MIGRATE_ASYNC : MIGRATE_SYNC_LIGHT,
.alloc_flags = alloc_flags,
- .classzone_idx = classzone_idx,
+ .highest_zoneidx = highest_zoneidx,
.direct_compaction = true,
.whole_zone = (prio == MIN_COMPACT_PRIORITY),
.ignore_skip_hint = (prio == MIN_COMPACT_PRIORITY),
@@ -2318,15 +2424,26 @@ static enum compact_result compact_zone_order(struct zone *zone, int order,
.page = NULL,
};
- current->capture_control = &capc;
+ /*
+ * Make sure the structs are really initialized before we expose the
+ * capture control, in case we are interrupted and the interrupt handler
+ * frees a page.
+ */
+ barrier();
+ WRITE_ONCE(current->capture_control, &capc);
ret = compact_zone(&cc, &capc);
VM_BUG_ON(!list_empty(&cc.freepages));
VM_BUG_ON(!list_empty(&cc.migratepages));
- *capture = capc.page;
- current->capture_control = NULL;
+ /*
+ * Make sure we hide capture control first before we read the captured
+ * page pointer, otherwise an interrupt could free and capture a page
+ * and we would leak it.
+ */
+ WRITE_ONCE(current->capture_control, NULL);
+ *capture = READ_ONCE(capc.page);
return ret;
}
@@ -2363,8 +2480,8 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
trace_mm_compaction_try_to_compact_pages(order, gfp_mask, prio);
/* Compact each zone in the list */
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
enum compact_result status;
if (prio > MIN_COMPACT_PRIORITY
@@ -2374,7 +2491,7 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
}
status = compact_zone_order(zone, order, gfp_mask, prio,
- alloc_flags, ac_classzone_idx(ac), capture);
+ alloc_flags, ac->highest_zoneidx, capture);
rc = max(status, rc);
/* The allocation should succeed, stop compacting */
@@ -2412,6 +2529,41 @@ enum compact_result try_to_compact_pages(gfp_t gfp_mask, unsigned int order,
return rc;
}
+/*
+ * Compact all zones within a node till each zone's fragmentation score
+ * reaches within proactive compaction thresholds (as determined by the
+ * proactiveness tunable).
+ *
+ * It is possible that the function returns before reaching score targets
+ * due to various back-off conditions, such as, contention on per-node or
+ * per-zone locks.
+ */
+static void proactive_compact_node(pg_data_t *pgdat)
+{
+ int zoneid;
+ struct zone *zone;
+ struct compact_control cc = {
+ .order = -1,
+ .mode = MIGRATE_SYNC_LIGHT,
+ .ignore_skip_hint = true,
+ .whole_zone = true,
+ .gfp_mask = GFP_KERNEL,
+ .proactive_compaction = true,
+ };
+
+ for (zoneid = 0; zoneid < MAX_NR_ZONES; zoneid++) {
+ zone = &pgdat->node_zones[zoneid];
+ if (!populated_zone(zone))
+ continue;
+
+ cc.zone = zone;
+
+ compact_zone(&cc, NULL);
+
+ VM_BUG_ON(!list_empty(&cc.freepages));
+ VM_BUG_ON(!list_empty(&cc.migratepages));
+ }
+}
/* Compact all zones within a node */
static void compact_node(int nid)
@@ -2459,11 +2611,18 @@ static void compact_nodes(void)
int sysctl_compact_memory;
/*
+ * Tunable for proactive compaction. It determines how
+ * aggressively the kernel should compact memory in the
+ * background. It takes values in the range [0, 100].
+ */
+unsigned int __read_mostly sysctl_compaction_proactiveness = 20;
+
+/*
* This is the entry point for compacting all nodes via
* /proc/sys/vm/compact_memory
*/
int sysctl_compaction_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
if (write)
compact_nodes();
@@ -2509,16 +2668,16 @@ static bool kcompactd_node_suitable(pg_data_t *pgdat)
{
int zoneid;
struct zone *zone;
- enum zone_type classzone_idx = pgdat->kcompactd_classzone_idx;
+ enum zone_type highest_zoneidx = pgdat->kcompactd_highest_zoneidx;
- for (zoneid = 0; zoneid <= classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= highest_zoneidx; zoneid++) {
zone = &pgdat->node_zones[zoneid];
if (!populated_zone(zone))
continue;
if (compaction_suitable(zone, pgdat->kcompactd_max_order, 0,
- classzone_idx) == COMPACT_CONTINUE)
+ highest_zoneidx) == COMPACT_CONTINUE)
return true;
}
@@ -2536,16 +2695,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
struct compact_control cc = {
.order = pgdat->kcompactd_max_order,
.search_order = pgdat->kcompactd_max_order,
- .classzone_idx = pgdat->kcompactd_classzone_idx,
+ .highest_zoneidx = pgdat->kcompactd_highest_zoneidx,
.mode = MIGRATE_SYNC_LIGHT,
.ignore_skip_hint = false,
.gfp_mask = GFP_KERNEL,
};
trace_mm_compaction_kcompactd_wake(pgdat->node_id, cc.order,
- cc.classzone_idx);
+ cc.highest_zoneidx);
count_compact_event(KCOMPACTD_WAKE);
- for (zoneid = 0; zoneid <= cc.classzone_idx; zoneid++) {
+ for (zoneid = 0; zoneid <= cc.highest_zoneidx; zoneid++) {
int status;
zone = &pgdat->node_zones[zoneid];
@@ -2594,16 +2753,16 @@ static void kcompactd_do_work(pg_data_t *pgdat)
/*
* Regardless of success, we are done until woken up next. But remember
- * the requested order/classzone_idx in case it was higher/tighter than
- * our current ones
+ * the requested order/highest_zoneidx in case it was higher/tighter
+ * than our current ones
*/
if (pgdat->kcompactd_max_order <= cc.order)
pgdat->kcompactd_max_order = 0;
- if (pgdat->kcompactd_classzone_idx >= cc.classzone_idx)
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ if (pgdat->kcompactd_highest_zoneidx >= cc.highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
}
-void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
+void wakeup_kcompactd(pg_data_t *pgdat, int order, int highest_zoneidx)
{
if (!order)
return;
@@ -2611,8 +2770,8 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kcompactd_max_order < order)
pgdat->kcompactd_max_order = order;
- if (pgdat->kcompactd_classzone_idx > classzone_idx)
- pgdat->kcompactd_classzone_idx = classzone_idx;
+ if (pgdat->kcompactd_highest_zoneidx > highest_zoneidx)
+ pgdat->kcompactd_highest_zoneidx = highest_zoneidx;
/*
* Pairs with implicit barrier in wait_event_freezable()
@@ -2625,7 +2784,7 @@ void wakeup_kcompactd(pg_data_t *pgdat, int order, int classzone_idx)
return;
trace_mm_compaction_wakeup_kcompactd(pgdat->node_id, order,
- classzone_idx);
+ highest_zoneidx);
wake_up_interruptible(&pgdat->kcompactd_wait);
}
@@ -2637,6 +2796,7 @@ static int kcompactd(void *p)
{
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
+ unsigned int proactive_defer = 0;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -2646,18 +2806,40 @@ static int kcompactd(void *p)
set_freezable();
pgdat->kcompactd_max_order = 0;
- pgdat->kcompactd_classzone_idx = pgdat->nr_zones - 1;
+ pgdat->kcompactd_highest_zoneidx = pgdat->nr_zones - 1;
while (!kthread_should_stop()) {
unsigned long pflags;
trace_mm_compaction_kcompactd_sleep(pgdat->node_id);
- wait_event_freezable(pgdat->kcompactd_wait,
- kcompactd_work_requested(pgdat));
+ if (wait_event_freezable_timeout(pgdat->kcompactd_wait,
+ kcompactd_work_requested(pgdat),
+ msecs_to_jiffies(HPAGE_FRAG_CHECK_INTERVAL_MSEC))) {
+
+ psi_memstall_enter(&pflags);
+ kcompactd_do_work(pgdat);
+ psi_memstall_leave(&pflags);
+ continue;
+ }
- psi_memstall_enter(&pflags);
- kcompactd_do_work(pgdat);
- psi_memstall_leave(&pflags);
+ /* kcompactd wait timeout */
+ if (should_proactive_compact_node(pgdat)) {
+ unsigned int prev_score, score;
+
+ if (proactive_defer) {
+ proactive_defer--;
+ continue;
+ }
+ prev_score = fragmentation_score_node(pgdat);
+ proactive_compact_node(pgdat);
+ score = fragmentation_score_node(pgdat);
+ /*
+ * Defer proactive compaction if the fragmentation
+ * score did not go down i.e. no progress made.
+ */
+ proactive_defer = score < prev_score ?
+ 0 : 1 << COMPACT_MAX_DEFER_SHIFT;
+ }
}
return 0;
diff --git a/mm/debug.c b/mm/debug.c
index 2189357f0987..ca8d1cacdecc 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -69,8 +69,19 @@ void __dump_page(struct page *page, const char *reason)
}
if (page < head || (page >= head + MAX_ORDER_NR_PAGES)) {
- /* Corrupt page, cannot call page_mapping */
- mapping = page->mapping;
+ /*
+ * Corrupt page, so we cannot call page_mapping. Instead, do a
+ * safe subset of the steps that page_mapping() does. Caution:
+ * this will be misleading for tail pages, PageSwapCache pages,
+ * and potentially other situations. (See the page_mapping()
+ * implementation for what's missing here.)
+ */
+ unsigned long tmp = (unsigned long)page->mapping;
+
+ if (tmp & PAGE_MAPPING_ANON)
+ mapping = NULL;
+ else
+ mapping = (void *)(tmp & ~PAGE_MAPPING_FLAGS);
head = page;
compound = false;
} else {
@@ -84,42 +95,76 @@ void __dump_page(struct page *page, const char *reason)
*/
mapcount = PageSlab(head) ? 0 : page_mapcount(page);
- if (compound)
+ pr_warn("page:%p refcount:%d mapcount:%d mapping:%p index:%#lx pfn:%#lx\n",
+ page, page_ref_count(head), mapcount, mapping,
+ page_to_pgoff(page), page_to_pfn(page));
+ if (compound) {
if (hpage_pincount_available(page)) {
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
- "index:%#lx head:%px order:%u "
- "compound_mapcount:%d compound_pincount:%d\n",
- page, page_ref_count(head), mapcount,
- mapping, page_to_pgoff(page), head,
- compound_order(head), compound_mapcount(page),
- compound_pincount(page));
+ pr_warn("head:%p order:%u compound_mapcount:%d compound_pincount:%d\n",
+ head, compound_order(head),
+ head_mapcount(head),
+ head_pincount(head));
} else {
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p "
- "index:%#lx head:%px order:%u "
- "compound_mapcount:%d\n",
- page, page_ref_count(head), mapcount,
- mapping, page_to_pgoff(page), head,
- compound_order(head), compound_mapcount(page));
+ pr_warn("head:%p order:%u compound_mapcount:%d\n",
+ head, compound_order(head),
+ head_mapcount(head));
}
- else
- pr_warn("page:%px refcount:%d mapcount:%d mapping:%p index:%#lx\n",
- page, page_ref_count(page), mapcount,
- mapping, page_to_pgoff(page));
+ }
if (PageKsm(page))
type = "ksm ";
else if (PageAnon(page))
type = "anon ";
else if (mapping) {
- if (mapping->host && mapping->host->i_dentry.first) {
- struct dentry *dentry;
- dentry = container_of(mapping->host->i_dentry.first, struct dentry, d_u.d_alias);
- pr_warn("%ps name:\"%pd\"\n", mapping->a_ops, dentry);
- } else
- pr_warn("%ps\n", mapping->a_ops);
+ struct inode *host;
+ const struct address_space_operations *a_ops;
+ struct hlist_node *dentry_first;
+ struct dentry *dentry_ptr;
+ struct dentry dentry;
+
+ /*
+ * mapping can be invalid pointer and we don't want to crash
+ * accessing it, so probe everything depending on it carefully
+ */
+ if (get_kernel_nofault(host, &mapping->host) ||
+ get_kernel_nofault(a_ops, &mapping->a_ops)) {
+ pr_warn("failed to read mapping contents, not a valid kernel address?\n");
+ goto out_mapping;
+ }
+
+ if (!host) {
+ pr_warn("aops:%ps\n", a_ops);
+ goto out_mapping;
+ }
+
+ if (get_kernel_nofault(dentry_first, &host->i_dentry.first)) {
+ pr_warn("aops:%ps with invalid host inode %px\n",
+ a_ops, host);
+ goto out_mapping;
+ }
+
+ if (!dentry_first) {
+ pr_warn("aops:%ps ino:%lx\n", a_ops, host->i_ino);
+ goto out_mapping;
+ }
+
+ dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias);
+ if (get_kernel_nofault(dentry, dentry_ptr)) {
+ pr_warn("aops:%ps with invalid dentry %px\n", a_ops,
+ dentry_ptr);
+ } else {
+ /*
+ * if dentry is corrupted, the %pd handler may still
+ * crash, but it's unlikely that we reach here with a
+ * corrupted struct page
+ */
+ pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n",
+ a_ops, host->i_ino, &dentry);
+ }
}
+out_mapping:
BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1);
- pr_warn("%sflags: %#lx(%pGp)%s\n", type, page->flags, &page->flags,
+ pr_warn("%sflags: %#lx(%pGp)%s\n", type, head->flags, &head->flags,
page_cma ? " CMA" : "");
hex_only:
diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c
new file mode 100644
index 000000000000..086309fb9b6f
--- /dev/null
+++ b/mm/debug_vm_pgtable.c
@@ -0,0 +1,1049 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This kernel test validates architecture page table helpers and
+ * accessors and helps in verifying their continued compliance with
+ * expected generic MM semantics.
+ *
+ * Copyright (C) 2019 ARM Ltd.
+ *
+ * Author: Anshuman Khandual <anshuman.khandual@arm.com>
+ */
+#define pr_fmt(fmt) "debug_vm_pgtable: [%-25s]: " fmt, __func__
+
+#include <linux/gfp.h>
+#include <linux/highmem.h>
+#include <linux/hugetlb.h>
+#include <linux/kernel.h>
+#include <linux/kconfig.h>
+#include <linux/mm.h>
+#include <linux/mman.h>
+#include <linux/mm_types.h>
+#include <linux/module.h>
+#include <linux/pfn_t.h>
+#include <linux/printk.h>
+#include <linux/pgtable.h>
+#include <linux/random.h>
+#include <linux/spinlock.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
+#include <linux/start_kernel.h>
+#include <linux/sched/mm.h>
+#include <asm/pgalloc.h>
+#include <asm/tlbflush.h>
+
+/*
+ * Please refer Documentation/vm/arch_pgtable_helpers.rst for the semantics
+ * expectations that are being validated here. All future changes in here
+ * or the documentation need to be in sync.
+ */
+
+#define VMFLAGS (VM_READ|VM_WRITE|VM_EXEC)
+
+/*
+ * On s390 platform, the lower 4 bits are used to identify given page table
+ * entry type. But these bits might affect the ability to clear entries with
+ * pxx_clear() because of how dynamic page table folding works on s390. So
+ * while loading up the entries do not change the lower 4 bits. It does not
+ * have affect any other platform.
+ */
+#define S390_MASK_BITS 4
+#define RANDOM_ORVALUE GENMASK(BITS_PER_LONG - 1, S390_MASK_BITS)
+#define RANDOM_NZVALUE GENMASK(7, 0)
+
+static void __init pte_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE basic\n");
+ WARN_ON(!pte_same(pte, pte));
+ WARN_ON(!pte_young(pte_mkyoung(pte_mkold(pte))));
+ WARN_ON(!pte_dirty(pte_mkdirty(pte_mkclean(pte))));
+ WARN_ON(!pte_write(pte_mkwrite(pte_wrprotect(pte))));
+ WARN_ON(pte_young(pte_mkold(pte_mkyoung(pte))));
+ WARN_ON(pte_dirty(pte_mkclean(pte_mkdirty(pte))));
+ WARN_ON(pte_write(pte_wrprotect(pte_mkwrite(pte))));
+}
+
+static void __init pte_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pte_t *ptep,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE advanced\n");
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_write(pte));
+
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_get_and_clear(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pfn_pte(pfn, prot);
+ pte = pte_wrprotect(pte);
+ pte = pte_mkclean(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ pte = pte_mkwrite(pte);
+ pte = pte_mkdirty(pte);
+ ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!(pte_write(pte) && pte_dirty(pte)));
+
+ pte = pfn_pte(pfn, prot);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_get_and_clear_full(mm, vaddr, ptep, 1);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+
+ pte = pte_mkyoung(pte);
+ set_pte_at(mm, vaddr, ptep, pte);
+ ptep_test_and_clear_young(vma, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(pte_young(pte));
+}
+
+static void __init pte_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE saved write\n");
+ WARN_ON(!pte_savedwrite(pte_mk_savedwrite(pte_clear_savedwrite(pte))));
+ WARN_ON(pte_savedwrite(pte_clear_savedwrite(pte_mk_savedwrite(pte))));
+}
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD basic\n");
+ WARN_ON(!pmd_same(pmd, pmd));
+ WARN_ON(!pmd_young(pmd_mkyoung(pmd_mkold(pmd))));
+ WARN_ON(!pmd_dirty(pmd_mkdirty(pmd_mkclean(pmd))));
+ WARN_ON(!pmd_write(pmd_mkwrite(pmd_wrprotect(pmd))));
+ WARN_ON(pmd_young(pmd_mkold(pmd_mkyoung(pmd))));
+ WARN_ON(pmd_dirty(pmd_mkclean(pmd_mkdirty(pmd))));
+ WARN_ON(pmd_write(pmd_wrprotect(pmd_mkwrite(pmd))));
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pmd_bad().
+ */
+ WARN_ON(!pmd_bad(pmd_mkhuge(pmd)));
+}
+
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD advanced\n");
+ /* Align the address wrt HPAGE_PMD_SIZE */
+ vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE;
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_set_wrprotect(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_write(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_huge_get_and_clear(mm, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pfn_pmd(pfn, prot);
+ pmd = pmd_wrprotect(pmd);
+ pmd = pmd_mkclean(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmd = pmd_mkwrite(pmd);
+ pmd = pmd_mkdirty(pmd);
+ pmdp_set_access_flags(vma, vaddr, pmdp, pmd, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!(pmd_write(pmd) && pmd_dirty(pmd)));
+
+ pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_huge_get_and_clear_full(vma, vaddr, pmdp, 1);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+
+ pmd = pmd_mkyoung(pmd);
+ set_pmd_at(mm, vaddr, pmdp, pmd);
+ pmdp_test_and_clear_young(vma, vaddr, pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_young(pmd));
+}
+
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD leaf\n");
+ /*
+ * PMD based THP is a leaf entry.
+ */
+ pmd = pmd_mkhuge(pmd);
+ WARN_ON(!pmd_leaf(pmd));
+}
+
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return;
+
+ pr_debug("Validating PMD huge\n");
+ /*
+ * X86 defined pmd_set_huge() verifies that the given
+ * PMD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pmdp, __pmd(0));
+ WARN_ON(!pmd_set_huge(pmdp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pmd_clear_huge(pmdp));
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD saved write\n");
+ WARN_ON(!pmd_savedwrite(pmd_mk_savedwrite(pmd_clear_savedwrite(pmd))));
+ WARN_ON(pmd_savedwrite(pmd_clear_savedwrite(pmd_mk_savedwrite(pmd))));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD basic\n");
+ WARN_ON(!pud_same(pud, pud));
+ WARN_ON(!pud_young(pud_mkyoung(pud_mkold(pud))));
+ WARN_ON(!pud_write(pud_mkwrite(pud_wrprotect(pud))));
+ WARN_ON(pud_write(pud_wrprotect(pud_mkwrite(pud))));
+ WARN_ON(pud_young(pud_mkold(pud_mkyoung(pud))));
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ /*
+ * A huge page does not point to next level page table
+ * entry. Hence this must qualify as pud_bad().
+ */
+ WARN_ON(!pud_bad(pud_mkhuge(pud)));
+}
+
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD advanced\n");
+ /* Align the address wrt HPAGE_PUD_SIZE */
+ vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE;
+
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_set_wrprotect(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_write(pud));
+
+#ifndef __PAGETABLE_PMD_FOLDED
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_huge_get_and_clear(mm, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+
+ pud = pfn_pud(pfn, prot);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_huge_get_and_clear_full(mm, vaddr, pudp, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+#endif /* __PAGETABLE_PMD_FOLDED */
+ pud = pfn_pud(pfn, prot);
+ pud = pud_wrprotect(pud);
+ pud = pud_mkclean(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pud = pud_mkwrite(pud);
+ pud = pud_mkdirty(pud);
+ pudp_set_access_flags(vma, vaddr, pudp, pud, 1);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!(pud_write(pud) && pud_dirty(pud)));
+
+ pud = pud_mkyoung(pud);
+ set_pud_at(mm, vaddr, pudp, pud);
+ pudp_test_and_clear_young(vma, vaddr, pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_young(pud));
+}
+
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ pr_debug("Validating PUD leaf\n");
+ /*
+ * PUD based THP is a leaf entry.
+ */
+ pud = pud_mkhuge(pud);
+ WARN_ON(!pud_leaf(pud));
+}
+
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_HUGE_VMAP))
+ return;
+
+ pr_debug("Validating PUD huge\n");
+ /*
+ * X86 defined pud_set_huge() verifies that the given
+ * PUD is not a populated non-leaf entry.
+ */
+ WRITE_ONCE(*pudp, __pud(0));
+ WARN_ON(!pud_set_huge(pudp, __pfn_to_phys(pfn), prot));
+ WARN_ON(!pud_clear_huge(pudp));
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pmd_t *pmdp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pud_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma, pud_t *pudp,
+ unsigned long pfn, unsigned long vaddr,
+ pgprot_t prot)
+{
+}
+static void __init pmd_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_leaf_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_huge_tests(pmd_t *pmdp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pud_huge_tests(pud_t *pudp, unsigned long pfn, pgprot_t prot)
+{
+}
+static void __init pmd_savedwrite_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static void __init p4d_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ p4d_t p4d;
+
+ pr_debug("Validating P4D basic\n");
+ memset(&p4d, RANDOM_NZVALUE, sizeof(p4d_t));
+ WARN_ON(!p4d_same(p4d, p4d));
+}
+
+static void __init pgd_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ pgd_t pgd;
+
+ pr_debug("Validating PGD basic\n");
+ memset(&pgd, RANDOM_NZVALUE, sizeof(pgd_t));
+ WARN_ON(!pgd_same(pgd, pgd));
+}
+
+#ifndef __PAGETABLE_PUD_FOLDED
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp)
+{
+ pud_t pud = READ_ONCE(*pudp);
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ pr_debug("Validating PUD clear\n");
+ pud = __pud(pud_val(pud) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pudp, pud);
+ pud_clear(pudp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(!pud_none(pud));
+}
+
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+ pud_t pud;
+
+ if (mm_pmd_folded(mm))
+ return;
+
+ pr_debug("Validating PUD populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pud_bad().
+ */
+ pmd_clear(pmdp);
+ pud_clear(pudp);
+ pud_populate(mm, pudp, pmdp);
+ pud = READ_ONCE(*pudp);
+ WARN_ON(pud_bad(pud));
+}
+#else /* !__PAGETABLE_PUD_FOLDED */
+static void __init pud_clear_tests(struct mm_struct *mm, pud_t *pudp) { }
+static void __init pud_populate_tests(struct mm_struct *mm, pud_t *pudp,
+ pmd_t *pmdp)
+{
+}
+#endif /* PAGETABLE_PUD_FOLDED */
+
+#ifndef __PAGETABLE_P4D_FOLDED
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp)
+{
+ p4d_t p4d = READ_ONCE(*p4dp);
+
+ if (mm_pud_folded(mm))
+ return;
+
+ pr_debug("Validating P4D clear\n");
+ p4d = __p4d(p4d_val(p4d) | RANDOM_ORVALUE);
+ WRITE_ONCE(*p4dp, p4d);
+ p4d_clear(p4dp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(!p4d_none(p4d));
+}
+
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+ p4d_t p4d;
+
+ if (mm_pud_folded(mm))
+ return;
+
+ pr_debug("Validating P4D populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as p4d_bad().
+ */
+ pud_clear(pudp);
+ p4d_clear(p4dp);
+ p4d_populate(mm, p4dp, pudp);
+ p4d = READ_ONCE(*p4dp);
+ WARN_ON(p4d_bad(p4d));
+}
+
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp)
+{
+ pgd_t pgd = READ_ONCE(*pgdp);
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ pr_debug("Validating PGD clear\n");
+ pgd = __pgd(pgd_val(pgd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pgdp, pgd);
+ pgd_clear(pgdp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(!pgd_none(pgd));
+}
+
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+ pgd_t pgd;
+
+ if (mm_p4d_folded(mm))
+ return;
+
+ pr_debug("Validating PGD populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pgd_bad().
+ */
+ p4d_clear(p4dp);
+ pgd_clear(pgdp);
+ pgd_populate(mm, pgdp, p4dp);
+ pgd = READ_ONCE(*pgdp);
+ WARN_ON(pgd_bad(pgd));
+}
+#else /* !__PAGETABLE_P4D_FOLDED */
+static void __init p4d_clear_tests(struct mm_struct *mm, p4d_t *p4dp) { }
+static void __init pgd_clear_tests(struct mm_struct *mm, pgd_t *pgdp) { }
+static void __init p4d_populate_tests(struct mm_struct *mm, p4d_t *p4dp,
+ pud_t *pudp)
+{
+}
+static void __init pgd_populate_tests(struct mm_struct *mm, pgd_t *pgdp,
+ p4d_t *p4dp)
+{
+}
+#endif /* PAGETABLE_P4D_FOLDED */
+
+static void __init pte_clear_tests(struct mm_struct *mm, pte_t *ptep,
+ unsigned long vaddr)
+{
+ pte_t pte = ptep_get(ptep);
+
+ pr_debug("Validating PTE clear\n");
+ pte = __pte(pte_val(pte) | RANDOM_ORVALUE);
+ set_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte_clear(mm, vaddr, ptep);
+ pte = ptep_get(ptep);
+ WARN_ON(!pte_none(pte));
+}
+
+static void __init pmd_clear_tests(struct mm_struct *mm, pmd_t *pmdp)
+{
+ pmd_t pmd = READ_ONCE(*pmdp);
+
+ pr_debug("Validating PMD clear\n");
+ pmd = __pmd(pmd_val(pmd) | RANDOM_ORVALUE);
+ WRITE_ONCE(*pmdp, pmd);
+ pmd_clear(pmdp);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(!pmd_none(pmd));
+}
+
+static void __init pmd_populate_tests(struct mm_struct *mm, pmd_t *pmdp,
+ pgtable_t pgtable)
+{
+ pmd_t pmd;
+
+ pr_debug("Validating PMD populate\n");
+ /*
+ * This entry points to next level page table page.
+ * Hence this must not qualify as pmd_bad().
+ */
+ pmd_clear(pmdp);
+ pmd_populate(mm, pmdp, pgtable);
+ pmd = READ_ONCE(*pmdp);
+ WARN_ON(pmd_bad(pmd));
+}
+
+static void __init pte_special_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL))
+ return;
+
+ pr_debug("Validating PTE special\n");
+ WARN_ON(!pte_special(pte_mkspecial(pte)));
+}
+
+static void __init pte_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PTE protnone\n");
+ WARN_ON(!pte_protnone(pte));
+ WARN_ON(!pte_present(pte));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pmd_mkhuge(pfn_pmd(pfn, prot));
+
+ if (!IS_ENABLED(CONFIG_NUMA_BALANCING))
+ return;
+
+ pr_debug("Validating PMD protnone\n");
+ WARN_ON(!pmd_protnone(pmd));
+ WARN_ON(!pmd_present(pmd));
+}
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_protnone_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+#ifdef CONFIG_ARCH_HAS_PTE_DEVMAP
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ pr_debug("Validating PTE devmap\n");
+ WARN_ON(!pte_devmap(pte_mkdevmap(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ pr_debug("Validating PMD devmap\n");
+ WARN_ON(!pmd_devmap(pmd_mkdevmap(pmd)));
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud = pfn_pud(pfn, prot);
+
+ pr_debug("Validating PUD devmap\n");
+ WARN_ON(!pud_devmap(pud_mkdevmap(pud)));
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#else
+static void __init pte_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_devmap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE soft dirty\n");
+ WARN_ON(!pte_soft_dirty(pte_mksoft_dirty(pte)));
+ WARN_ON(pte_soft_dirty(pte_clear_soft_dirty(pte)));
+}
+
+static void __init pte_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pte = pfn_pte(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PTE swap soft dirty\n");
+ WARN_ON(!pte_swp_soft_dirty(pte_swp_mksoft_dirty(pte)));
+ WARN_ON(pte_swp_soft_dirty(pte_swp_clear_soft_dirty(pte)));
+}
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY))
+ return;
+
+ pr_debug("Validating PMD soft dirty\n");
+ WARN_ON(!pmd_soft_dirty(pmd_mksoft_dirty(pmd)));
+ WARN_ON(pmd_soft_dirty(pmd_clear_soft_dirty(pmd)));
+}
+
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd = pfn_pmd(pfn, prot);
+
+ if (!IS_ENABLED(CONFIG_MEM_SOFT_DIRTY) ||
+ !IS_ENABLED(CONFIG_ARCH_ENABLE_THP_MIGRATION))
+ return;
+
+ pr_debug("Validating PMD swap soft dirty\n");
+ WARN_ON(!pmd_swp_soft_dirty(pmd_swp_mksoft_dirty(pmd)));
+ WARN_ON(pmd_swp_soft_dirty(pmd_swp_clear_soft_dirty(pmd)));
+}
+#else /* !CONFIG_ARCH_HAS_PTE_DEVMAP */
+static void __init pmd_soft_dirty_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pmd_swap_soft_dirty_tests(unsigned long pfn, pgprot_t prot)
+{
+}
+#endif /* CONFIG_ARCH_HAS_PTE_DEVMAP */
+
+static void __init pte_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pte_t pte;
+
+ pr_debug("Validating PTE swap\n");
+ pte = pfn_pte(pfn, prot);
+ swp = __pte_to_swp_entry(pte);
+ pte = __swp_entry_to_pte(swp);
+ WARN_ON(pfn != pte_pfn(pte));
+}
+
+#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot)
+{
+ swp_entry_t swp;
+ pmd_t pmd;
+
+ pr_debug("Validating PMD swap\n");
+ pmd = pfn_pmd(pfn, prot);
+ swp = __pmd_to_swp_entry(pmd);
+ pmd = __swp_entry_to_pmd(swp);
+ WARN_ON(pfn != pmd_pfn(pmd));
+}
+#else /* !CONFIG_ARCH_ENABLE_THP_MIGRATION */
+static void __init pmd_swap_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_ARCH_ENABLE_THP_MIGRATION */
+
+static void __init swap_migration_tests(void)
+{
+ struct page *page;
+ swp_entry_t swp;
+
+ if (!IS_ENABLED(CONFIG_MIGRATION))
+ return;
+
+ pr_debug("Validating swap migration\n");
+ /*
+ * swap_migration_tests() requires a dedicated page as it needs to
+ * be locked before creating a migration entry from it. Locking the
+ * page that actually maps kernel text ('start_kernel') can be real
+ * problematic. Lets allocate a dedicated page explicitly for this
+ * purpose that will be freed subsequently.
+ */
+ page = alloc_page(GFP_KERNEL);
+ if (!page) {
+ pr_err("page allocation failed\n");
+ return;
+ }
+
+ /*
+ * make_migration_entry() expects given page to be
+ * locked, otherwise it stumbles upon a BUG_ON().
+ */
+ __SetPageLocked(page);
+ swp = make_migration_entry(page, 1);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(!is_write_migration_entry(swp));
+
+ make_migration_entry_read(&swp);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+
+ swp = make_migration_entry(page, 0);
+ WARN_ON(!is_migration_entry(swp));
+ WARN_ON(is_write_migration_entry(swp));
+ __ClearPageLocked(page);
+ __free_page(page);
+}
+
+#ifdef CONFIG_HUGETLB_PAGE
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot)
+{
+ struct page *page;
+ pte_t pte;
+
+ pr_debug("Validating HugeTLB basic\n");
+ /*
+ * Accessing the page associated with the pfn is safe here,
+ * as it was previously derived from a real kernel symbol.
+ */
+ page = pfn_to_page(pfn);
+ pte = mk_huge_pte(page, prot);
+
+ WARN_ON(!huge_pte_dirty(huge_pte_mkdirty(pte)));
+ WARN_ON(!huge_pte_write(huge_pte_mkwrite(huge_pte_wrprotect(pte))));
+ WARN_ON(huge_pte_write(huge_pte_wrprotect(huge_pte_mkwrite(pte))));
+
+#ifdef CONFIG_ARCH_WANT_GENERAL_HUGETLB
+ pte = pfn_pte(pfn, prot);
+
+ WARN_ON(!pte_huge(pte_mkhuge(pte)));
+#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
+}
+
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t *ptep, unsigned long pfn,
+ unsigned long vaddr, pgprot_t prot)
+{
+ struct page *page = pfn_to_page(pfn);
+ pte_t pte = ptep_get(ptep);
+ unsigned long paddr = __pfn_to_phys(pfn) & PMD_MASK;
+
+ pr_debug("Validating HugeTLB advanced\n");
+ pte = pte_mkhuge(mk_pte(pfn_to_page(PHYS_PFN(paddr)), prot));
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ WARN_ON(!pte_same(pte, huge_ptep_get(ptep)));
+ huge_pte_clear(mm, vaddr, ptep, PMD_SIZE);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!huge_pte_none(pte));
+
+ pte = mk_huge_pte(page, prot);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ huge_ptep_set_wrprotect(mm, vaddr, ptep);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(huge_pte_write(pte));
+
+ pte = mk_huge_pte(page, prot);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ huge_ptep_get_and_clear(mm, vaddr, ptep);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!huge_pte_none(pte));
+
+ pte = mk_huge_pte(page, prot);
+ pte = huge_pte_wrprotect(pte);
+ set_huge_pte_at(mm, vaddr, ptep, pte);
+ barrier();
+ pte = huge_pte_mkwrite(pte);
+ pte = huge_pte_mkdirty(pte);
+ huge_ptep_set_access_flags(vma, vaddr, ptep, pte, 1);
+ pte = huge_ptep_get(ptep);
+ WARN_ON(!(huge_pte_write(pte) && huge_pte_dirty(pte)));
+}
+#else /* !CONFIG_HUGETLB_PAGE */
+static void __init hugetlb_basic_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init hugetlb_advanced_tests(struct mm_struct *mm,
+ struct vm_area_struct *vma,
+ pte_t *ptep, unsigned long pfn,
+ unsigned long vaddr, pgprot_t prot)
+{
+}
+#endif /* CONFIG_HUGETLB_PAGE */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pmd_t pmd;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PMD based THP\n");
+ /*
+ * pmd_trans_huge() and pmd_present() must return positive after
+ * MMU invalidation with pmd_mkinvalid(). This behavior is an
+ * optimization for transparent huge page. pmd_trans_huge() must
+ * be true if pmd_page() returns a valid THP to avoid taking the
+ * pmd_lock when others walk over non transhuge pmds (i.e. there
+ * are no THP allocated). Especially when splitting a THP and
+ * removing the present bit from the pmd, pmd_trans_huge() still
+ * needs to return true. pmd_present() should be true whenever
+ * pmd_trans_huge() returns true.
+ */
+ pmd = pfn_pmd(pfn, prot);
+ WARN_ON(!pmd_trans_huge(pmd_mkhuge(pmd)));
+
+#ifndef __HAVE_ARCH_PMDP_INVALIDATE
+ WARN_ON(!pmd_trans_huge(pmd_mkinvalid(pmd_mkhuge(pmd))));
+ WARN_ON(!pmd_present(pmd_mkinvalid(pmd_mkhuge(pmd))));
+#endif /* __HAVE_ARCH_PMDP_INVALIDATE */
+}
+
+#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot)
+{
+ pud_t pud;
+
+ if (!has_transparent_hugepage())
+ return;
+
+ pr_debug("Validating PUD based THP\n");
+ pud = pfn_pud(pfn, prot);
+ WARN_ON(!pud_trans_huge(pud_mkhuge(pud)));
+
+ /*
+ * pud_mkinvalid() has been dropped for now. Enable back
+ * these tests when it comes back with a modified pud_present().
+ *
+ * WARN_ON(!pud_trans_huge(pud_mkinvalid(pud_mkhuge(pud))));
+ * WARN_ON(!pud_present(pud_mkinvalid(pud_mkhuge(pud))));
+ */
+}
+#else /* !CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
+#else /* !CONFIG_TRANSPARENT_HUGEPAGE */
+static void __init pmd_thp_tests(unsigned long pfn, pgprot_t prot) { }
+static void __init pud_thp_tests(unsigned long pfn, pgprot_t prot) { }
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+
+static unsigned long __init get_random_vaddr(void)
+{
+ unsigned long random_vaddr, random_pages, total_user_pages;
+
+ total_user_pages = (TASK_SIZE - FIRST_USER_ADDRESS) / PAGE_SIZE;
+
+ random_pages = get_random_long() % total_user_pages;
+ random_vaddr = FIRST_USER_ADDRESS + random_pages * PAGE_SIZE;
+
+ return random_vaddr;
+}
+
+static int __init debug_vm_pgtable(void)
+{
+ struct vm_area_struct *vma;
+ struct mm_struct *mm;
+ pgd_t *pgdp;
+ p4d_t *p4dp, *saved_p4dp;
+ pud_t *pudp, *saved_pudp;
+ pmd_t *pmdp, *saved_pmdp, pmd;
+ pte_t *ptep;
+ pgtable_t saved_ptep;
+ pgprot_t prot, protnone;
+ phys_addr_t paddr;
+ unsigned long vaddr, pte_aligned, pmd_aligned;
+ unsigned long pud_aligned, p4d_aligned, pgd_aligned;
+ spinlock_t *ptl = NULL;
+
+ pr_info("Validating architecture page table helpers\n");
+ prot = vm_get_page_prot(VMFLAGS);
+ vaddr = get_random_vaddr();
+ mm = mm_alloc();
+ if (!mm) {
+ pr_err("mm_struct allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * __P000 (or even __S000) will help create page table entries with
+ * PROT_NONE permission as required for pxx_protnone_tests().
+ */
+ protnone = __P000;
+
+ vma = vm_area_alloc(mm);
+ if (!vma) {
+ pr_err("vma allocation failed\n");
+ return 1;
+ }
+
+ /*
+ * PFN for mapping at PTE level is determined from a standard kernel
+ * text symbol. But pfns for higher page table levels are derived by
+ * masking lower bits of this real pfn. These derived pfns might not
+ * exist on the platform but that does not really matter as pfn_pxx()
+ * helpers will still create appropriate entries for the test. This
+ * helps avoid large memory block allocations to be used for mapping
+ * at higher page table levels.
+ */
+ paddr = __pa_symbol(&start_kernel);
+
+ pte_aligned = (paddr & PAGE_MASK) >> PAGE_SHIFT;
+ pmd_aligned = (paddr & PMD_MASK) >> PAGE_SHIFT;
+ pud_aligned = (paddr & PUD_MASK) >> PAGE_SHIFT;
+ p4d_aligned = (paddr & P4D_MASK) >> PAGE_SHIFT;
+ pgd_aligned = (paddr & PGDIR_MASK) >> PAGE_SHIFT;
+ WARN_ON(!pfn_valid(pte_aligned));
+
+ pgdp = pgd_offset(mm, vaddr);
+ p4dp = p4d_alloc(mm, pgdp, vaddr);
+ pudp = pud_alloc(mm, p4dp, vaddr);
+ pmdp = pmd_alloc(mm, pudp, vaddr);
+ ptep = pte_alloc_map_lock(mm, pmdp, vaddr, &ptl);
+
+ /*
+ * Save all the page table page addresses as the page table
+ * entries will be used for testing with random or garbage
+ * values. These saved addresses will be used for freeing
+ * page table pages.
+ */
+ pmd = READ_ONCE(*pmdp);
+ saved_p4dp = p4d_offset(pgdp, 0UL);
+ saved_pudp = pud_offset(p4dp, 0UL);
+ saved_pmdp = pmd_offset(pudp, 0UL);
+ saved_ptep = pmd_pgtable(pmd);
+
+ pte_basic_tests(pte_aligned, prot);
+ pmd_basic_tests(pmd_aligned, prot);
+ pud_basic_tests(pud_aligned, prot);
+ p4d_basic_tests(p4d_aligned, prot);
+ pgd_basic_tests(pgd_aligned, prot);
+
+ pte_clear_tests(mm, ptep, vaddr);
+ pmd_clear_tests(mm, pmdp);
+ pud_clear_tests(mm, pudp);
+ p4d_clear_tests(mm, p4dp);
+ pgd_clear_tests(mm, pgdp);
+
+ pte_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+ pmd_advanced_tests(mm, vma, pmdp, pmd_aligned, vaddr, prot);
+ pud_advanced_tests(mm, vma, pudp, pud_aligned, vaddr, prot);
+ hugetlb_advanced_tests(mm, vma, ptep, pte_aligned, vaddr, prot);
+
+ pmd_leaf_tests(pmd_aligned, prot);
+ pud_leaf_tests(pud_aligned, prot);
+
+ pmd_huge_tests(pmdp, pmd_aligned, prot);
+ pud_huge_tests(pudp, pud_aligned, prot);
+
+ pte_savedwrite_tests(pte_aligned, prot);
+ pmd_savedwrite_tests(pmd_aligned, prot);
+
+ pte_unmap_unlock(ptep, ptl);
+
+ pmd_populate_tests(mm, pmdp, saved_ptep);
+ pud_populate_tests(mm, pudp, saved_pmdp);
+ p4d_populate_tests(mm, p4dp, saved_pudp);
+ pgd_populate_tests(mm, pgdp, saved_p4dp);
+
+ pte_special_tests(pte_aligned, prot);
+ pte_protnone_tests(pte_aligned, protnone);
+ pmd_protnone_tests(pmd_aligned, protnone);
+
+ pte_devmap_tests(pte_aligned, prot);
+ pmd_devmap_tests(pmd_aligned, prot);
+ pud_devmap_tests(pud_aligned, prot);
+
+ pte_soft_dirty_tests(pte_aligned, prot);
+ pmd_soft_dirty_tests(pmd_aligned, prot);
+ pte_swap_soft_dirty_tests(pte_aligned, prot);
+ pmd_swap_soft_dirty_tests(pmd_aligned, prot);
+
+ pte_swap_tests(pte_aligned, prot);
+ pmd_swap_tests(pmd_aligned, prot);
+
+ swap_migration_tests();
+ hugetlb_basic_tests(pte_aligned, prot);
+
+ pmd_thp_tests(pmd_aligned, prot);
+ pud_thp_tests(pud_aligned, prot);
+
+ p4d_free(mm, saved_p4dp);
+ pud_free(mm, saved_pudp);
+ pmd_free(mm, saved_pmdp);
+ pte_free(mm, saved_ptep);
+
+ vm_area_free(vma);
+ mm_dec_nr_puds(mm);
+ mm_dec_nr_pmds(mm);
+ mm_dec_nr_ptes(mm);
+ mmdrop(mm);
+ return 0;
+}
+late_initcall(debug_vm_pgtable);
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 4f17c83db575..0e66f2aaeea3 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -22,6 +22,8 @@
#include <asm/unistd.h>
+#include "internal.h"
+
/*
* POSIX_FADV_WILLNEED could set PG_Referenced, and POSIX_FADV_NOREUSE could
* deactivate the pages and clear PG_Referenced.
@@ -102,10 +104,6 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
if (!nrpages)
nrpages = ~0UL;
- /*
- * Ignore return value because fadvise() shall return
- * success even if filesystem can't retrieve a hint,
- */
force_page_cache_readahead(mapping, file, start_index, nrpages);
break;
case POSIX_FADV_NOREUSE:
diff --git a/mm/filemap.c b/mm/filemap.c
index 23a051a7ef0f..99c49eeae71b 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -41,6 +41,7 @@
#include <linux/delayacct.h>
#include <linux/psi.h>
#include <linux/ramfs.h>
+#include <linux/page_idle.h>
#include "internal.h"
#define CREATE_TRACE_POINTS
@@ -76,16 +77,16 @@
* ->i_mutex
* ->i_mmap_rwsem (truncate->unmap_mapping_range)
*
- * ->mmap_sem
+ * ->mmap_lock
* ->i_mmap_rwsem
* ->page_table_lock or pte_lock (various, mainly in memory.c)
* ->i_pages lock (arch-dependent flush_dcache_mmap_lock)
*
- * ->mmap_sem
+ * ->mmap_lock
* ->lock_page (access_process_vm)
*
* ->i_mutex (generic_perform_write)
- * ->mmap_sem (fault_in_pages_readable->do_page_fault)
+ * ->mmap_lock (fault_in_pages_readable->do_page_fault)
*
* bdi->wb.list_lock
* sb_lock (fs/fs-writeback.c)
@@ -197,11 +198,11 @@ static void unaccount_page_cache_page(struct address_space *mapping,
if (PageHuge(page))
return;
- nr = hpage_nr_pages(page);
+ nr = thp_nr_pages(page);
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, -nr);
if (PageSwapBacked(page)) {
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, -nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
__dec_node_page_state(page, NR_SHMEM_THPS);
} else if (PageTransHuge(page)) {
@@ -802,21 +803,22 @@ int replace_page_cache_page(struct page *old, struct page *new, gfp_t gfp_mask)
new->mapping = mapping;
new->index = offset;
+ mem_cgroup_migrate(old, new);
+
xas_lock_irqsave(&xas, flags);
xas_store(&xas, new);
old->mapping = NULL;
/* hugetlb pages do not participate in page cache accounting. */
if (!PageHuge(old))
- __dec_node_page_state(new, NR_FILE_PAGES);
+ __dec_lruvec_page_state(old, NR_FILE_PAGES);
if (!PageHuge(new))
- __inc_node_page_state(new, NR_FILE_PAGES);
+ __inc_lruvec_page_state(new, NR_FILE_PAGES);
if (PageSwapBacked(old))
- __dec_node_page_state(new, NR_SHMEM);
+ __dec_lruvec_page_state(old, NR_SHMEM);
if (PageSwapBacked(new))
- __inc_node_page_state(new, NR_SHMEM);
+ __inc_lruvec_page_state(new, NR_SHMEM);
xas_unlock_irqrestore(&xas, flags);
- mem_cgroup_migrate(old, new);
if (freepage)
freepage(old);
put_page(old);
@@ -832,7 +834,6 @@ static int __add_to_page_cache_locked(struct page *page,
{
XA_STATE(xas, &mapping->i_pages, offset);
int huge = PageHuge(page);
- struct mem_cgroup *memcg;
int error;
void *old;
@@ -840,17 +841,16 @@ static int __add_to_page_cache_locked(struct page *page,
VM_BUG_ON_PAGE(PageSwapBacked(page), page);
mapping_set_update(&xas, mapping);
- if (!huge) {
- error = mem_cgroup_try_charge(page, current->mm,
- gfp_mask, &memcg, false);
- if (error)
- return error;
- }
-
get_page(page);
page->mapping = mapping;
page->index = offset;
+ if (!huge) {
+ error = mem_cgroup_charge(page, current->mm, gfp_mask);
+ if (error)
+ goto error;
+ }
+
do {
xas_lock_irq(&xas);
old = xas_load(&xas);
@@ -869,25 +869,23 @@ static int __add_to_page_cache_locked(struct page *page,
/* hugetlb pages do not participate in page cache accounting */
if (!huge)
- __inc_node_page_state(page, NR_FILE_PAGES);
+ __inc_lruvec_page_state(page, NR_FILE_PAGES);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp_mask & GFP_RECLAIM_MASK));
- if (xas_error(&xas))
+ if (xas_error(&xas)) {
+ error = xas_error(&xas);
goto error;
+ }
- if (!huge)
- mem_cgroup_commit_charge(page, memcg, false, false);
trace_mm_filemap_add_to_page_cache(page);
return 0;
error:
page->mapping = NULL;
/* Leave page->index set: truncation relies upon it */
- if (!huge)
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
- return xas_error(&xas);
+ return error;
}
ALLOW_ERROR_INJECTION(__add_to_page_cache_locked, ERRNO);
@@ -990,44 +988,89 @@ void __init pagecache_init(void)
page_writeback_init();
}
-/* This has the same layout as wait_bit_key - see fs/cachefiles/rdwr.c */
-struct wait_page_key {
- struct page *page;
- int bit_nr;
- int page_match;
-};
-
-struct wait_page_queue {
- struct page *page;
- int bit_nr;
- wait_queue_entry_t wait;
-};
-
+/*
+ * The page wait code treats the "wait->flags" somewhat unusually, because
+ * we have multiple different kinds of waits, not just the usual "exclusive"
+ * one.
+ *
+ * We have:
+ *
+ * (a) no special bits set:
+ *
+ * We're just waiting for the bit to be released, and when a waker
+ * calls the wakeup function, we set WQ_FLAG_WOKEN and wake it up,
+ * and remove it from the wait queue.
+ *
+ * Simple and straightforward.
+ *
+ * (b) WQ_FLAG_EXCLUSIVE:
+ *
+ * The waiter is waiting to get the lock, and only one waiter should
+ * be woken up to avoid any thundering herd behavior. We'll set the
+ * WQ_FLAG_WOKEN bit, wake it up, and remove it from the wait queue.
+ *
+ * This is the traditional exclusive wait.
+ *
+ * (c) WQ_FLAG_EXCLUSIVE | WQ_FLAG_CUSTOM:
+ *
+ * The waiter is waiting to get the bit, and additionally wants the
+ * lock to be transferred to it for fair lock behavior. If the lock
+ * cannot be taken, we stop walking the wait queue without waking
+ * the waiter.
+ *
+ * This is the "fair lock handoff" case, and in addition to setting
+ * WQ_FLAG_WOKEN, we set WQ_FLAG_DONE to let the waiter easily see
+ * that it now has the lock.
+ */
static int wake_page_function(wait_queue_entry_t *wait, unsigned mode, int sync, void *arg)
{
+ unsigned int flags;
struct wait_page_key *key = arg;
struct wait_page_queue *wait_page
= container_of(wait, struct wait_page_queue, wait);
- if (wait_page->page != key->page)
- return 0;
- key->page_match = 1;
-
- if (wait_page->bit_nr != key->bit_nr)
+ if (!wake_page_match(wait_page, key))
return 0;
/*
- * Stop walking if it's locked.
- * Is this safe if put_and_wait_on_page_locked() is in use?
- * Yes: the waker must hold a reference to this page, and if PG_locked
- * has now already been set by another task, that task must also hold
- * a reference to the *same usage* of this page; so there is no need
- * to walk on to wake even the put_and_wait_on_page_locked() callers.
+ * If it's a lock handoff wait, we get the bit for it, and
+ * stop walking (and do not wake it up) if we can't.
+ */
+ flags = wait->flags;
+ if (flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ if (flags & WQ_FLAG_CUSTOM) {
+ if (test_and_set_bit(key->bit_nr, &key->page->flags))
+ return -1;
+ flags |= WQ_FLAG_DONE;
+ }
+ }
+
+ /*
+ * We are holding the wait-queue lock, but the waiter that
+ * is waiting for this will be checking the flags without
+ * any locking.
+ *
+ * So update the flags atomically, and wake up the waiter
+ * afterwards to avoid any races. This store-release pairs
+ * with the load-acquire in wait_on_page_bit_common().
*/
- if (test_bit(key->bit_nr, &key->page->flags))
- return -1;
+ smp_store_release(&wait->flags, flags | WQ_FLAG_WOKEN);
+ wake_up_state(wait->private, mode);
- return autoremove_wake_function(wait, mode, sync, key);
+ /*
+ * Ok, we have successfully done what we're waiting for,
+ * and we can unconditionally remove the wait entry.
+ *
+ * Note that this pairs with the "finish_wait()" in the
+ * waiter, and has to be the absolute last thing we do.
+ * After this list_del_init(&wait->entry) the wait entry
+ * might be de-allocated and the process might even have
+ * exited.
+ */
+ list_del_init_careful(&wait->entry);
+ return (flags & WQ_FLAG_EXCLUSIVE) != 0;
}
static void wake_up_page_bit(struct page *page, int bit_nr)
@@ -1106,16 +1149,35 @@ enum behavior {
*/
};
+/*
+ * Attempt to check (or get) the page bit, and mark us done
+ * if successful.
+ */
+static inline bool trylock_page_bit_common(struct page *page, int bit_nr,
+ struct wait_queue_entry *wait)
+{
+ if (wait->flags & WQ_FLAG_EXCLUSIVE) {
+ if (test_and_set_bit(bit_nr, &page->flags))
+ return false;
+ } else if (test_bit(bit_nr, &page->flags))
+ return false;
+
+ wait->flags |= WQ_FLAG_WOKEN | WQ_FLAG_DONE;
+ return true;
+}
+
+/* How many times do we accept lock stealing from under a waiter? */
+int sysctl_page_lock_unfairness = 5;
+
static inline int wait_on_page_bit_common(wait_queue_head_t *q,
struct page *page, int bit_nr, int state, enum behavior behavior)
{
+ int unfairness = sysctl_page_lock_unfairness;
struct wait_page_queue wait_page;
wait_queue_entry_t *wait = &wait_page.wait;
- bool bit_is_set;
bool thrashing = false;
bool delayacct = false;
unsigned long pflags;
- int ret = 0;
if (bit_nr == PG_locked &&
!PageUptodate(page) && PageWorkingset(page)) {
@@ -1128,55 +1190,97 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
init_wait(wait);
- wait->flags = behavior == EXCLUSIVE ? WQ_FLAG_EXCLUSIVE : 0;
wait->func = wake_page_function;
wait_page.page = page;
wait_page.bit_nr = bit_nr;
- for (;;) {
- spin_lock_irq(&q->lock);
-
- if (likely(list_empty(&wait->entry))) {
- __add_wait_queue_entry_tail(q, wait);
- SetPageWaiters(page);
- }
+repeat:
+ wait->flags = 0;
+ if (behavior == EXCLUSIVE) {
+ wait->flags = WQ_FLAG_EXCLUSIVE;
+ if (--unfairness < 0)
+ wait->flags |= WQ_FLAG_CUSTOM;
+ }
- set_current_state(state);
+ /*
+ * Do one last check whether we can get the
+ * page bit synchronously.
+ *
+ * Do the SetPageWaiters() marking before that
+ * to let any waker we _just_ missed know they
+ * need to wake us up (otherwise they'll never
+ * even go to the slow case that looks at the
+ * page queue), and add ourselves to the wait
+ * queue if we need to sleep.
+ *
+ * This part needs to be done under the queue
+ * lock to avoid races.
+ */
+ spin_lock_irq(&q->lock);
+ SetPageWaiters(page);
+ if (!trylock_page_bit_common(page, bit_nr, wait))
+ __add_wait_queue_entry_tail(q, wait);
+ spin_unlock_irq(&q->lock);
- spin_unlock_irq(&q->lock);
+ /*
+ * From now on, all the logic will be based on
+ * the WQ_FLAG_WOKEN and WQ_FLAG_DONE flag, to
+ * see whether the page bit testing has already
+ * been done by the wake function.
+ *
+ * We can drop our reference to the page.
+ */
+ if (behavior == DROP)
+ put_page(page);
- bit_is_set = test_bit(bit_nr, &page->flags);
- if (behavior == DROP)
- put_page(page);
+ /*
+ * Note that until the "finish_wait()", or until
+ * we see the WQ_FLAG_WOKEN flag, we need to
+ * be very careful with the 'wait->flags', because
+ * we may race with a waker that sets them.
+ */
+ for (;;) {
+ unsigned int flags;
- if (likely(bit_is_set))
- io_schedule();
+ set_current_state(state);
- if (behavior == EXCLUSIVE) {
- if (!test_and_set_bit_lock(bit_nr, &page->flags))
- break;
- } else if (behavior == SHARED) {
- if (!test_bit(bit_nr, &page->flags))
+ /* Loop until we've been woken or interrupted */
+ flags = smp_load_acquire(&wait->flags);
+ if (!(flags & WQ_FLAG_WOKEN)) {
+ if (signal_pending_state(state, current))
break;
+
+ io_schedule();
+ continue;
}
- if (signal_pending_state(state, current)) {
- ret = -EINTR;
+ /* If we were non-exclusive, we're done */
+ if (behavior != EXCLUSIVE)
break;
- }
- if (behavior == DROP) {
- /*
- * We can no longer safely access page->flags:
- * even if CONFIG_MEMORY_HOTREMOVE is not enabled,
- * there is a risk of waiting forever on a page reused
- * for something that keeps it locked indefinitely.
- * But best check for -EINTR above before breaking.
- */
+ /* If the waker got the lock for us, we're done */
+ if (flags & WQ_FLAG_DONE)
break;
- }
+
+ /*
+ * Otherwise, if we're getting the lock, we need to
+ * try to get it ourselves.
+ *
+ * And if that fails, we'll have to retry this all.
+ */
+ if (unlikely(test_and_set_bit(bit_nr, &page->flags)))
+ goto repeat;
+
+ wait->flags |= WQ_FLAG_DONE;
+ break;
}
+ /*
+ * If a signal happened, this 'finish_wait()' may remove the last
+ * waiter from the wait-queues, but the PageWaiters bit will remain
+ * set. That's ok. The next wakeup will take care of it, and trying
+ * to do it here would be difficult and prone to races.
+ */
finish_wait(q, wait);
if (thrashing) {
@@ -1186,14 +1290,22 @@ static inline int wait_on_page_bit_common(wait_queue_head_t *q,
}
/*
- * A signal could leave PageWaiters set. Clearing it here if
- * !waitqueue_active would be possible (by open-coding finish_wait),
- * but still fail to catch it in the case of wait hash collision. We
- * already can fail to clear wait hash collision cases, so don't
- * bother with signals either.
+ * NOTE! The wait->flags weren't stable until we've done the
+ * 'finish_wait()', and we could have exited the loop above due
+ * to a signal, and had a wakeup event happen after the signal
+ * test but before the 'finish_wait()'.
+ *
+ * So only after the finish_wait() can we reliably determine
+ * if we got woken up or not, so we can now figure out the final
+ * return value based on that state without races.
+ *
+ * Also note that WQ_FLAG_WOKEN is sufficient for a non-exclusive
+ * waiter, but an exclusive one requires WQ_FLAG_DONE.
*/
+ if (behavior == EXCLUSIVE)
+ return wait->flags & WQ_FLAG_DONE ? 0 : -EINTR;
- return ret;
+ return wait->flags & WQ_FLAG_WOKEN ? 0 : -EINTR;
}
void wait_on_page_bit(struct page *page, int bit_nr)
@@ -1210,6 +1322,44 @@ int wait_on_page_bit_killable(struct page *page, int bit_nr)
}
EXPORT_SYMBOL(wait_on_page_bit_killable);
+static int __wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait, bool set)
+{
+ struct wait_queue_head *q = page_waitqueue(page);
+ int ret = 0;
+
+ wait->page = page;
+ wait->bit_nr = PG_locked;
+
+ spin_lock_irq(&q->lock);
+ __add_wait_queue_entry_tail(q, &wait->wait);
+ SetPageWaiters(page);
+ if (set)
+ ret = !trylock_page(page);
+ else
+ ret = PageLocked(page);
+ /*
+ * If we were succesful now, we know we're still on the
+ * waitqueue as we're still under the lock. This means it's
+ * safe to remove and return success, we know the callback
+ * isn't going to trigger.
+ */
+ if (!ret)
+ __remove_wait_queue(q, &wait->wait);
+ else
+ ret = -EIOCBQUEUED;
+ spin_unlock_irq(&q->lock);
+ return ret;
+}
+
+static int wait_on_page_locked_async(struct page *page,
+ struct wait_page_queue *wait)
+{
+ if (!PageLocked(page))
+ return 0;
+ return __wait_on_page_locked_async(compound_head(page), wait, false);
+}
+
/**
* put_and_wait_on_page_locked - Drop a reference and wait for it to be unlocked
* @page: The page to wait for.
@@ -1259,7 +1409,7 @@ EXPORT_SYMBOL_GPL(add_page_wait_queue);
* instead.
*
* The read of PG_waiters has to be after (or concurrently with) PG_locked
- * being cleared, but a memory barrier should be unneccssary since it is
+ * being cleared, but a memory barrier should be unnecessary since it is
* in the same byte as PG_locked.
*/
static inline bool clear_bit_unlock_is_negative_byte(long nr, volatile void *mem)
@@ -1372,29 +1522,34 @@ int __lock_page_killable(struct page *__page)
}
EXPORT_SYMBOL_GPL(__lock_page_killable);
+int __lock_page_async(struct page *page, struct wait_page_queue *wait)
+{
+ return __wait_on_page_locked_async(page, wait, true);
+}
+
/*
* Return values:
- * 1 - page is locked; mmap_sem is still held.
+ * 1 - page is locked; mmap_lock is still held.
* 0 - page is not locked.
- * mmap_sem has been released (up_read()), unless flags had both
+ * mmap_lock has been released (mmap_read_unlock(), unless flags had both
* FAULT_FLAG_ALLOW_RETRY and FAULT_FLAG_RETRY_NOWAIT set, in
- * which case mmap_sem is still held.
+ * which case mmap_lock is still held.
*
* If neither ALLOW_RETRY nor KILLABLE are set, will always return 1
- * with the page locked and the mmap_sem unperturbed.
+ * with the page locked and the mmap_lock unperturbed.
*/
int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
unsigned int flags)
{
if (fault_flag_allow_retry_first(flags)) {
/*
- * CAUTION! In this case, mmap_sem is not released
+ * CAUTION! In this case, mmap_lock is not released
* even though return 0.
*/
if (flags & FAULT_FLAG_RETRY_NOWAIT)
return 0;
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (flags & FAULT_FLAG_KILLABLE)
wait_on_page_locked_killable(page);
else
@@ -1406,7 +1561,7 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
ret = __lock_page_killable(page);
if (ret) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return 0;
}
} else
@@ -1592,6 +1747,9 @@ EXPORT_SYMBOL(find_lock_entry);
* * %FGP_FOR_MMAP - The caller wants to do its own locking dance if the
* page is already in cache. If the page was allocated, unlock it before
* returning so the caller can do the same dance.
+ * * %FGP_WRITE - The page will be written
+ * * %FGP_NOFS - __GFP_FS will get cleared in gfp mask
+ * * %FGP_NOWAIT - Don't get blocked by page lock
*
* If %FGP_LOCK or %FGP_CREAT are specified then the function may sleep even
* if the %GFP flags specified for %FGP_CREAT are atomic.
@@ -1633,6 +1791,11 @@ repeat:
if (fgp_flags & FGP_ACCESSED)
mark_page_accessed(page);
+ else if (fgp_flags & FGP_WRITE) {
+ /* Clear idle flag for buffer write */
+ if (page_is_idle(page))
+ clear_page_idle(page);
+ }
no_page:
if (!page && (fgp_flags & FGP_CREAT)) {
@@ -1991,7 +2154,7 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
* * total number of bytes copied, including those the were already @written
* * negative error code if nothing was copied
*/
-static ssize_t generic_file_buffered_read(struct kiocb *iocb,
+ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
@@ -2031,7 +2194,7 @@ find_page:
page = find_get_page(mapping, index);
if (!page) {
- if (iocb->ki_flags & IOCB_NOWAIT)
+ if (iocb->ki_flags & IOCB_NOIO)
goto would_block;
page_cache_sync_readahead(mapping,
ra, filp,
@@ -2041,22 +2204,34 @@ find_page:
goto no_cached_page;
}
if (PageReadahead(page)) {
+ if (iocb->ki_flags & IOCB_NOIO) {
+ put_page(page);
+ goto out;
+ }
page_cache_async_readahead(mapping,
ra, filp, page,
index, last_index - index);
}
if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- put_page(page);
- goto would_block;
- }
-
/*
* See comment in do_read_cache_page on why
* wait_on_page_locked is used to avoid unnecessarily
* serialisations and why it's safe.
*/
- error = wait_on_page_locked_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ if (written) {
+ put_page(page);
+ goto out;
+ }
+ error = wait_on_page_locked_async(page,
+ iocb->ki_waitq);
+ } else {
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ put_page(page);
+ goto would_block;
+ }
+ error = wait_on_page_locked_killable(page);
+ }
if (unlikely(error))
goto readpage_error;
if (PageUptodate(page))
@@ -2144,7 +2319,10 @@ page_ok:
page_not_up_to_date:
/* Get exclusive access to the page ... */
- error = lock_page_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ)
+ error = lock_page_async(page, iocb->ki_waitq);
+ else
+ error = lock_page_killable(page);
if (unlikely(error))
goto readpage_error;
@@ -2163,6 +2341,11 @@ page_not_up_to_date_locked:
}
readpage:
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+ unlock_page(page);
+ put_page(page);
+ goto would_block;
+ }
/*
* A previous I/O error may have been due to temporary
* failures, eg. multipath errors.
@@ -2182,7 +2365,11 @@ readpage:
}
if (!PageUptodate(page)) {
- error = lock_page_killable(page);
+ if (iocb->ki_flags & IOCB_WAITQ)
+ error = lock_page_async(page, iocb->ki_waitq);
+ else
+ error = lock_page_killable(page);
+
if (unlikely(error))
goto readpage_error;
if (!PageUptodate(page)) {
@@ -2243,6 +2430,7 @@ out:
file_accessed(filp);
return written ? written : error;
}
+EXPORT_SYMBOL_GPL(generic_file_buffered_read);
/**
* generic_file_read_iter - generic filesystem read routine
@@ -2251,9 +2439,19 @@ out:
*
* This is the "read_iter()" routine for all filesystems
* that can use the page cache directly.
+ *
+ * The IOCB_NOWAIT flag in iocb->ki_flags indicates that -EAGAIN shall
+ * be returned when no data can be read without waiting for I/O requests
+ * to complete; it doesn't prevent readahead.
+ *
+ * The IOCB_NOIO flag in iocb->ki_flags indicates that no new I/O
+ * requests shall be made for the read or for readahead. When no data
+ * can be read, -EAGAIN shall be returned. When readahead would be
+ * triggered, a partial, possibly empty read shall be returned.
+ *
* Return:
* * number of bytes copied, even for partial reads
- * * negative error code if nothing was read
+ * * negative error code (or 0 if IOCB_NOIO) if nothing was read
*/
ssize_t
generic_file_read_iter(struct kiocb *iocb, struct iov_iter *iter)
@@ -2315,14 +2513,14 @@ EXPORT_SYMBOL(generic_file_read_iter);
#ifdef CONFIG_MMU
#define MMAP_LOTSAMISS (100)
/*
- * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_sem
+ * lock_page_maybe_drop_mmap - lock the page, possibly dropping the mmap_lock
* @vmf - the vm_fault for this fault.
* @page - the page to lock.
* @fpin - the pointer to the file we may pin (or is already pinned).
*
- * This works similar to lock_page_or_retry in that it can drop the mmap_sem.
+ * This works similar to lock_page_or_retry in that it can drop the mmap_lock.
* It differs in that it actually returns the page locked if it returns 1 and 0
- * if it couldn't lock the page. If we did have to drop the mmap_sem then fpin
+ * if it couldn't lock the page. If we did have to drop the mmap_lock then fpin
* will point to the pinned file and needs to be fput()'ed at a later point.
*/
static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
@@ -2333,7 +2531,7 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
/*
* NOTE! This will make us return with VM_FAULT_RETRY, but with
- * the mmap_sem still held. That's how FAULT_FLAG_RETRY_NOWAIT
+ * the mmap_lock still held. That's how FAULT_FLAG_RETRY_NOWAIT
* is supposed to work. We have way too many special cases..
*/
if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT)
@@ -2343,13 +2541,13 @@ static int lock_page_maybe_drop_mmap(struct vm_fault *vmf, struct page *page,
if (vmf->flags & FAULT_FLAG_KILLABLE) {
if (__lock_page_killable(page)) {
/*
- * We didn't have the right flags to drop the mmap_sem,
+ * We didn't have the right flags to drop the mmap_lock,
* but all fault_handlers only check for fatal signals
* if we return VM_FAULT_RETRY, so we need to drop the
- * mmap_sem here and return 0 if we don't have a fpin.
+ * mmap_lock here and return 0 if we don't have a fpin.
*/
if (*fpin == NULL)
- up_read(&vmf->vma->vm_mm->mmap_sem);
+ mmap_read_unlock(vmf->vma->vm_mm);
return 0;
}
} else
@@ -2372,6 +2570,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
pgoff_t offset = vmf->pgoff;
+ unsigned int mmap_miss;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ)
@@ -2387,14 +2586,15 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
}
/* Avoid banging the cache line if not needed */
- if (ra->mmap_miss < MMAP_LOTSAMISS * 10)
- ra->mmap_miss++;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss < MMAP_LOTSAMISS * 10)
+ WRITE_ONCE(ra->mmap_miss, ++mmap_miss);
/*
* Do we miss much more than hit in this file? If so,
* stop bothering with read-ahead. It will only hurt.
*/
- if (ra->mmap_miss > MMAP_LOTSAMISS)
+ if (mmap_miss > MMAP_LOTSAMISS)
return fpin;
/*
@@ -2411,7 +2611,7 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
/*
* Asynchronous readahead happens when we find the page and PG_readahead,
* so we want to possibly extend the readahead further. We return the file that
- * was pinned if we have to drop the mmap_sem in order to do IO.
+ * was pinned if we have to drop the mmap_lock in order to do IO.
*/
static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct page *page)
@@ -2420,13 +2620,15 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file_ra_state *ra = &file->f_ra;
struct address_space *mapping = file->f_mapping;
struct file *fpin = NULL;
+ unsigned int mmap_miss;
pgoff_t offset = vmf->pgoff;
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
- if (ra->mmap_miss > 0)
- ra->mmap_miss--;
+ mmap_miss = READ_ONCE(ra->mmap_miss);
+ if (mmap_miss)
+ WRITE_ONCE(ra->mmap_miss, --mmap_miss);
if (PageReadahead(page)) {
fpin = maybe_unlock_mmap_for_io(vmf, fpin);
page_cache_async_readahead(mapping, ra, file,
@@ -2446,12 +2648,12 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
* it in the page cache, and handles the special cases reasonably without
* having a lot of duplicated code.
*
- * vma->vm_mm->mmap_sem must be held on entry.
+ * vma->vm_mm->mmap_lock must be held on entry.
*
- * If our return value has VM_FAULT_RETRY set, it's because the mmap_sem
+ * If our return value has VM_FAULT_RETRY set, it's because the mmap_lock
* may be dropped before doing I/O or by lock_page_maybe_drop_mmap().
*
- * If our return value does not have VM_FAULT_RETRY set, the mmap_sem
+ * If our return value does not have VM_FAULT_RETRY set, the mmap_lock
* has not been released.
*
* We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set.
@@ -2521,7 +2723,7 @@ retry_find:
goto page_not_uptodate;
/*
- * We've made it this far and we had to drop our mmap_sem, now is the
+ * We've made it this far and we had to drop our mmap_lock, now is the
* time to return to the upper layer and have it re-find the vma and
* redo the fault.
*/
@@ -2566,13 +2768,12 @@ page_not_uptodate:
if (!error || error == AOP_TRUNCATED_PAGE)
goto retry_find;
- /* Things didn't work out. Return zero to tell the mm layer so. */
shrink_readahead_size_eio(ra);
return VM_FAULT_SIGBUS;
out_retry:
/*
- * We dropped the mmap_sem, we need to return to the fault handler to
+ * We dropped the mmap_lock, we need to return to the fault handler to
* re-find the vma and come back and find our hopefully still populated
* page.
*/
@@ -2593,6 +2794,7 @@ void filemap_map_pages(struct vm_fault *vmf,
unsigned long max_idx;
XA_STATE(xas, &mapping->i_pages, start_pgoff);
struct page *page;
+ unsigned int mmap_miss = READ_ONCE(file->f_ra.mmap_miss);
rcu_read_lock();
xas_for_each(&xas, page, end_pgoff) {
@@ -2629,14 +2831,14 @@ void filemap_map_pages(struct vm_fault *vmf,
if (page->index >= max_idx)
goto unlock;
- if (file->f_ra.mmap_miss > 0)
- file->f_ra.mmap_miss--;
+ if (mmap_miss > 0)
+ mmap_miss--;
vmf->address += (xas.xa_index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
vmf->pte += xas.xa_index - last_pgoff;
last_pgoff = xas.xa_index;
- if (alloc_set_pte(vmf, NULL, page))
+ if (alloc_set_pte(vmf, page))
goto unlock;
unlock_page(page);
goto next;
@@ -2650,6 +2852,7 @@ next:
break;
}
rcu_read_unlock();
+ WRITE_ONCE(file->f_ra.mmap_miss, mmap_miss);
}
EXPORT_SYMBOL(filemap_map_pages);
@@ -2790,7 +2993,7 @@ filler:
* Case a, the page will be up to date when the page is unlocked.
* There is no need to serialise on the page lock here as the page
* is pinned so the lock gives no additional protection. Even if the
- * the page is truncated, the data is still valid if PageUptodate as
+ * page is truncated, the data is still valid if PageUptodate as
* it's a race vs truncate race.
* Case b, the page will not be up to date
* Case c, the page may be truncated but in itself, the data may still
diff --git a/mm/frame_vector.c b/mm/frame_vector.c
index c431ca81dad5..10f82d5643b6 100644
--- a/mm/frame_vector.c
+++ b/mm/frame_vector.c
@@ -29,7 +29,7 @@
* different type underlying the specified range of virtual addresses.
* When the function isn't able to map a single page, it returns error.
*
- * This function takes care of grabbing mmap_sem as necessary.
+ * This function takes care of grabbing mmap_lock as necessary.
*/
int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
unsigned int gup_flags, struct frame_vector *vec)
@@ -48,7 +48,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
start = untagged_addr(start);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
locked = 1;
vma = find_vma_intersection(mm, start, start + 1);
if (!vma) {
@@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) {
vec->got_ref = true;
vec->is_pfns = false;
- ret = get_user_pages_locked(start, nr_frames,
+ ret = pin_user_pages_locked(start, nr_frames,
gup_flags, (struct page **)(vec->ptrs), &locked);
goto out;
}
@@ -102,7 +102,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames,
} while (vma && vma->vm_flags & (VM_IO | VM_PFNMAP));
out:
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (!ret)
ret = -EFAULT;
if (ret > 0)
@@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames);
*/
void put_vaddr_frames(struct frame_vector *vec)
{
- int i;
struct page **pages;
if (!vec->got_ref)
@@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec)
*/
if (WARN_ON(IS_ERR(pages)))
goto out;
- for (i = 0; i < vec->nr_frames; i++)
- put_page(pages[i]);
+
+ unpin_user_pages(pages, vec->nr_frames);
vec->got_ref = false;
out:
vec->nr_frames = 0;
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 60bb20e8a951..2183a56c7874 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -61,16 +61,16 @@ static u64 frontswap_failed_stores;
static u64 frontswap_invalidates;
static inline void inc_frontswap_loads(void) {
- frontswap_loads++;
+ data_race(frontswap_loads++);
}
static inline void inc_frontswap_succ_stores(void) {
- frontswap_succ_stores++;
+ data_race(frontswap_succ_stores++);
}
static inline void inc_frontswap_failed_stores(void) {
- frontswap_failed_stores++;
+ data_race(frontswap_failed_stores++);
}
static inline void inc_frontswap_invalidates(void) {
- frontswap_invalidates++;
+ data_race(frontswap_invalidates++);
}
#else
static inline void inc_frontswap_loads(void) { }
@@ -87,7 +87,7 @@ static inline void inc_frontswap_invalidates(void) { }
*
* This would not guards us against the user deciding to call swapoff right as
* we are calling the backend to initialize (so swapon is in action).
- * Fortunatly for us, the swapon_mutex has been taked by the callee so we are
+ * Fortunately for us, the swapon_mutex has been taken by the callee so we are
* OK. The other scenario where calls to frontswap_store (called via
* swap_writepage) is racing with frontswap_invalidate_area (called via
* swapoff) is again guarded by the swap subsystem.
@@ -413,8 +413,8 @@ static int __frontswap_unuse_pages(unsigned long total, unsigned long *unused,
}
/*
- * Used to check if it's necessory and feasible to unuse pages.
- * Return 1 when nothing to do, 0 when need to shink pages,
+ * Used to check if it's necessary and feasible to unuse pages.
+ * Return 1 when nothing to do, 0 when need to shrink pages,
* error code when there is an error.
*/
static int __frontswap_shrink(unsigned long target_pages,
@@ -446,7 +446,7 @@ static int __frontswap_shrink(unsigned long target_pages,
void frontswap_shrink(unsigned long target_pages)
{
unsigned long pages_to_unuse = 0;
- int uninitialized_var(type), ret;
+ int type, ret;
/*
* we don't want to hold swap_lock while doing a very
diff --git a/mm/gup.c b/mm/gup.c
index 87a6a59fe667..e869c634cc9a 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -19,7 +19,6 @@
#include <linux/sched/mm.h>
#include <asm/mmu_context.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -584,7 +583,7 @@ retry:
pmdval = READ_ONCE(*pmd);
/*
* MADV_DONTNEED may convert the pmd to null because
- * mmap_sem is held in read mode
+ * mmap_lock is held in read mode
*/
if (pmd_none(pmdval))
return no_page_table(vma, flags);
@@ -835,7 +834,7 @@ static int get_gate_page(struct mm_struct *mm, unsigned long address,
goto unmap;
*page = pte_page(*pte);
}
- if (unlikely(!try_get_page(*page))) {
+ if (unlikely(!try_grab_page(*page, gup_flags))) {
ret = -ENOMEM;
goto unmap;
}
@@ -847,11 +846,11 @@ unmap:
}
/*
- * mmap_sem must be held on entry. If @locked != NULL and *@flags
- * does not include FOLL_NOWAIT, the mmap_sem may be released. If it
+ * mmap_lock must be held on entry. If @locked != NULL and *@flags
+ * does not include FOLL_NOWAIT, the mmap_lock may be released. If it
* is, *@locked will be set to 0 and -EBUSY returned.
*/
-static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
+static int faultin_page(struct vm_area_struct *vma,
unsigned long address, unsigned int *flags, int *locked)
{
unsigned int fault_flags = 0;
@@ -876,7 +875,7 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
fault_flags |= FAULT_FLAG_TRIED;
}
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, *flags);
@@ -885,13 +884,6 @@ static int faultin_page(struct task_struct *tsk, struct vm_area_struct *vma,
BUG();
}
- if (tsk) {
- if (ret & VM_FAULT_MAJOR)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
-
if (ret & VM_FAULT_RETRY) {
if (locked && !(fault_flags & FAULT_FLAG_RETRY_NOWAIT))
*locked = 0;
@@ -961,7 +953,6 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
/**
* __get_user_pages() - pin user pages in memory
- * @tsk: task_struct of target task
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -971,7 +962,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* only intends to ensure the pages are faulted in.
* @vmas: array of pointers to vmas corresponding to each page.
* Or NULL if the caller does not require them.
- * @locked: whether we're still with the mmap_sem held
+ * @locked: whether we're still with the mmap_lock held
*
* Returns either number of pages pinned (which may be less than the
* number requested), or an error. Details about the return value:
@@ -980,12 +971,13 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* -- If nr_pages is >0, but no pages were pinned, returns -errno.
* -- If nr_pages is >0, and some pages were pinned, returns the number of
* pages pinned. Again, this may be less than nr_pages.
+ * -- 0 return value is possible when the fault would need to be retried.
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_sem is held.
+ * @vmas are valid only as long as mmap_lock is held.
*
- * Must be called with mmap_sem held. It may be released. See below.
+ * Must be called with mmap_lock held. It may be released. See below.
*
* __get_user_pages walks a process's page tables and takes a reference to
* each struct page that each user address corresponds to at a given
@@ -1006,12 +998,12 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* appropriate) must be called after the page is finished with, and
* before put_page is called.
*
- * If @locked != NULL, *@locked will be set to 0 when mmap_sem is
+ * If @locked != NULL, *@locked will be set to 0 when mmap_lock is
* released by an up_read(). That can happen if @gup_flags does not
* have FOLL_NOWAIT.
*
* A caller using such a combination of @locked and @gup_flags
- * must therefore hold the mmap_sem for reading only, and recognize
+ * must therefore hold the mmap_lock for reading only, and recognize
* when it's been released. Otherwise, it must be held for either
* reading or writing and will not be released.
*
@@ -1019,7 +1011,7 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
* instead of __get_user_pages. __get_user_pages should be used only if
* you need some special @gup_flags.
*/
-static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
+static long __get_user_pages(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1072,7 +1064,7 @@ static long __get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
if (locked && *locked == 0) {
/*
* We've got a VM_FAULT_RETRY
- * and we've lost mmap_sem.
+ * and we've lost mmap_lock.
* We must stop here.
*/
BUG_ON(gup_flags & FOLL_NOWAIT);
@@ -1095,8 +1087,7 @@ retry:
page = follow_page_mask(vma, start, foll_flags, &ctx);
if (!page) {
- ret = faultin_page(tsk, vma, start, &foll_flags,
- locked);
+ ret = faultin_page(vma, start, &foll_flags, locked);
switch (ret) {
case 0:
goto retry;
@@ -1168,15 +1159,14 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
return true;
}
-/*
+/**
* fixup_user_fault() - manually resolve a user page fault
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @address: user address
* @fault_flags:flags to pass down to handle_mm_fault()
- * @unlocked: did we unlock the mmap_sem while retrying, maybe NULL if caller
- * does not allow retry
+ * @unlocked: did we unlock the mmap_lock while retrying, maybe NULL if caller
+ * does not allow retry. If NULL, the caller must guarantee
+ * that fault_flags does not contain FAULT_FLAG_ALLOW_RETRY.
*
* This is meant to be called in the specific scenario where for locking reasons
* we try to access user memory in atomic context (within a pagefault_disable()
@@ -1195,10 +1185,10 @@ static bool vma_permits_fault(struct vm_area_struct *vma,
* such architectures, gup() will not be enough to make a subsequent access
* succeed.
*
- * This function will not return with an unlocked mmap_sem. So it has not the
- * same semantics wrt the @mm->mmap_sem as does filemap_fault().
+ * This function will not return with an unlocked mmap_lock. So it has not the
+ * same semantics wrt the @mm->mmap_lock as does filemap_fault().
*/
-int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm,
+int fixup_user_fault(struct mm_struct *mm,
unsigned long address, unsigned int fault_flags,
bool *unlocked)
{
@@ -1222,7 +1212,7 @@ retry:
fatal_signal_pending(current))
return -EINTR;
- ret = handle_mm_fault(vma, address, fault_flags);
+ ret = handle_mm_fault(vma, address, fault_flags, NULL);
major |= ret & VM_FAULT_MAJOR;
if (ret & VM_FAULT_ERROR) {
int err = vm_fault_to_errno(ret, 0);
@@ -1233,24 +1223,21 @@ retry:
}
if (ret & VM_FAULT_RETRY) {
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
*unlocked = true;
fault_flags |= FAULT_FLAG_TRIED;
goto retry;
}
- if (tsk) {
- if (major)
- tsk->maj_flt++;
- else
- tsk->min_flt++;
- }
return 0;
}
EXPORT_SYMBOL_GPL(fixup_user_fault);
-static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+/*
+ * Please note that this function, unlike __get_user_pages will not
+ * return 0 for nr_pages > 0 without FOLL_NOWAIT
+ */
+static __always_inline long __get_user_pages_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1268,6 +1255,9 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
BUG_ON(*locked != 1);
}
+ if (flags & FOLL_PIN)
+ atomic_set(&mm->has_pinned, 1);
+
/*
* FOLL_PIN and FOLL_GET are mutually exclusive. Traditional behavior
* is to set FOLL_GET if the caller wants pages[] filled in (but has
@@ -1283,7 +1273,7 @@ static __always_inline long __get_user_pages_locked(struct task_struct *tsk,
pages_done = 0;
lock_dropped = false;
for (;;) {
- ret = __get_user_pages(tsk, mm, start, nr_pages, flags, pages,
+ ret = __get_user_pages(mm, start, nr_pages, flags, pages,
vmas, locked);
if (!locked)
/* VM_FAULT_RETRY couldn't trigger, bypass */
@@ -1334,7 +1324,7 @@ retry:
break;
}
- ret = down_read_killable(&mm->mmap_sem);
+ ret = mmap_read_lock_killable(mm);
if (ret) {
BUG_ON(ret > 0);
if (!pages_done)
@@ -1343,7 +1333,7 @@ retry:
}
*locked = 1;
- ret = __get_user_pages(tsk, mm, start, 1, flags | FOLL_TRIED,
+ ret = __get_user_pages(mm, start, 1, flags | FOLL_TRIED,
pages, NULL, locked);
if (!*locked) {
/* Continue to retry until we succeeded */
@@ -1369,7 +1359,7 @@ retry:
* We must let the caller know we temporarily dropped the lock
* and so the critical section protected by it was lost.
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
*locked = 0;
}
return pages_done;
@@ -1380,13 +1370,14 @@ retry:
* @vma: target vma
* @start: start address
* @end: end address
- * @locked: whether the mmap_sem is still held
+ * @locked: whether the mmap_lock is still held
*
* This takes care of mlocking the pages too if VM_LOCKED is set.
*
- * return 0 on success, negative error code on error.
+ * Return either number of pages pinned in the vma, or a negative error
+ * code on error.
*
- * vma->vm_mm->mmap_sem must be held.
+ * vma->vm_mm->mmap_lock must be held.
*
* If @locked is NULL, it may be held for read or write and will
* be unperturbed.
@@ -1405,7 +1396,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
VM_BUG_ON(end & ~PAGE_MASK);
VM_BUG_ON_VMA(start < vma->vm_start, vma);
VM_BUG_ON_VMA(end > vma->vm_end, vma);
- VM_BUG_ON_MM(!rwsem_is_locked(&mm->mmap_sem), mm);
+ mmap_assert_locked(mm);
gup_flags = FOLL_TOUCH | FOLL_POPULATE | FOLL_MLOCK;
if (vma->vm_flags & VM_LOCKONFAULT)
@@ -1429,7 +1420,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
* We made sure addr is within a VMA, so the following will
* not result in a stack expansion that recurses back here.
*/
- return __get_user_pages(current, mm, start, nr_pages, gup_flags,
+ return __get_user_pages(mm, start, nr_pages, gup_flags,
NULL, NULL, locked);
}
@@ -1438,7 +1429,7 @@ long populate_vma_page_range(struct vm_area_struct *vma,
*
* This is used to implement mlock() and the MAP_POPULATE / MAP_LOCKED mmap
* flags. VMAs must be already marked with the desired vm_flags, and
- * mmap_sem must not be held.
+ * mmap_lock must not be held.
*/
int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
{
@@ -1457,7 +1448,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
*/
if (!locked) {
locked = 1;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, nstart);
} else if (nstart >= vma->vm_end)
vma = vma->vm_next;
@@ -1489,7 +1480,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
ret = 0;
}
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret; /* 0 or negative error code */
}
@@ -1505,7 +1496,7 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors)
* NULL wherever the ZERO_PAGE, or an anonymous pte_none, has been found -
* allowing a hole to be left in the corefile to save diskspace.
*
- * Called without mmap_sem, but after all other threads have been killed.
+ * Called without mmap_lock, but after all other threads have been killed.
*/
#ifdef CONFIG_ELF_CORE
struct page *get_dump_page(unsigned long addr)
@@ -1513,7 +1504,7 @@ struct page *get_dump_page(unsigned long addr)
struct vm_area_struct *vma;
struct page *page;
- if (__get_user_pages(current, current->mm, addr, 1,
+ if (__get_user_pages(current->mm, addr, 1,
FOLL_FORCE | FOLL_DUMP | FOLL_GET, &page, &vma,
NULL) < 1)
return NULL;
@@ -1522,8 +1513,7 @@ struct page *get_dump_page(unsigned long addr)
}
#endif /* CONFIG_ELF_CORE */
#else /* CONFIG_MMU */
-static long __get_user_pages_locked(struct task_struct *tsk,
- struct mm_struct *mm, unsigned long start,
+static long __get_user_pages_locked(struct mm_struct *mm, unsigned long start,
unsigned long nr_pages, struct page **pages,
struct vm_area_struct **vmas, int *locked,
unsigned int foll_flags)
@@ -1588,59 +1578,7 @@ static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
}
#ifdef CONFIG_CMA
-static struct page *new_non_cma_page(struct page *page, unsigned long private)
-{
- /*
- * We want to make sure we allocate the new page from the same node
- * as the source page.
- */
- int nid = page_to_nid(page);
- /*
- * Trying to allocate a page for migration. Ignore allocation
- * failure warnings. We don't force __GFP_THISNODE here because
- * this node here is the node where we have CMA reservation and
- * in some case these nodes will have really less non movable
- * allocation memory.
- */
- gfp_t gfp_mask = GFP_USER | __GFP_NOWARN;
-
- if (PageHighMem(page))
- gfp_mask |= __GFP_HIGHMEM;
-
-#ifdef CONFIG_HUGETLB_PAGE
- if (PageHuge(page)) {
- struct hstate *h = page_hstate(page);
- /*
- * We don't want to dequeue from the pool because pool pages will
- * mostly be from the CMA region.
- */
- return alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
- }
-#endif
- if (PageTransHuge(page)) {
- struct page *thp;
- /*
- * ignore allocation failure warnings
- */
- gfp_t thp_gfpmask = GFP_TRANSHUGE | __GFP_NOWARN;
-
- /*
- * Remove the movable mask so that we don't allocate from
- * CMA area again.
- */
- thp_gfpmask &= ~__GFP_MOVABLE;
- thp = __alloc_pages_node(nid, thp_gfpmask, HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- }
-
- return __alloc_pages_node(nid, gfp_mask, 0);
-}
-
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1653,6 +1591,10 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
bool migrate_allow = true;
LIST_HEAD(cma_page_list);
long ret = nr_pages;
+ struct migration_target_control mtc = {
+ .nid = NUMA_NO_NODE,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_NOWARN,
+ };
check_again:
for (i = 0; i < nr_pages;) {
@@ -1683,7 +1625,7 @@ check_again:
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON +
page_is_file_lru(head),
- hpage_nr_pages(head));
+ thp_nr_pages(head));
}
}
}
@@ -1698,8 +1640,8 @@ check_again:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
- if (migrate_pages(&cma_page_list, new_non_cma_page,
- NULL, 0, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
+ if (migrate_pages(&cma_page_list, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_CONTIG_RANGE)) {
/*
* some of the pages failed migration. Do get_user_pages
* without migration.
@@ -1714,7 +1656,7 @@ check_again:
* again migrating any new CMA pages which we failed to isolate
* earlier.
*/
- ret = __get_user_pages_locked(tsk, mm, start, nr_pages,
+ ret = __get_user_pages_locked(mm, start, nr_pages,
pages, vmas, NULL,
gup_flags);
@@ -1728,8 +1670,7 @@ check_again:
return ret;
}
#else
-static long check_and_migrate_cma_pages(struct task_struct *tsk,
- struct mm_struct *mm,
+static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1744,8 +1685,7 @@ static long check_and_migrate_cma_pages(struct task_struct *tsk,
* __gup_longterm_locked() is a wrapper for __get_user_pages_locked which
* allows us to process the FOLL_LONGTERM flag.
*/
-static long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
@@ -1770,11 +1710,10 @@ static long __gup_longterm_locked(struct task_struct *tsk,
flags = memalloc_nocma_save();
}
- rc = __get_user_pages_locked(tsk, mm, start, nr_pages, pages,
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages,
vmas_tmp, NULL, gup_flags);
if (gup_flags & FOLL_LONGTERM) {
- memalloc_nocma_restore(flags);
if (rc < 0)
goto out;
@@ -1785,32 +1724,31 @@ static long __gup_longterm_locked(struct task_struct *tsk,
goto out;
}
- rc = check_and_migrate_cma_pages(tsk, mm, start, rc, pages,
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
vmas_tmp, gup_flags);
+out:
+ memalloc_nocma_restore(flags);
}
-out:
if (vmas_tmp != vmas)
kfree(vmas_tmp);
return rc;
}
#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct task_struct *tsk,
- struct mm_struct *mm,
+static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
unsigned long start,
unsigned long nr_pages,
struct page **pages,
struct vm_area_struct **vmas,
unsigned int flags)
{
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
NULL, flags);
}
#endif /* CONFIG_FS_DAX || CONFIG_CMA */
#ifdef CONFIG_MMU
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1829,20 +1767,18 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* This will check the vmas (even if our vmas arg is NULL)
* and return -ENOTSUPP if DAX isn't allowed in this case:
*/
- return __gup_longterm_locked(tsk, mm, start, nr_pages, pages,
+ return __gup_longterm_locked(mm, start, nr_pages, pages,
vmas, gup_flags | FOLL_TOUCH |
FOLL_REMOTE);
}
- return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas,
+ return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
locked,
gup_flags | FOLL_TOUCH | FOLL_REMOTE);
}
-/*
+/**
* get_user_pages_remote() - pin user pages in memory
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -1866,17 +1802,17 @@ static long __get_user_pages_remote(struct task_struct *tsk,
*
* The caller is responsible for releasing returned @pages, via put_page().
*
- * @vmas are valid only as long as mmap_sem is held.
+ * @vmas are valid only as long as mmap_lock is held.
*
- * Must be called with mmap_sem held for read or write.
+ * Must be called with mmap_lock held for read or write.
*
- * get_user_pages walks a process's page tables and takes a reference to
- * each struct page that each user address corresponds to at a given
+ * get_user_pages_remote walks a process's page tables and takes a reference
+ * to each struct page that each user address corresponds to at a given
* instant. That is, it takes the page that would be accessed if a user
* thread accesses the given user virtual address at that instant.
*
* This does not guarantee that the page exists in the user mappings when
- * get_user_pages returns, and there may even be a completely different
+ * get_user_pages_remote returns, and there may even be a completely different
* page there in some cases (eg. if mmapped pagecache has been invalidated
* and subsequently re faulted). However it does guarantee that the page
* won't be freed completely. And mostly callers simply care that the page
@@ -1888,20 +1824,20 @@ static long __get_user_pages_remote(struct task_struct *tsk,
* is written to, set_page_dirty (or set_page_dirty_lock, as appropriate) must
* be called after the page is finished with, and before put_page is called.
*
- * get_user_pages is typically used for fewer-copy IO operations, to get a
- * handle on the memory by some means other than accesses via the user virtual
- * addresses. The pages may be submitted for DMA to devices or accessed via
- * their kernel linear mapping (via the kmap APIs). Care should be taken to
- * use the correct cache flushing APIs.
+ * get_user_pages_remote is typically used for fewer-copy IO operations,
+ * to get a handle on the memory by some means other than accesses
+ * via the user virtual addresses. The pages may be submitted for
+ * DMA to devices or accessed via their kernel linear mapping (via the
+ * kmap APIs). Care should be taken to use the correct cache flushing APIs.
*
* See also get_user_pages_fast, for performance critical applications.
*
- * get_user_pages should be phased out in favor of
+ * get_user_pages_remote should be phased out in favor of
* get_user_pages_locked|unlocked or get_user_pages_fast. Nothing
- * should use get_user_pages because it cannot pass
+ * should use get_user_pages_remote because it cannot pass
* FAULT_FLAG_ALLOW_RETRY to handle_mm_fault.
*/
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1913,13 +1849,13 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(get_user_pages_remote);
#else /* CONFIG_MMU */
-long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1927,8 +1863,7 @@ long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return 0;
}
-static long __get_user_pages_remote(struct task_struct *tsk,
- struct mm_struct *mm,
+static long __get_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -1937,12 +1872,21 @@ static long __get_user_pages_remote(struct task_struct *tsk,
}
#endif /* !CONFIG_MMU */
-/*
- * This is the same as get_user_pages_remote(), just with a
- * less-flexible calling convention where we assume that the task
- * and mm being operated on are the current task's and don't allow
- * passing of a locked parameter. We also obviously don't pass
- * FOLL_REMOTE in here.
+/**
+ * get_user_pages() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @vmas: array of pointers to vmas corresponding to each page.
+ * Or NULL if the caller does not require them.
+ *
+ * This is the same as get_user_pages_remote(), just with a less-flexible
+ * calling convention where we assume that the mm being operated on belongs to
+ * the current task, and doesn't allow passing of a locked parameter. We also
+ * obviously don't pass FOLL_REMOTE in here.
*/
long get_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1955,31 +1899,42 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
return -EINVAL;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags | FOLL_TOUCH);
}
EXPORT_SYMBOL(get_user_pages);
-/*
- * We can leverage the VM_FAULT_RETRY functionality in the page fault
- * paths better by using either get_user_pages_locked() or
- * get_user_pages_unlocked().
- *
+/**
* get_user_pages_locked() is suitable to replace the form:
*
- * down_read(&mm->mmap_sem);
+ * mmap_read_lock(mm);
* do_something()
- * get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
+ * get_user_pages(mm, ..., pages, NULL);
+ * mmap_read_unlock(mm);
*
* to:
*
* int locked = 1;
- * down_read(&mm->mmap_sem);
+ * mmap_read_lock(mm);
* do_something()
- * get_user_pages_locked(tsk, mm, ..., pages, &locked);
+ * get_user_pages_locked(mm, ..., pages, &locked);
* if (locked)
- * up_read(&mm->mmap_sem);
+ * mmap_read_unlock(mm);
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * We can leverage the VM_FAULT_RETRY functionality in the page fault
+ * paths better by using either get_user_pages_locked() or
+ * get_user_pages_unlocked().
+ *
*/
long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -1993,8 +1948,14 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages,
*/
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
+ /*
+ * FOLL_PIN must only be set internally by the pin_user_pages*() APIs,
+ * never directly by the caller, so enforce that:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_PIN))
+ return -EINVAL;
- return __get_user_pages_locked(current, current->mm, start, nr_pages,
+ return __get_user_pages_locked(current->mm, start, nr_pages,
pages, NULL, locked,
gup_flags | FOLL_TOUCH);
}
@@ -2003,13 +1964,13 @@ EXPORT_SYMBOL(get_user_pages_locked);
/*
* get_user_pages_unlocked() is suitable to replace the form:
*
- * down_read(&mm->mmap_sem);
- * get_user_pages(tsk, mm, ..., pages, NULL);
- * up_read(&mm->mmap_sem);
+ * mmap_read_lock(mm);
+ * get_user_pages(mm, ..., pages, NULL);
+ * mmap_read_unlock(mm);
*
* with:
*
- * get_user_pages_unlocked(tsk, mm, ..., pages);
+ * get_user_pages_unlocked(mm, ..., pages);
*
* It is functionally equivalent to get_user_pages_fast so
* get_user_pages_fast should be used instead if specific gup_flags
@@ -2031,11 +1992,11 @@ long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
return -EINVAL;
- down_read(&mm->mmap_sem);
- ret = __get_user_pages_locked(current, mm, start, nr_pages, pages, NULL,
+ mmap_read_lock(mm);
+ ret = __get_user_pages_locked(mm, start, nr_pages, pages, NULL,
&locked, gup_flags | FOLL_TOUCH);
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
EXPORT_SYMBOL(get_user_pages_unlocked);
@@ -2149,7 +2110,7 @@ static inline pte_t gup_get_pte(pte_t *ptep)
*/
static inline pte_t gup_get_pte(pte_t *ptep)
{
- return READ_ONCE(*ptep);
+ return ptep_get(ptep);
}
#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
@@ -2252,7 +2213,7 @@ pte_unmap:
* to be special.
*
* For a futex to be placed on a THP tail page, get_futex_key requires a
- * __get_user_pages_fast implementation that can pin pages. Thus it's still
+ * get_user_pages_fast_only implementation that can pin pages. Thus it's still
* useful to have gup_huge_pmd even if we can't operate on ptes.
*/
static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
@@ -2378,7 +2339,7 @@ static int gup_hugepte(pte_t *ptep, unsigned long sz, unsigned long addr,
if (pte_end < end)
end = pte_end;
- pte = READ_ONCE(*ptep);
+ pte = huge_ptep_get(ptep);
if (!pte_access_permitted(pte, flags & FOLL_WRITE))
return 0;
@@ -2527,13 +2488,13 @@ static int gup_huge_pgd(pgd_t orig, pgd_t *pgdp, unsigned long addr,
return 1;
}
-static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
+static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pmd_t *pmdp;
- pmdp = pmd_offset(&pud, addr);
+ pmdp = pmd_offset_lockless(pudp, pud, addr);
do {
pmd_t pmd = READ_ONCE(*pmdp);
@@ -2570,13 +2531,13 @@ static int gup_pmd_range(pud_t pud, unsigned long addr, unsigned long end,
return 1;
}
-static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
+static int gup_pud_range(p4d_t *p4dp, p4d_t p4d, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
pud_t *pudp;
- pudp = pud_offset(&p4d, addr);
+ pudp = pud_offset_lockless(p4dp, p4d, addr);
do {
pud_t pud = READ_ONCE(*pudp);
@@ -2591,20 +2552,20 @@ static int gup_pud_range(p4d_t p4d, unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(pud_val(pud)), addr,
PUD_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pmd_range(pud, addr, next, flags, pages, nr))
+ } else if (!gup_pmd_range(pudp, pud, addr, next, flags, pages, nr))
return 0;
} while (pudp++, addr = next, addr != end);
return 1;
}
-static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
+static int gup_p4d_range(pgd_t *pgdp, pgd_t pgd, unsigned long addr, unsigned long end,
unsigned int flags, struct page **pages, int *nr)
{
unsigned long next;
p4d_t *p4dp;
- p4dp = p4d_offset(&pgd, addr);
+ p4dp = p4d_offset_lockless(pgdp, pgd, addr);
do {
p4d_t p4d = READ_ONCE(*p4dp);
@@ -2616,7 +2577,7 @@ static int gup_p4d_range(pgd_t pgd, unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(p4d_val(p4d)), addr,
P4D_SHIFT, next, flags, pages, nr))
return 0;
- } else if (!gup_pud_range(p4d, addr, next, flags, pages, nr))
+ } else if (!gup_pud_range(p4dp, p4d, addr, next, flags, pages, nr))
return 0;
} while (p4dp++, addr = next, addr != end);
@@ -2644,7 +2605,7 @@ static void gup_pgd_range(unsigned long addr, unsigned long end,
if (!gup_huge_pd(__hugepd(pgd_val(pgd)), addr,
PGDIR_SHIFT, next, flags, pages, nr))
return;
- } else if (!gup_p4d_range(pgd, addr, next, flags, pages, nr))
+ } else if (!gup_p4d_range(pgdp, pgd, addr, next, flags, pages, nr))
return;
} while (pgdp++, addr = next, addr != end);
}
@@ -2657,7 +2618,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end,
#ifndef gup_fast_permitted
/*
- * Check if it's allowed to use __get_user_pages_fast() for the range, or
+ * Check if it's allowed to use get_user_pages_fast_only() for the range, or
* we need to fall back to the slow version:
*/
static bool gup_fast_permitted(unsigned long start, unsigned long end)
@@ -2666,62 +2627,6 @@ static bool gup_fast_permitted(unsigned long start, unsigned long end)
}
#endif
-/*
- * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
- * the regular GUP.
- * Note a difference with get_user_pages_fast: this always returns the
- * number of pages pinned, 0 if no pages were pinned.
- *
- * If the architecture does not support this function, simply return with no
- * pages pinned.
- */
-int __get_user_pages_fast(unsigned long start, int nr_pages, int write,
- struct page **pages)
-{
- unsigned long len, end;
- unsigned long flags;
- int nr_pinned = 0;
- /*
- * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
- * because gup fast is always a "pin with a +1 page refcount" request.
- */
- unsigned int gup_flags = FOLL_GET;
-
- if (write)
- gup_flags |= FOLL_WRITE;
-
- start = untagged_addr(start) & PAGE_MASK;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (end <= start)
- return 0;
- if (unlikely(!access_ok((void __user *)start, len)))
- return 0;
-
- /*
- * Disable interrupts. We use the nested form as we can already have
- * interrupts disabled by get_futex_key.
- *
- * With interrupts disabled, we block page table pages from being
- * freed from under us. See struct mmu_table_batch comments in
- * include/asm-generic/tlb.h for more details.
- *
- * We do not adopt an rcu_read_lock(.) here as we also want to
- * block IPIs that come from THPs splitting.
- */
-
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_save(flags);
- gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
- local_irq_restore(flags);
- }
-
- return nr_pinned;
-}
-EXPORT_SYMBOL_GPL(__get_user_pages_fast);
-
static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
{
@@ -2732,11 +2637,11 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
* get_user_pages_unlocked() (see comments in that function)
*/
if (gup_flags & FOLL_LONGTERM) {
- down_read(&current->mm->mmap_sem);
- ret = __gup_longterm_locked(current, current->mm,
+ mmap_read_lock(current->mm);
+ ret = __gup_longterm_locked(current->mm,
start, nr_pages,
pages, NULL, gup_flags);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
} else {
ret = get_user_pages_unlocked(start, nr_pages,
pages, gup_flags);
@@ -2750,12 +2655,20 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
struct page **pages)
{
unsigned long addr, len, end;
+ unsigned long flags;
int nr_pinned = 0, ret = 0;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
- FOLL_FORCE | FOLL_PIN | FOLL_GET)))
+ FOLL_FORCE | FOLL_PIN | FOLL_GET |
+ FOLL_FAST_ONLY)))
return -EINVAL;
+ if (gup_flags & FOLL_PIN)
+ atomic_set(&current->mm->has_pinned, 1);
+
+ if (!(gup_flags & FOLL_FAST_ONLY))
+ might_lock_read(&current->mm->mmap_lock);
+
start = untagged_addr(start) & PAGE_MASK;
addr = start;
len = (unsigned long) nr_pages << PAGE_SHIFT;
@@ -2766,15 +2679,27 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) &&
- gup_fast_permitted(start, end)) {
- local_irq_disable();
- gup_pgd_range(addr, end, gup_flags, pages, &nr_pinned);
- local_irq_enable();
+ /*
+ * Disable interrupts. The nested form is used, in order to allow
+ * full, general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being
+ * freed from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock(.) here as we also want to
+ * block IPIs that come from THPs splitting.
+ */
+ if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) {
+ unsigned long fast_flags = gup_flags;
+
+ local_irq_save(flags);
+ gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
+ local_irq_restore(flags);
ret = nr_pinned;
}
- if (nr_pinned < nr_pages) {
+ if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) {
/* Try to get the remaining pages with get_user_pages */
start += nr_pinned << PAGE_SHIFT;
pages += nr_pinned;
@@ -2793,6 +2718,54 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
return ret;
}
+/**
+ * get_user_pages_fast_only() - pin user pages in memory
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying pin behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long.
+ *
+ * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to
+ * the regular GUP.
+ * Note a difference with get_user_pages_fast: this always returns the
+ * number of pages pinned, 0 if no pages were pinned.
+ *
+ * If the architecture does not support this function, simply return with no
+ * pages pinned.
+ *
+ * Careful, careful! COW breaking can go either way, so a non-write
+ * access can get ambiguous page results. If you call this function without
+ * 'write' set, you'd better be sure that you're ok with that ambiguity.
+ */
+int get_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+ /*
+ * Internally (within mm/gup.c), gup fast variants must set FOLL_GET,
+ * because gup fast is always a "pin with a +1 page refcount" request.
+ *
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= FOLL_GET | FOLL_FAST_ONLY;
+
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+
+ /*
+ * As specified in the API description above, this routine is not
+ * allowed to return negative values. However, the common core
+ * routine internal_get_user_pages_fast() *can* return -errno.
+ * Therefore, correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(get_user_pages_fast_only);
/**
* get_user_pages_fast() - pin user pages in memory
@@ -2802,7 +2775,7 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
* @pages: array that receives pointers to the pages pinned.
* Should be at least nr_pages long.
*
- * Attempt to pin user pages in memory without taking mm->mmap_sem.
+ * Attempt to pin user pages in memory without taking mm->mmap_lock.
* If not successful, it will fall back to taking the lock and
* calling get_user_pages().
*
@@ -2845,10 +2818,7 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast);
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for further details.
- *
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
- * is NOT intended for Case 2 (RDMA: long-term pins).
+ * see Documentation/core-api/pin_user_pages.rst for further details.
*/
int pin_user_pages_fast(unsigned long start, int nr_pages,
unsigned int gup_flags, struct page **pages)
@@ -2862,11 +2832,45 @@ int pin_user_pages_fast(unsigned long start, int nr_pages,
}
EXPORT_SYMBOL_GPL(pin_user_pages_fast);
+/*
+ * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior
+ * is the same, except that this one sets FOLL_PIN instead of FOLL_GET.
+ *
+ * The API rules are the same, too: no negative values may be returned.
+ */
+int pin_user_pages_fast_only(unsigned long start, int nr_pages,
+ unsigned int gup_flags, struct page **pages)
+{
+ int nr_pinned;
+
+ /*
+ * FOLL_GET and FOLL_PIN are mutually exclusive. Note that the API
+ * rules require returning 0, rather than -errno:
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return 0;
+ /*
+ * FOLL_FAST_ONLY is required in order to match the API description of
+ * this routine: no fall back to regular ("slow") GUP.
+ */
+ gup_flags |= (FOLL_PIN | FOLL_FAST_ONLY);
+ nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags,
+ pages);
+ /*
+ * This routine is not allowed to return negative values. However,
+ * internal_get_user_pages_fast() *can* return -errno. Therefore,
+ * correct for that here:
+ */
+ if (nr_pinned < 0)
+ nr_pinned = 0;
+
+ return nr_pinned;
+}
+EXPORT_SYMBOL_GPL(pin_user_pages_fast_only);
+
/**
- * pin_user_pages_remote() - pin pages of a remote process (task != current)
+ * pin_user_pages_remote() - pin pages of a remote process
*
- * @tsk: the task_struct to use for page fault accounting, or
- * NULL if faults are not to be recorded.
* @mm: mm_struct of target mm
* @start: starting user address
* @nr_pages: number of pages from start to pin
@@ -2885,12 +2889,9 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast);
* the arguments here are identical.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for details.
- *
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
- * is NOT intended for Case 2 (RDMA: long-term pins).
+ * see Documentation/core-api/pin_user_pages.rst for details.
*/
-long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
+long pin_user_pages_remote(struct mm_struct *mm,
unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
struct vm_area_struct **vmas, int *locked)
@@ -2900,7 +2901,7 @@ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __get_user_pages_remote(tsk, mm, start, nr_pages, gup_flags,
+ return __get_user_pages_remote(mm, start, nr_pages, gup_flags,
pages, vmas, locked);
}
EXPORT_SYMBOL(pin_user_pages_remote);
@@ -2921,10 +2922,7 @@ EXPORT_SYMBOL(pin_user_pages_remote);
* FOLL_PIN is set.
*
* FOLL_PIN means that the pages must be released via unpin_user_page(). Please
- * see Documentation/vm/pin_user_pages.rst for details.
- *
- * This is intended for Case 1 (DIO) in Documentation/vm/pin_user_pages.rst. It
- * is NOT intended for Case 2 (RDMA: long-term pins).
+ * see Documentation/core-api/pin_user_pages.rst for details.
*/
long pin_user_pages(unsigned long start, unsigned long nr_pages,
unsigned int gup_flags, struct page **pages,
@@ -2935,7 +2933,53 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages,
return -EINVAL;
gup_flags |= FOLL_PIN;
- return __gup_longterm_locked(current, current->mm, start, nr_pages,
+ return __gup_longterm_locked(current->mm, start, nr_pages,
pages, vmas, gup_flags);
}
EXPORT_SYMBOL(pin_user_pages);
+
+/*
+ * pin_user_pages_unlocked() is the FOLL_PIN variant of
+ * get_user_pages_unlocked(). Behavior is the same, except that this one sets
+ * FOLL_PIN and rejects FOLL_GET.
+ */
+long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages,
+ struct page **pages, unsigned int gup_flags)
+{
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return get_user_pages_unlocked(start, nr_pages, pages, gup_flags);
+}
+EXPORT_SYMBOL(pin_user_pages_unlocked);
+
+/*
+ * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked().
+ * Behavior is the same, except that this one sets FOLL_PIN and rejects
+ * FOLL_GET.
+ */
+long pin_user_pages_locked(unsigned long start, unsigned long nr_pages,
+ unsigned int gup_flags, struct page **pages,
+ int *locked)
+{
+ /*
+ * FIXME: Current FOLL_LONGTERM behavior is incompatible with
+ * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on
+ * vmas. As there are no users of this flag in this call we simply
+ * disallow this option for now.
+ */
+ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM))
+ return -EINVAL;
+
+ /* FOLL_GET and FOLL_PIN are mutually exclusive. */
+ if (WARN_ON_ONCE(gup_flags & FOLL_GET))
+ return -EINVAL;
+
+ gup_flags |= FOLL_PIN;
+ return __get_user_pages_locked(current->mm, start, nr_pages,
+ pages, NULL, locked,
+ gup_flags | FOLL_TOUCH);
+}
+EXPORT_SYMBOL(pin_user_pages_locked);
diff --git a/mm/hmm.c b/mm/hmm.c
index 280585833adf..943cb2ba4442 100644
--- a/mm/hmm.c
+++ b/mm/hmm.c
@@ -37,28 +37,13 @@ enum {
HMM_NEED_ALL_BITS = HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT,
};
-/*
- * hmm_device_entry_from_pfn() - create a valid device entry value from pfn
- * @range: range use to encode HMM pfn value
- * @pfn: pfn value for which to create the device entry
- * Return: valid device entry for the pfn
- */
-static uint64_t hmm_device_entry_from_pfn(const struct hmm_range *range,
- unsigned long pfn)
-{
- return (pfn << range->pfn_shift) | range->flags[HMM_PFN_VALID];
-}
-
static int hmm_pfns_fill(unsigned long addr, unsigned long end,
- struct hmm_range *range, enum hmm_pfn_value_e value)
+ struct hmm_range *range, unsigned long cpu_flags)
{
- uint64_t *pfns = range->pfns;
- unsigned long i;
+ unsigned long i = (addr - range->start) >> PAGE_SHIFT;
- i = (addr - range->start) >> PAGE_SHIFT;
for (; addr < end; addr += PAGE_SIZE, i++)
- pfns[i] = range->values[value];
-
+ range->hmm_pfns[i] = cpu_flags;
return 0;
}
@@ -90,13 +75,15 @@ static int hmm_vma_fault(unsigned long addr, unsigned long end,
}
for (; addr < end; addr += PAGE_SIZE)
- if (handle_mm_fault(vma, addr, fault_flags) & VM_FAULT_ERROR)
+ if (handle_mm_fault(vma, addr, fault_flags, NULL) &
+ VM_FAULT_ERROR)
return -EFAULT;
return -EBUSY;
}
static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- uint64_t pfns, uint64_t cpu_flags)
+ unsigned long pfn_req_flags,
+ unsigned long cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;
@@ -110,27 +97,28 @@ static unsigned int hmm_pte_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
* waste to have the user pre-fill the pfn arrays with a default
* flags value.
*/
- pfns = (pfns & range->pfn_flags_mask) | range->default_flags;
+ pfn_req_flags &= range->pfn_flags_mask;
+ pfn_req_flags |= range->default_flags;
/* We aren't ask to do anything ... */
- if (!(pfns & range->flags[HMM_PFN_VALID]))
+ if (!(pfn_req_flags & HMM_PFN_REQ_FAULT))
return 0;
/* Need to write fault ? */
- if ((pfns & range->flags[HMM_PFN_WRITE]) &&
- !(cpu_flags & range->flags[HMM_PFN_WRITE]))
+ if ((pfn_req_flags & HMM_PFN_REQ_WRITE) &&
+ !(cpu_flags & HMM_PFN_WRITE))
return HMM_NEED_FAULT | HMM_NEED_WRITE_FAULT;
/* If CPU page table is not valid then we need to fault */
- if (!(cpu_flags & range->flags[HMM_PFN_VALID]))
+ if (!(cpu_flags & HMM_PFN_VALID))
return HMM_NEED_FAULT;
return 0;
}
static unsigned int
hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
- const uint64_t *pfns, unsigned long npages,
- uint64_t cpu_flags)
+ const unsigned long hmm_pfns[], unsigned long npages,
+ unsigned long cpu_flags)
{
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault = 0;
@@ -142,12 +130,12 @@ hmm_range_need_fault(const struct hmm_vma_walk *hmm_vma_walk,
* hmm_pte_need_fault() will always return 0.
*/
if (!((range->default_flags | range->pfn_flags_mask) &
- range->flags[HMM_PFN_VALID]))
+ HMM_PFN_REQ_FAULT))
return 0;
for (i = 0; i < npages; ++i) {
- required_fault |=
- hmm_pte_need_fault(hmm_vma_walk, pfns[i], cpu_flags);
+ required_fault |= hmm_pte_need_fault(hmm_vma_walk, hmm_pfns[i],
+ cpu_flags);
if (required_fault == HMM_NEED_ALL_BITS)
return required_fault;
}
@@ -161,12 +149,13 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
unsigned long i, npages;
- uint64_t *pfns;
+ unsigned long *hmm_pfns;
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
- required_fault = hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0);
+ hmm_pfns = &range->hmm_pfns[i];
+ required_fault =
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0);
if (!walk->vma) {
if (required_fault)
return -EFAULT;
@@ -174,46 +163,51 @@ static int hmm_vma_walk_hole(unsigned long addr, unsigned long end,
}
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
- hmm_vma_walk->last = addr;
- return hmm_pfns_fill(addr, end, range, HMM_PFN_NONE);
+ return hmm_pfns_fill(addr, end, range, 0);
}
-static inline uint64_t pmd_to_hmm_pfn_flags(struct hmm_range *range, pmd_t pmd)
+static inline unsigned long hmm_pfn_flags_order(unsigned long order)
+{
+ return order << HMM_PFN_ORDER_SHIFT;
+}
+
+static inline unsigned long pmd_to_hmm_pfn_flags(struct hmm_range *range,
+ pmd_t pmd)
{
if (pmd_protnone(pmd))
return 0;
- return pmd_write(pmd) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return (pmd_write(pmd) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
+ HMM_PFN_VALID) |
+ hmm_pfn_flags_order(PMD_SHIFT - PAGE_SHIFT);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
- unsigned long end, uint64_t *pfns, pmd_t pmd)
+ unsigned long end, unsigned long hmm_pfns[],
+ pmd_t pmd)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned long pfn, npages, i;
unsigned int required_fault;
- uint64_t cpu_flags;
+ unsigned long cpu_flags;
npages = (end - addr) >> PAGE_SHIFT;
cpu_flags = pmd_to_hmm_pfn_flags(range, pmd);
required_fault =
- hmm_range_need_fault(hmm_vma_walk, pfns, npages, cpu_flags);
+ hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, cpu_flags);
if (required_fault)
return hmm_vma_fault(addr, end, required_fault, walk);
pfn = pmd_pfn(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT);
for (i = 0; addr < end; addr += PAGE_SIZE, i++, pfn++)
- pfns[i] = hmm_device_entry_from_pfn(range, pfn) | cpu_flags;
- hmm_vma_walk->last = end;
+ hmm_pfns[i] = pfn | cpu_flags;
return 0;
}
#else /* CONFIG_TRANSPARENT_HUGEPAGE */
/* stub to allow the code below to compile */
int hmm_vma_handle_pmd(struct mm_walk *walk, unsigned long addr,
- unsigned long end, uint64_t *pfns, pmd_t pmd);
+ unsigned long end, unsigned long hmm_pfns[], pmd_t pmd);
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
static inline bool hmm_is_device_private_entry(struct hmm_range *range,
@@ -224,31 +218,31 @@ static inline bool hmm_is_device_private_entry(struct hmm_range *range,
range->dev_private_owner;
}
-static inline uint64_t pte_to_hmm_pfn_flags(struct hmm_range *range, pte_t pte)
+static inline unsigned long pte_to_hmm_pfn_flags(struct hmm_range *range,
+ pte_t pte)
{
if (pte_none(pte) || !pte_present(pte) || pte_protnone(pte))
return 0;
- return pte_write(pte) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return pte_write(pte) ? (HMM_PFN_VALID | HMM_PFN_WRITE) : HMM_PFN_VALID;
}
static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
unsigned long end, pmd_t *pmdp, pte_t *ptep,
- uint64_t *pfn)
+ unsigned long *hmm_pfn)
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
unsigned int required_fault;
- uint64_t cpu_flags;
+ unsigned long cpu_flags;
pte_t pte = *ptep;
- uint64_t orig_pfn = *pfn;
+ uint64_t pfn_req_flags = *hmm_pfn;
if (pte_none(pte)) {
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (required_fault)
goto fault;
- *pfn = range->values[HMM_PFN_NONE];
+ *hmm_pfn = 0;
return 0;
}
@@ -256,21 +250,22 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
swp_entry_t entry = pte_to_swp_entry(pte);
/*
- * Never fault in device private pages pages, but just report
+ * Never fault in device private pages, but just report
* the PFN even if not present.
*/
if (hmm_is_device_private_entry(range, entry)) {
- *pfn = hmm_device_entry_from_pfn(range,
- device_private_entry_to_pfn(entry));
- *pfn |= range->flags[HMM_PFN_VALID];
+ cpu_flags = HMM_PFN_VALID;
if (is_write_device_private_entry(entry))
- *pfn |= range->flags[HMM_PFN_WRITE];
+ cpu_flags |= HMM_PFN_WRITE;
+ *hmm_pfn = device_private_entry_to_pfn(entry) |
+ cpu_flags;
return 0;
}
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0);
if (!required_fault) {
- *pfn = range->values[HMM_PFN_NONE];
+ *hmm_pfn = 0;
return 0;
}
@@ -290,7 +285,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
}
cpu_flags = pte_to_hmm_pfn_flags(range, pte);
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault)
goto fault;
@@ -299,15 +295,15 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
* fall through and treat it like a normal page.
*/
if (pte_special(pte) && !is_zero_pfn(pte_pfn(pte))) {
- if (hmm_pte_need_fault(hmm_vma_walk, orig_pfn, 0)) {
+ if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
pte_unmap(ptep);
return -EFAULT;
}
- *pfn = range->values[HMM_PFN_SPECIAL];
+ *hmm_pfn = HMM_PFN_ERROR;
return 0;
}
- *pfn = hmm_device_entry_from_pfn(range, pte_pfn(pte)) | cpu_flags;
+ *hmm_pfn = pte_pfn(pte) | cpu_flags;
return 0;
fault:
@@ -323,7 +319,8 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
{
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
- uint64_t *pfns = &range->pfns[(start - range->start) >> PAGE_SHIFT];
+ unsigned long *hmm_pfns =
+ &range->hmm_pfns[(start - range->start) >> PAGE_SHIFT];
unsigned long npages = (end - start) >> PAGE_SHIFT;
unsigned long addr = start;
pte_t *ptep;
@@ -335,16 +332,16 @@ again:
return hmm_vma_walk_hole(start, end, -1, walk);
if (thp_migration_supported() && is_pmd_migration_entry(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0)) {
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0)) {
hmm_vma_walk->last = addr;
pmd_migration_entry_wait(walk->mm, pmdp);
return -EBUSY;
}
- return hmm_pfns_fill(start, end, range, HMM_PFN_NONE);
+ return hmm_pfns_fill(start, end, range, 0);
}
if (!pmd_present(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
@@ -364,7 +361,7 @@ again:
if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
goto again;
- return hmm_vma_handle_pmd(walk, addr, end, pfns, pmd);
+ return hmm_vma_handle_pmd(walk, addr, end, hmm_pfns, pmd);
}
/*
@@ -374,37 +371,35 @@ again:
* recover.
*/
if (pmd_bad(pmd)) {
- if (hmm_range_need_fault(hmm_vma_walk, pfns, npages, 0))
+ if (hmm_range_need_fault(hmm_vma_walk, hmm_pfns, npages, 0))
return -EFAULT;
return hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
}
ptep = pte_offset_map(pmdp, addr);
- for (; addr < end; addr += PAGE_SIZE, ptep++, pfns++) {
+ for (; addr < end; addr += PAGE_SIZE, ptep++, hmm_pfns++) {
int r;
- r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, pfns);
+ r = hmm_vma_handle_pte(walk, addr, end, pmdp, ptep, hmm_pfns);
if (r) {
/* hmm_vma_handle_pte() did pte_unmap() */
- hmm_vma_walk->last = addr;
return r;
}
}
pte_unmap(ptep - 1);
-
- hmm_vma_walk->last = addr;
return 0;
}
#if defined(CONFIG_ARCH_HAS_PTE_DEVMAP) && \
defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
-static inline uint64_t pud_to_hmm_pfn_flags(struct hmm_range *range, pud_t pud)
+static inline unsigned long pud_to_hmm_pfn_flags(struct hmm_range *range,
+ pud_t pud)
{
if (!pud_present(pud))
return 0;
- return pud_write(pud) ? range->flags[HMM_PFN_VALID] |
- range->flags[HMM_PFN_WRITE] :
- range->flags[HMM_PFN_VALID];
+ return (pud_write(pud) ? (HMM_PFN_VALID | HMM_PFN_WRITE) :
+ HMM_PFN_VALID) |
+ hmm_pfn_flags_order(PUD_SHIFT - PAGE_SHIFT);
}
static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
@@ -432,7 +427,8 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
if (pud_huge(pud) && pud_devmap(pud)) {
unsigned long i, npages, pfn;
unsigned int required_fault;
- uint64_t *pfns, cpu_flags;
+ unsigned long *hmm_pfns;
+ unsigned long cpu_flags;
if (!pud_present(pud)) {
spin_unlock(ptl);
@@ -441,10 +437,10 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
i = (addr - range->start) >> PAGE_SHIFT;
npages = (end - addr) >> PAGE_SHIFT;
- pfns = &range->pfns[i];
+ hmm_pfns = &range->hmm_pfns[i];
cpu_flags = pud_to_hmm_pfn_flags(range, pud);
- required_fault = hmm_range_need_fault(hmm_vma_walk, pfns,
+ required_fault = hmm_range_need_fault(hmm_vma_walk, hmm_pfns,
npages, cpu_flags);
if (required_fault) {
spin_unlock(ptl);
@@ -453,9 +449,7 @@ static int hmm_vma_walk_pud(pud_t *pudp, unsigned long start, unsigned long end,
pfn = pud_pfn(pud) + ((addr & ~PUD_MASK) >> PAGE_SHIFT);
for (i = 0; i < npages; ++i, ++pfn)
- pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
- cpu_flags;
- hmm_vma_walk->last = end;
+ hmm_pfns[i] = pfn | cpu_flags;
goto out_unlock;
}
@@ -479,8 +473,9 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
struct hmm_vma_walk *hmm_vma_walk = walk->private;
struct hmm_range *range = hmm_vma_walk->range;
struct vm_area_struct *vma = walk->vma;
- uint64_t orig_pfn, cpu_flags;
unsigned int required_fault;
+ unsigned long pfn_req_flags;
+ unsigned long cpu_flags;
spinlock_t *ptl;
pte_t entry;
@@ -488,9 +483,11 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
entry = huge_ptep_get(pte);
i = (start - range->start) >> PAGE_SHIFT;
- orig_pfn = range->pfns[i];
- cpu_flags = pte_to_hmm_pfn_flags(range, entry);
- required_fault = hmm_pte_need_fault(hmm_vma_walk, orig_pfn, cpu_flags);
+ pfn_req_flags = range->hmm_pfns[i];
+ cpu_flags = pte_to_hmm_pfn_flags(range, entry) |
+ hmm_pfn_flags_order(huge_page_order(hstate_vma(vma)));
+ required_fault =
+ hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, cpu_flags);
if (required_fault) {
spin_unlock(ptl);
return hmm_vma_fault(addr, end, required_fault, walk);
@@ -498,9 +495,8 @@ static int hmm_vma_walk_hugetlb_entry(pte_t *pte, unsigned long hmask,
pfn = pte_pfn(entry) + ((start & ~hmask) >> PAGE_SHIFT);
for (; addr < end; addr += PAGE_SIZE, i++, pfn++)
- range->pfns[i] = hmm_device_entry_from_pfn(range, pfn) |
- cpu_flags;
- hmm_vma_walk->last = end;
+ range->hmm_pfns[i] = pfn | cpu_flags;
+
spin_unlock(ptl);
return 0;
}
@@ -531,13 +527,12 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
* failure.
*/
if (hmm_range_need_fault(hmm_vma_walk,
- range->pfns +
+ range->hmm_pfns +
((start - range->start) >> PAGE_SHIFT),
(end - start) >> PAGE_SHIFT, 0))
return -EFAULT;
hmm_pfns_fill(start, end, range, HMM_PFN_ERROR);
- hmm_vma_walk->last = end;
/* Skip this vma and continue processing the next vma. */
return 1;
@@ -555,9 +550,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
* hmm_range_fault - try to fault some address in a virtual address range
* @range: argument structure
*
- * Return: the number of valid pages in range->pfns[] (from range start
- * address), which may be zero. On error one of the following status codes
- * can be returned:
+ * Returns 0 on success or one of the following error codes:
*
* -EINVAL: Invalid arguments or mm or virtual address is in an invalid vma
* (e.g., device file vma).
@@ -572,7 +565,7 @@ static const struct mm_walk_ops hmm_walk_ops = {
* This is similar to get_user_pages(), except that it can read the page tables
* without mutating them (ie causing faults).
*/
-long hmm_range_fault(struct hmm_range *range)
+int hmm_range_fault(struct hmm_range *range)
{
struct hmm_vma_walk hmm_vma_walk = {
.range = range,
@@ -581,7 +574,7 @@ long hmm_range_fault(struct hmm_range *range)
struct mm_struct *mm = range->notifier->mm;
int ret;
- lockdep_assert_held(&mm->mmap_sem);
+ mmap_assert_locked(mm);
do {
/* If range is no longer valid force retry. */
@@ -590,10 +583,13 @@ long hmm_range_fault(struct hmm_range *range)
return -EBUSY;
ret = walk_page_range(mm, hmm_vma_walk.last, range->end,
&hmm_walk_ops, &hmm_vma_walk);
+ /*
+ * When -EBUSY is returned the loop restarts with
+ * hmm_vma_walk.last set to an address that has not been stored
+ * in pfns. All entries < last in the pfn array are set to their
+ * output, and all >= are still at their input values.
+ */
} while (ret == -EBUSY);
-
- if (ret)
- return ret;
- return (hmm_vma_walk.last - range->start) >> PAGE_SHIFT;
+ return ret;
}
EXPORT_SYMBOL(hmm_range_fault);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 6ecd1045113b..da397779a6d4 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -303,24 +303,6 @@ static ssize_t hpage_pmd_size_show(struct kobject *kobj,
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
-#ifdef CONFIG_DEBUG_VM
-static ssize_t debug_cow_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
-{
- return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static ssize_t debug_cow_store(struct kobject *kobj,
- struct kobj_attribute *attr,
- const char *buf, size_t count)
-{
- return single_hugepage_flag_store(kobj, attr, buf, count,
- TRANSPARENT_HUGEPAGE_DEBUG_COW_FLAG);
-}
-static struct kobj_attribute debug_cow_attr =
- __ATTR(debug_cow, 0644, debug_cow_show, debug_cow_store);
-#endif /* CONFIG_DEBUG_VM */
-
static struct attribute *hugepage_attr[] = {
&enabled_attr.attr,
&defrag_attr.attr,
@@ -329,9 +311,6 @@ static struct attribute *hugepage_attr[] = {
#ifdef CONFIG_SHMEM
&shmem_enabled_attr.attr,
#endif
-#ifdef CONFIG_DEBUG_VM
- &debug_cow_attr.attr,
-#endif
NULL,
};
@@ -522,7 +501,7 @@ void prep_transhuge_page(struct page *page)
bool is_transparent_hugepage(struct page *page)
{
if (!PageCompound(page))
- return 0;
+ return false;
page = compound_head(page);
return is_huge_zero_page(page) ||
@@ -587,19 +566,19 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
struct page *page, gfp_t gfp)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
pgtable_t pgtable;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
vm_fault_t ret = 0;
VM_BUG_ON_PAGE(!PageCompound(page), page);
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, gfp, &memcg, true)) {
+ if (mem_cgroup_charge(page, vma->vm_mm, gfp)) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
count_vm_event(THP_FAULT_FALLBACK_CHARGE);
return VM_FAULT_FALLBACK;
}
+ cgroup_throttle_swaprate(page, gfp);
pgtable = pte_alloc_one(vma->vm_mm);
if (unlikely(!pgtable)) {
@@ -630,7 +609,6 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
vm_fault_t ret2;
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
pte_free(vma->vm_mm, pgtable);
ret2 = handle_userfault(vmf, VM_UFFD_MISSING);
@@ -641,15 +619,14 @@ static vm_fault_t __do_huge_pmd_anonymous_page(struct vm_fault *vmf,
entry = mk_huge_pmd(page, vma->vm_page_prot);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
page_add_new_anon_rmap(page, vma, haddr, true);
- mem_cgroup_commit_charge(page, memcg, false, true);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable);
set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
mm_inc_nr_ptes(vma->vm_mm);
spin_unlock(vmf->ptl);
count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
+ count_memcg_event_mm(vma->vm_mm, THP_FAULT_ALLOC);
}
return 0;
@@ -658,7 +635,6 @@ unlock_release:
release:
if (pgtable)
pte_free(vma->vm_mm, pgtable);
- mem_cgroup_cancel_charge(page, memcg, true);
put_page(page);
return ret;
@@ -1098,6 +1074,24 @@ int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
src_page = pmd_page(pmd);
VM_BUG_ON_PAGE(!PageHead(src_page), src_page);
+
+ /*
+ * If this page is a potentially pinned page, split and retry the fault
+ * with smaller page size. Normally this should not happen because the
+ * userspace should use MADV_DONTFORK upon pinned regions. This is a
+ * best effort that the pinned pages won't be replaced by another
+ * random page during the coming copy-on-write.
+ */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(src_page))) {
+ pte_free(dst_mm, pgtable);
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pmd(vma, src_pmd, addr, false, NULL);
+ return -EAGAIN;
+ }
+
get_page(src_page);
page_dup_rmap(src_page, true);
add_mm_counter(dst_mm, MM_ANONPAGES, HPAGE_PMD_NR);
@@ -1201,6 +1195,16 @@ int copy_huge_pud(struct mm_struct *dst_mm, struct mm_struct *src_mm,
/* No huge zero pud yet */
}
+ /* Please refer to comments in copy_huge_pmd() */
+ if (unlikely(is_cow_mapping(vma->vm_flags) &&
+ atomic_read(&src_mm->has_pinned) &&
+ page_maybe_dma_pinned(pud_page(pud)))) {
+ spin_unlock(src_ptl);
+ spin_unlock(dst_ptl);
+ __split_huge_pud(vma, src_pud, addr);
+ return -EAGAIN;
+ }
+
pudp_set_wrprotect(src_mm, addr, src_pud);
pud = pud_mkold(pud_wrprotect(pud));
set_pud_at(dst_mm, addr, dst_pud, pud);
@@ -1255,263 +1259,63 @@ unlock:
spin_unlock(vmf->ptl);
}
-static vm_fault_t do_huge_pmd_wp_page_fallback(struct vm_fault *vmf,
- pmd_t orig_pmd, struct page *page)
-{
- struct vm_area_struct *vma = vmf->vma;
- unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mem_cgroup *memcg;
- pgtable_t pgtable;
- pmd_t _pmd;
- int i;
- vm_fault_t ret = 0;
- struct page **pages;
- struct mmu_notifier_range range;
-
- pages = kmalloc_array(HPAGE_PMD_NR, sizeof(struct page *),
- GFP_KERNEL);
- if (unlikely(!pages)) {
- ret |= VM_FAULT_OOM;
- goto out;
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE, vma,
- vmf->address, page_to_nid(page));
- if (unlikely(!pages[i] ||
- mem_cgroup_try_charge_delay(pages[i], vma->vm_mm,
- GFP_KERNEL, &memcg, false))) {
- if (pages[i])
- put_page(pages[i]);
- while (--i >= 0) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg,
- false);
- put_page(pages[i]);
- }
- kfree(pages);
- ret |= VM_FAULT_OOM;
- goto out;
- }
- set_page_private(pages[i], (unsigned long)memcg);
- }
-
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- copy_user_highpage(pages[i], page + i,
- haddr + PAGE_SIZE * i, vma);
- __SetPageUptodate(pages[i]);
- cond_resched();
- }
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_free_pages;
- VM_BUG_ON_PAGE(!PageHead(page), page);
-
- /*
- * Leave pmd empty until pte is filled note we must notify here as
- * concurrent CPU thread might write to new page before the call to
- * mmu_notifier_invalidate_range_end() happens which can lead to a
- * device seeing memory write in different order than CPU.
- *
- * See Documentation/vm/mmu_notifier.rst
- */
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
-
- pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd);
- pmd_populate(vma->vm_mm, &_pmd, pgtable);
-
- for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) {
- pte_t entry;
- entry = mk_pte(pages[i], vma->vm_page_prot);
- entry = maybe_mkwrite(pte_mkdirty(entry), vma);
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false);
- mem_cgroup_commit_charge(pages[i], memcg, false, false);
- lru_cache_add_active_or_unevictable(pages[i], vma);
- vmf->pte = pte_offset_map(&_pmd, haddr);
- VM_BUG_ON(!pte_none(*vmf->pte));
- set_pte_at(vma->vm_mm, haddr, vmf->pte, entry);
- pte_unmap(vmf->pte);
- }
- kfree(pages);
-
- smp_wmb(); /* make pte visible before pmd */
- pmd_populate(vma->vm_mm, vmf->pmd, pgtable);
- page_remove_rmap(page, true);
- spin_unlock(vmf->ptl);
-
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-
- ret |= VM_FAULT_WRITE;
- put_page(page);
-
-out:
- return ret;
-
-out_free_pages:
- spin_unlock(vmf->ptl);
- mmu_notifier_invalidate_range_end(&range);
- for (i = 0; i < HPAGE_PMD_NR; i++) {
- memcg = (void *)page_private(pages[i]);
- set_page_private(pages[i], 0);
- mem_cgroup_cancel_charge(pages[i], memcg, false);
- put_page(pages[i]);
- }
- kfree(pages);
- goto out;
-}
-
vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd)
{
struct vm_area_struct *vma = vmf->vma;
- struct page *page = NULL, *new_page;
- struct mem_cgroup *memcg;
+ struct page *page;
unsigned long haddr = vmf->address & HPAGE_PMD_MASK;
- struct mmu_notifier_range range;
- gfp_t huge_gfp; /* for allocation and charge */
- vm_fault_t ret = 0;
vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd);
VM_BUG_ON_VMA(!vma->anon_vma, vma);
+
if (is_huge_zero_pmd(orig_pmd))
- goto alloc;
+ goto fallback;
+
spin_lock(vmf->ptl);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd)))
- goto out_unlock;
+
+ if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
+ return 0;
+ }
page = pmd_page(orig_pmd);
VM_BUG_ON_PAGE(!PageCompound(page) || !PageHead(page), page);
- /*
- * We can only reuse the page if nobody else maps the huge page or it's
- * part.
- */
+
+ /* Lock page for reuse_swap_page() */
if (!trylock_page(page)) {
get_page(page);
spin_unlock(vmf->ptl);
lock_page(page);
spin_lock(vmf->ptl);
if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
+ spin_unlock(vmf->ptl);
unlock_page(page);
put_page(page);
- goto out_unlock;
+ return 0;
}
put_page(page);
}
+
+ /*
+ * We can only reuse the page if nobody else maps the huge page or it's
+ * part.
+ */
if (reuse_swap_page(page, NULL)) {
pmd_t entry;
entry = pmd_mkyoung(orig_pmd);
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
+ if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1))
update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- ret |= VM_FAULT_WRITE;
unlock_page(page);
- goto out_unlock;
- }
- unlock_page(page);
- get_page(page);
- spin_unlock(vmf->ptl);
-alloc:
- if (__transparent_hugepage_enabled(vma) &&
- !transparent_hugepage_debug_cow()) {
- huge_gfp = alloc_hugepage_direct_gfpmask(vma);
- new_page = alloc_hugepage_vma(huge_gfp, vma, haddr, HPAGE_PMD_ORDER);
- } else
- new_page = NULL;
-
- if (likely(new_page)) {
- prep_transhuge_page(new_page);
- } else {
- if (!page) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- } else {
- ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page);
- if (ret & VM_FAULT_OOM) {
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- ret |= VM_FAULT_FALLBACK;
- }
- put_page(page);
- }
- count_vm_event(THP_FAULT_FALLBACK);
- goto out;
- }
-
- if (unlikely(mem_cgroup_try_charge_delay(new_page, vma->vm_mm,
- huge_gfp, &memcg, true))) {
- put_page(new_page);
- split_huge_pmd(vma, vmf->pmd, vmf->address);
- if (page)
- put_page(page);
- ret |= VM_FAULT_FALLBACK;
- count_vm_event(THP_FAULT_FALLBACK);
- count_vm_event(THP_FAULT_FALLBACK_CHARGE);
- goto out;
- }
-
- count_vm_event(THP_FAULT_ALLOC);
- count_memcg_events(memcg, THP_FAULT_ALLOC, 1);
-
- if (!page)
- clear_huge_page(new_page, vmf->address, HPAGE_PMD_NR);
- else
- copy_user_huge_page(new_page, page, vmf->address,
- vma, HPAGE_PMD_NR);
- __SetPageUptodate(new_page);
-
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
- haddr, haddr + HPAGE_PMD_SIZE);
- mmu_notifier_invalidate_range_start(&range);
-
- spin_lock(vmf->ptl);
- if (page)
- put_page(page);
- if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) {
spin_unlock(vmf->ptl);
- mem_cgroup_cancel_charge(new_page, memcg, true);
- put_page(new_page);
- goto out_mn;
- } else {
- pmd_t entry;
- entry = mk_huge_pmd(new_page, vma->vm_page_prot);
- entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
- pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd);
- page_add_new_anon_rmap(new_page, vma, haddr, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- lru_cache_add_active_or_unevictable(new_page, vma);
- set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry);
- update_mmu_cache_pmd(vma, vmf->address, vmf->pmd);
- if (!page) {
- add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR);
- } else {
- VM_BUG_ON_PAGE(!PageHead(page), page);
- page_remove_rmap(page, true);
- put_page(page);
- }
- ret |= VM_FAULT_WRITE;
+ return VM_FAULT_WRITE;
}
+
+ unlock_page(page);
spin_unlock(vmf->ptl);
-out_mn:
- /*
- * No need to double call mmu_notifier->invalidate_range() callback as
- * the above pmdp_huge_clear_flush_notify() did already call it.
- */
- mmu_notifier_invalidate_range_only_end(&range);
-out:
- return ret;
-out_unlock:
- spin_unlock(vmf->ptl);
- return ret;
+fallback:
+ __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL);
+ return VM_FAULT_FALLBACK;
}
/*
@@ -1582,7 +1386,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
goto skip_mlock;
if (!trylock_page(page))
goto skip_mlock;
- lru_add_drain();
if (page->mapping && !PageDoubleMap(page))
mlock_vma_page(page);
unlock_page(page);
@@ -1852,8 +1655,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma,
* pgtable_trans_huge_withdraw after finishing pmdp related
* operations.
*/
- orig_pmd = pmdp_huge_get_and_clear_full(tlb->mm, addr, pmd,
- tlb->fullmm);
+ orig_pmd = pmdp_huge_get_and_clear_full(vma, addr, pmd,
+ tlb->fullmm);
tlb_remove_pmd_tlb_entry(tlb, pmd, addr);
if (vma_is_special_huge(vma)) {
if (arch_needs_pgtable_deposit())
@@ -1927,19 +1730,13 @@ static pmd_t move_soft_dirty_pmd(pmd_t pmd)
}
bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
pmd_t pmd;
struct mm_struct *mm = vma->vm_mm;
bool force_flush = false;
- if ((old_addr & ~HPAGE_PMD_MASK) ||
- (new_addr & ~HPAGE_PMD_MASK) ||
- old_end - old_addr < HPAGE_PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
* should have release it.
@@ -1951,7 +1748,7 @@ bool move_huge_pmd(struct vm_area_struct *vma, unsigned long old_addr,
/*
* We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
+ * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = __pmd_trans_huge_lock(old_pmd, vma);
if (old_ptl) {
@@ -2038,9 +1835,9 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
goto unlock;
/*
- * In case prot_numa, we are under down_read(mmap_sem). It's critical
+ * In case prot_numa, we are under mmap_read_lock(mm). It's critical
* to not clear pmd intermittently to avoid race with MADV_DONTNEED
- * which is also under down_read(mmap_sem):
+ * which is also under mmap_read_lock(mm):
*
* CPU0: CPU1:
* change_huge_pmd(prot_numa=1)
@@ -2253,7 +2050,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
put_page(page);
add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR);
return;
- } else if (is_huge_zero_pmd(*pmd)) {
+ } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) {
/*
* FIXME: Do we want to invalidate secondary mmu by calling
* mmu_notifier_invalidate_range() see comments below inside
@@ -2274,8 +2071,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
* free), userland could trigger a small page size TLB miss on the
* small sized TLB while the hugepage TLB entry is still established in
* the huge TLB. Some CPU doesn't like that.
- * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum
- * 383 on page 93. Intel should be safe but is also warns that it's
+ * See http://support.amd.com/TechDocs/41322_10h_Rev_Gd.pdf, Erratum
+ * 383 on page 105. Intel should be safe but is also warns that it's
* only safe if the permission and cache attributes of the two entries
* loaded in the two TLB is identical (which should be the case here).
* But it is generally safer to never allow small and huge TLB entries
@@ -2347,27 +2144,33 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd,
pte = pte_offset_map(&_pmd, addr);
BUG_ON(!pte_none(*pte));
set_pte_at(mm, addr, pte, entry);
- atomic_inc(&page[i]._mapcount);
- pte_unmap(pte);
- }
-
- /*
- * Set PG_double_map before dropping compound_mapcount to avoid
- * false-negative page_mapped().
- */
- if (compound_mapcount(page) > 1 && !TestSetPageDoubleMap(page)) {
- for (i = 0; i < HPAGE_PMD_NR; i++)
+ if (!pmd_migration)
atomic_inc(&page[i]._mapcount);
+ pte_unmap(pte);
}
- if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
- /* Last compound_mapcount is gone. */
- __dec_node_page_state(page, NR_ANON_THPS);
- if (TestClearPageDoubleMap(page)) {
- /* No need in mapcount reference anymore */
+ if (!pmd_migration) {
+ /*
+ * Set PG_double_map before dropping compound_mapcount to avoid
+ * false-negative page_mapped().
+ */
+ if (compound_mapcount(page) > 1 &&
+ !TestSetPageDoubleMap(page)) {
for (i = 0; i < HPAGE_PMD_NR; i++)
- atomic_dec(&page[i]._mapcount);
+ atomic_inc(&page[i]._mapcount);
}
+
+ lock_page_memcg(page);
+ if (atomic_add_negative(-1, compound_mapcount_ptr(page))) {
+ /* Last compound_mapcount is gone. */
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
+ if (TestClearPageDoubleMap(page)) {
+ /* No need in mapcount reference anymore */
+ for (i = 0; i < HPAGE_PMD_NR; i++)
+ atomic_dec(&page[i]._mapcount);
+ }
+ }
+ unlock_page_memcg(page);
}
smp_wmb(); /* make pte visible before pmd */
@@ -2386,6 +2189,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
{
spinlock_t *ptl;
struct mmu_notifier_range range;
+ bool was_locked = false;
+ pmd_t _pmd;
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm,
address & HPAGE_PMD_MASK,
@@ -2398,11 +2203,32 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
* pmd against. Otherwise we can end up replacing wrong page.
*/
VM_BUG_ON(freeze && !page);
- if (page && page != pmd_page(*pmd))
- goto out;
+ if (page) {
+ VM_WARN_ON_ONCE(!PageLocked(page));
+ was_locked = true;
+ if (page != pmd_page(*pmd))
+ goto out;
+ }
+repeat:
if (pmd_trans_huge(*pmd)) {
- page = pmd_page(*pmd);
+ if (!page) {
+ page = pmd_page(*pmd);
+ if (unlikely(!trylock_page(page))) {
+ get_page(page);
+ _pmd = *pmd;
+ spin_unlock(ptl);
+ lock_page(page);
+ spin_lock(ptl);
+ if (unlikely(!pmd_same(*pmd, _pmd))) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ goto repeat;
+ }
+ put_page(page);
+ }
+ }
if (PageMlocked(page))
clear_page_mlock(page);
} else if (!(pmd_devmap(*pmd) || is_pmd_migration_entry(*pmd)))
@@ -2410,6 +2236,8 @@ void __split_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd,
__split_huge_pmd_locked(vma, pmd, range.start, freeze);
out:
spin_unlock(ptl);
+ if (!was_locked && page)
+ unlock_page(page);
/*
* No need to double call mmu_notifier->invalidate_range() callback.
* They are 3 cases to consider inside __split_huge_pmd_locked():
@@ -2784,7 +2612,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
- bool mlocked;
unsigned long flags;
pgoff_t end;
@@ -2797,7 +2624,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
if (PageAnon(head)) {
/*
- * The caller does not necessarily hold an mmap_sem that would
+ * The caller does not necessarily hold an mmap_lock that would
* prevent the anon_vma disappearing so we first we take a
* reference to it and then lock the anon_vma for write. This
* is similar to page_lock_anon_vma_read except the write lock
@@ -2843,14 +2670,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
goto out_unlock;
}
- mlocked = PageMlocked(head);
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
- /* Make sure the page is not on per-CPU pagevec as it takes pin */
- if (mlocked)
- lru_add_drain();
-
/* prevent PageLRU to go away from under us, and freeze lru stats */
spin_lock_irqsave(&pgdata->lru_lock, flags);
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index bcabbe02192b..67fc6383995b 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -19,6 +19,7 @@
#include <linux/memblock.h>
#include <linux/sysfs.h>
#include <linux/slab.h>
+#include <linux/sched/mm.h>
#include <linux/mmdebug.h>
#include <linux/sched/signal.h>
#include <linux/rmap.h>
@@ -31,7 +32,7 @@
#include <linux/cma.h>
#include <asm/page.h>
-#include <asm/pgtable.h>
+#include <asm/pgalloc.h>
#include <asm/tlb.h>
#include <linux/io.h>
@@ -46,7 +47,10 @@ int hugetlb_max_hstate __read_mostly;
unsigned int default_hstate_idx;
struct hstate hstates[HUGE_MAX_HSTATE];
+#ifdef CONFIG_CMA
static struct cma *hugetlb_cma[MAX_NUMNODES];
+#endif
+static unsigned long hugetlb_cma_size __initdata;
/*
* Minimum page order among possible hugepage sizes, set to a proper value
@@ -59,8 +63,8 @@ __initdata LIST_HEAD(huge_boot_pages);
/* for command line parsing */
static struct hstate * __initdata parsed_hstate;
static unsigned long __initdata default_hstate_max_huge_pages;
-static unsigned long __initdata default_hstate_size;
static bool __initdata parsed_valid_hugepagesz = true;
+static bool __initdata parsed_default_hugepagesz;
/*
* Protects updates to hugepage_freelists, hugepage_activelist, nr_huge_pages,
@@ -85,7 +89,7 @@ static inline void unlock_or_release_subpool(struct hugepage_subpool *spool)
spin_unlock(&spool->lock);
/* If no pages are used, and no other handles to the subpool
- * remain, give up any reservations mased on minimum size and
+ * remain, give up any reservations based on minimum size and
* free the subpool */
if (free) {
if (spool->min_hpages != -1)
@@ -130,10 +134,10 @@ void hugepage_put_subpool(struct hugepage_subpool *spool)
/*
* Subpool accounting for allocating and reserving pages.
* Return -ENOMEM if there are not enough resources to satisfy the
- * the request. Otherwise, return the number of pages by which the
+ * request. Otherwise, return the number of pages by which the
* global pools must be adjusted (upward). The returned value may
* only be different than the passed value (delta) in the case where
- * a subpool minimum size must be manitained.
+ * a subpool minimum size must be maintained.
*/
static long hugepage_subpool_get_pages(struct hugepage_subpool *spool,
long delta)
@@ -473,7 +477,7 @@ out_of_memory:
*
* Return the number of new huge pages added to the map. This number is greater
* than or equal to zero. If file_region entries needed to be allocated for
- * this operation and we were not able to allocate, it ruturns -ENOMEM.
+ * this operation and we were not able to allocate, it returns -ENOMEM.
* region_add of regions of length 1 never allocate file_regions and cannot
* fail; region_chg will always allocate at least 1 entry and a region_add for
* 1 page will only require at most 1 entry.
@@ -988,7 +992,7 @@ static bool vma_has_reserves(struct vm_area_struct *vma, long chg)
* We know VM_NORESERVE is not set. Therefore, there SHOULD
* be a region map for all pages. The only situation where
* there is no region map is if a hole was punched via
- * fallocate. In this case, there really are no reverves to
+ * fallocate. In this case, there really are no reserves to
* use. This situation is indicated if chg != 0.
*/
if (chg)
@@ -1037,10 +1041,16 @@ static void enqueue_huge_page(struct hstate *h, struct page *page)
static struct page *dequeue_huge_page_node_exact(struct hstate *h, int nid)
{
struct page *page;
+ bool nocma = !!(current->flags & PF_MEMALLOC_NOCMA);
+
+ list_for_each_entry(page, &h->hugepage_freelists[nid], lru) {
+ if (nocma && is_migrate_cma_page(page))
+ continue;
- list_for_each_entry(page, &h->hugepage_freelists[nid], lru)
if (!PageHWPoison(page))
break;
+ }
+
/*
* if 'non-isolated free hugepage' not found on the list,
* the allocation fails.
@@ -1090,15 +1100,6 @@ retry_cpuset:
return NULL;
}
-/* Movability of hugepages depends on migration support. */
-static inline gfp_t htlb_alloc_mask(struct hstate *h)
-{
- if (hugepage_movable_supported(h))
- return GFP_HIGHUSER_MOVABLE;
- else
- return GFP_HIGHUSER;
-}
-
static struct page *dequeue_huge_page_vma(struct hstate *h,
struct vm_area_struct *vma,
unsigned long address, int avoid_reserve,
@@ -1236,9 +1237,10 @@ static void free_gigantic_page(struct page *page, unsigned int order)
* If the page isn't allocated using the cma allocator,
* cma_release() returns false.
*/
- if (IS_ENABLED(CONFIG_CMA) &&
- cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
+#ifdef CONFIG_CMA
+ if (cma_release(hugetlb_cma[page_to_nid(page)], page, 1 << order))
return;
+#endif
free_contig_range(page_to_pfn(page), 1 << order);
}
@@ -1248,21 +1250,34 @@ static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nodemask)
{
unsigned long nr_pages = 1UL << huge_page_order(h);
+ if (nid == NUMA_NO_NODE)
+ nid = numa_mem_id();
- if (IS_ENABLED(CONFIG_CMA)) {
+#ifdef CONFIG_CMA
+ {
struct page *page;
int node;
- for_each_node_mask(node, *nodemask) {
- if (!hugetlb_cma[node])
- continue;
-
- page = cma_alloc(hugetlb_cma[node], nr_pages,
- huge_page_order(h), true);
+ if (hugetlb_cma[nid]) {
+ page = cma_alloc(hugetlb_cma[nid], nr_pages,
+ huge_page_order(h), true);
if (page)
return page;
}
+
+ if (!(gfp_mask & __GFP_THISNODE)) {
+ for_each_node_mask(node, *nodemask) {
+ if (node == nid || !hugetlb_cma[node])
+ continue;
+
+ page = cma_alloc(hugetlb_cma[node], nr_pages,
+ huge_page_order(h), true);
+ if (page)
+ return page;
+ }
+ }
}
+#endif
return alloc_contig_pages(nr_pages, gfp_mask, nid, nodemask);
}
@@ -1519,7 +1534,7 @@ static void prep_compound_gigantic_page(struct page *page, unsigned int order)
* For gigantic hugepages allocated through bootmem at
* boot, it's safer to be consistent with the not-gigantic
* hugepages and clear the PG_reserved bit from all tail pages
- * too. Otherwse drivers using get_user_pages() to access tail
+ * too. Otherwise drivers using get_user_pages() to access tail
* pages may get the reference counting wrong if they see
* PG_reserved set on a tail page (despite the head page not
* having PG_reserved set). Enforcing this consistency between
@@ -1594,7 +1609,7 @@ static struct address_space *_get_hugetlb_page_mapping(struct page *hpage)
/* Use first found vma */
pgoff_start = page_to_pgoff(hpage);
- pgoff_end = pgoff_start + hpage_nr_pages(hpage) - 1;
+ pgoff_end = pgoff_start + pages_per_huge_page(page_hstate(hpage)) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
@@ -1938,7 +1953,7 @@ out_unlock:
return page;
}
-struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
+static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask,
int nid, nodemask_t *nmask)
{
struct page *page;
@@ -1980,31 +1995,9 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h,
}
/* page migration callback function */
-struct page *alloc_huge_page_node(struct hstate *h, int nid)
-{
- gfp_t gfp_mask = htlb_alloc_mask(h);
- struct page *page = NULL;
-
- if (nid != NUMA_NO_NODE)
- gfp_mask |= __GFP_THISNODE;
-
- spin_lock(&hugetlb_lock);
- if (h->free_huge_pages - h->resv_huge_pages > 0)
- page = dequeue_huge_page_nodemask(h, gfp_mask, nid, NULL);
- spin_unlock(&hugetlb_lock);
-
- if (!page)
- page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL);
-
- return page;
-}
-
-/* page migration callback function */
struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid,
- nodemask_t *nmask)
+ nodemask_t *nmask, gfp_t gfp_mask)
{
- gfp_t gfp_mask = htlb_alloc_mask(h);
-
spin_lock(&hugetlb_lock);
if (h->free_huge_pages - h->resv_huge_pages > 0) {
struct page *page;
@@ -2032,7 +2025,7 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
gfp_mask = htlb_alloc_mask(h);
node = huge_node(vma, address, gfp_mask, &mpol, &nodemask);
- page = alloc_huge_page_nodemask(h, node, nodemask);
+ page = alloc_huge_page_nodemask(h, node, nodemask, gfp_mask);
mpol_cond_put(mpol);
return page;
@@ -2161,7 +2154,7 @@ static void return_unused_surplus_pages(struct hstate *h,
* evenly across all nodes with memory. Iterate across these nodes
* until we can no longer free unreserved surplus pages. This occurs
* when the nodes with surplus pages have no free pages.
- * free_pool_huge_page() will balance the the freed pages across the
+ * free_pool_huge_page() will balance the freed pages across the
* on-line nodes with memory and will handle the hstate accounting.
*
* Note that we decrement resv_huge_pages as we free the pages. If
@@ -2572,7 +2565,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h)
for (i = 0; i < h->max_huge_pages; ++i) {
if (hstate_is_gigantic(h)) {
- if (IS_ENABLED(CONFIG_CMA) && hugetlb_cma[0]) {
+ if (hugetlb_cma_size) {
pr_warn_once("HugeTLB: hugetlb_cma is enabled, skip boot time allocation\n");
break;
}
@@ -3060,7 +3053,7 @@ static void __init hugetlb_sysfs_init(void)
err = hugetlb_sysfs_add_hstate(h, hugepages_kobj,
hstate_kobjs, &hstate_attr_group);
if (err)
- pr_err("Hugetlb: Unable to add hstate %s", h->name);
+ pr_err("HugeTLB: Unable to add hstate %s", h->name);
}
}
@@ -3164,7 +3157,7 @@ static void hugetlb_register_node(struct node *node)
nhs->hstate_kobjs,
&per_node_hstate_attr_group);
if (err) {
- pr_err("Hugetlb: Unable to add hstate %s for node %d\n",
+ pr_err("HugeTLB: Unable to add hstate %s for node %d\n",
h->name, node->dev.id);
hugetlb_unregister_node(node);
break;
@@ -3212,23 +3205,41 @@ static int __init hugetlb_init(void)
{
int i;
- if (!hugepages_supported())
+ if (!hugepages_supported()) {
+ if (hugetlb_max_hstate || default_hstate_max_huge_pages)
+ pr_warn("HugeTLB: huge pages not supported, ignoring associated command-line parameters\n");
return 0;
+ }
- if (!size_to_hstate(default_hstate_size)) {
- if (default_hstate_size != 0) {
- pr_err("HugeTLB: unsupported default_hugepagesz %lu. Reverting to %lu\n",
- default_hstate_size, HPAGE_SIZE);
+ /*
+ * Make sure HPAGE_SIZE (HUGETLB_PAGE_ORDER) hstate exists. Some
+ * architectures depend on setup being done here.
+ */
+ hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
+ if (!parsed_default_hugepagesz) {
+ /*
+ * If we did not parse a default huge page size, set
+ * default_hstate_idx to HPAGE_SIZE hstate. And, if the
+ * number of huge pages for this default size was implicitly
+ * specified, set that here as well.
+ * Note that the implicit setting will overwrite an explicit
+ * setting. A warning will be printed in this case.
+ */
+ default_hstate_idx = hstate_index(size_to_hstate(HPAGE_SIZE));
+ if (default_hstate_max_huge_pages) {
+ if (default_hstate.max_huge_pages) {
+ char buf[32];
+
+ string_get_size(huge_page_size(&default_hstate),
+ 1, STRING_UNITS_2, buf, 32);
+ pr_warn("HugeTLB: Ignoring hugepages=%lu associated with %s page size\n",
+ default_hstate.max_huge_pages, buf);
+ pr_warn("HugeTLB: Using hugepages=%lu for number of default huge pages\n",
+ default_hstate_max_huge_pages);
+ }
+ default_hstate.max_huge_pages =
+ default_hstate_max_huge_pages;
}
-
- default_hstate_size = HPAGE_SIZE;
- if (!size_to_hstate(default_hstate_size))
- hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
- }
- default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
- if (default_hstate_max_huge_pages) {
- if (!default_hstate.max_huge_pages)
- default_hstate.max_huge_pages = default_hstate_max_huge_pages;
}
hugetlb_cma_check();
@@ -3256,10 +3267,10 @@ static int __init hugetlb_init(void)
}
subsys_initcall(hugetlb_init);
-/* Should be called on processing a hugepagesz=... option */
-void __init hugetlb_bad_size(void)
+/* Overwritten by architectures with more huge page sizes */
+bool __init __attribute((weak)) arch_hugetlb_valid_size(unsigned long size)
{
- parsed_valid_hugepagesz = false;
+ return size == HPAGE_SIZE;
}
void __init hugetlb_add_hstate(unsigned int order)
@@ -3268,7 +3279,6 @@ void __init hugetlb_add_hstate(unsigned int order)
unsigned long i;
if (size_to_hstate(PAGE_SIZE << order)) {
- pr_warn("hugepagesz= specified twice, ignoring\n");
return;
}
BUG_ON(hugetlb_max_hstate >= HUGE_MAX_HSTATE);
@@ -3289,20 +3299,29 @@ void __init hugetlb_add_hstate(unsigned int order)
parsed_hstate = h;
}
-static int __init hugetlb_nrpages_setup(char *s)
+/*
+ * hugepages command line processing
+ * hugepages normally follows a valid hugepagsz or default_hugepagsz
+ * specification. If not, ignore the hugepages value. hugepages can also
+ * be the first huge page command line option in which case it implicitly
+ * specifies the number of huge pages for the default size.
+ */
+static int __init hugepages_setup(char *s)
{
unsigned long *mhp;
static unsigned long *last_mhp;
if (!parsed_valid_hugepagesz) {
- pr_warn("hugepages = %s preceded by "
- "an unsupported hugepagesz, ignoring\n", s);
+ pr_warn("HugeTLB: hugepages=%s does not follow a valid hugepagesz, ignoring\n", s);
parsed_valid_hugepagesz = true;
- return 1;
+ return 0;
}
+
/*
- * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter yet,
- * so this hugepages= parameter goes to the "default hstate".
+ * !hugetlb_max_hstate means we haven't parsed a hugepagesz= parameter
+ * yet, so this hugepages= parameter goes to the "default hstate".
+ * Otherwise, it goes with the previously parsed hugepagesz or
+ * default_hugepagesz.
*/
else if (!hugetlb_max_hstate)
mhp = &default_hstate_max_huge_pages;
@@ -3310,8 +3329,8 @@ static int __init hugetlb_nrpages_setup(char *s)
mhp = &parsed_hstate->max_huge_pages;
if (mhp == last_mhp) {
- pr_warn("hugepages= specified twice without interleaving hugepagesz=, ignoring\n");
- return 1;
+ pr_warn("HugeTLB: hugepages= specified twice without interleaving hugepagesz=, ignoring hugepages=%s\n", s);
+ return 0;
}
if (sscanf(s, "%lu", mhp) <= 0)
@@ -3329,30 +3348,142 @@ static int __init hugetlb_nrpages_setup(char *s)
return 1;
}
-__setup("hugepages=", hugetlb_nrpages_setup);
+__setup("hugepages=", hugepages_setup);
-static int __init hugetlb_default_setup(char *s)
+/*
+ * hugepagesz command line processing
+ * A specific huge page size can only be specified once with hugepagesz.
+ * hugepagesz is followed by hugepages on the command line. The global
+ * variable 'parsed_valid_hugepagesz' is used to determine if prior
+ * hugepagesz argument was valid.
+ */
+static int __init hugepagesz_setup(char *s)
{
- default_hstate_size = memparse(s, &s);
+ unsigned long size;
+ struct hstate *h;
+
+ parsed_valid_hugepagesz = false;
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ h = size_to_hstate(size);
+ if (h) {
+ /*
+ * hstate for this size already exists. This is normally
+ * an error, but is allowed if the existing hstate is the
+ * default hstate. More specifically, it is only allowed if
+ * the number of huge pages for the default hstate was not
+ * previously specified.
+ */
+ if (!parsed_default_hugepagesz || h != &default_hstate ||
+ default_hstate.max_huge_pages) {
+ pr_warn("HugeTLB: hugepagesz=%s specified twice, ignoring\n", s);
+ return 0;
+ }
+
+ /*
+ * No need to call hugetlb_add_hstate() as hstate already
+ * exists. But, do set parsed_hstate so that a following
+ * hugepages= parameter will be applied to this hstate.
+ */
+ parsed_hstate = h;
+ parsed_valid_hugepagesz = true;
+ return 1;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
return 1;
}
-__setup("default_hugepagesz=", hugetlb_default_setup);
+__setup("hugepagesz=", hugepagesz_setup);
-static unsigned int cpuset_mems_nr(unsigned int *array)
+/*
+ * default_hugepagesz command line input
+ * Only one instance of default_hugepagesz allowed on command line.
+ */
+static int __init default_hugepagesz_setup(char *s)
+{
+ unsigned long size;
+
+ parsed_valid_hugepagesz = false;
+ if (parsed_default_hugepagesz) {
+ pr_err("HugeTLB: default_hugepagesz previously specified, ignoring %s\n", s);
+ return 0;
+ }
+
+ size = (unsigned long)memparse(s, NULL);
+
+ if (!arch_hugetlb_valid_size(size)) {
+ pr_err("HugeTLB: unsupported default_hugepagesz=%s\n", s);
+ return 0;
+ }
+
+ hugetlb_add_hstate(ilog2(size) - PAGE_SHIFT);
+ parsed_valid_hugepagesz = true;
+ parsed_default_hugepagesz = true;
+ default_hstate_idx = hstate_index(size_to_hstate(size));
+
+ /*
+ * The number of default huge pages (for this size) could have been
+ * specified as the first hugetlb parameter: hugepages=X. If so,
+ * then default_hstate_max_huge_pages is set. If the default huge
+ * page size is gigantic (>= MAX_ORDER), then the pages must be
+ * allocated here from bootmem allocator.
+ */
+ if (default_hstate_max_huge_pages) {
+ default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+ if (hstate_is_gigantic(&default_hstate))
+ hugetlb_hstate_alloc_pages(&default_hstate);
+ default_hstate_max_huge_pages = 0;
+ }
+
+ return 1;
+}
+__setup("default_hugepagesz=", default_hugepagesz_setup);
+
+static unsigned int allowed_mems_nr(struct hstate *h)
{
int node;
unsigned int nr = 0;
+ nodemask_t *mpol_allowed;
+ unsigned int *array = h->free_huge_pages_node;
+ gfp_t gfp_mask = htlb_alloc_mask(h);
- for_each_node_mask(node, cpuset_current_mems_allowed)
- nr += array[node];
+ mpol_allowed = policy_nodemask_current(gfp_mask);
+
+ for_each_node_mask(node, cpuset_current_mems_allowed) {
+ if (!mpol_allowed ||
+ (mpol_allowed && node_isset(node, *mpol_allowed)))
+ nr += array[node];
+ }
return nr;
}
#ifdef CONFIG_SYSCTL
+static int proc_hugetlb_doulongvec_minmax(struct ctl_table *table, int write,
+ void *buffer, size_t *length,
+ loff_t *ppos, unsigned long *out)
+{
+ struct ctl_table dup_table;
+
+ /*
+ * In order to avoid races with __do_proc_doulongvec_minmax(), we
+ * can duplicate the @table and alter the duplicate of it.
+ */
+ dup_table = *table;
+ dup_table.data = out;
+
+ return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos);
+}
+
static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp = h->max_huge_pages;
@@ -3361,9 +3492,8 @@ static int hugetlb_sysctl_handler_common(bool obey_mempolicy,
if (!hugepages_supported())
return -EOPNOTSUPP;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
@@ -3375,7 +3505,7 @@ out:
}
int hugetlb_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(false, table, write,
@@ -3384,7 +3514,7 @@ int hugetlb_sysctl_handler(struct ctl_table *table, int write,
#ifdef CONFIG_NUMA
int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
return hugetlb_sysctl_handler_common(true, table, write,
buffer, length, ppos);
@@ -3392,8 +3522,7 @@ int hugetlb_mempolicy_sysctl_handler(struct ctl_table *table, int write,
#endif /* CONFIG_NUMA */
int hugetlb_overcommit_handler(struct ctl_table *table, int write,
- void __user *buffer,
- size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct hstate *h = &default_hstate;
unsigned long tmp;
@@ -3407,9 +3536,8 @@ int hugetlb_overcommit_handler(struct ctl_table *table, int write,
if (write && hstate_is_gigantic(h))
return -EINVAL;
- table->data = &tmp;
- table->maxlen = sizeof(unsigned long);
- ret = proc_doulongvec_minmax(table, write, buffer, length, ppos);
+ ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos,
+ &tmp);
if (ret)
goto out;
@@ -3524,12 +3652,18 @@ static int hugetlb_acct_memory(struct hstate *h, long delta)
* we fall back to check against current free page availability as
* a best attempt and hopefully to minimize the impact of changing
* semantics that cpuset has.
+ *
+ * Apart from cpuset, we also have memory policy mechanism that
+ * also determines from which node the kernel will allocate memory
+ * in a NUMA system. So similar to cpuset, we also should consider
+ * the memory policy of the current task. Similar to the description
+ * above.
*/
if (delta > 0) {
if (gather_surplus_pages(h, delta) < 0)
goto out;
- if (delta > cpuset_mems_nr(h->free_huge_pages_node)) {
+ if (delta > allowed_mems_nr(h)) {
return_unused_surplus_pages(h, delta);
goto out;
}
@@ -3834,7 +3968,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma,
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
spin_unlock(ptl);
/*
* We just unmapped a page of PMDs by clearing a PUD.
@@ -4421,10 +4555,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
} else if (unlikely(is_hugetlb_entry_hwpoisoned(entry)))
return VM_FAULT_HWPOISON_LARGE |
VM_FAULT_SET_HINDEX(hstate_index(h));
- } else {
- ptep = huge_pte_alloc(mm, haddr, huge_page_size(h));
- if (!ptep)
- return VM_FAULT_OOM;
}
/*
@@ -4466,9 +4596,9 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
/*
* entry could be a migration/hwpoison entry at this point, so this
* check prevents the kernel from going below assuming that we have
- * a active hugepage in pagecache. This goto expects the 2nd page fault,
- * and is_hugetlb_entry_(migration|hwpoisoned) check will properly
- * handle it.
+ * an active hugepage in pagecache. This goto expects the 2nd page
+ * fault, and is_hugetlb_entry_(migration|hwpoisoned) check will
+ * properly handle it.
*/
if (!pte_present(entry))
goto out_mutex;
@@ -4583,7 +4713,7 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm,
(const void __user *) src_addr,
pages_per_huge_page(h), false);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
@@ -4901,7 +5031,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma,
if (!ptep)
continue;
ptl = huge_pte_lock(h, mm, ptep);
- if (huge_pmd_unshare(mm, &address, ptep)) {
+ if (huge_pmd_unshare(mm, vma, &address, ptep)) {
pages++;
spin_unlock(ptl);
shared_pmd = true;
@@ -5195,25 +5325,21 @@ static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr)
void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma,
unsigned long *start, unsigned long *end)
{
- unsigned long check_addr;
+ unsigned long a_start, a_end;
if (!(vma->vm_flags & VM_MAYSHARE))
return;
- for (check_addr = *start; check_addr < *end; check_addr += PUD_SIZE) {
- unsigned long a_start = check_addr & PUD_MASK;
- unsigned long a_end = a_start + PUD_SIZE;
+ /* Extend the range to be PUD aligned for a worst case scenario */
+ a_start = ALIGN_DOWN(*start, PUD_SIZE);
+ a_end = ALIGN(*end, PUD_SIZE);
- /*
- * If sharing is possible, adjust start/end if necessary.
- */
- if (range_in_vma(vma, a_start, a_end)) {
- if (a_start < *start)
- *start = a_start;
- if (a_end > *end)
- *end = a_end;
- }
- }
+ /*
+ * Intersect the range with the vma range, since pmd sharing won't be
+ * across vma after all
+ */
+ *start = max(vma->vm_start, a_start);
+ *end = min(vma->vm_end, a_end);
}
/*
@@ -5286,12 +5412,14 @@ out:
* returns: 1 successfully unmapped a shared pte page
* 0 the underlying pte page is not shared, or it is the last user
*/
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
pgd_t *pgd = pgd_offset(mm, *addr);
p4d_t *p4d = p4d_offset(pgd, *addr);
pud_t *pud = pud_offset(p4d, *addr);
+ i_mmap_assert_write_locked(vma->vm_file->f_mapping);
BUG_ON(page_count(virt_to_page(ptep)) == 0);
if (page_count(virt_to_page(ptep)) == 1)
return 0;
@@ -5309,7 +5437,8 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud)
return NULL;
}
-int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep)
+int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long *addr, pte_t *ptep)
{
return 0;
}
@@ -5355,8 +5484,8 @@ pte_t *huge_pte_alloc(struct mm_struct *mm,
* huge_pte_offset() - Walk the page table to resolve the hugepage
* entry at address @addr
*
- * Return: Pointer to page table or swap entry (PUD or PMD) for
- * address @addr, or NULL if a p*d_none() entry is encountered and the
+ * Return: Pointer to page table entry (PUD or PMD) for
+ * address @addr, or NULL if a !p*d_present() entry is encountered and the
* size @sz doesn't match the hugepage size at this level of the page
* table.
*/
@@ -5365,8 +5494,8 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud, pud_entry;
- pmd_t *pmd, pmd_entry;
+ pud_t *pud;
+ pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (!pgd_present(*pgd))
@@ -5376,22 +5505,16 @@ pte_t *huge_pte_offset(struct mm_struct *mm,
return NULL;
pud = pud_offset(p4d, addr);
- pud_entry = READ_ONCE(*pud);
- if (sz != PUD_SIZE && pud_none(pud_entry))
- return NULL;
- /* hugepage or swap? */
- if (pud_huge(pud_entry) || !pud_present(pud_entry))
+ if (sz == PUD_SIZE)
+ /* must be pud huge, non-present or none */
return (pte_t *)pud;
-
- pmd = pmd_offset(pud, addr);
- pmd_entry = READ_ONCE(*pmd);
- if (sz != PMD_SIZE && pmd_none(pmd_entry))
+ if (!pud_present(*pud))
return NULL;
- /* hugepage or swap? */
- if (pmd_huge(pmd_entry) || !pmd_present(pmd_entry))
- return (pte_t *)pmd;
+ /* must have a valid entry and size to go further */
- return NULL;
+ pmd = pmd_offset(pud, addr);
+ /* must be pmd huge, non-present or none */
+ return (pte_t *)pmd;
}
#endif /* CONFIG_ARCH_WANT_GENERAL_HUGETLB */
@@ -5548,7 +5671,6 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason)
}
#ifdef CONFIG_CMA
-static unsigned long hugetlb_cma_size __initdata;
static bool cma_reserve_called __initdata;
static int __init cmdline_parse_hugetlb_cma(char *p)
@@ -5586,12 +5708,14 @@ void __init hugetlb_cma_reserve(int order)
reserved = 0;
for_each_node_state(nid, N_ONLINE) {
int res;
+ char name[20];
size = min(per_node, hugetlb_cma_size - reserved);
size = round_up(size, PAGE_SIZE << order);
+ snprintf(name, 20, "hugetlb%d", nid);
res = cma_declare_contiguous_nid(0, size, 0, PAGE_SIZE << order,
- 0, false, "hugetlb",
+ 0, false, name,
&hugetlb_cma[nid], nid);
if (res) {
pr_warn("hugetlb_cma: reservation failed: err %d, node %d",
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index aabf65d4d91b..1f87aec9ab5c 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -655,7 +655,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.events", buf);
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_show;
- cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]),
+ cft->file_offset = offsetof(struct hugetlb_cgroup, events_file[idx]);
cft->flags = CFTYPE_NOT_ON_ROOT;
/* Add the events.local file */
@@ -664,7 +664,7 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
cft->private = MEMFILE_PRIVATE(idx, 0);
cft->seq_show = hugetlb_events_local_show;
cft->file_offset = offsetof(struct hugetlb_cgroup,
- events_local_file[idx]),
+ events_local_file[idx]);
cft->flags = CFTYPE_NOT_ON_ROOT;
/* NULL terminate the last cft */
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 19603302a77f..3a613c85f9ed 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -6,10 +6,10 @@
#include <linux/list.h>
#include <linux/cpumask.h>
#include <linux/mman.h>
+#include <linux/pgtable.h>
#include <linux/atomic.h>
#include <linux/user_namespace.h>
-#include <asm/pgtable.h>
#include <asm/mmu.h>
#ifndef INIT_MM_CONTEXT
@@ -31,7 +31,7 @@ struct mm_struct init_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
- .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem),
+ MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
diff --git a/mm/internal.h b/mm/internal.h
index b5634e78f01d..10c677655912 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -49,18 +49,20 @@ void unmap_page_range(struct mmu_gather *tlb,
unsigned long addr, unsigned long end,
struct zap_details *details);
-extern unsigned int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void force_page_cache_readahead(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read);
+void __do_page_cache_readahead(struct address_space *, struct file *,
+ pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_size);
/*
* Submit IO for the read-ahead request in file_ra_state.
*/
-static inline unsigned long ra_submit(struct file_ra_state *ra,
+static inline void ra_submit(struct file_ra_state *ra,
struct address_space *mapping, struct file *filp)
{
- return __do_page_cache_readahead(mapping, filp,
- ra->start, ra->size, ra->async_size);
+ __do_page_cache_readahead(mapping, filp,
+ ra->start, ra->size, ra->async_size);
}
/**
@@ -125,12 +127,12 @@ extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
* between functions involved in allocations, including the alloc_pages*
* family of functions.
*
- * nodemask, migratetype and high_zoneidx are initialized only once in
+ * nodemask, migratetype and highest_zoneidx are initialized only once in
* __alloc_pages_nodemask() and then never change.
*
- * zonelist, preferred_zone and classzone_idx are set first in
+ * zonelist, preferred_zone and highest_zoneidx are set first in
* __alloc_pages_nodemask() for the fast path, and might be later changed
- * in __alloc_pages_slowpath(). All other functions pass the whole strucure
+ * in __alloc_pages_slowpath(). All other functions pass the whole structure
* by a const pointer.
*/
struct alloc_context {
@@ -138,12 +140,21 @@ struct alloc_context {
nodemask_t *nodemask;
struct zoneref *preferred_zoneref;
int migratetype;
- enum zone_type high_zoneidx;
+
+ /*
+ * highest_zoneidx represents highest usable zone index of
+ * the allocation request. Due to the nature of the zone,
+ * memory on lower zone than the highest_zoneidx will be
+ * protected by lowmem_reserve[highest_zoneidx].
+ *
+ * highest_zoneidx is also used by reclaim/compaction to limit
+ * the target zone since higher zone than this index cannot be
+ * usable for this allocation request.
+ */
+ enum zone_type highest_zoneidx;
bool spread_dirty_pages;
};
-#define ac_classzone_idx(ac) zonelist_zone_idx(ac->preferred_zoneref)
-
/*
* Locate the struct page for both the matching buddy in our
* pair (buddy1) and the combined O(n+1) page they form (page).
@@ -222,12 +233,13 @@ struct compact_control {
int order; /* order a direct compactor needs */
int migratetype; /* migratetype of direct compactor */
const unsigned int alloc_flags; /* alloc flags of a direct compactor */
- const int classzone_idx; /* zone index of a direct compactor */
+ const int highest_zoneidx; /* zone index of a direct compactor */
enum migrate_mode mode; /* Async or sync migration mode */
bool ignore_skip_hint; /* Scan blocks even if marked skip */
bool no_set_skip_hint; /* Don't mark blocks for skipping */
bool ignore_block_suitable; /* Scan blocks considered unsuitable */
bool direct_compaction; /* False from kcompactd or /proc/... */
+ bool proactive_compaction; /* kcompactd proactive compaction */
bool whole_zone; /* Whole zone should/has been scanned */
bool contended; /* Signal lock or sched contention */
bool rescan; /* Rescanning the same pageblock */
@@ -333,7 +345,7 @@ static inline void munlock_vma_pages_all(struct vm_area_struct *vma)
}
/*
- * must be called with vma's mmap_sem held for read or write, and page locked.
+ * must be called with vma's mmap_lock held for read or write, and page locked.
*/
extern void mlock_vma_page(struct page *page);
extern unsigned int munlock_vma_page(struct page *page);
@@ -357,7 +369,7 @@ extern void clear_page_mlock(struct page *page);
static inline void mlock_migrate_page(struct page *newpage, struct page *page)
{
if (TestClearPageMlocked(page)) {
- int nr_pages = hpage_nr_pages(page);
+ int nr_pages = thp_nr_pages(page);
/* Holding pmd lock, no change in irq context: __mod is safe */
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
@@ -384,7 +396,7 @@ vma_address(struct page *page, struct vm_area_struct *vma)
unsigned long start, end;
start = __vma_address(page, vma);
- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
+ end = start + thp_size(page) - PAGE_SIZE;
/* page should be within @vma mapping range */
VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma);
@@ -402,13 +414,13 @@ static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf,
/*
* FAULT_FLAG_RETRY_NOWAIT means we don't want to wait on page locks or
- * anything, so we only pin the file and drop the mmap_sem if only
+ * anything, so we only pin the file and drop the mmap_lock if only
* FAULT_FLAG_ALLOW_RETRY is set, while this is the first attempt.
*/
if (fault_flag_allow_retry_first(flags) &&
!(flags & FAULT_FLAG_RETRY_NOWAIT)) {
fpin = get_file(vmf->vma->vm_file);
- up_read(&vmf->vma->vm_mm->mmap_sem);
+ mmap_read_unlock(vmf->vma->vm_mm);
}
return fpin;
}
@@ -527,7 +539,7 @@ extern unsigned long __must_check vm_mmap_pgoff(struct file *, unsigned long,
unsigned long, unsigned long);
extern void set_pageblock_order(void);
-unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list);
/* The ALLOC_WMARK bits are used as an index to zone->watermark */
#define ALLOC_WMARK_MIN WMARK_MIN
@@ -601,5 +613,11 @@ static inline bool is_migrate_highatomic_page(struct page *page)
}
void setup_zone_pageset(struct zone *zone);
-extern struct page *alloc_new_node_page(struct page *page, unsigned long node);
+
+struct migration_target_control {
+ int nid; /* preferred node id */
+ nodemask_t *nmask;
+ gfp_t gfp_mask;
+};
+
#endif /* __MM_INTERNAL_H */
diff --git a/mm/ioremap.c b/mm/ioremap.c
new file mode 100644
index 000000000000..5fa1ab41d152
--- /dev/null
+++ b/mm/ioremap.c
@@ -0,0 +1,289 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Re-map IO memory to kernel address space so that we can access it.
+ * This is needed for high PCI addresses that aren't mapped in the
+ * 640k-1MB IO memory area on PC's
+ *
+ * (C) Copyright 1995 1996 Linus Torvalds
+ */
+#include <linux/vmalloc.h>
+#include <linux/mm.h>
+#include <linux/sched.h>
+#include <linux/io.h>
+#include <linux/export.h>
+#include <asm/cacheflush.h>
+
+#include "pgalloc-track.h"
+
+#ifdef CONFIG_HAVE_ARCH_HUGE_VMAP
+static int __read_mostly ioremap_p4d_capable;
+static int __read_mostly ioremap_pud_capable;
+static int __read_mostly ioremap_pmd_capable;
+static int __read_mostly ioremap_huge_disabled;
+
+static int __init set_nohugeiomap(char *str)
+{
+ ioremap_huge_disabled = 1;
+ return 0;
+}
+early_param("nohugeiomap", set_nohugeiomap);
+
+void __init ioremap_huge_init(void)
+{
+ if (!ioremap_huge_disabled) {
+ if (arch_ioremap_p4d_supported())
+ ioremap_p4d_capable = 1;
+ if (arch_ioremap_pud_supported())
+ ioremap_pud_capable = 1;
+ if (arch_ioremap_pmd_supported())
+ ioremap_pmd_capable = 1;
+ }
+}
+
+static inline int ioremap_p4d_enabled(void)
+{
+ return ioremap_p4d_capable;
+}
+
+static inline int ioremap_pud_enabled(void)
+{
+ return ioremap_pud_capable;
+}
+
+static inline int ioremap_pmd_enabled(void)
+{
+ return ioremap_pmd_capable;
+}
+
+#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
+static inline int ioremap_p4d_enabled(void) { return 0; }
+static inline int ioremap_pud_enabled(void) { return 0; }
+static inline int ioremap_pmd_enabled(void) { return 0; }
+#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
+
+static int ioremap_pte_range(pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pte_t *pte;
+ u64 pfn;
+
+ pfn = phys_addr >> PAGE_SHIFT;
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
+ if (!pte)
+ return -ENOMEM;
+ do {
+ BUG_ON(!pte_none(*pte));
+ set_pte_at(&init_mm, addr, pte, pfn_pte(pfn, prot));
+ pfn++;
+ } while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
+ return 0;
+}
+
+static int ioremap_try_huge_pmd(pmd_t *pmd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_pmd_enabled())
+ return 0;
+
+ if ((end - addr) != PMD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PMD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PMD_SIZE))
+ return 0;
+
+ if (pmd_present(*pmd) && !pmd_free_pte_page(pmd, addr))
+ return 0;
+
+ return pmd_set_huge(pmd, phys_addr, prot);
+}
+
+static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pmd_t *pmd;
+ unsigned long next;
+
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
+ if (!pmd)
+ return -ENOMEM;
+ do {
+ next = pmd_addr_end(addr, end);
+
+ if (ioremap_try_huge_pmd(pmd, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PMD_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pte_range(pmd, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pmd++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int ioremap_try_huge_pud(pud_t *pud, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_pud_enabled())
+ return 0;
+
+ if ((end - addr) != PUD_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, PUD_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, PUD_SIZE))
+ return 0;
+
+ if (pud_present(*pud) && !pud_free_pmd_page(pud, addr))
+ return 0;
+
+ return pud_set_huge(pud, phys_addr, prot);
+}
+
+static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ pud_t *pud;
+ unsigned long next;
+
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
+ if (!pud)
+ return -ENOMEM;
+ do {
+ next = pud_addr_end(addr, end);
+
+ if (ioremap_try_huge_pud(pud, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_PUD_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pmd_range(pud, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (pud++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+static int ioremap_try_huge_p4d(p4d_t *p4d, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr,
+ pgprot_t prot)
+{
+ if (!ioremap_p4d_enabled())
+ return 0;
+
+ if ((end - addr) != P4D_SIZE)
+ return 0;
+
+ if (!IS_ALIGNED(addr, P4D_SIZE))
+ return 0;
+
+ if (!IS_ALIGNED(phys_addr, P4D_SIZE))
+ return 0;
+
+ if (p4d_present(*p4d) && !p4d_free_pud_page(p4d, addr))
+ return 0;
+
+ return p4d_set_huge(p4d, phys_addr, prot);
+}
+
+static inline int ioremap_p4d_range(pgd_t *pgd, unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot,
+ pgtbl_mod_mask *mask)
+{
+ p4d_t *p4d;
+ unsigned long next;
+
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
+ if (!p4d)
+ return -ENOMEM;
+ do {
+ next = p4d_addr_end(addr, end);
+
+ if (ioremap_try_huge_p4d(p4d, addr, next, phys_addr, prot)) {
+ *mask |= PGTBL_P4D_MODIFIED;
+ continue;
+ }
+
+ if (ioremap_pud_range(p4d, addr, next, phys_addr, prot, mask))
+ return -ENOMEM;
+ } while (p4d++, phys_addr += (next - addr), addr = next, addr != end);
+ return 0;
+}
+
+int ioremap_page_range(unsigned long addr,
+ unsigned long end, phys_addr_t phys_addr, pgprot_t prot)
+{
+ pgd_t *pgd;
+ unsigned long start;
+ unsigned long next;
+ int err;
+ pgtbl_mod_mask mask = 0;
+
+ might_sleep();
+ BUG_ON(addr >= end);
+
+ start = addr;
+ pgd = pgd_offset_k(addr);
+ do {
+ next = pgd_addr_end(addr, end);
+ err = ioremap_p4d_range(pgd, addr, next, phys_addr, prot,
+ &mask);
+ if (err)
+ break;
+ } while (pgd++, phys_addr += (next - addr), addr = next, addr != end);
+
+ flush_cache_vmap(start, end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return err;
+}
+
+#ifdef CONFIG_GENERIC_IOREMAP
+void __iomem *ioremap_prot(phys_addr_t addr, size_t size, unsigned long prot)
+{
+ unsigned long offset, vaddr;
+ phys_addr_t last_addr;
+ struct vm_struct *area;
+
+ /* Disallow wrap-around or zero size */
+ last_addr = addr + size - 1;
+ if (!size || last_addr < addr)
+ return NULL;
+
+ /* Page-align mappings */
+ offset = addr & (~PAGE_MASK);
+ addr -= offset;
+ size = PAGE_ALIGN(size + offset);
+
+ area = get_vm_area_caller(size, VM_IOREMAP,
+ __builtin_return_address(0));
+ if (!area)
+ return NULL;
+ vaddr = (unsigned long)area->addr;
+
+ if (ioremap_page_range(vaddr, vaddr + size, addr, __pgprot(prot))) {
+ free_vm_area(area);
+ return NULL;
+ }
+
+ return (void __iomem *)(vaddr + offset);
+}
+EXPORT_SYMBOL(ioremap_prot);
+
+void iounmap(volatile void __iomem *addr)
+{
+ vunmap((void *)((unsigned long)addr & PAGE_MASK));
+}
+EXPORT_SYMBOL(iounmap);
+#endif /* CONFIG_GENERIC_IOREMAP */
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index de3121848ddf..370d970e5ab5 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -15,14 +15,19 @@ CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
-CFLAGS_common.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_generic.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_generic_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_init.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_quarantine.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_tags.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
-CFLAGS_tags_report.o := $(call cc-option, -fno-conserve-stack -fno-stack-protector) -DDISABLE_BRANCH_PROFILING
+CC_FLAGS_KASAN_RUNTIME := $(call cc-option, -fno-conserve-stack)
+CC_FLAGS_KASAN_RUNTIME += -fno-stack-protector
+# Disable branch tracing to avoid recursion.
+CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
+
+CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME)
obj-$(CONFIG_KASAN) := common.o init.o report.o
obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 2906358e42f0..950fd372a07e 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -33,7 +33,6 @@
#include <linux/types.h>
#include <linux/vmalloc.h>
#include <linux/bug.h>
-#include <linux/uaccess.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
@@ -41,7 +40,7 @@
#include "kasan.h"
#include "../slab.h"
-static inline depot_stack_handle_t save_stack(gfp_t flags)
+depot_stack_handle_t kasan_save_stack(gfp_t flags)
{
unsigned long entries[KASAN_STACK_DEPTH];
unsigned int nr_entries;
@@ -51,10 +50,10 @@ static inline depot_stack_handle_t save_stack(gfp_t flags)
return stack_depot_save(entries, nr_entries, flags);
}
-static inline void set_track(struct kasan_track *track, gfp_t flags)
+void kasan_set_track(struct kasan_track *track, gfp_t flags)
{
track->pid = current->pid;
- track->stack = save_stack(flags);
+ track->stack = kasan_save_stack(flags);
}
void kasan_enable_current(void)
@@ -181,21 +180,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
kasan_unpoison_shadow(base, watermark - base);
}
-/*
- * Clear all poison for the region between the current SP and a provided
- * watermark value, as is sometimes required prior to hand-crafted asm function
- * returns in the middle of functions.
- */
-void kasan_unpoison_stack_above_sp_to(const void *watermark)
-{
- const void *sp = __builtin_frame_address(0);
- size_t size = watermark - sp;
-
- if (WARN_ON(sp > watermark))
- return;
- kasan_unpoison_shadow(sp, size);
-}
-
void kasan_alloc_pages(struct page *page, unsigned int order)
{
u8 tag;
@@ -299,24 +283,6 @@ struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
return (void *)object + cache->kasan_info.free_meta_offset;
}
-
-static void kasan_set_free_info(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- u8 idx = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- idx = alloc_meta->free_track_idx;
- alloc_meta->free_pointer_tag[idx] = tag;
- alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
-#endif
-
- set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
-}
-
void kasan_poison_slab(struct page *page)
{
unsigned long i;
@@ -492,7 +458,7 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
KASAN_KMALLOC_REDZONE);
if (cache->flags & SLAB_KASAN)
- set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+ kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
return set_tag(object, tag);
}
@@ -613,24 +579,6 @@ void kasan_free_shadow(const struct vm_struct *vm)
}
#endif
-extern void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip);
-extern bool report_enabled(void);
-
-bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
-{
- unsigned long flags = user_access_save();
- bool ret = false;
-
- if (likely(report_enabled())) {
- __kasan_report(addr, size, is_write, ip);
- ret = true;
- }
-
- user_access_restore(flags);
-
- return ret;
-}
-
#ifdef CONFIG_MEMORY_HOTPLUG
static bool shadow_mapped(unsigned long addr)
{
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 098a7dbaced6..248264b9cb76 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -324,3 +324,46 @@ DEFINE_ASAN_SET_SHADOW(f2);
DEFINE_ASAN_SET_SHADOW(f3);
DEFINE_ASAN_SET_SHADOW(f5);
DEFINE_ASAN_SET_SHADOW(f8);
+
+void kasan_record_aux_stack(void *addr)
+{
+ struct page *page = kasan_addr_to_page(addr);
+ struct kmem_cache *cache;
+ struct kasan_alloc_meta *alloc_info;
+ void *object;
+
+ if (!(page && PageSlab(page)))
+ return;
+
+ cache = page->slab_cache;
+ object = nearest_obj(cache, page, addr);
+ alloc_info = get_alloc_info(cache, object);
+
+ /*
+ * record the last two call_rcu() call stacks.
+ */
+ alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
+ alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_free_meta *free_meta;
+
+ free_meta = get_free_info(cache, object);
+ kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+
+ /*
+ * the object was freed and has free track set
+ */
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
+ return NULL;
+ return &get_free_info(cache, object)->free_track;
+}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
index e200acb2d292..a38c7a9e192a 100644
--- a/mm/kasan/generic_report.c
+++ b/mm/kasan/generic_report.c
@@ -80,6 +80,7 @@ static const char *get_shadow_bug_type(struct kasan_access_info *info)
break;
case KASAN_FREE_PAGE:
case KASAN_KMALLOC_FREE:
+ case KASAN_KMALLOC_FREETRACK:
bug_type = "use-after-free";
break;
case KASAN_ALLOCA_LEFT:
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index ce45c491ebcd..fe6be0be1f76 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -250,20 +250,9 @@ int __ref kasan_populate_early_shadow(const void *shadow_start,
* 3,2 - level page tables where we don't have
* puds,pmds, so pgd_populate(), pud_populate()
* is noops.
- *
- * The ifndef is required to avoid build breakage.
- *
- * With 5level-fixup.h, pgd_populate() is not nop and
- * we reference kasan_early_shadow_p4d. It's not defined
- * unless 5-level paging enabled.
- *
- * The ifndef can be dropped once all KASAN-enabled
- * architectures will switch to pgtable-nop4d.h.
*/
-#ifndef __ARCH_HAS_5LEVEL_HACK
pgd_populate(&init_mm, pgd,
lm_alias(kasan_early_shadow_p4d));
-#endif
p4d = p4d_offset(pgd, addr);
p4d_populate(&init_mm, p4d,
lm_alias(kasan_early_shadow_pud));
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index cfade6413528..ac499456740f 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -17,15 +17,17 @@
#define KASAN_PAGE_REDZONE 0xFE /* redzone for kmalloc_large allocations */
#define KASAN_KMALLOC_REDZONE 0xFC /* redzone inside slub object */
#define KASAN_KMALLOC_FREE 0xFB /* object was freed (kmem_cache_free/kfree) */
+#define KASAN_KMALLOC_FREETRACK 0xFA /* object was freed and has free track set */
#else
#define KASAN_FREE_PAGE KASAN_TAG_INVALID
#define KASAN_PAGE_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_REDZONE KASAN_TAG_INVALID
#define KASAN_KMALLOC_FREE KASAN_TAG_INVALID
+#define KASAN_KMALLOC_FREETRACK KASAN_TAG_INVALID
#endif
-#define KASAN_GLOBAL_REDZONE 0xFA /* redzone for global variable */
-#define KASAN_VMALLOC_INVALID 0xF9 /* unallocated space in vmapped page */
+#define KASAN_GLOBAL_REDZONE 0xF9 /* redzone for global variable */
+#define KASAN_VMALLOC_INVALID 0xF8 /* unallocated space in vmapped page */
/*
* Stack redzone shadow values
@@ -104,7 +106,15 @@ struct kasan_track {
struct kasan_alloc_meta {
struct kasan_track alloc_track;
+#ifdef CONFIG_KASAN_GENERIC
+ /*
+ * call_rcu() call stack is stored into struct kasan_alloc_meta.
+ * The free stack is stored into struct kasan_free_meta.
+ */
+ depot_stack_handle_t aux_stack[2];
+#else
struct kasan_track free_track[KASAN_NR_FREE_STACKS];
+#endif
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
u8 free_pointer_tag[KASAN_NR_FREE_STACKS];
u8 free_track_idx;
@@ -119,6 +129,9 @@ struct kasan_free_meta {
* Otherwise it might be used for the allocator freelist.
*/
struct qlist_node quarantine_link;
+#ifdef CONFIG_KASAN_GENERIC
+ struct kasan_track free_track;
+#endif
};
struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
@@ -159,6 +172,12 @@ void kasan_report_invalid_free(void *object, unsigned long ip);
struct page *kasan_addr_to_page(const void *addr);
+depot_stack_handle_t kasan_save_stack(gfp_t flags);
+void kasan_set_track(struct kasan_track *track, gfp_t flags);
+void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag);
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag);
+
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 978bc4a3eb51..4c5375810449 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -145,6 +145,7 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
+ *(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 80f23c9da6b0..4f49fa6cd1aa 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -29,6 +29,7 @@
#include <linux/kasan.h>
#include <linux/module.h>
#include <linux/sched/task_stack.h>
+#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -105,15 +106,20 @@ static void end_report(unsigned long *flags)
kasan_enable_current();
}
+static void print_stack(depot_stack_handle_t stack)
+{
+ unsigned long *entries;
+ unsigned int nr_entries;
+
+ nr_entries = stack_depot_fetch(stack, &entries);
+ stack_trace_print(entries, nr_entries, 0);
+}
+
static void print_track(struct kasan_track *track, const char *prefix)
{
pr_err("%s by task %u:\n", prefix, track->pid);
if (track->stack) {
- unsigned long *entries;
- unsigned int nr_entries;
-
- nr_entries = stack_depot_fetch(track->stack, &entries);
- stack_trace_print(entries, nr_entries, 0);
+ print_stack(track->stack);
} else {
pr_err("(stack is not available)\n");
}
@@ -159,26 +165,6 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
- void *object, u8 tag)
-{
- struct kasan_alloc_meta *alloc_meta;
- int i = 0;
-
- alloc_meta = get_alloc_info(cache, object);
-
-#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
- if (alloc_meta->free_pointer_tag[i] == tag)
- break;
- }
- if (i == KASAN_NR_FREE_STACKS)
- i = alloc_meta->free_track_idx;
-#endif
-
- return &alloc_meta->free_track[i];
-}
-
static void describe_object(struct kmem_cache *cache, void *object,
const void *addr, u8 tag)
{
@@ -190,8 +176,23 @@ static void describe_object(struct kmem_cache *cache, void *object,
print_track(&alloc_info->alloc_track, "Allocated");
pr_err("\n");
free_track = kasan_get_free_track(cache, object, tag);
- print_track(free_track, "Freed");
- pr_err("\n");
+ if (free_track) {
+ print_track(free_track, "Freed");
+ pr_err("\n");
+ }
+
+#ifdef CONFIG_KASAN_GENERIC
+ if (alloc_info->aux_stack[0]) {
+ pr_err("Last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[0]);
+ pr_err("\n");
+ }
+ if (alloc_info->aux_stack[1]) {
+ pr_err("Second to last call_rcu():\n");
+ print_stack(alloc_info->aux_stack[1]);
+ pr_err("\n");
+ }
+#endif
}
describe_object_addr(cache, object, addr);
@@ -454,7 +455,7 @@ static void print_shadow_for_address(const void *addr)
}
}
-bool report_enabled(void)
+static bool report_enabled(void)
{
if (current->kasan_depth)
return false;
@@ -479,7 +480,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
end_report(&flags);
}
-void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip)
+static void __kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
{
struct kasan_access_info info;
void *tagged_addr;
@@ -518,6 +520,22 @@ void __kasan_report(unsigned long addr, size_t size, bool is_write, unsigned lon
end_report(&flags);
}
+bool kasan_report(unsigned long addr, size_t size, bool is_write,
+ unsigned long ip)
+{
+ unsigned long flags = user_access_save();
+ bool ret = false;
+
+ if (likely(report_enabled())) {
+ __kasan_report(addr, size, is_write, ip);
+ ret = true;
+ }
+
+ user_access_restore(flags);
+
+ return ret;
+}
+
#ifdef CONFIG_KASAN_INLINE
/*
* With CONFIG_KASAN_INLINE, accesses to bogus pointers (outside the high
diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c
index 8a959fdd30e3..e02a36a51f42 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/tags.c
@@ -161,3 +161,40 @@ void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
kasan_poison_shadow((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ u8 idx = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ idx = alloc_meta->free_track_idx;
+ alloc_meta->free_pointer_tag[idx] = tag;
+ alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS;
+#endif
+
+ kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+ int i = 0;
+
+ alloc_meta = get_alloc_info(cache, object);
+
+#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ break;
+ }
+ if (i == KASAN_NR_FREE_STACKS)
+ i = alloc_meta->free_track_idx;
+#endif
+
+ return &alloc_meta->free_track[i];
+}
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index cd280afb246e..58b0d9c502a1 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -28,6 +28,8 @@ enum scan_result {
SCAN_SUCCEED,
SCAN_PMD_NULL,
SCAN_EXCEED_NONE_PTE,
+ SCAN_EXCEED_SWAP_PTE,
+ SCAN_EXCEED_SHARED_PTE,
SCAN_PTE_NON_PRESENT,
SCAN_PTE_UFFD_WP,
SCAN_PAGE_RO,
@@ -47,7 +49,6 @@ enum scan_result {
SCAN_DEL_PAGE_LRU,
SCAN_ALLOC_HUGE_PAGE_FAIL,
SCAN_CGROUP_CHARGE_FAIL,
- SCAN_EXCEED_SWAP_PTE,
SCAN_TRUNCATED,
SCAN_PAGE_HAS_PRIVATE,
};
@@ -55,6 +56,9 @@ enum scan_result {
#define CREATE_TRACE_POINTS
#include <trace/events/huge_memory.h>
+static struct task_struct *khugepaged_thread __read_mostly;
+static DEFINE_MUTEX(khugepaged_mutex);
+
/* default scan 8*512 pte (or vmas) every 30 second */
static unsigned int khugepaged_pages_to_scan __read_mostly;
static unsigned int khugepaged_pages_collapsed;
@@ -72,6 +76,7 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait);
*/
static unsigned int khugepaged_max_ptes_none __read_mostly;
static unsigned int khugepaged_max_ptes_swap __read_mostly;
+static unsigned int khugepaged_max_ptes_shared __read_mostly;
#define MM_SLOTS_HASH_BITS 10
static __read_mostly DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS);
@@ -291,15 +296,43 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
__ATTR(max_ptes_swap, 0644, khugepaged_max_ptes_swap_show,
khugepaged_max_ptes_swap_store);
+static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ char *buf)
+{
+ return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+}
+
+static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
+ struct kobj_attribute *attr,
+ const char *buf, size_t count)
+{
+ int err;
+ unsigned long max_ptes_shared;
+
+ err = kstrtoul(buf, 10, &max_ptes_shared);
+ if (err || max_ptes_shared > HPAGE_PMD_NR-1)
+ return -EINVAL;
+
+ khugepaged_max_ptes_shared = max_ptes_shared;
+
+ return count;
+}
+
+static struct kobj_attribute khugepaged_max_ptes_shared_attr =
+ __ATTR(max_ptes_shared, 0644, khugepaged_max_ptes_shared_show,
+ khugepaged_max_ptes_shared_store);
+
static struct attribute *khugepaged_attr[] = {
&khugepaged_defrag_attr.attr,
&khugepaged_max_ptes_none_attr.attr,
+ &khugepaged_max_ptes_swap_attr.attr,
+ &khugepaged_max_ptes_shared_attr.attr,
&pages_to_scan_attr.attr,
&pages_collapsed_attr.attr,
&full_scans_attr.attr,
&scan_sleep_millisecs_attr.attr,
&alloc_sleep_millisecs_attr.attr,
- &khugepaged_max_ptes_swap_attr.attr,
NULL,
};
@@ -359,6 +392,7 @@ int __init khugepaged_init(void)
khugepaged_pages_to_scan = HPAGE_PMD_NR * 8;
khugepaged_max_ptes_none = HPAGE_PMD_NR - 1;
khugepaged_max_ptes_swap = HPAGE_PMD_NR / 8;
+ khugepaged_max_ptes_shared = HPAGE_PMD_NR / 2;
return 0;
}
@@ -400,7 +434,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
static inline int khugepaged_test_exit(struct mm_struct *mm)
{
- return atomic_read(&mm->mm_users) == 0;
+ return atomic_read(&mm->mm_users) == 0 || !mmget_still_valid(mm);
}
static bool hugepage_vma_check(struct vm_area_struct *vma,
@@ -435,7 +469,7 @@ int __khugepaged_enter(struct mm_struct *mm)
return -ENOMEM;
/* __khugepaged_exit() must not run from under us */
- VM_BUG_ON_MM(khugepaged_test_exit(mm), mm);
+ VM_BUG_ON_MM(atomic_read(&mm->mm_users) == 0, mm);
if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) {
free_mm_slot(mm_slot);
return 0;
@@ -503,36 +537,61 @@ void __khugepaged_exit(struct mm_struct *mm)
* under mmap sem read mode). Stop here (after we
* return all pagetables will be destroyed) until
* khugepaged has finished working on the pagetables
- * under the mmap_sem.
+ * under the mmap_lock.
*/
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
}
static void release_pte_page(struct page *page)
{
- dec_node_page_state(page, NR_ISOLATED_ANON + page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ -compound_nr(page));
unlock_page(page);
putback_lru_page(page);
}
-static void release_pte_pages(pte_t *pte, pte_t *_pte)
+static void release_pte_pages(pte_t *pte, pte_t *_pte,
+ struct list_head *compound_pagelist)
{
+ struct page *page, *tmp;
+
while (--_pte >= pte) {
pte_t pteval = *_pte;
- if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)))
- release_pte_page(pte_page(pteval));
+
+ page = pte_page(pteval);
+ if (!pte_none(pteval) && !is_zero_pfn(pte_pfn(pteval)) &&
+ !PageCompound(page))
+ release_pte_page(page);
+ }
+
+ list_for_each_entry_safe(page, tmp, compound_pagelist, lru) {
+ list_del(&page->lru);
+ release_pte_page(page);
}
}
+static bool is_refcount_suitable(struct page *page)
+{
+ int expected_refcount;
+
+ expected_refcount = total_mapcount(page);
+ if (PageSwapCache(page))
+ expected_refcount += compound_nr(page);
+
+ return page_count(page) == expected_refcount;
+}
+
static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
unsigned long address,
- pte_t *pte)
+ pte_t *pte,
+ struct list_head *compound_pagelist)
{
struct page *page = NULL;
pte_t *_pte;
- int none_or_zero = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0, result = 0, referenced = 0;
bool writable = false;
for (_pte = pte; _pte < pte+HPAGE_PMD_NR;
@@ -558,13 +617,27 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
goto out;
}
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ VM_BUG_ON_PAGE(!PageAnon(page), page);
+
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
goto out;
}
- VM_BUG_ON_PAGE(!PageAnon(page), page);
+ if (PageCompound(page)) {
+ struct page *p;
+ page = compound_head(page);
+
+ /*
+ * Check if we have dealt with the compound page
+ * already
+ */
+ list_for_each_entry(p, compound_pagelist, lru) {
+ if (page == p)
+ goto next;
+ }
+ }
/*
* We can do it before isolate_lru_page because the
@@ -578,28 +651,30 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * The page table that maps the page has been already unlinked
+ * from the page table tree and this process cannot get
+ * an additinal pin on the page.
+ *
+ * New pins can come later if the page is shared across fork,
+ * but not from this process. The other process cannot write to
+ * the page, only trigger CoW.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
unlock_page(page);
result = SCAN_PAGE_COUNT;
goto out;
}
- if (pte_write(pteval)) {
- writable = true;
- } else {
- if (PageSwapCache(page) &&
- !reuse_swap_page(page, NULL)) {
- unlock_page(page);
- result = SCAN_SWAP_CACHE_PAGE;
- goto out;
- }
+ if (!pte_write(pteval) && PageSwapCache(page) &&
+ !reuse_swap_page(page, NULL)) {
/*
- * Page is not in the swap cache. It can be collapsed
- * into a THP.
+ * Page is in the swap cache and cannot be re-used.
+ * It cannot be collapsed into a THP.
*/
+ unlock_page(page);
+ result = SCAN_SWAP_CACHE_PAGE;
+ goto out;
}
/*
@@ -611,16 +686,23 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
result = SCAN_DEL_PAGE_LRU;
goto out;
}
- inc_node_page_state(page,
- NR_ISOLATED_ANON + page_is_file_lru(page));
+ mod_node_page_state(page_pgdat(page),
+ NR_ISOLATED_ANON + page_is_file_lru(page),
+ compound_nr(page));
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ if (PageCompound(page))
+ list_add_tail(&page->lru, compound_pagelist);
+next:
/* There should be enough young pte to collapse the page */
if (pte_young(pteval) ||
page_is_young(page) || PageReferenced(page) ||
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
+
+ if (pte_write(pteval))
+ writable = true;
}
if (likely(writable)) {
if (likely(referenced)) {
@@ -634,7 +716,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
}
out:
- release_pte_pages(pte, _pte);
+ release_pte_pages(pte, _pte, compound_pagelist);
trace_mm_collapse_huge_page_isolate(page, none_or_zero,
referenced, writable, result);
return 0;
@@ -643,13 +725,14 @@ out:
static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
struct vm_area_struct *vma,
unsigned long address,
- spinlock_t *ptl)
+ spinlock_t *ptl,
+ struct list_head *compound_pagelist)
{
+ struct page *src_page, *tmp;
pte_t *_pte;
for (_pte = pte; _pte < pte + HPAGE_PMD_NR;
_pte++, page++, address += PAGE_SIZE) {
pte_t pteval = *_pte;
- struct page *src_page;
if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) {
clear_user_highpage(page, address);
@@ -669,8 +752,8 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
} else {
src_page = pte_page(pteval);
copy_user_highpage(page, src_page, address, vma);
- VM_BUG_ON_PAGE(page_mapcount(src_page) != 1, src_page);
- release_pte_page(src_page);
+ if (!PageCompound(src_page))
+ release_pte_page(src_page);
/*
* ptl mostly unnecessary, but preempt has to
* be disabled to update the per-cpu stats
@@ -687,6 +770,11 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
free_page_and_swap_cache(src_page);
}
}
+
+ list_for_each_entry_safe(src_page, tmp, compound_pagelist, lru) {
+ list_del(&src_page->lru);
+ release_pte_page(src_page);
+ }
}
static void khugepaged_alloc_sleep(void)
@@ -829,6 +917,18 @@ static struct page *khugepaged_alloc_hugepage(bool *wait)
static bool khugepaged_prealloc_page(struct page **hpage, bool *wait)
{
+ /*
+ * If the hpage allocated earlier was briefly exposed in page cache
+ * before collapse_file() failed, it is possible that racing lookups
+ * have not yet completed, and would then be unpleasantly surprised by
+ * finding the hpage reused for the same mapping at a different offset.
+ * Just release the previous allocation if there is any danger of that.
+ */
+ if (*hpage && page_count(*hpage) > 1) {
+ put_page(*hpage);
+ *hpage = NULL;
+ }
+
if (!*hpage)
*hpage = khugepaged_alloc_hugepage(wait);
@@ -848,8 +948,8 @@ khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node)
#endif
/*
- * If mmap_sem temporarily dropped, revalidate vma
- * before taking mmap_sem.
+ * If mmap_lock temporarily dropped, revalidate vma
+ * before taking mmap_lock.
* Return 0 if succeeds, otherwise return none-zero
* value (scan code).
*/
@@ -873,6 +973,9 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_ADDRESS_RANGE;
if (!hugepage_vma_check(vma, vma->vm_flags))
return SCAN_VMA_CHECK;
+ /* Anon VMA expected */
+ if (!vma->anon_vma || vma->vm_ops)
+ return SCAN_VMA_CHECK;
return 0;
}
@@ -881,7 +984,7 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
* Only done if khugepaged_scan_pmd believes it is worthwhile.
*
* Called and returns without pte mapped or spinlocks held,
- * but with mmap_sem held to protect against vma changes.
+ * but with mmap_lock held to protect against vma changes.
*/
static bool __collapse_huge_page_swapin(struct mm_struct *mm,
@@ -899,11 +1002,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
.pgoff = linear_page_index(vma, address),
};
- /* we only decide to swapin, if there is enough young ptes */
- if (referenced < HPAGE_PMD_NR/2) {
- trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
- return false;
- }
vmf.pte = pte_offset_map(pmd, address);
for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE;
vmf.pte++, vmf.address += PAGE_SIZE) {
@@ -913,9 +1011,9 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
swapped_in++;
ret = do_swap_page(&vmf);
- /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */
+ /* do_swap_page returns VM_FAULT_RETRY with released mmap_lock */
if (ret & VM_FAULT_RETRY) {
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
if (hugepage_vma_revalidate(mm, address, &vmf.vma)) {
/* vma is no longer available, don't continue to swapin */
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0);
@@ -936,6 +1034,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
}
vmf.pte--;
pte_unmap(vmf.pte);
+
+ /* Drain LRU add pagevec to remove extra pin on the swapped in pages */
+ if (swapped_in)
+ lru_add_drain();
+
trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1);
return true;
}
@@ -943,15 +1046,15 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm,
static void collapse_huge_page(struct mm_struct *mm,
unsigned long address,
struct page **hpage,
- int node, int referenced)
+ int node, int referenced, int unmapped)
{
+ LIST_HEAD(compound_pagelist);
pmd_t *pmd, _pmd;
pte_t *pte;
pgtable_t pgtable;
struct page *new_page;
spinlock_t *pmd_ptl, *pte_ptl;
int isolated = 0, result = 0;
- struct mem_cgroup *memcg;
struct vm_area_struct *vma;
struct mmu_notifier_range range;
gfp_t gfp;
@@ -962,60 +1065,56 @@ static void collapse_huge_page(struct mm_struct *mm,
gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE;
/*
- * Before allocating the hugepage, release the mmap_sem read lock.
+ * Before allocating the hugepage, release the mmap_lock read lock.
* The allocation can take potentially a long time if it involves
- * sync compaction, and we do not need to hold the mmap_sem during
+ * sync compaction, and we do not need to hold the mmap_lock during
* that. We will recheck the vma after taking it again in write mode.
*/
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
new_page = khugepaged_alloc_page(hpage, gfp, node);
if (!new_page) {
result = SCAN_ALLOC_HUGE_PAGE_FAIL;
goto out_nolock;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out_nolock;
}
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
goto out_nolock;
}
pmd = mm_find_pmd(mm, address);
if (!pmd) {
result = SCAN_PMD_NULL;
- mem_cgroup_cancel_charge(new_page, memcg, true);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
goto out_nolock;
}
/*
- * __collapse_huge_page_swapin always returns with mmap_sem locked.
- * If it fails, we release mmap_sem and jump out_nolock.
+ * __collapse_huge_page_swapin always returns with mmap_lock locked.
+ * If it fails, we release mmap_lock and jump out_nolock.
* Continuing to collapse causes inconsistency.
*/
- if (!__collapse_huge_page_swapin(mm, vma, address, pmd, referenced)) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
- up_read(&mm->mmap_sem);
+ if (unmapped && !__collapse_huge_page_swapin(mm, vma, address,
+ pmd, referenced)) {
+ mmap_read_unlock(mm);
goto out_nolock;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* Prevent all access to pagetables with the exception of
* gup_fast later handled by the ptep_clear_flush and the VM
* handled by the anon_vma lock + PG_lock.
*/
- down_write(&mm->mmap_sem);
- result = SCAN_ANY_PROCESS;
- if (!mmget_still_valid(mm))
- goto out;
+ mmap_write_lock(mm);
result = hugepage_vma_revalidate(mm, address, &vma);
if (result)
goto out;
@@ -1044,7 +1143,8 @@ static void collapse_huge_page(struct mm_struct *mm,
mmu_notifier_invalidate_range_end(&range);
spin_lock(pte_ptl);
- isolated = __collapse_huge_page_isolate(vma, address, pte);
+ isolated = __collapse_huge_page_isolate(vma, address, pte,
+ &compound_pagelist);
spin_unlock(pte_ptl);
if (unlikely(!isolated)) {
@@ -1069,7 +1169,8 @@ static void collapse_huge_page(struct mm_struct *mm,
*/
anon_vma_unlock_write(vma->anon_vma);
- __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl);
+ __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl,
+ &compound_pagelist);
pte_unmap(pte);
__SetPageUptodate(new_page);
pgtable = pmd_pgtable(_pmd);
@@ -1087,9 +1188,7 @@ static void collapse_huge_page(struct mm_struct *mm,
spin_lock(pmd_ptl);
BUG_ON(!pmd_none(*pmd));
page_add_new_anon_rmap(new_page, vma, address, true);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
- count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
pgtable_trans_huge_deposit(mm, pmd, pgtable);
set_pmd_at(mm, address, pmd, _pmd);
update_mmu_cache_pmd(vma, address, pmd);
@@ -1100,12 +1199,13 @@ static void collapse_huge_page(struct mm_struct *mm,
khugepaged_pages_collapsed++;
result = SCAN_SUCCEED;
out_up_write:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
out_nolock:
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
trace_mm_collapse_huge_page(mm, isolated, result);
return;
out:
- mem_cgroup_cancel_charge(new_page, memcg, true);
goto out_up_write;
}
@@ -1116,7 +1216,8 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
{
pmd_t *pmd;
pte_t *pte, *_pte;
- int ret = 0, none_or_zero = 0, result = 0, referenced = 0;
+ int ret = 0, result = 0, referenced = 0;
+ int none_or_zero = 0, shared = 0;
struct page *page = NULL;
unsigned long _address;
spinlock_t *ptl;
@@ -1188,12 +1289,14 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
goto out_unmap;
}
- /* TODO: teach khugepaged to collapse THP mapped with pte */
- if (PageCompound(page)) {
- result = SCAN_PAGE_COMPOUND;
+ if (page_mapcount(page) > 1 &&
+ ++shared > khugepaged_max_ptes_shared) {
+ result = SCAN_EXCEED_SHARED_PTE;
goto out_unmap;
}
+ page = compound_head(page);
+
/*
* Record which node the original page is from and save this
* information to khugepaged_node_load[].
@@ -1220,11 +1323,23 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
}
/*
- * cannot use mapcount: can't collapse if there's a gup pin.
- * The page must only be referenced by the scanned process
- * and page swap cache.
+ * Check if the page has any GUP (or other external) pins.
+ *
+ * Here the check is racy it may see totmal_mapcount > refcount
+ * in some cases.
+ * For example, one process with one forked child process.
+ * The parent has the PMD split due to MADV_DONTNEED, then
+ * the child is trying unmap the whole PMD, but khugepaged
+ * may be scanning the parent between the child has
+ * PageDoubleMap flag cleared and dec the mapcount. So
+ * khugepaged may see total_mapcount > refcount.
+ *
+ * But such case is ephemeral we could always retry collapse
+ * later. However it may report false positive if the page
+ * has excessive GUP pins (i.e. 512). Anyway the same check
+ * will be done again later the risk seems low.
*/
- if (page_count(page) != 1 + PageSwapCache(page)) {
+ if (!is_refcount_suitable(page)) {
result = SCAN_PAGE_COUNT;
goto out_unmap;
}
@@ -1233,22 +1348,21 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
mmu_notifier_test_young(vma->vm_mm, address))
referenced++;
}
- if (writable) {
- if (referenced) {
- result = SCAN_SUCCEED;
- ret = 1;
- } else {
- result = SCAN_LACK_REFERENCED_PAGE;
- }
- } else {
+ if (!writable) {
result = SCAN_PAGE_RO;
+ } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) {
+ result = SCAN_LACK_REFERENCED_PAGE;
+ } else {
+ result = SCAN_SUCCEED;
+ ret = 1;
}
out_unmap:
pte_unmap_unlock(pte, ptl);
if (ret) {
node = khugepaged_find_target_node();
- /* collapse_huge_page will return with the mmap_sem released */
- collapse_huge_page(mm, address, hpage, node, referenced);
+ /* collapse_huge_page will return with the mmap_lock released */
+ collapse_huge_page(mm, address, hpage, node,
+ referenced, unmapped);
}
out:
trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced,
@@ -1310,7 +1424,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
{
unsigned long haddr = addr & HPAGE_PMD_MASK;
struct vm_area_struct *vma = find_vma(mm, haddr);
- struct page *hpage = NULL;
+ struct page *hpage;
pte_t *start_pte, *pte;
pmd_t *pmd, _pmd;
spinlock_t *ptl;
@@ -1330,9 +1444,17 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE))
return;
+ hpage = find_lock_page(vma->vm_file->f_mapping,
+ linear_page_index(vma, haddr));
+ if (!hpage)
+ return;
+
+ if (!PageHead(hpage))
+ goto drop_hpage;
+
pmd = mm_find_pmd(mm, haddr);
if (!pmd)
- return;
+ goto drop_hpage;
start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl);
@@ -1351,30 +1473,11 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
page = vm_normal_page(vma, addr, *pte);
- if (!page || !PageCompound(page))
- goto abort;
-
- if (!hpage) {
- hpage = compound_head(page);
- /*
- * The mapping of the THP should not change.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may
- * change the page table, but the new page will
- * not pass PageCompound() check.
- */
- if (WARN_ON(hpage->mapping != vma->vm_file->f_mapping))
- goto abort;
- }
-
/*
- * Confirm the page maps to the correct subpage.
- *
- * Note that uprobe, debugger, or MAP_PRIVATE may change
- * the page table, but the new page will not pass
- * PageCompound() check.
+ * Note that uprobe, debugger, or MAP_PRIVATE may change the
+ * page table, but the new page will not be a subpage of hpage.
*/
- if (WARN_ON(hpage + i != page))
+ if (hpage + i != page)
goto abort;
count++;
}
@@ -1393,21 +1496,26 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr)
pte_unmap_unlock(start_pte, ptl);
/* step 3: set proper refcount and mm_counters. */
- if (hpage) {
+ if (count) {
page_ref_sub(hpage, count);
add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count);
}
/* step 4: collapse pmd */
ptl = pmd_lock(vma->vm_mm, pmd);
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ _pmd = pmdp_collapse_flush(vma, haddr, pmd);
spin_unlock(ptl);
mm_dec_nr_ptes(mm);
pte_free(mm, pmd_pgtable(_pmd));
+
+drop_hpage:
+ unlock_page(hpage);
+ put_page(hpage);
return;
abort:
pte_unmap_unlock(start_pte, ptl);
+ goto drop_hpage;
}
static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
@@ -1418,7 +1526,7 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
if (likely(mm_slot->nr_pte_mapped_thp == 0))
return 0;
- if (!down_write_trylock(&mm->mmap_sem))
+ if (!mmap_write_trylock(mm))
return -EBUSY;
if (unlikely(khugepaged_test_exit(mm)))
@@ -1429,13 +1537,14 @@ static int khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot)
out:
mm_slot->nr_pte_mapped_thp = 0;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return 0;
}
static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
{
struct vm_area_struct *vma;
+ struct mm_struct *mm;
unsigned long addr;
pmd_t *pmd, _pmd;
@@ -1444,11 +1553,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
/*
* Check vma->anon_vma to exclude MAP_PRIVATE mappings that
* got written to. These VMAs are likely not worth investing
- * down_write(mmap_sem) as PMD-mapping is likely to be split
+ * mmap_write_lock(mm) as PMD-mapping is likely to be split
* later.
*
* Not that vma->anon_vma check is racy: it can be set up after
- * the check but before we took mmap_sem by the fault path.
+ * the check but before we took mmap_lock by the fault path.
* But page lock would prevent establishing any new ptes of the
* page, so we are safe.
*
@@ -1464,27 +1573,30 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
continue;
if (vma->vm_end < addr + HPAGE_PMD_SIZE)
continue;
- pmd = mm_find_pmd(vma->vm_mm, addr);
+ mm = vma->vm_mm;
+ pmd = mm_find_pmd(mm, addr);
if (!pmd)
continue;
/*
- * We need exclusive mmap_sem to retract page table.
+ * We need exclusive mmap_lock to retract page table.
*
* We use trylock due to lock inversion: we need to acquire
- * mmap_sem while holding page lock. Fault path does it in
+ * mmap_lock while holding page lock. Fault path does it in
* reverse order. Trylock is a way to avoid deadlock.
*/
- if (down_write_trylock(&vma->vm_mm->mmap_sem)) {
- spinlock_t *ptl = pmd_lock(vma->vm_mm, pmd);
- /* assume page table is clear */
- _pmd = pmdp_collapse_flush(vma, addr, pmd);
- spin_unlock(ptl);
- up_write(&vma->vm_mm->mmap_sem);
- mm_dec_nr_ptes(vma->vm_mm);
- pte_free(vma->vm_mm, pmd_pgtable(_pmd));
+ if (mmap_write_trylock(mm)) {
+ if (!khugepaged_test_exit(mm)) {
+ spinlock_t *ptl = pmd_lock(mm, pmd);
+ /* assume page table is clear */
+ _pmd = pmdp_collapse_flush(vma, addr, pmd);
+ spin_unlock(ptl);
+ mm_dec_nr_ptes(mm);
+ pte_free(mm, pmd_pgtable(_pmd));
+ }
+ mmap_write_unlock(mm);
} else {
/* Try again later */
- khugepaged_add_pte_mapped_thp(vma->vm_mm, addr);
+ khugepaged_add_pte_mapped_thp(mm, addr);
}
}
i_mmap_unlock_write(mapping);
@@ -1515,7 +1627,6 @@ static void collapse_file(struct mm_struct *mm,
struct address_space *mapping = file->f_mapping;
gfp_t gfp;
struct page *new_page;
- struct mem_cgroup *memcg;
pgoff_t index, end = start + HPAGE_PMD_NR;
LIST_HEAD(pagelist);
XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER);
@@ -1534,10 +1645,11 @@ static void collapse_file(struct mm_struct *mm,
goto out;
}
- if (unlikely(mem_cgroup_try_charge(new_page, mm, gfp, &memcg, true))) {
+ if (unlikely(mem_cgroup_charge(new_page, mm, gfp))) {
result = SCAN_CGROUP_CHARGE_FAIL;
goto out;
}
+ count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC);
/* This will be less messy when we use multi-index entries */
do {
@@ -1547,7 +1659,6 @@ static void collapse_file(struct mm_struct *mm,
break;
xas_unlock_irq(&xas);
if (!xas_nomem(&xas, GFP_KERNEL)) {
- mem_cgroup_cancel_charge(new_page, memcg, true);
result = SCAN_FAIL;
goto out;
}
@@ -1613,7 +1724,7 @@ static void collapse_file(struct mm_struct *mm,
xas_unlock_irq(&xas);
page_cache_sync_readahead(mapping, &file->f_ra,
file, index,
- PAGE_SIZE);
+ end - index);
/* drain pagevecs to help isolate_lru_page() */
lru_add_drain();
page = find_lock_page(mapping, index);
@@ -1741,12 +1852,9 @@ out_unlock:
}
if (nr_none) {
- struct zone *zone = page_zone(new_page);
-
- __mod_node_page_state(zone->zone_pgdat, NR_FILE_PAGES, nr_none);
+ __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none);
if (is_shmem)
- __mod_node_page_state(zone->zone_pgdat,
- NR_SHMEM, nr_none);
+ __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none);
}
xa_locked:
@@ -1784,15 +1892,9 @@ xa_unlocked:
SetPageUptodate(new_page);
page_ref_add(new_page, HPAGE_PMD_NR - 1);
- mem_cgroup_commit_charge(new_page, memcg, false, true);
-
- if (is_shmem) {
+ if (is_shmem)
set_page_dirty(new_page);
- lru_cache_add_anon(new_page);
- } else {
- lru_cache_add_file(new_page);
- }
- count_memcg_events(memcg, THP_COLLAPSE_ALLOC, 1);
+ lru_cache_add(new_page);
/*
* Remove pte page tables, so we can re-fault the page as huge.
@@ -1839,13 +1941,14 @@ xa_unlocked:
VM_BUG_ON(nr_none);
xas_unlock_irq(&xas);
- mem_cgroup_cancel_charge(new_page, memcg, true);
new_page->mapping = NULL;
}
unlock_page(new_page);
out:
VM_BUG_ON(!list_empty(&pagelist));
+ if (!IS_ERR_OR_NULL(*hpage))
+ mem_cgroup_uncharge(*hpage);
/* TODO: tracepoints */
}
@@ -1967,8 +2070,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages,
* the next mm on the list.
*/
vma = NULL;
- if (unlikely(!down_read_trylock(&mm->mmap_sem)))
- goto breakouterloop_mmap_sem;
+ if (unlikely(!mmap_read_trylock(mm)))
+ goto breakouterloop_mmap_lock;
if (likely(!khugepaged_test_exit(mm)))
vma = find_vma(mm, khugepaged_scan.address);
@@ -2012,7 +2115,7 @@ skip:
pgoff_t pgoff = linear_page_index(vma,
khugepaged_scan.address);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
ret = 1;
khugepaged_scan_file(mm, file, pgoff, hpage);
fput(file);
@@ -2025,15 +2128,15 @@ skip:
khugepaged_scan.address += HPAGE_PMD_SIZE;
progress += HPAGE_PMD_NR;
if (ret)
- /* we released mmap_sem so break loop */
- goto breakouterloop_mmap_sem;
+ /* we released mmap_lock so break loop */
+ goto breakouterloop_mmap_lock;
if (progress >= pages)
goto breakouterloop;
}
}
breakouterloop:
- up_read(&mm->mmap_sem); /* exit_mmap will destroy ptes after this */
-breakouterloop_mmap_sem:
+ mmap_read_unlock(mm); /* exit_mmap will destroy ptes after this */
+breakouterloop_mmap_lock:
spin_lock(&khugepaged_mm_lock);
VM_BUG_ON(khugepaged_scan.mm_slot != mm_slot);
@@ -2084,6 +2187,8 @@ static void khugepaged_do_scan(void)
barrier(); /* write khugepaged_pages_to_scan to local stack */
+ lru_add_drain_all();
+
while (progress < pages) {
if (!khugepaged_prealloc_page(&hpage, &wait))
break;
@@ -2202,8 +2307,6 @@ static void set_recommended_min_free_kbytes(void)
int start_stop_khugepaged(void)
{
- static struct task_struct *khugepaged_thread __read_mostly;
- static DEFINE_MUTEX(khugepaged_mutex);
int err = 0;
mutex_lock(&khugepaged_mutex);
@@ -2230,3 +2333,11 @@ fail:
mutex_unlock(&khugepaged_mutex);
return err;
}
+
+void khugepaged_min_free_kbytes_update(void)
+{
+ mutex_lock(&khugepaged_mutex);
+ if (khugepaged_enabled() && khugepaged_thread)
+ set_recommended_min_free_kbytes();
+ mutex_unlock(&khugepaged_mutex);
+}
diff --git a/mm/kmemleak.c b/mm/kmemleak.c
index e362dc3d2028..5e252d91eb14 100644
--- a/mm/kmemleak.c
+++ b/mm/kmemleak.c
@@ -1169,8 +1169,10 @@ static bool update_checksum(struct kmemleak_object *object)
u32 old_csum = object->checksum;
kasan_disable_current();
+ kcsan_disable_current();
object->checksum = crc32(0, (void *)object->pointer, object->size);
kasan_enable_current();
+ kcsan_enable_current();
return object->checksum != old_csum;
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 281c00129a2e..9afccc36dbd2 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -442,7 +442,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm,
/*
* ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's
* page tables after it has passed through ksm_exit() - which, if necessary,
- * takes mmap_sem briefly to serialize against them. ksm_exit() does not set
+ * takes mmap_lock briefly to serialize against them. ksm_exit() does not set
* a special flag: they can just back out as soon as mm_users goes to zero.
* ksm_test_exit() is used throughout to make this test for exit: in some
* places for correctness, in some places just to avoid unnecessary work.
@@ -480,7 +480,8 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr)
break;
if (PageKsm(page))
ret = handle_mm_fault(vma, addr,
- FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE);
+ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE,
+ NULL);
else
ret = VM_FAULT_WRITE;
put_page(page);
@@ -542,11 +543,11 @@ static void break_cow(struct rmap_item *rmap_item)
*/
put_anon_vma(rmap_item->anon_vma);
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (vma)
break_ksm(vma, addr);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
static struct page *get_mergeable_page(struct rmap_item *rmap_item)
@@ -556,7 +557,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
struct vm_area_struct *vma;
struct page *page;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, addr);
if (!vma)
goto out;
@@ -572,7 +573,7 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item)
out:
page = NULL;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return page;
}
@@ -612,7 +613,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup,
* Move the old stable node to the second dimension
* queued in the hlist_dup. The invariant is that all
* dup stable_nodes in the chain->hlist point to pages
- * that are wrprotected and have the exact same
+ * that are write protected and have the exact same
* content.
*/
stable_node_chain_add_dup(dup, chain);
@@ -831,7 +832,7 @@ static void remove_trailing_rmap_items(struct mm_slot *mm_slot,
* Though it's very tempting to unmerge rmap_items from stable tree rather
* than check every pte of a given vma, the locking doesn't quite work for
* that - an rmap_item is assigned to the stable tree after inserting ksm
- * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing
+ * page and upping mmap_lock. Nor does it fit with the way we skip dup'ing
* rmap_items from parent to child at fork time (so as not to waste time
* if exit comes before the next scan reaches it).
*
@@ -976,7 +977,7 @@ static int unmerge_and_remove_all_rmap_items(void)
for (mm_slot = ksm_scan.mm_slot;
mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) {
mm = mm_slot->mm;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (ksm_test_exit(mm))
break;
@@ -989,7 +990,7 @@ static int unmerge_and_remove_all_rmap_items(void)
}
remove_trailing_rmap_items(mm_slot, &mm_slot->rmap_list);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next,
@@ -1012,7 +1013,7 @@ static int unmerge_and_remove_all_rmap_items(void)
return 0;
error:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
spin_lock(&ksm_mmlist_lock);
ksm_scan.mm_slot = &ksm_mm_head;
spin_unlock(&ksm_mmlist_lock);
@@ -1148,7 +1149,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
/*
* No need to check ksm_use_zero_pages here: we can only have a
- * zero_page here if ksm_use_zero_pages was enabled alreaady.
+ * zero_page here if ksm_use_zero_pages was enabled already.
*/
if (!is_zero_pfn(page_to_pfn(kpage))) {
get_page(kpage);
@@ -1280,7 +1281,7 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
struct vm_area_struct *vma;
int err = -EFAULT;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, rmap_item->address);
if (!vma)
goto out;
@@ -1292,11 +1293,11 @@ static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item,
/* Unstable nid is in union with stable anon_vma: remove first */
remove_rmap_item_from_tree(rmap_item);
- /* Must get reference to anon_vma while still holding mmap_sem */
+ /* Must get reference to anon_vma while still holding mmap_lock */
rmap_item->anon_vma = vma->anon_vma;
get_anon_vma(vma->anon_vma);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1608,7 +1609,7 @@ again:
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
tree_page = get_ksm_page(stable_node_any,
@@ -1843,7 +1844,7 @@ again:
* continue. All KSM pages belonging to the
* stable_node dups in a stable_node chain
* have the same content and they're
- * wrprotected at all times. Any will work
+ * write protected at all times. Any will work
* fine to continue the walk.
*/
tree_page = get_ksm_page(stable_node_any,
@@ -2001,7 +2002,7 @@ static void stable_tree_append(struct rmap_item *rmap_item,
* duplicate. page_migration could break later if rmap breaks,
* so we can as well crash here. We really need to check for
* rmap_hlist_len == STABLE_NODE_CHAIN, but we can as well check
- * for other negative values as an undeflow if detected here
+ * for other negative values as an underflow if detected here
* for the first time (and not when decreasing rmap_hlist_len)
* would be sign of memory corruption in the stable_node.
*/
@@ -2110,7 +2111,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
if (ksm_use_zero_pages && (checksum == zero_checksum)) {
struct vm_area_struct *vma;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_mergeable_vma(mm, rmap_item->address);
if (vma) {
err = try_to_merge_one_page(vma, page,
@@ -2122,7 +2123,7 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item)
*/
err = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
* In case of failure, the page was not really empty, so we
* need to continue. Otherwise we're done.
@@ -2285,7 +2286,7 @@ next_mm:
}
mm = slot->mm;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
if (ksm_test_exit(mm))
vma = NULL;
else
@@ -2319,7 +2320,7 @@ next_mm:
ksm_scan.address += PAGE_SIZE;
} else
put_page(*page);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return rmap_item;
}
put_page(*page);
@@ -2343,13 +2344,13 @@ next_mm:
struct mm_slot, mm_list);
if (ksm_scan.address == 0) {
/*
- * We've completed a full scan of all vmas, holding mmap_sem
+ * We've completed a full scan of all vmas, holding mmap_lock
* throughout, and found no VM_MERGEABLE: so do the same as
* __ksm_exit does to remove this mm from all our lists now.
* This applies either when cleaning up after __ksm_exit
* (but beware: we can reach here even before __ksm_exit),
* or when all VM_MERGEABLE areas have been unmapped (and
- * mmap_sem then protects against race with MADV_MERGEABLE).
+ * mmap_lock then protects against race with MADV_MERGEABLE).
*/
hash_del(&slot->link);
list_del(&slot->mm_list);
@@ -2357,12 +2358,12 @@ next_mm:
free_mm_slot(slot);
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
mmdrop(mm);
} else {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/*
- * up_read(&mm->mmap_sem) first because after
+ * mmap_read_unlock(mm) first because after
* spin_unlock(&ksm_mmlist_lock) run, the "mm" may
* already have been freed under us by __ksm_exit()
* because the "mm_slot" is still hashed and
@@ -2387,7 +2388,7 @@ next_mm:
static void ksm_do_scan(unsigned int scan_npages)
{
struct rmap_item *rmap_item;
- struct page *uninitialized_var(page);
+ struct page *page;
while (scan_npages-- && likely(!freezing(current))) {
cond_resched();
@@ -2536,7 +2537,7 @@ void __ksm_exit(struct mm_struct *mm)
* This process is exiting: if it's straightforward (as is the
* case when ksmd was never running), free mm_slot immediately.
* But if it's at the cursor or has rmap_items linked to it, use
- * mmap_sem to synchronize with any break_cows before pagetables
+ * mmap_lock to synchronize with any break_cows before pagetables
* are freed, and leave the mm_slot on the list for ksmd to free.
* Beware: ksm may already have noticed it exiting and freed the slot.
*/
@@ -2560,8 +2561,8 @@ void __ksm_exit(struct mm_struct *mm)
clear_bit(MMF_VM_MERGEABLE, &mm->flags);
mmdrop(mm);
} else if (mm_slot) {
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
}
@@ -2585,6 +2586,10 @@ struct page *ksm_might_need_to_copy(struct page *page,
return page; /* let do_swap_page report the error */
new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address);
+ if (new_page && mem_cgroup_charge(new_page, vma->vm_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ new_page = NULL;
+ }
if (new_page) {
copy_user_highpage(new_page, page, address, vma);
@@ -2660,31 +2665,6 @@ again:
goto again;
}
-bool reuse_ksm_page(struct page *page,
- struct vm_area_struct *vma,
- unsigned long address)
-{
-#ifdef CONFIG_DEBUG_VM
- if (WARN_ON(is_zero_pfn(page_to_pfn(page))) ||
- WARN_ON(!page_mapped(page)) ||
- WARN_ON(!PageLocked(page))) {
- dump_page(page, "reuse_ksm_page");
- return false;
- }
-#endif
-
- if (PageSwapCache(page) || !page_stable_node(page))
- return false;
- /* Prohibit parallel get_ksm_page() */
- if (!page_ref_freeze(page, 1))
- return false;
-
- page_move_anon_rmap(page, vma);
- page->index = linear_page_index(vma, address);
- page_ref_unfreeze(page, 1);
-
- return true;
-}
#ifdef CONFIG_MIGRATION
void ksm_migrate_page(struct page *newpage, struct page *oldpage)
{
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 4d5294c39bba..5aa6e44bc2ae 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -180,7 +180,7 @@ unsigned long list_lru_count_one(struct list_lru *lru,
rcu_read_lock();
l = list_lru_from_memcg_idx(nlru, memcg_cache_id(memcg));
- count = l->nr_items;
+ count = READ_ONCE(l->nr_items);
rcu_read_unlock();
return count;
@@ -213,7 +213,7 @@ restart:
/*
* decrement nr_to_walk first so that we don't livelock if we
- * get stuck on large numbesr of LRU_RETRY items
+ * get stuck on large numbers of LRU_RETRY items
*/
if (!*nr_to_walk)
break;
@@ -373,14 +373,14 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru)
struct list_lru_memcg *memcg_lrus;
/*
* This is called when shrinker has already been unregistered,
- * and nobody can use it. So, there is no need to use kvfree_rcu().
+ * and nobody can use it. So, there is no need to use kvfree_rcu_local().
*/
memcg_lrus = rcu_dereference_protected(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
kvfree(memcg_lrus);
}
-static void kvfree_rcu(struct rcu_head *head)
+static void kvfree_rcu_local(struct rcu_head *head)
{
struct list_lru_memcg *mlru;
@@ -419,7 +419,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru,
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
- call_rcu(&old->rcu, kvfree_rcu);
+ call_rcu(&old->rcu, kvfree_rcu_local);
return 0;
}
diff --git a/mm/maccess.c b/mm/maccess.c
index 3ca8d97e5010..3bd70405f2d8 100644
--- a/mm/maccess.c
+++ b/mm/maccess.c
@@ -1,104 +1,130 @@
// SPDX-License-Identifier: GPL-2.0-only
/*
- * Access kernel memory without faulting.
+ * Access kernel or user memory without faulting.
*/
#include <linux/export.h>
#include <linux/mm.h>
#include <linux/uaccess.h>
-static __always_inline long
-probe_read_common(void *dst, const void __user *src, size_t size)
+bool __weak copy_from_kernel_nofault_allowed(const void *unsafe_src,
+ size_t size)
{
- long ret;
+ return true;
+}
+
+#ifdef HAVE_GET_KERNEL_NOFAULT
+
+#define copy_from_kernel_nofault_loop(dst, src, len, type, err_label) \
+ while (len >= sizeof(type)) { \
+ __get_kernel_nofault(dst, src, type, err_label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
+
+long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
+{
+ if (!copy_from_kernel_nofault_allowed(src, size))
+ return -ERANGE;
pagefault_disable();
- ret = __copy_from_user_inatomic(dst, src, size);
+ copy_from_kernel_nofault_loop(dst, src, size, u64, Efault);
+ copy_from_kernel_nofault_loop(dst, src, size, u32, Efault);
+ copy_from_kernel_nofault_loop(dst, src, size, u16, Efault);
+ copy_from_kernel_nofault_loop(dst, src, size, u8, Efault);
+ pagefault_enable();
+ return 0;
+Efault:
pagefault_enable();
+ return -EFAULT;
+}
+EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
+
+#define copy_to_kernel_nofault_loop(dst, src, len, type, err_label) \
+ while (len >= sizeof(type)) { \
+ __put_kernel_nofault(dst, src, type, err_label); \
+ dst += sizeof(type); \
+ src += sizeof(type); \
+ len -= sizeof(type); \
+ }
- return ret ? -EFAULT : 0;
+long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
+{
+ pagefault_disable();
+ copy_to_kernel_nofault_loop(dst, src, size, u64, Efault);
+ copy_to_kernel_nofault_loop(dst, src, size, u32, Efault);
+ copy_to_kernel_nofault_loop(dst, src, size, u16, Efault);
+ copy_to_kernel_nofault_loop(dst, src, size, u8, Efault);
+ pagefault_enable();
+ return 0;
+Efault:
+ pagefault_enable();
+ return -EFAULT;
}
-static __always_inline long
-probe_write_common(void __user *dst, const void *src, size_t size)
+long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
- long ret;
+ const void *src = unsafe_addr;
+
+ if (unlikely(count <= 0))
+ return 0;
+ if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
+ return -ERANGE;
pagefault_disable();
- ret = __copy_to_user_inatomic(dst, src, size);
+ do {
+ __get_kernel_nofault(dst, src, u8, Efault);
+ dst++;
+ src++;
+ } while (dst[-1] && src - unsafe_addr < count);
pagefault_enable();
- return ret ? -EFAULT : 0;
+ dst[-1] = '\0';
+ return src - unsafe_addr;
+Efault:
+ pagefault_enable();
+ dst[-1] = '\0';
+ return -EFAULT;
}
-
+#else /* HAVE_GET_KERNEL_NOFAULT */
/**
- * probe_kernel_read(): safely attempt to read from a kernel-space location
+ * copy_from_kernel_nofault(): safely attempt to read from kernel-space
* @dst: pointer to the buffer that shall take the data
* @src: address to read from
* @size: size of the data chunk
*
- * Safely read from address @src to the buffer at @dst. If a kernel fault
- * happens, handle that and return -EFAULT.
+ * Safely read from kernel address @src to the buffer at @dst. If a kernel
+ * fault happens, handle that and return -EFAULT. If @src is not a valid kernel
+ * address, return -ERANGE.
*
* We ensure that the copy_from_user is executed in atomic context so that
- * do_page_fault() doesn't attempt to take mmap_sem. This makes
- * probe_kernel_read() suitable for use within regions where the caller
- * already holds mmap_sem, or other locks which nest inside mmap_sem.
- *
- * probe_kernel_read_strict() is the same as probe_kernel_read() except for
- * the case where architectures have non-overlapping user and kernel address
- * ranges: probe_kernel_read_strict() will additionally return -EFAULT for
- * probing memory on a user address range where probe_user_read() is supposed
- * to be used instead.
+ * do_page_fault() doesn't attempt to take mmap_lock. This makes
+ * copy_from_kernel_nofault() suitable for use within regions where the caller
+ * already holds mmap_lock, or other locks which nest inside mmap_lock.
*/
-
-long __weak probe_kernel_read(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_read")));
-
-long __weak probe_kernel_read_strict(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_read")));
-
-long __probe_kernel_read(void *dst, const void *src, size_t size)
+long copy_from_kernel_nofault(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
- set_fs(KERNEL_DS);
- ret = probe_read_common(dst, (__force const void __user *)src, size);
- set_fs(old_fs);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(probe_kernel_read);
-
-/**
- * probe_user_read(): safely attempt to read from a user-space location
- * @dst: pointer to the buffer that shall take the data
- * @src: address to read from. This must be a user address.
- * @size: size of the data chunk
- *
- * Safely read from user address @src to the buffer at @dst. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-
-long __weak probe_user_read(void *dst, const void __user *src, size_t size)
- __attribute__((alias("__probe_user_read")));
+ if (!copy_from_kernel_nofault_allowed(src, size))
+ return -ERANGE;
-long __probe_user_read(void *dst, const void __user *src, size_t size)
-{
- long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
-
- set_fs(USER_DS);
- if (access_ok(src, size))
- ret = probe_read_common(dst, src, size);
+ set_fs(KERNEL_DS);
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, (__force const void __user *)src,
+ size);
+ pagefault_enable();
set_fs(old_fs);
- return ret;
+ if (ret)
+ return -EFAULT;
+ return 0;
}
-EXPORT_SYMBOL_GPL(probe_user_read);
+EXPORT_SYMBOL_GPL(copy_from_kernel_nofault);
/**
- * probe_kernel_write(): safely attempt to write to a location
+ * copy_to_kernel_nofault(): safely attempt to write to a location
* @dst: address to write to
* @src: pointer to the data that shall be written
* @size: size of the data chunk
@@ -106,52 +132,25 @@ EXPORT_SYMBOL_GPL(probe_user_read);
* Safely write to address @dst from the buffer at @src. If a kernel fault
* happens, handle that and return -EFAULT.
*/
-
-long __weak probe_kernel_write(void *dst, const void *src, size_t size)
- __attribute__((alias("__probe_kernel_write")));
-
-long __probe_kernel_write(void *dst, const void *src, size_t size)
+long copy_to_kernel_nofault(void *dst, const void *src, size_t size)
{
long ret;
mm_segment_t old_fs = get_fs();
set_fs(KERNEL_DS);
- ret = probe_write_common((__force void __user *)dst, src, size);
- set_fs(old_fs);
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(probe_kernel_write);
-
-/**
- * probe_user_write(): safely attempt to write to a user-space location
- * @dst: address to write to
- * @src: pointer to the data that shall be written
- * @size: size of the data chunk
- *
- * Safely write to address @dst from the buffer at @src. If a kernel fault
- * happens, handle that and return -EFAULT.
- */
-
-long __weak probe_user_write(void __user *dst, const void *src, size_t size)
- __attribute__((alias("__probe_user_write")));
-
-long __probe_user_write(void __user *dst, const void *src, size_t size)
-{
- long ret = -EFAULT;
- mm_segment_t old_fs = get_fs();
-
- set_fs(USER_DS);
- if (access_ok(dst, size))
- ret = probe_write_common(dst, src, size);
+ pagefault_disable();
+ ret = __copy_to_user_inatomic((__force void __user *)dst, src, size);
+ pagefault_enable();
set_fs(old_fs);
- return ret;
+ if (ret)
+ return -EFAULT;
+ return 0;
}
-EXPORT_SYMBOL_GPL(probe_user_write);
/**
- * strncpy_from_unsafe: - Copy a NUL terminated string from unsafe address.
+ * strncpy_from_kernel_nofault: - Copy a NUL terminated string from unsafe
+ * address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
* @unsafe_addr: Unsafe address.
@@ -161,27 +160,14 @@ EXPORT_SYMBOL_GPL(probe_user_write);
*
* On success, returns the length of the string INCLUDING the trailing NUL.
*
- * If access fails, returns -EFAULT (some data may have been copied
- * and the trailing NUL added).
+ * If access fails, returns -EFAULT (some data may have been copied and the
+ * trailing NUL added). If @unsafe_addr is not a valid kernel address, return
+ * -ERANGE.
*
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
- *
- * strncpy_from_unsafe_strict() is the same as strncpy_from_unsafe() except
- * for the case where architectures have non-overlapping user and kernel address
- * ranges: strncpy_from_unsafe_strict() will additionally return -EFAULT for
- * probing memory on a user address range where strncpy_from_unsafe_user() is
- * supposed to be used instead.
*/
-
-long __weak strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
- __attribute__((alias("__strncpy_from_unsafe")));
-
-long __weak strncpy_from_unsafe_strict(char *dst, const void *unsafe_addr,
- long count)
- __attribute__((alias("__strncpy_from_unsafe")));
-
-long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
+long strncpy_from_kernel_nofault(char *dst, const void *unsafe_addr, long count)
{
mm_segment_t old_fs = get_fs();
const void *src = unsafe_addr;
@@ -189,6 +175,8 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
if (unlikely(count <= 0))
return 0;
+ if (!copy_from_kernel_nofault_allowed(unsafe_addr, count))
+ return -ERANGE;
set_fs(KERNEL_DS);
pagefault_disable();
@@ -203,9 +191,64 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
return ret ? -EFAULT : src - unsafe_addr;
}
+#endif /* HAVE_GET_KERNEL_NOFAULT */
+
+/**
+ * copy_from_user_nofault(): safely attempt to read from a user-space location
+ * @dst: pointer to the buffer that shall take the data
+ * @src: address to read from. This must be a user address.
+ * @size: size of the data chunk
+ *
+ * Safely read from user address @src to the buffer at @dst. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long copy_from_user_nofault(void *dst, const void __user *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = force_uaccess_begin();
+
+ if (access_ok(src, size)) {
+ pagefault_disable();
+ ret = __copy_from_user_inatomic(dst, src, size);
+ pagefault_enable();
+ }
+ force_uaccess_end(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_from_user_nofault);
/**
- * strncpy_from_unsafe_user: - Copy a NUL terminated string from unsafe user
+ * copy_to_user_nofault(): safely attempt to write to a user-space location
+ * @dst: address to write to
+ * @src: pointer to the data that shall be written
+ * @size: size of the data chunk
+ *
+ * Safely write to address @dst from the buffer at @src. If a kernel fault
+ * happens, handle that and return -EFAULT.
+ */
+long copy_to_user_nofault(void __user *dst, const void *src, size_t size)
+{
+ long ret = -EFAULT;
+ mm_segment_t old_fs = force_uaccess_begin();
+
+ if (access_ok(dst, size)) {
+ pagefault_disable();
+ ret = __copy_to_user_inatomic(dst, src, size);
+ pagefault_enable();
+ }
+ force_uaccess_end(old_fs);
+
+ if (ret)
+ return -EFAULT;
+ return 0;
+}
+EXPORT_SYMBOL_GPL(copy_to_user_nofault);
+
+/**
+ * strncpy_from_user_nofault: - Copy a NUL terminated string from unsafe user
* address.
* @dst: Destination address, in kernel space. This buffer must be at
* least @count bytes long.
@@ -222,20 +265,20 @@ long __strncpy_from_unsafe(char *dst, const void *unsafe_addr, long count)
* If @count is smaller than the length of the string, copies @count-1 bytes,
* sets the last byte of @dst buffer to NUL and returns @count.
*/
-long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
+long strncpy_from_user_nofault(char *dst, const void __user *unsafe_addr,
long count)
{
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs;
long ret;
if (unlikely(count <= 0))
return 0;
- set_fs(USER_DS);
+ old_fs = force_uaccess_begin();
pagefault_disable();
ret = strncpy_from_user(dst, unsafe_addr, count);
pagefault_enable();
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
if (ret >= count) {
ret = count;
@@ -248,7 +291,7 @@ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
}
/**
- * strnlen_unsafe_user: - Get the size of a user string INCLUDING final NUL.
+ * strnlen_user_nofault: - Get the size of a user string INCLUDING final NUL.
* @unsafe_addr: The string to measure.
* @count: Maximum count (including NUL)
*
@@ -263,16 +306,16 @@ long strncpy_from_unsafe_user(char *dst, const void __user *unsafe_addr,
* Unlike strnlen_user, this can be used from IRQ handler etc. because
* it disables pagefaults.
*/
-long strnlen_unsafe_user(const void __user *unsafe_addr, long count)
+long strnlen_user_nofault(const void __user *unsafe_addr, long count)
{
- mm_segment_t old_fs = get_fs();
+ mm_segment_t old_fs;
int ret;
- set_fs(USER_DS);
+ old_fs = force_uaccess_begin();
pagefault_disable();
ret = strnlen_user(unsafe_addr, count);
pagefault_enable();
- set_fs(old_fs);
+ force_uaccess_end(old_fs);
return ret;
}
diff --git a/mm/madvise.c b/mm/madvise.c
index 8cbd8c1bfe15..0e0d61003fc6 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -40,7 +40,7 @@ struct madvise_walk_private {
/*
* Any behaviour which results in changes to the vma->vm_flags needs to
- * take mmap_sem for writing. Others, which simply traverse vmas, need
+ * take mmap_lock for writing. Others, which simply traverse vmas, need
* to only take it for reading.
*/
static int madvise_need_mmap_write(int behavior)
@@ -165,7 +165,7 @@ static long madvise_behavior(struct vm_area_struct *vma,
success:
/*
- * vm_flags is protected by the mmap_sem held in write mode.
+ * vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
@@ -285,16 +285,16 @@ static long madvise_willneed(struct vm_area_struct *vma,
* Filesystem's fadvise may need to take various locks. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
- * mmap_sem.
+ * mmap_lock.
*/
- *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ *prev = NULL; /* tell sys_madvise we drop mmap_lock */
get_file(file);
- up_read(&current->mm->mmap_sem);
offset = (loff_t)(start - vma->vm_start)
+ ((loff_t)vma->vm_pgoff << PAGE_SHIFT);
+ mmap_read_unlock(current->mm);
vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED);
fput(file);
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
return 0;
}
@@ -381,9 +381,9 @@ huge_unlock:
return 0;
}
+regular_page:
if (pmd_trans_unstable(pmd))
return 0;
-regular_page:
#endif
tlb_change_page_size(tlb, PAGE_SIZE);
orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -768,9 +768,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
return -EINVAL;
if (!userfaultfd_remove(vma, start, end)) {
- *prev = NULL; /* mmap_sem has been dropped, prev is stale */
+ *prev = NULL; /* mmap_lock has been dropped, prev is stale */
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
vma = find_vma(current->mm, start);
if (!vma)
return -ENOMEM;
@@ -791,7 +791,7 @@ static long madvise_dontneed_free(struct vm_area_struct *vma,
if (end > vma->vm_end) {
/*
* Don't fail if end > vma->vm_end. If the old
- * vma was splitted while the mmap_sem was
+ * vma was splitted while the mmap_lock was
* released the effect of the concurrent
* operation may not cause madvise() to
* have an undefined result. There may be an
@@ -826,7 +826,7 @@ static long madvise_remove(struct vm_area_struct *vma,
int error;
struct file *f;
- *prev = NULL; /* tell sys_madvise we drop mmap_sem */
+ *prev = NULL; /* tell sys_madvise we drop mmap_lock */
if (vma->vm_flags & VM_LOCKED)
return -EINVAL;
@@ -847,18 +847,18 @@ static long madvise_remove(struct vm_area_struct *vma,
* Filesystem's fallocate may need to take i_mutex. We need to
* explicitly grab a reference because the vma (and hence the
* vma's reference to the file) can go away as soon as we drop
- * mmap_sem.
+ * mmap_lock.
*/
get_file(f);
if (userfaultfd_remove(vma, start, end)) {
- /* mmap_sem was not released by userfaultfd_remove() */
- up_read(&current->mm->mmap_sem);
+ /* mmap_lock was not released by userfaultfd_remove() */
+ mmap_read_unlock(current->mm);
}
error = vfs_fallocate(f,
FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
offset, end - start);
fput(f);
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
return error;
}
@@ -1089,7 +1089,7 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
write = madvise_need_mmap_write(behavior);
if (write) {
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
/*
@@ -1105,11 +1105,11 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
* model.
*/
if (!mmget_still_valid(current->mm)) {
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return -EINTR;
}
} else {
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
}
/*
@@ -1153,15 +1153,15 @@ int do_madvise(unsigned long start, size_t len_in, int behavior)
goto out;
if (prev)
vma = prev->vm_next;
- else /* madvise_remove dropped mmap_sem */
+ else /* madvise_remove dropped mmap_lock */
vma = find_vma(current->mm, start);
}
out:
blk_finish_plug(&plug);
if (write)
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
else
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
return error;
}
diff --git a/mm/memblock.c b/mm/memblock.c
index c79ba6f9920c..45f198750be9 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -44,19 +44,20 @@
* in the system, for instance when the memory is restricted with
* ``mem=`` command line parameter
* * ``reserved`` - describes the regions that were allocated
- * * ``physmap`` - describes the actual physical memory regardless of
- * the possible restrictions; the ``physmap`` type is only available
- * on some architectures.
+ * * ``physmem`` - describes the actual physical memory available during
+ * boot regardless of the possible restrictions and memory hot(un)plug;
+ * the ``physmem`` type is only available on some architectures.
*
* Each region is represented by :c:type:`struct memblock_region` that
* defines the region extents, its attributes and NUMA node id on NUMA
* systems. Every memory type is described by the :c:type:`struct
* memblock_type` which contains an array of memory regions along with
- * the allocator metadata. The memory types are nicely wrapped with
- * :c:type:`struct memblock`. This structure is statically initialzed
- * at build time. The region arrays for the "memory" and "reserved"
- * types are initially sized to %INIT_MEMBLOCK_REGIONS and for the
- * "physmap" type to %INIT_PHYSMEM_REGIONS.
+ * the allocator metadata. The "memory" and "reserved" types are nicely
+ * wrapped with :c:type:`struct memblock`. This structure is statically
+ * initialized at build time. The region arrays are initially sized to
+ * %INIT_MEMBLOCK_REGIONS for "memory" and %INIT_MEMBLOCK_RESERVED_REGIONS
+ * for "reserved". The region array for "physmem" is initially sized to
+ * %INIT_PHYSMEM_REGIONS.
* The memblock_allow_resize() enables automatic resizing of the region
* arrays during addition of new regions. This feature should be used
* with care so that memory allocated for the region array will not
@@ -78,7 +79,7 @@
* * memblock_alloc*() - these functions return the **virtual** address
* of the allocated memory.
*
- * Note, that both API variants use implict assumptions about allowed
+ * Note, that both API variants use implicit assumptions about allowed
* memory ranges and the fallback methods. Consult the documentation
* of memblock_alloc_internal() and memblock_alloc_range_nid()
* functions for more elaborate description.
@@ -87,8 +88,8 @@
* function frees all the memory to the buddy page allocator.
*
* Unless an architecture enables %CONFIG_ARCH_KEEP_MEMBLOCK, the
- * memblock data structures will be discarded after the system
- * initialization completes.
+ * memblock data structures (except "physmem") will be discarded after the
+ * system initialization completes.
*/
#ifndef CONFIG_NEED_MULTIPLE_NODES
@@ -104,7 +105,7 @@ unsigned long long max_possible_pfn;
static struct memblock_region memblock_memory_init_regions[INIT_MEMBLOCK_REGIONS] __initdata_memblock;
static struct memblock_region memblock_reserved_init_regions[INIT_MEMBLOCK_RESERVED_REGIONS] __initdata_memblock;
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
-static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS] __initdata_memblock;
+static struct memblock_region memblock_physmem_init_regions[INIT_PHYSMEM_REGIONS];
#endif
struct memblock memblock __initdata_memblock = {
@@ -118,17 +119,19 @@ struct memblock memblock __initdata_memblock = {
.reserved.max = INIT_MEMBLOCK_RESERVED_REGIONS,
.reserved.name = "reserved",
-#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- .physmem.regions = memblock_physmem_init_regions,
- .physmem.cnt = 1, /* empty dummy entry */
- .physmem.max = INIT_PHYSMEM_REGIONS,
- .physmem.name = "physmem",
-#endif
-
.bottom_up = false,
.current_limit = MEMBLOCK_ALLOC_ANYWHERE,
};
+#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
+struct memblock_type physmem = {
+ .regions = memblock_physmem_init_regions,
+ .cnt = 1, /* empty dummy entry */
+ .max = INIT_PHYSMEM_REGIONS,
+ .name = "physmem",
+};
+#endif
+
int memblock_debug __initdata_memblock;
static bool system_has_some_mirror __initdata_memblock = false;
static int memblock_can_resize __initdata_memblock;
@@ -620,7 +623,7 @@ repeat:
* area, insert that portion.
*/
if (rbase > base) {
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
WARN_ON(nid != memblock_get_region_node(rgn));
#endif
WARN_ON(flags != rgn->flags);
@@ -838,7 +841,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
memblock_dbg("%s: [%pa-%pa] %pS\n", __func__,
&base, &end, (void *)_RET_IP_);
- return memblock_add_range(&memblock.physmem, base, size, MAX_NUMNODES, 0);
+ return memblock_add_range(&physmem, base, size, MAX_NUMNODES, 0);
}
#endif
@@ -1019,12 +1022,10 @@ static bool should_skip_region(struct memblock_region *m, int nid, int flags)
* As both region arrays are sorted, the function advances the two indices
* in lockstep and returns each intersection.
*/
-void __init_memblock __next_mem_range(u64 *idx, int nid,
- enum memblock_flags flags,
- struct memblock_type *type_a,
- struct memblock_type *type_b,
- phys_addr_t *out_start,
- phys_addr_t *out_end, int *out_nid)
+void __next_mem_range(u64 *idx, int nid, enum memblock_flags flags,
+ struct memblock_type *type_a,
+ struct memblock_type *type_b, phys_addr_t *out_start,
+ phys_addr_t *out_end, int *out_nid)
{
int idx_a = *idx & 0xffffffff;
int idx_b = *idx >> 32;
@@ -1197,7 +1198,6 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid,
*idx = ULLONG_MAX;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
/*
* Common iterator interface used to define for_each_mem_pfn_range().
*/
@@ -1207,13 +1207,15 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
{
struct memblock_type *type = &memblock.memory;
struct memblock_region *r;
+ int r_nid;
while (++*idx < type->cnt) {
r = &type->regions[*idx];
+ r_nid = memblock_get_region_node(r);
if (PFN_UP(r->base) >= PFN_DOWN(r->base + r->size))
continue;
- if (nid == MAX_NUMNODES || nid == r->nid)
+ if (nid == MAX_NUMNODES || nid == r_nid)
break;
}
if (*idx >= type->cnt) {
@@ -1226,7 +1228,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
if (out_end_pfn)
*out_end_pfn = PFN_DOWN(r->base + r->size);
if (out_nid)
- *out_nid = r->nid;
+ *out_nid = r_nid;
}
/**
@@ -1245,6 +1247,7 @@ void __init_memblock __next_mem_pfn_range(int *idx, int nid,
int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
struct memblock_type *type, int nid)
{
+#ifdef CONFIG_NEED_MULTIPLE_NODES
int start_rgn, end_rgn;
int i, ret;
@@ -1256,9 +1259,10 @@ int __init_memblock memblock_set_node(phys_addr_t base, phys_addr_t size,
memblock_set_region_node(&type->regions[i], nid);
memblock_merge_regions(type);
+#endif
return 0;
}
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
+
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
/**
* __next_mem_pfn_range_in_zone - iterator for for_each_*_range_in_zone()
@@ -1797,7 +1801,6 @@ bool __init_memblock memblock_is_map_memory(phys_addr_t addr)
return !memblock_is_nomap(&memblock.memory.regions[i]);
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
unsigned long *start_pfn, unsigned long *end_pfn)
{
@@ -1810,9 +1813,8 @@ int __init_memblock memblock_search_pfn_nid(unsigned long pfn,
*start_pfn = PFN_DOWN(type->regions[mid].base);
*end_pfn = PFN_DOWN(type->regions[mid].base + type->regions[mid].size);
- return type->regions[mid].nid;
+ return memblock_get_region_node(&type->regions[mid]);
}
-#endif
/**
* memblock_is_region_memory - check if a region is a subset of memory
@@ -1903,7 +1905,7 @@ static void __init_memblock memblock_dump(struct memblock_type *type)
size = rgn->size;
end = base + size - 1;
flags = rgn->flags;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
+#ifdef CONFIG_NEED_MULTIPLE_NODES
if (memblock_get_region_node(rgn) != MAX_NUMNODES)
snprintf(nid_buf, sizeof(nid_buf), " on node %d",
memblock_get_region_node(rgn));
@@ -1923,7 +1925,7 @@ void __init_memblock __memblock_dump_all(void)
memblock_dump(&memblock.memory);
memblock_dump(&memblock.reserved);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- memblock_dump(&memblock.physmem);
+ memblock_dump(&physmem);
#endif
}
@@ -2063,8 +2065,8 @@ static int __init memblock_init_debugfs(void)
debugfs_create_file("reserved", 0444, root,
&memblock.reserved, &memblock_debug_fops);
#ifdef CONFIG_HAVE_MEMBLOCK_PHYS_MAP
- debugfs_create_file("physmem", 0444, root,
- &memblock.physmem, &memblock_debug_fops);
+ debugfs_create_file("physmem", 0444, root, &physmem,
+ &memblock_debug_fops);
#endif
return 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a3b97f103966..6877c765b8d0 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -73,8 +73,6 @@ EXPORT_SYMBOL(memory_cgrp_subsys);
struct mem_cgroup *root_mem_cgroup __read_mostly;
-#define MEM_CGROUP_RECLAIM_RETRIES 5
-
/* Socket memory accounting disabled? */
static bool cgroup_memory_nosocket;
@@ -83,9 +81,9 @@ static bool cgroup_memory_nokmem;
/* Whether the swap controller is active */
#ifdef CONFIG_MEMCG_SWAP
-int do_swap_account __read_mostly;
+bool cgroup_memory_noswap __read_mostly;
#else
-#define do_swap_account 0
+#define cgroup_memory_noswap 1
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
@@ -95,7 +93,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq);
/* Whether legacy memory+swap accounting is active */
static bool do_memsw_account(void)
{
- return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
+ return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
}
#define THRESHOLDS_EVENTS_TARGET 128
@@ -257,8 +255,100 @@ struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
}
#ifdef CONFIG_MEMCG_KMEM
+extern spinlock_t css_set_lock;
+
+static void obj_cgroup_release(struct percpu_ref *ref)
+{
+ struct obj_cgroup *objcg = container_of(ref, struct obj_cgroup, refcnt);
+ struct mem_cgroup *memcg;
+ unsigned int nr_bytes;
+ unsigned int nr_pages;
+ unsigned long flags;
+
+ /*
+ * At this point all allocated objects are freed, and
+ * objcg->nr_charged_bytes can't have an arbitrary byte value.
+ * However, it can be PAGE_SIZE or (x * PAGE_SIZE).
+ *
+ * The following sequence can lead to it:
+ * 1) CPU0: objcg == stock->cached_objcg
+ * 2) CPU1: we do a small allocation (e.g. 92 bytes),
+ * PAGE_SIZE bytes are charged
+ * 3) CPU1: a process from another memcg is allocating something,
+ * the stock if flushed,
+ * objcg->nr_charged_bytes = PAGE_SIZE - 92
+ * 5) CPU0: we do release this object,
+ * 92 bytes are added to stock->nr_bytes
+ * 6) CPU0: stock is flushed,
+ * 92 bytes are added to objcg->nr_charged_bytes
+ *
+ * In the result, nr_charged_bytes == PAGE_SIZE.
+ * This page will be uncharged in obj_cgroup_release().
+ */
+ nr_bytes = atomic_read(&objcg->nr_charged_bytes);
+ WARN_ON_ONCE(nr_bytes & (PAGE_SIZE - 1));
+ nr_pages = nr_bytes >> PAGE_SHIFT;
+
+ spin_lock_irqsave(&css_set_lock, flags);
+ memcg = obj_cgroup_memcg(objcg);
+ if (nr_pages)
+ __memcg_kmem_uncharge(memcg, nr_pages);
+ list_del(&objcg->list);
+ mem_cgroup_put(memcg);
+ spin_unlock_irqrestore(&css_set_lock, flags);
+
+ percpu_ref_exit(ref);
+ kfree_rcu(objcg, rcu);
+}
+
+static struct obj_cgroup *obj_cgroup_alloc(void)
+{
+ struct obj_cgroup *objcg;
+ int ret;
+
+ objcg = kzalloc(sizeof(struct obj_cgroup), GFP_KERNEL);
+ if (!objcg)
+ return NULL;
+
+ ret = percpu_ref_init(&objcg->refcnt, obj_cgroup_release, 0,
+ GFP_KERNEL);
+ if (ret) {
+ kfree(objcg);
+ return NULL;
+ }
+ INIT_LIST_HEAD(&objcg->list);
+ return objcg;
+}
+
+static void memcg_reparent_objcgs(struct mem_cgroup *memcg,
+ struct mem_cgroup *parent)
+{
+ struct obj_cgroup *objcg, *iter;
+
+ objcg = rcu_replace_pointer(memcg->objcg, NULL, true);
+
+ spin_lock_irq(&css_set_lock);
+
+ /* Move active objcg to the parent's list */
+ xchg(&objcg->memcg, parent);
+ css_get(&parent->css);
+ list_add(&objcg->list, &parent->objcg_list);
+
+ /* Move already reparented objcgs to the parent's list */
+ list_for_each_entry(iter, &memcg->objcg_list, list) {
+ css_get(&parent->css);
+ xchg(&iter->memcg, parent);
+ css_put(&memcg->css);
+ }
+ list_splice(&memcg->objcg_list, &parent->objcg_list);
+
+ spin_unlock_irq(&css_set_lock);
+
+ percpu_ref_kill(&objcg->refcnt);
+}
+
/*
- * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
+ * This will be used as a shrinker list's index.
* The main reason for not using cgroup id for this:
* this works better in sparse environments, where we have a lot of memcgs,
* but only a few kmem-limited. Or also, if we have, for instance, 200
@@ -301,14 +391,12 @@ void memcg_put_cache_ids(void)
/*
* A lot of the calls to the cache allocation functions are expected to be
- * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
+ * inlined by the compiler. Since the calls to memcg_slab_pre_alloc_hook() are
* conditional to this static branch, we'll have to allow modules that does
* kmem_cache_alloc and the such to see this symbol as well
*/
DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
EXPORT_SYMBOL(memcg_kmem_enabled_key);
-
-struct workqueue_struct *memcg_kmem_cache_wq;
#endif
static int memcg_shrinker_map_size;
@@ -477,10 +565,17 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- if (PageSlab(page) && !PageTail(page))
- memcg = memcg_from_slab_page(page);
- else
- memcg = READ_ONCE(page->mem_cgroup);
+ memcg = page->mem_cgroup;
+
+ /*
+ * The lowest bit set means that memcg isn't a valid
+ * memcg pointer, but a obj_cgroups pointer.
+ * In this case the page is shared and doesn't belong
+ * to any specific memory cgroup.
+ */
+ if ((unsigned long) memcg & 0x1UL)
+ memcg = NULL;
+
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
if (memcg)
@@ -681,13 +776,16 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
*/
void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val)
{
- long x;
+ long x, threshold = MEMCG_CHARGE_BATCH;
if (mem_cgroup_disabled())
return;
+ if (memcg_stat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
x = val + __this_cpu_read(memcg->vmstats_percpu->stat[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ if (unlikely(abs(x) > threshold)) {
struct mem_cgroup *mi;
/*
@@ -713,29 +811,12 @@ parent_nodeinfo(struct mem_cgroup_per_node *pn, int nid)
return mem_cgroup_nodeinfo(parent, nid);
}
-/**
- * __mod_lruvec_state - update lruvec memory statistics
- * @lruvec: the lruvec
- * @idx: the stat item
- * @val: delta to add to the counter, can be negative
- *
- * The lruvec is the intersection of the NUMA node and a cgroup. This
- * function updates the all three counters that are affected by a
- * change of state at this level: per-node, per-cgroup, per-lruvec.
- */
-void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
- int val)
+void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
{
- pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pn;
struct mem_cgroup *memcg;
- long x;
-
- /* Update node */
- __mod_node_page_state(pgdat, idx, val);
-
- if (mem_cgroup_disabled())
- return;
+ long x, threshold = MEMCG_CHARGE_BATCH;
pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
memcg = pn->memcg;
@@ -746,8 +827,12 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
/* Update lruvec */
__this_cpu_add(pn->lruvec_stat_local->count[idx], val);
+ if (vmstat_item_in_bytes(idx))
+ threshold <<= PAGE_SHIFT;
+
x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]);
- if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) {
+ if (unlikely(abs(x) > threshold)) {
+ pg_data_t *pgdat = lruvec_pgdat(lruvec);
struct mem_cgroup_per_node *pi;
for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id))
@@ -757,6 +842,27 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__this_cpu_write(pn->lruvec_stat_cpu->count[idx], x);
}
+/**
+ * __mod_lruvec_state - update lruvec memory statistics
+ * @lruvec: the lruvec
+ * @idx: the stat item
+ * @val: delta to add to the counter, can be negative
+ *
+ * The lruvec is the intersection of the NUMA node and a cgroup. This
+ * function updates the all three counters that are affected by a
+ * change of state at this level: per-node, per-cgroup, per-lruvec.
+ */
+void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+ int val)
+{
+ /* Update node */
+ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val);
+
+ /* Update memcg and lruvec */
+ if (!mem_cgroup_disabled())
+ __mod_memcg_lruvec_state(lruvec, idx, val);
+}
+
void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
@@ -834,25 +940,8 @@ static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event)
static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
struct page *page,
- bool compound, int nr_pages)
+ int nr_pages)
{
- /*
- * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
- * counted as CACHE even if it's on ANON LRU.
- */
- if (PageAnon(page))
- __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
- else {
- __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
- if (PageSwapBacked(page))
- __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
- }
-
- if (compound) {
- VM_BUG_ON_PAGE(!PageTransHuge(page), page);
- __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
- }
-
/* pagein of a big page is an event. So, ignore page size */
if (nr_pages > 0)
__count_memcg_events(memcg, PGPGIN, 1);
@@ -1021,7 +1110,7 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
struct mem_cgroup *prev,
struct mem_cgroup_reclaim_cookie *reclaim)
{
- struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
+ struct mem_cgroup_reclaim_iter *iter;
struct cgroup_subsys_state *css = NULL;
struct mem_cgroup *memcg = NULL;
struct mem_cgroup *pos = NULL;
@@ -1218,9 +1307,8 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
* @page: the page
* @pgdat: pgdat of the page
*
- * This function is only safe when following the LRU page isolation
- * and putback protocol: the LRU lock must be held, and the page must
- * either be PageLRU() or the caller must have isolated/allocated it.
+ * This function relies on page->mem_cgroup being stable - see the
+ * access rules in commit_charge().
*/
struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
{
@@ -1314,7 +1402,7 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
if (do_memsw_account()) {
count = page_counter_read(&memcg->memsw);
limit = READ_ONCE(memcg->memsw.max);
- if (count <= limit)
+ if (count < limit)
margin = min(margin, limit - count);
else
margin = 0;
@@ -1389,18 +1477,19 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
*/
seq_buf_printf(&s, "anon %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS) *
+ (u64)memcg_page_state(memcg, NR_ANON_MAPPED) *
PAGE_SIZE);
seq_buf_printf(&s, "file %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_CACHE) *
+ (u64)memcg_page_state(memcg, NR_FILE_PAGES) *
PAGE_SIZE);
seq_buf_printf(&s, "kernel_stack %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_KERNEL_STACK_KB) *
+ (u64)memcg_page_state(memcg, NR_KERNEL_STACK_KB) *
1024);
seq_buf_printf(&s, "slab %llu\n",
- (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) +
- memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE)) *
- PAGE_SIZE);
+ (u64)(memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B) +
+ memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B)));
+ seq_buf_printf(&s, "percpu %llu\n",
+ (u64)memcg_page_state(memcg, MEMCG_PERCPU_B));
seq_buf_printf(&s, "sock %llu\n",
(u64)memcg_page_state(memcg, MEMCG_SOCK) *
PAGE_SIZE);
@@ -1418,15 +1507,11 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
(u64)memcg_page_state(memcg, NR_WRITEBACK) *
PAGE_SIZE);
- /*
- * TODO: We should eventually replace our own MEMCG_RSS_HUGE counter
- * with the NR_ANON_THP vm counter, but right now it's a pain in the
- * arse because it requires migrating the work out of rmap to a place
- * where the page->mem_cgroup is set up and stable.
- */
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
seq_buf_printf(&s, "anon_thp %llu\n",
- (u64)memcg_page_state(memcg, MEMCG_RSS_HUGE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_ANON_THPS) *
+ HPAGE_PMD_SIZE);
+#endif
for (i = 0; i < NR_LRU_LISTS; i++)
seq_buf_printf(&s, "%s %llu\n", lru_list_name(i),
@@ -1434,11 +1519,9 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
PAGE_SIZE);
seq_buf_printf(&s, "slab_reclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_RECLAIMABLE_B));
seq_buf_printf(&s, "slab_unreclaimable %llu\n",
- (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE) *
- PAGE_SIZE);
+ (u64)memcg_page_state(memcg, NR_SLAB_UNRECLAIMABLE_B));
/* Accumulated memory events */
@@ -1447,10 +1530,18 @@ static char *memory_stat_format(struct mem_cgroup *memcg)
seq_buf_printf(&s, "%s %lu\n", vm_event_name(PGMAJFAULT),
memcg_events(memcg, PGMAJFAULT));
- seq_buf_printf(&s, "workingset_refault %lu\n",
- memcg_page_state(memcg, WORKINGSET_REFAULT));
- seq_buf_printf(&s, "workingset_activate %lu\n",
- memcg_page_state(memcg, WORKINGSET_ACTIVATE));
+ seq_buf_printf(&s, "workingset_refault_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT_ANON));
+ seq_buf_printf(&s, "workingset_refault_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_REFAULT_FILE));
+ seq_buf_printf(&s, "workingset_activate_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE_ANON));
+ seq_buf_printf(&s, "workingset_activate_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_ACTIVATE_FILE));
+ seq_buf_printf(&s, "workingset_restore_anon %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE_ANON));
+ seq_buf_printf(&s, "workingset_restore_file %lu\n",
+ memcg_page_state(memcg, WORKINGSET_RESTORE_FILE));
seq_buf_printf(&s, "workingset_nodereclaim %lu\n",
memcg_page_state(memcg, WORKINGSET_NODERECLAIM));
@@ -1580,15 +1671,21 @@ static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
.gfp_mask = gfp_mask,
.order = order,
};
- bool ret;
+ bool ret = true;
if (mutex_lock_killable(&oom_lock))
return true;
+
+ if (mem_cgroup_margin(memcg) >= (1 << order))
+ goto unlock;
+
/*
* A few threads which were not waiting at mutex_lock_killable() can
* fail to bail out. Therefore, check again after holding oom_lock.
*/
ret = should_force_charge() || out_of_memory(&oc);
+
+unlock:
mutex_unlock(&oom_lock);
return ret;
}
@@ -1979,6 +2076,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
*/
struct mem_cgroup *lock_page_memcg(struct page *page)
{
+ struct page *head = compound_head(page); /* rmap on tail pages */
struct mem_cgroup *memcg;
unsigned long flags;
@@ -1998,7 +2096,7 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = page->mem_cgroup;
+ memcg = head->mem_cgroup;
if (unlikely(!memcg))
return NULL;
@@ -2006,7 +2104,7 @@ again:
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != page->mem_cgroup) {
+ if (memcg != head->mem_cgroup) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -2049,13 +2147,21 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
*/
void unlock_page_memcg(struct page *page)
{
- __unlock_page_memcg(page->mem_cgroup);
+ struct page *head = compound_head(page);
+
+ __unlock_page_memcg(head->mem_cgroup);
}
EXPORT_SYMBOL(unlock_page_memcg);
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
+
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup *cached_objcg;
+ unsigned int nr_bytes;
+#endif
+
struct work_struct work;
unsigned long flags;
#define FLUSHING_CACHED_CHARGE 0
@@ -2063,6 +2169,22 @@ struct memcg_stock_pcp {
static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex);
+#ifdef CONFIG_MEMCG_KMEM
+static void drain_obj_stock(struct memcg_stock_pcp *stock);
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg);
+
+#else
+static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+}
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ return false;
+}
+#endif
+
/**
* consume_stock: Try to consume stocked charge on this cpu.
* @memcg: memcg to consume from.
@@ -2103,13 +2225,17 @@ static void drain_stock(struct memcg_stock_pcp *stock)
{
struct mem_cgroup *old = stock->cached;
+ if (!old)
+ return;
+
if (stock->nr_pages) {
page_counter_uncharge(&old->memory, stock->nr_pages);
if (do_memsw_account())
page_counter_uncharge(&old->memsw, stock->nr_pages);
- css_put_many(&old->css, stock->nr_pages);
stock->nr_pages = 0;
}
+
+ css_put(&old->css);
stock->cached = NULL;
}
@@ -2125,6 +2251,7 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
+ drain_obj_stock(stock);
drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
@@ -2145,6 +2272,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached != memcg) { /* reset if necessary */
drain_stock(stock);
+ css_get(&memcg->css);
stock->cached = memcg;
}
stock->nr_pages += nr_pages;
@@ -2183,6 +2311,8 @@ static void drain_all_stock(struct mem_cgroup *root_memcg)
if (memcg && stock->nr_pages &&
mem_cgroup_is_descendant(memcg, root_memcg))
flush = true;
+ if (obj_stock_flush_required(stock, root_memcg))
+ flush = true;
rcu_read_unlock();
if (flush &&
@@ -2245,17 +2375,29 @@ static int memcg_hotplug_cpu_dead(unsigned int cpu)
return 0;
}
-static void reclaim_high(struct mem_cgroup *memcg,
- unsigned int nr_pages,
- gfp_t gfp_mask)
+static unsigned long reclaim_high(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ gfp_t gfp_mask)
{
+ unsigned long nr_reclaimed = 0;
+
do {
- if (page_counter_read(&memcg->memory) <= READ_ONCE(memcg->high))
+ unsigned long pflags;
+
+ if (page_counter_read(&memcg->memory) <=
+ READ_ONCE(memcg->memory.high))
continue;
+
memcg_memory_event(memcg, MEMCG_HIGH);
- try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
+
+ psi_memstall_enter(&pflags);
+ nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
+ gfp_mask, true);
+ psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+
+ return nr_reclaimed;
}
static void high_work_func(struct work_struct *work)
@@ -2280,7 +2422,7 @@ static void high_work_func(struct work_struct *work)
*
* - MEMCG_DELAY_PRECISION_SHIFT: Extra precision bits while translating the
* overage ratio to a delay.
- * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down down the
+ * - MEMCG_DELAY_SCALING_SHIFT: The number of bits to scale down the
* proposed penalty in order to reduce to a reasonable number of jiffies, and
* to produce a reasonable delay curve.
*
@@ -2319,41 +2461,64 @@ static void high_work_func(struct work_struct *work)
#define MEMCG_DELAY_PRECISION_SHIFT 20
#define MEMCG_DELAY_SCALING_SHIFT 14
-/*
- * Get the number of jiffies that we should penalise a mischievous cgroup which
- * is exceeding its memory.high by checking both it and its ancestors.
- */
-static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
- unsigned int nr_pages)
+static u64 calculate_overage(unsigned long usage, unsigned long high)
{
- unsigned long penalty_jiffies;
- u64 max_overage = 0;
+ u64 overage;
- do {
- unsigned long usage, high;
- u64 overage;
+ if (usage <= high)
+ return 0;
- usage = page_counter_read(&memcg->memory);
- high = READ_ONCE(memcg->high);
+ /*
+ * Prevent division by 0 in overage calculation by acting as if
+ * it was a threshold of 1 page
+ */
+ high = max(high, 1UL);
- if (usage <= high)
- continue;
+ overage = usage - high;
+ overage <<= MEMCG_DELAY_PRECISION_SHIFT;
+ return div64_u64(overage, high);
+}
- /*
- * Prevent division by 0 in overage calculation by acting as if
- * it was a threshold of 1 page
- */
- high = max(high, 1UL);
+static u64 mem_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
- overage = usage - high;
- overage <<= MEMCG_DELAY_PRECISION_SHIFT;
- overage = div64_u64(overage, high);
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->memory),
+ READ_ONCE(memcg->memory.high));
+ max_overage = max(overage, max_overage);
+ } while ((memcg = parent_mem_cgroup(memcg)) &&
+ !mem_cgroup_is_root(memcg));
+
+ return max_overage;
+}
- if (overage > max_overage)
- max_overage = overage;
+static u64 swap_find_max_overage(struct mem_cgroup *memcg)
+{
+ u64 overage, max_overage = 0;
+
+ do {
+ overage = calculate_overage(page_counter_read(&memcg->swap),
+ READ_ONCE(memcg->swap.high));
+ if (overage)
+ memcg_memory_event(memcg, MEMCG_SWAP_HIGH);
+ max_overage = max(overage, max_overage);
} while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg));
+ return max_overage;
+}
+
+/*
+ * Get the number of jiffies that we should penalise a mischievous cgroup which
+ * is exceeding its memory.high by checking both it and its ancestors.
+ */
+static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
+ unsigned int nr_pages,
+ u64 max_overage)
+{
+ unsigned long penalty_jiffies;
+
if (!max_overage)
return 0;
@@ -2377,14 +2542,7 @@ static unsigned long calculate_high_delay(struct mem_cgroup *memcg,
* MEMCG_CHARGE_BATCH pages is nominal, so work out how much smaller or
* larger the current charge patch is than that.
*/
- penalty_jiffies = penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
-
- /*
- * Clamp the max delay per usermode return so as to still keep the
- * application moving forwards and also permit diagnostics, albeit
- * extremely slowly.
- */
- return min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
+ return penalty_jiffies * nr_pages / MEMCG_CHARGE_BATCH;
}
/*
@@ -2395,21 +2553,48 @@ void mem_cgroup_handle_over_high(void)
{
unsigned long penalty_jiffies;
unsigned long pflags;
+ unsigned long nr_reclaimed;
unsigned int nr_pages = current->memcg_nr_pages_over_high;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *memcg;
+ bool in_retry = false;
if (likely(!nr_pages))
return;
memcg = get_mem_cgroup_from_mm(current->mm);
- reclaim_high(memcg, nr_pages, GFP_KERNEL);
current->memcg_nr_pages_over_high = 0;
+retry_reclaim:
+ /*
+ * The allocating task should reclaim at least the batch size, but for
+ * subsequent retries we only want to do what's necessary to prevent oom
+ * or breaching resource isolation.
+ *
+ * This is distinct from memory.max or page allocator behaviour because
+ * memory.high is currently batched, whereas memory.max and the page
+ * allocator run every time an allocation is made.
+ */
+ nr_reclaimed = reclaim_high(memcg,
+ in_retry ? SWAP_CLUSTER_MAX : nr_pages,
+ GFP_KERNEL);
+
/*
* memory.high is breached and reclaim is unable to keep up. Throttle
* allocators proactively to slow down excessive growth.
*/
- penalty_jiffies = calculate_high_delay(memcg, nr_pages);
+ penalty_jiffies = calculate_high_delay(memcg, nr_pages,
+ mem_find_max_overage(memcg));
+
+ penalty_jiffies += calculate_high_delay(memcg, nr_pages,
+ swap_find_max_overage(memcg));
+
+ /*
+ * Clamp the max delay per usermode return so as to still keep the
+ * application moving forwards and also permit diagnostics, albeit
+ * extremely slowly.
+ */
+ penalty_jiffies = min(penalty_jiffies, MEMCG_MAX_HIGH_DELAY_JIFFIES);
/*
* Don't sleep if the amount of jiffies this memcg owes us is so low
@@ -2421,6 +2606,16 @@ void mem_cgroup_handle_over_high(void)
goto out;
/*
+ * If reclaim is making forward progress but we're still over
+ * memory.high, we want to encourage that rather than doing allocator
+ * throttling.
+ */
+ if (nr_reclaimed || nr_retries--) {
+ in_retry = true;
+ goto retry_reclaim;
+ }
+
+ /*
* If we exit early, we're guaranteed to die (since
* schedule_timeout_killable sets TASK_KILLABLE). This means we don't
* need to account for any ill-begotten jiffies to pay them off later.
@@ -2437,13 +2632,14 @@ static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
unsigned int nr_pages)
{
unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
+ enum oom_status oom_status;
unsigned long nr_reclaimed;
bool may_swap = true;
bool drained = false;
- enum oom_status oom_status;
+ unsigned long pflags;
if (mem_cgroup_is_root(memcg))
return 0;
@@ -2503,8 +2699,10 @@ retry:
memcg_memory_event(mem_over_limit, MEMCG_MAX);
+ psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, may_swap);
+ psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
goto retry;
@@ -2556,7 +2754,7 @@ retry:
get_order(nr_pages * PAGE_SIZE));
switch (oom_status) {
case OOM_SUCCESS:
- nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ nr_retries = MAX_RECLAIM_RETRIES;
goto retry;
case OOM_FAILED:
goto force;
@@ -2575,12 +2773,10 @@ force:
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
return 0;
done_restock:
- css_get_many(&memcg->css, batch);
if (batch > nr_pages)
refill_stock(memcg, batch - nr_pages);
@@ -2594,12 +2790,32 @@ done_restock:
* reclaim, the cost of mismatch is negligible.
*/
do {
- if (page_counter_read(&memcg->memory) > READ_ONCE(memcg->high)) {
- /* Don't bother a random interrupted task */
- if (in_interrupt()) {
+ bool mem_high, swap_high;
+
+ mem_high = page_counter_read(&memcg->memory) >
+ READ_ONCE(memcg->memory.high);
+ swap_high = page_counter_read(&memcg->swap) >
+ READ_ONCE(memcg->swap.high);
+
+ /* Don't bother a random interrupted task */
+ if (in_interrupt()) {
+ if (mem_high) {
schedule_work(&memcg->high_work);
break;
}
+ continue;
+ }
+
+ if (mem_high || swap_high) {
+ /*
+ * The allocating tasks in this cgroup will need to do
+ * reclaim or be throttled to prevent further growth
+ * of the memory or swap footprints.
+ *
+ * Target some best-effort fairness between the tasks,
+ * and distribute reclaim work and delay penalties
+ * based on how much each task is actually allocating.
+ */
current->memcg_nr_pages_over_high += batch;
set_notify_resume(current);
break;
@@ -2609,6 +2825,7 @@ done_restock:
return 0;
}
+#if defined(CONFIG_MEMCG_KMEM) || defined(CONFIG_MMU)
static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
{
if (mem_cgroup_is_root(memcg))
@@ -2617,76 +2834,44 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
page_counter_uncharge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_uncharge(&memcg->memsw, nr_pages);
-
- css_put_many(&memcg->css, nr_pages);
-}
-
-static void lock_page_lru(struct page *page, int *isolated)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- spin_lock_irq(&pgdat->lru_lock);
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- *isolated = 1;
- } else
- *isolated = 0;
-}
-
-static void unlock_page_lru(struct page *page, int isolated)
-{
- pg_data_t *pgdat = page_pgdat(page);
-
- if (isolated) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- VM_BUG_ON_PAGE(PageLRU(page), page);
- SetPageLRU(page);
- add_page_to_lru_list(page, lruvec, page_lru(page));
- }
- spin_unlock_irq(&pgdat->lru_lock);
}
+#endif
-static void commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare)
+static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- int isolated;
-
VM_BUG_ON_PAGE(page->mem_cgroup, page);
-
/*
- * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
- * may already be on some other mem_cgroup's LRU. Take care of it.
- */
- if (lrucare)
- lock_page_lru(page, &isolated);
-
- /*
- * Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point:
+ * Any of the following ensures page->mem_cgroup stability:
*
- * - the page is uncharged
- *
- * - the page is off-LRU
- *
- * - an anonymous fault has exclusive page access, except for
- * a locked page table
- *
- * - a page cache insertion, a swapin fault, or a migration
- * have the page locked
+ * - the page lock
+ * - LRU isolation
+ * - lock_page_memcg()
+ * - exclusive reference
*/
page->mem_cgroup = memcg;
-
- if (lrucare)
- unlock_page_lru(page, isolated);
}
#ifdef CONFIG_MEMCG_KMEM
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp)
+{
+ unsigned int objects = objs_per_slab_page(s, page);
+ void *vec;
+
+ vec = kcalloc_node(objects, sizeof(struct obj_cgroup *), gfp,
+ page_to_nid(page));
+ if (!vec)
+ return -ENOMEM;
+
+ if (cmpxchg(&page->obj_cgroups, NULL,
+ (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ kfree(vec);
+ else
+ kmemleak_not_leak(vec);
+
+ return 0;
+}
+
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
@@ -2703,17 +2888,50 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
- * Slab pages don't have page->mem_cgroup set because corresponding
- * kmem caches can be reparented during the lifetime. That's why
- * memcg_from_slab_page() should be used instead.
+ * Slab objects are accounted individually, not per-page.
+ * Memcg membership data for each individual object is saved in
+ * the page->obj_cgroups.
*/
- if (PageSlab(page))
- return memcg_from_slab_page(page);
+ if (page_has_obj_cgroups(page)) {
+ struct obj_cgroup *objcg;
+ unsigned int off;
+
+ off = obj_to_index(page->slab_cache, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ if (objcg)
+ return obj_cgroup_memcg(objcg);
+
+ return NULL;
+ }
/* All other pages use page->mem_cgroup */
return page->mem_cgroup;
}
+__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
+{
+ struct obj_cgroup *objcg = NULL;
+ struct mem_cgroup *memcg;
+
+ if (unlikely(!current->mm && !current->active_memcg))
+ return NULL;
+
+ rcu_read_lock();
+ if (unlikely(current->active_memcg))
+ memcg = rcu_dereference(current->active_memcg);
+ else
+ memcg = mem_cgroup_from_task(current);
+
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ objcg = rcu_dereference(memcg->objcg);
+ if (objcg && obj_cgroup_tryget(objcg))
+ break;
+ }
+ rcu_read_unlock();
+
+ return objcg;
+}
+
static int memcg_alloc_cache_id(void)
{
int id, size;
@@ -2739,9 +2957,7 @@ static int memcg_alloc_cache_id(void)
else if (size > MEMCG_CACHES_MAX_SIZE)
size = MEMCG_CACHES_MAX_SIZE;
- err = memcg_update_all_caches(size);
- if (!err)
- err = memcg_update_all_list_lrus(size);
+ err = memcg_update_all_list_lrus(size);
if (!err)
memcg_nr_cache_ids = size;
@@ -2759,143 +2975,6 @@ static void memcg_free_cache_id(int id)
ida_simple_remove(&memcg_cache_ida, id);
}
-struct memcg_kmem_cache_create_work {
- struct mem_cgroup *memcg;
- struct kmem_cache *cachep;
- struct work_struct work;
-};
-
-static void memcg_kmem_cache_create_func(struct work_struct *w)
-{
- struct memcg_kmem_cache_create_work *cw =
- container_of(w, struct memcg_kmem_cache_create_work, work);
- struct mem_cgroup *memcg = cw->memcg;
- struct kmem_cache *cachep = cw->cachep;
-
- memcg_create_kmem_cache(memcg, cachep);
-
- css_put(&memcg->css);
- kfree(cw);
-}
-
-/*
- * Enqueue the creation of a per-memcg kmem_cache.
- */
-static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
- struct kmem_cache *cachep)
-{
- struct memcg_kmem_cache_create_work *cw;
-
- if (!css_tryget_online(&memcg->css))
- return;
-
- cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
- if (!cw)
- return;
-
- cw->memcg = memcg;
- cw->cachep = cachep;
- INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
-
- queue_work(memcg_kmem_cache_wq, &cw->work);
-}
-
-static inline bool memcg_kmem_bypass(void)
-{
- if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
- return true;
- return false;
-}
-
-/**
- * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
- * @cachep: the original global kmem cache
- *
- * Return the kmem_cache we're supposed to use for a slab allocation.
- * We try to use the current memcg's version of the cache.
- *
- * If the cache does not exist yet, if we are the first user of it, we
- * create it asynchronously in a workqueue and let the current allocation
- * go through with the original cache.
- *
- * This function takes a reference to the cache it returns to assure it
- * won't get destroyed while we are working with it. Once the caller is
- * done with it, memcg_kmem_put_cache() must be called to release the
- * reference.
- */
-struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
-{
- struct mem_cgroup *memcg;
- struct kmem_cache *memcg_cachep;
- struct memcg_cache_array *arr;
- int kmemcg_id;
-
- VM_BUG_ON(!is_root_cache(cachep));
-
- if (memcg_kmem_bypass())
- return cachep;
-
- rcu_read_lock();
-
- if (unlikely(current->active_memcg))
- memcg = current->active_memcg;
- else
- memcg = mem_cgroup_from_task(current);
-
- if (!memcg || memcg == root_mem_cgroup)
- goto out_unlock;
-
- kmemcg_id = READ_ONCE(memcg->kmemcg_id);
- if (kmemcg_id < 0)
- goto out_unlock;
-
- arr = rcu_dereference(cachep->memcg_params.memcg_caches);
-
- /*
- * Make sure we will access the up-to-date value. The code updating
- * memcg_caches issues a write barrier to match the data dependency
- * barrier inside READ_ONCE() (see memcg_create_kmem_cache()).
- */
- memcg_cachep = READ_ONCE(arr->entries[kmemcg_id]);
-
- /*
- * If we are in a safe context (can wait, and not in interrupt
- * context), we could be be predictable and return right away.
- * This would guarantee that the allocation being performed
- * already belongs in the new cache.
- *
- * However, there are some clashes that can arrive from locking.
- * For instance, because we acquire the slab_mutex while doing
- * memcg_create_kmem_cache, this means no further allocation
- * could happen with the slab_mutex held. So it's better to
- * defer everything.
- *
- * If the memcg is dying or memcg_cache is about to be released,
- * don't bother creating new kmem_caches. Because memcg_cachep
- * is ZEROed as the fist step of kmem offlining, we don't need
- * percpu_ref_tryget_live() here. css_tryget_online() check in
- * memcg_schedule_kmem_cache_create() will prevent us from
- * creation of a new kmem_cache.
- */
- if (unlikely(!memcg_cachep))
- memcg_schedule_kmem_cache_create(memcg, cachep);
- else if (percpu_ref_tryget(&memcg_cachep->memcg_params.refcnt))
- cachep = memcg_cachep;
-out_unlock:
- rcu_read_unlock();
- return cachep;
-}
-
-/**
- * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
- * @cachep: the cache returned by memcg_kmem_get_cache
- */
-void memcg_kmem_put_cache(struct kmem_cache *cachep)
-{
- if (!is_root_cache(cachep))
- percpu_ref_put(&cachep->memcg_params.refcnt);
-}
-
/**
* __memcg_kmem_charge: charge a number of kernel pages to a memcg
* @memcg: memory cgroup to charge
@@ -2969,6 +3048,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
if (!ret) {
page->mem_cgroup = memcg;
__SetPageKmemcg(page);
+ return 0;
}
}
css_put(&memcg->css);
@@ -2991,13 +3071,146 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
page->mem_cgroup = NULL;
+ css_put(&memcg->css);
/* slab pages do not have PageKmemcg flag set */
if (PageKmemcg(page))
__ClearPageKmemcg(page);
+}
+
+static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+ bool ret = false;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
+ stock->nr_bytes -= nr_bytes;
+ ret = true;
+ }
+
+ local_irq_restore(flags);
+
+ return ret;
+}
+
+static void drain_obj_stock(struct memcg_stock_pcp *stock)
+{
+ struct obj_cgroup *old = stock->cached_objcg;
+
+ if (!old)
+ return;
+
+ if (stock->nr_bytes) {
+ unsigned int nr_pages = stock->nr_bytes >> PAGE_SHIFT;
+ unsigned int nr_bytes = stock->nr_bytes & (PAGE_SIZE - 1);
+
+ if (nr_pages) {
+ rcu_read_lock();
+ __memcg_kmem_uncharge(obj_cgroup_memcg(old), nr_pages);
+ rcu_read_unlock();
+ }
+
+ /*
+ * The leftover is flushed to the centralized per-memcg value.
+ * On the next attempt to refill obj stock it will be moved
+ * to a per-cpu stock (probably, on an other CPU), see
+ * refill_obj_stock().
+ *
+ * How often it's flushed is a trade-off between the memory
+ * limit enforcement accuracy and potential CPU contention,
+ * so it might be changed in the future.
+ */
+ atomic_add(nr_bytes, &old->nr_charged_bytes);
+ stock->nr_bytes = 0;
+ }
+
+ obj_cgroup_put(old);
+ stock->cached_objcg = NULL;
+}
+
+static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
+ struct mem_cgroup *root_memcg)
+{
+ struct mem_cgroup *memcg;
+
+ if (stock->cached_objcg) {
+ memcg = obj_cgroup_memcg(stock->cached_objcg);
+ if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
+ return true;
+ }
+
+ return false;
+}
+
+static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
+{
+ struct memcg_stock_pcp *stock;
+ unsigned long flags;
+
+ local_irq_save(flags);
+
+ stock = this_cpu_ptr(&memcg_stock);
+ if (stock->cached_objcg != objcg) { /* reset if necessary */
+ drain_obj_stock(stock);
+ obj_cgroup_get(objcg);
+ stock->cached_objcg = objcg;
+ stock->nr_bytes = atomic_xchg(&objcg->nr_charged_bytes, 0);
+ }
+ stock->nr_bytes += nr_bytes;
+
+ if (stock->nr_bytes > PAGE_SIZE)
+ drain_obj_stock(stock);
+
+ local_irq_restore(flags);
+}
+
+int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
+{
+ struct mem_cgroup *memcg;
+ unsigned int nr_pages, nr_bytes;
+ int ret;
+
+ if (consume_obj_stock(objcg, size))
+ return 0;
- css_put_many(&memcg->css, nr_pages);
+ /*
+ * In theory, memcg->nr_charged_bytes can have enough
+ * pre-charged bytes to satisfy the allocation. However,
+ * flushing memcg->nr_charged_bytes requires two atomic
+ * operations, and memcg->nr_charged_bytes can't be big,
+ * so it's better to ignore it and try grab some new pages.
+ * memcg->nr_charged_bytes will be flushed in
+ * refill_obj_stock(), called from this function or
+ * independently later.
+ */
+ rcu_read_lock();
+ memcg = obj_cgroup_memcg(objcg);
+ css_get(&memcg->css);
+ rcu_read_unlock();
+
+ nr_pages = size >> PAGE_SHIFT;
+ nr_bytes = size & (PAGE_SIZE - 1);
+
+ if (nr_bytes)
+ nr_pages += 1;
+
+ ret = __memcg_kmem_charge(memcg, gfp, nr_pages);
+ if (!ret && nr_bytes)
+ refill_obj_stock(objcg, PAGE_SIZE - nr_bytes);
+
+ css_put(&memcg->css);
+ return ret;
}
+
+void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
+{
+ refill_obj_stock(objcg, size);
+}
+
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -3008,15 +3221,16 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
+ struct mem_cgroup *memcg = head->mem_cgroup;
int i;
if (mem_cgroup_disabled())
return;
- for (i = 1; i < HPAGE_PMD_NR; i++)
- head[i].mem_cgroup = head->mem_cgroup;
-
- __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
+ for (i = 1; i < HPAGE_PMD_NR; i++) {
+ css_get(&memcg->css);
+ head[i].mem_cgroup = memcg;
+ }
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3201,7 +3415,7 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
* Test whether @memcg has children, dead or alive. Note that this
* function doesn't care whether @memcg has use_hierarchy enabled and
* returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsiblity.
+ * hierarchy. Testing use_hierarchy is the caller's responsibility.
*/
static inline bool memcg_has_children(struct mem_cgroup *memcg)
{
@@ -3220,7 +3434,7 @@ static inline bool memcg_has_children(struct mem_cgroup *memcg)
*/
static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
{
- int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ int nr_retries = MAX_RECLAIM_RETRIES;
/* we call try-to-free pages for make this cgroup empty */
lru_add_drain_all();
@@ -3299,8 +3513,8 @@ static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
unsigned long val;
if (mem_cgroup_is_root(memcg)) {
- val = memcg_page_state(memcg, MEMCG_CACHE) +
- memcg_page_state(memcg, MEMCG_RSS);
+ val = memcg_page_state(memcg, NR_FILE_PAGES) +
+ memcg_page_state(memcg, NR_ANON_MAPPED);
if (swap)
val += memcg_page_state(memcg, MEMCG_SWAP);
} else {
@@ -3417,6 +3631,7 @@ static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg)
#ifdef CONFIG_MEMCG_KMEM
static int memcg_online_kmem(struct mem_cgroup *memcg)
{
+ struct obj_cgroup *objcg;
int memcg_id;
if (cgroup_memory_nokmem)
@@ -3429,7 +3644,16 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
if (memcg_id < 0)
return memcg_id;
- static_branch_inc(&memcg_kmem_enabled_key);
+ objcg = obj_cgroup_alloc();
+ if (!objcg) {
+ memcg_free_cache_id(memcg_id);
+ return -ENOMEM;
+ }
+ objcg->memcg = memcg;
+ rcu_assign_pointer(memcg->objcg, objcg);
+
+ static_branch_enable(&memcg_kmem_enabled_key);
+
/*
* A memory cgroup is considered kmem-online as soon as it gets
* kmemcg_id. Setting the id after enabling static branching will
@@ -3438,7 +3662,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
*/
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
- INIT_LIST_HEAD(&memcg->kmem_caches);
return 0;
}
@@ -3451,22 +3674,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
if (memcg->kmem_state != KMEM_ONLINE)
return;
- /*
- * Clear the online state before clearing memcg_caches array
- * entries. The slab_mutex in memcg_deactivate_kmem_caches()
- * guarantees that no cache will be created for this cgroup
- * after we are done (see memcg_create_kmem_cache()).
- */
+
memcg->kmem_state = KMEM_ALLOCATED;
parent = parent_mem_cgroup(memcg);
if (!parent)
parent = root_mem_cgroup;
- /*
- * Deactivate and reparent kmem_caches.
- */
- memcg_deactivate_kmem_caches(memcg, parent);
+ memcg_reparent_objcgs(memcg, parent);
kmemcg_id = memcg->kmemcg_id;
BUG_ON(kmemcg_id < 0);
@@ -3499,11 +3714,6 @@ static void memcg_free_kmem(struct mem_cgroup *memcg)
/* css_alloc() failed, offlining didn't happen */
if (unlikely(memcg->kmem_state == KMEM_ONLINE))
memcg_offline_kmem(memcg);
-
- if (memcg->kmem_state == KMEM_ALLOCATED) {
- WARN_ON(!list_empty(&memcg->kmem_caches));
- static_branch_dec(&memcg_kmem_enabled_key);
- }
}
#else
static int memcg_online_kmem(struct mem_cgroup *memcg)
@@ -3688,7 +3898,7 @@ static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
#define LRU_ALL ((1 << NR_LRU_LISTS) - 1)
static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
- int nid, unsigned int lru_mask)
+ int nid, unsigned int lru_mask, bool tree)
{
struct lruvec *lruvec = mem_cgroup_lruvec(memcg, NODE_DATA(nid));
unsigned long nr = 0;
@@ -3699,13 +3909,17 @@ static unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
+ if (tree)
+ nr += lruvec_page_state(lruvec, NR_LRU_BASE + lru);
+ else
+ nr += lruvec_page_state_local(lruvec, NR_LRU_BASE + lru);
}
return nr;
}
static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
- unsigned int lru_mask)
+ unsigned int lru_mask,
+ bool tree)
{
unsigned long nr = 0;
enum lru_list lru;
@@ -3713,7 +3927,10 @@ static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
for_each_lru(lru) {
if (!(BIT(lru) & lru_mask))
continue;
- nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
+ if (tree)
+ nr += memcg_page_state(memcg, NR_LRU_BASE + lru);
+ else
+ nr += memcg_page_state_local(memcg, NR_LRU_BASE + lru);
}
return nr;
}
@@ -3733,34 +3950,28 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
};
const struct numa_stat *stat;
int nid;
- unsigned long nr;
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
- seq_printf(m, "%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
- stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+ seq_printf(m, "%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ false));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, false));
seq_putc(m, '\n');
}
for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
- struct mem_cgroup *iter;
-
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
- seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
- for_each_node_state(nid, N_MEMORY) {
- nr = 0;
- for_each_mem_cgroup_tree(iter, memcg)
- nr += mem_cgroup_node_nr_lru_pages(
- iter, nid, stat->lru_mask);
- seq_printf(m, " N%d=%lu", nid, nr);
- }
+
+ seq_printf(m, "hierarchical_%s=%lu", stat->name,
+ mem_cgroup_nr_lru_pages(memcg, stat->lru_mask,
+ true));
+ for_each_node_state(nid, N_MEMORY)
+ seq_printf(m, " N%d=%lu", nid,
+ mem_cgroup_node_nr_lru_pages(memcg, nid,
+ stat->lru_mask, true));
seq_putc(m, '\n');
}
@@ -3769,9 +3980,11 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v)
#endif /* CONFIG_NUMA */
static const unsigned int memcg1_stats[] = {
- MEMCG_CACHE,
- MEMCG_RSS,
- MEMCG_RSS_HUGE,
+ NR_FILE_PAGES,
+ NR_ANON_MAPPED,
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ NR_ANON_THPS,
+#endif
NR_SHMEM,
NR_FILE_MAPPED,
NR_FILE_DIRTY,
@@ -3782,7 +3995,9 @@ static const unsigned int memcg1_stats[] = {
static const char *const memcg1_stat_names[] = {
"cache",
"rss",
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
"rss_huge",
+#endif
"shmem",
"mapped_file",
"dirty",
@@ -3808,11 +4023,16 @@ static int memcg_stat_show(struct seq_file *m, void *v)
BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+ unsigned long nr;
+
if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
continue;
- seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
- memcg_page_state_local(memcg, memcg1_stats[i]) *
- PAGE_SIZE);
+ nr = memcg_page_state_local(memcg, memcg1_stats[i]);
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ if (memcg1_stats[i] == NR_ANON_THPS)
+ nr *= HPAGE_PMD_NR;
+#endif
+ seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE);
}
for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
@@ -3858,23 +4078,17 @@ static int memcg_stat_show(struct seq_file *m, void *v)
{
pg_data_t *pgdat;
struct mem_cgroup_per_node *mz;
- struct zone_reclaim_stat *rstat;
- unsigned long recent_rotated[2] = {0, 0};
- unsigned long recent_scanned[2] = {0, 0};
+ unsigned long anon_cost = 0;
+ unsigned long file_cost = 0;
for_each_online_pgdat(pgdat) {
mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
- rstat = &mz->lruvec.reclaim_stat;
- recent_rotated[0] += rstat->recent_rotated[0];
- recent_rotated[1] += rstat->recent_rotated[1];
- recent_scanned[0] += rstat->recent_scanned[0];
- recent_scanned[1] += rstat->recent_scanned[1];
+ anon_cost += mz->lruvec.anon_cost;
+ file_cost += mz->lruvec.file_cost;
}
- seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
- seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
- seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
- seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
+ seq_printf(m, "anon_cost %lu\n", anon_cost);
+ seq_printf(m, "file_cost %lu\n", file_cost);
}
#endif
@@ -4330,7 +4544,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
*pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY);
- /* this should eventually include NR_UNSTABLE_NFS */
*pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK);
*pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) +
memcg_exact_page_state(memcg, NR_ACTIVE_FILE);
@@ -4338,7 +4551,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
while ((parent = parent_mem_cgroup(memcg))) {
unsigned long ceiling = min(READ_ONCE(memcg->memory.max),
- READ_ONCE(memcg->high));
+ READ_ONCE(memcg->memory.high));
unsigned long used = page_counter_read(&memcg->memory);
*pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
@@ -4810,9 +5023,6 @@ static struct cftype mem_cgroup_legacy_files[] = {
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG))
{
.name = "kmem.slabinfo",
- .seq_start = memcg_slab_start,
- .seq_next = memcg_slab_next,
- .seq_stop = memcg_slab_stop,
.seq_show = memcg_slab_show,
},
#endif
@@ -4850,7 +5060,7 @@ static struct cftype mem_cgroup_legacy_files[] = {
* limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
* memory-controlled cgroups to 64k.
*
- * However, there usually are many references to the oflline CSS after
+ * However, there usually are many references to the offline CSS after
* the cgroup has been destroyed, such as page cache or reclaimable
* slab objects, that don't need to hang on to the ID. We want to keep
* those dead CSS from occupying IDs, or we might quickly exhaust the
@@ -4927,13 +5137,15 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
if (!pn)
return 1;
- pn->lruvec_stat_local = alloc_percpu(struct lruvec_stat);
+ pn->lruvec_stat_local = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_local) {
kfree(pn);
return 1;
}
- pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
+ pn->lruvec_stat_cpu = alloc_percpu_gfp(struct lruvec_stat,
+ GFP_KERNEL_ACCOUNT);
if (!pn->lruvec_stat_cpu) {
free_percpu(pn->lruvec_stat_local);
kfree(pn);
@@ -5007,11 +5219,13 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
goto fail;
}
- memcg->vmstats_local = alloc_percpu(struct memcg_vmstats_percpu);
+ memcg->vmstats_local = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_local)
goto fail;
- memcg->vmstats_percpu = alloc_percpu(struct memcg_vmstats_percpu);
+ memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu,
+ GFP_KERNEL_ACCOUNT);
if (!memcg->vmstats_percpu)
goto fail;
@@ -5032,6 +5246,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void)
memcg->socket_pressure = jiffies;
#ifdef CONFIG_MEMCG_KMEM
memcg->kmemcg_id = -1;
+ INIT_LIST_HEAD(&memcg->objcg_list);
#endif
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&memcg->cgwb_list);
@@ -5059,12 +5274,15 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
struct mem_cgroup *memcg;
long error = -ENOMEM;
+ memalloc_use_memcg(parent);
memcg = mem_cgroup_alloc();
+ memalloc_unuse_memcg();
if (IS_ERR(memcg))
return ERR_CAST(memcg);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
@@ -5093,9 +5311,6 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
/* The following stuff does not apply to the root */
if (!parent) {
-#ifdef CONFIG_MEMCG_KMEM
- INIT_LIST_HEAD(&memcg->kmem_caches);
-#endif
root_mem_cgroup = memcg;
return &memcg->css;
}
@@ -5216,8 +5431,9 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
page_counter_set_min(&memcg->memory, 0);
page_counter_set_low(&memcg->memory, 0);
- WRITE_ONCE(memcg->high, PAGE_COUNTER_MAX);
+ page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX);
memcg->soft_limit = PAGE_COUNTER_MAX;
+ page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX);
memcg_wb_domain_size_changed(memcg);
}
@@ -5308,8 +5524,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
* we call find_get_page() with swapper_space directly.
*/
page = find_get_page(swap_address_space(ent), swp_offset(ent));
- if (do_memsw_account())
- entry->val = ent.val;
+ entry->val = ent.val;
return page;
}
@@ -5343,8 +5558,7 @@ static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
page = find_get_entry(mapping, pgoff);
if (xa_is_value(page)) {
swp_entry_t swp = radix_to_swp_entry(page);
- if (do_memsw_account())
- *entry = swp;
+ *entry = swp;
page = find_get_page(swap_address_space(swp),
swp_offset(swp));
}
@@ -5375,10 +5589,8 @@ static int mem_cgroup_move_account(struct page *page,
{
struct lruvec *from_vec, *to_vec;
struct pglist_data *pgdat;
- unsigned long flags;
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
+ unsigned int nr_pages = compound ? thp_nr_pages(page) : 1;
int ret;
- bool anon;
VM_BUG_ON(from == to);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -5396,30 +5608,47 @@ static int mem_cgroup_move_account(struct page *page,
if (page->mem_cgroup != from)
goto out_unlock;
- anon = PageAnon(page);
-
pgdat = page_pgdat(page);
from_vec = mem_cgroup_lruvec(from, pgdat);
to_vec = mem_cgroup_lruvec(to, pgdat);
- spin_lock_irqsave(&from->move_lock, flags);
+ lock_page_memcg(page);
- if (!anon && page_mapped(page)) {
- __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
- }
+ if (PageAnon(page)) {
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_MAPPED, nr_pages);
+ if (PageTransHuge(page)) {
+ __mod_lruvec_state(from_vec, NR_ANON_THPS,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_ANON_THPS,
+ nr_pages);
+ }
- /*
- * move_lock grabbed above and caller set from->moving_account, so
- * mod_memcg_page_state will serialize updates to PageDirty.
- * So mapping should be stable for dirty pages.
- */
- if (!anon && PageDirty(page)) {
- struct address_space *mapping = page_mapping(page);
+ }
+ } else {
+ __mod_lruvec_state(from_vec, NR_FILE_PAGES, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_PAGES, nr_pages);
- if (mapping_cap_account_dirty(mapping)) {
- __mod_lruvec_state(from_vec, NR_FILE_DIRTY, -nr_pages);
- __mod_lruvec_state(to_vec, NR_FILE_DIRTY, nr_pages);
+ if (PageSwapBacked(page)) {
+ __mod_lruvec_state(from_vec, NR_SHMEM, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_SHMEM, nr_pages);
+ }
+
+ if (page_mapped(page)) {
+ __mod_lruvec_state(from_vec, NR_FILE_MAPPED, -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_MAPPED, nr_pages);
+ }
+
+ if (PageDirty(page)) {
+ struct address_space *mapping = page_mapping(page);
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __mod_lruvec_state(from_vec, NR_FILE_DIRTY,
+ -nr_pages);
+ __mod_lruvec_state(to_vec, NR_FILE_DIRTY,
+ nr_pages);
+ }
}
}
@@ -5429,22 +5658,33 @@ static int mem_cgroup_move_account(struct page *page,
}
/*
+ * All state has been migrated, let's switch to the new memcg.
+ *
* It is safe to change page->mem_cgroup here because the page
- * is referenced, charged, and isolated - we can't race with
- * uncharging, charging, migration, or LRU putback.
+ * is referenced, charged, isolated, and locked: we can't race
+ * with (un)charging, migration, LRU putback, or anything else
+ * that would rely on a stable page->mem_cgroup.
+ *
+ * Note that lock_page_memcg is a memcg lock, not a page lock,
+ * to save space. As soon as we switch page->mem_cgroup to a
+ * new memcg that isn't locked, the above state can change
+ * concurrently again. Make sure we're truly done with it.
*/
+ smp_mb();
+
+ css_get(&to->css);
+ css_put(&from->css);
- /* caller should have done css_get */
page->mem_cgroup = to;
- spin_unlock_irqrestore(&from->move_lock, flags);
+ __unlock_page_memcg(from);
ret = 0;
local_irq_disable();
- mem_cgroup_charge_statistics(to, page, compound, nr_pages);
+ mem_cgroup_charge_statistics(to, page, nr_pages);
memcg_check_events(to, page);
- mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
+ mem_cgroup_charge_statistics(from, page, -nr_pages);
memcg_check_events(from, page);
local_irq_enable();
out_unlock:
@@ -5603,9 +5843,9 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
{
unsigned long precharge;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
precharge = mc.precharge;
mc.precharge = 0;
@@ -5656,9 +5896,6 @@ static void __mem_cgroup_clear_mc(void)
if (!mem_cgroup_is_root(mc.to))
page_counter_uncharge(&mc.to->memory, mc.moved_swap);
- mem_cgroup_id_get_many(mc.to, mc.moved_swap);
- css_put_many(&mc.to->css, mc.moved_swap);
-
mc.moved_swap = 0;
}
memcg_oom_recover(from);
@@ -5847,7 +6084,8 @@ put: /* get_mctgt_type() gets the page */
ent = target.ent;
if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
mc.precharge--;
- /* we fixup refcnts and charges later. */
+ mem_cgroup_id_get_many(mc.to, 1);
+ /* we fixup other refcnts and charges later. */
mc.moved_swap++;
}
break;
@@ -5888,9 +6126,9 @@ static void mem_cgroup_move_charge(void)
atomic_inc(&mc.from->moving_account);
synchronize_rcu();
retry:
- if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
+ if (unlikely(!mmap_read_trylock(mc.mm))) {
/*
- * Someone who are holding the mmap_sem might be waiting in
+ * Someone who are holding the mmap_lock might be waiting in
* waitq. So we cancel all extra charges, wake up all waiters,
* and retry. Because we cancel precharges, we might not be able
* to move enough charges, but moving charge is a best-effort
@@ -5907,7 +6145,7 @@ retry:
walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops,
NULL);
- up_read(&mc.mm->mmap_sem);
+ mmap_read_unlock(mc.mm);
atomic_dec(&mc.from->moving_account);
}
@@ -6015,14 +6253,15 @@ static ssize_t memory_low_write(struct kernfs_open_file *of,
static int memory_high_show(struct seq_file *m, void *v)
{
- return seq_puts_memcg_tunable(m, READ_ONCE(mem_cgroup_from_seq(m)->high));
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->memory.high));
}
static ssize_t memory_high_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_retries = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long high;
int err;
@@ -6032,8 +6271,6 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
if (err)
return err;
- WRITE_ONCE(memcg->high, high);
-
for (;;) {
unsigned long nr_pages = page_counter_read(&memcg->memory);
unsigned long reclaimed;
@@ -6057,6 +6294,10 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
break;
}
+ page_counter_set_high(&memcg->memory, high);
+
+ memcg_wb_domain_size_changed(memcg);
+
return nbytes;
}
@@ -6070,7 +6311,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
char *buf, size_t nbytes, loff_t off)
{
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
+ unsigned int nr_reclaims = MAX_RECLAIM_RETRIES;
bool drained = false;
unsigned long max;
int err;
@@ -6227,7 +6468,6 @@ static struct cftype memory_files[] = {
},
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = memory_stat_show,
},
{
@@ -6349,11 +6589,16 @@ static unsigned long effective_protection(unsigned long usage,
* We're using unprotected memory for the weight so that if
* some cgroups DO claim explicit protection, we don't protect
* the same bytes twice.
+ *
+ * Check both usage and parent_usage against the respective
+ * protected values. One should imply the other, but they
+ * aren't read atomically - make sure the division is sane.
*/
if (!(cgrp_dfl_root.flags & CGRP_ROOT_MEMORY_RECURSIVE_PROT))
return ep;
-
- if (parent_effective > siblings_protected && usage > protected) {
+ if (parent_effective > siblings_protected &&
+ parent_usage > siblings_protected &&
+ usage > protected) {
unsigned long unclaimed;
unclaimed = parent_effective - siblings_protected;
@@ -6373,40 +6618,42 @@ static unsigned long effective_protection(unsigned long usage,
*
* WARNING: This function is not stateless! It can only be used as part
* of a top-down tree iteration, not for isolated queries.
- *
- * Returns one of the following:
- * MEMCG_PROT_NONE: cgroup memory is not protected
- * MEMCG_PROT_LOW: cgroup memory is protected as long there is
- * an unprotected supply of reclaimable memory from other cgroups.
- * MEMCG_PROT_MIN: cgroup memory is protected
*/
-enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
- struct mem_cgroup *memcg)
+void mem_cgroup_calculate_protection(struct mem_cgroup *root,
+ struct mem_cgroup *memcg)
{
unsigned long usage, parent_usage;
struct mem_cgroup *parent;
if (mem_cgroup_disabled())
- return MEMCG_PROT_NONE;
+ return;
if (!root)
root = root_mem_cgroup;
+
+ /*
+ * Effective values of the reclaim targets are ignored so they
+ * can be stale. Have a look at mem_cgroup_protection for more
+ * details.
+ * TODO: calculation should be more robust so that we do not need
+ * that special casing.
+ */
if (memcg == root)
- return MEMCG_PROT_NONE;
+ return;
usage = page_counter_read(&memcg->memory);
if (!usage)
- return MEMCG_PROT_NONE;
+ return;
parent = parent_mem_cgroup(memcg);
/* No parent means a non-hierarchical mode on v1 memcg */
if (!parent)
- return MEMCG_PROT_NONE;
+ return;
if (parent == root) {
memcg->memory.emin = READ_ONCE(memcg->memory.min);
- memcg->memory.elow = memcg->memory.low;
- goto out;
+ memcg->memory.elow = READ_ONCE(memcg->memory.low);
+ return;
}
parent_usage = page_counter_read(&parent->memory);
@@ -6417,138 +6664,70 @@ enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
atomic_long_read(&parent->memory.children_min_usage)));
WRITE_ONCE(memcg->memory.elow, effective_protection(usage, parent_usage,
- memcg->memory.low, READ_ONCE(parent->memory.elow),
+ READ_ONCE(memcg->memory.low),
+ READ_ONCE(parent->memory.elow),
atomic_long_read(&parent->memory.children_low_usage)));
-
-out:
- if (usage <= memcg->memory.emin)
- return MEMCG_PROT_MIN;
- else if (usage <= memcg->memory.elow)
- return MEMCG_PROT_LOW;
- else
- return MEMCG_PROT_NONE;
}
/**
- * mem_cgroup_try_charge - try charging a page
+ * mem_cgroup_charge - charge a newly allocated page to a cgroup
* @page: page to charge
* @mm: mm context of the victim
* @gfp_mask: reclaim mode
- * @memcgp: charged memcg return
- * @compound: charge the page as compound or small page
*
* Try to charge @page to the memcg that @mm belongs to, reclaiming
* pages according to @gfp_mask if necessary.
*
- * Returns 0 on success, with *@memcgp pointing to the charged memcg.
- * Otherwise, an error code is returned.
- *
- * After page->mapping has been set up, the caller must finalize the
- * charge with mem_cgroup_commit_charge(). Or abort the transaction
- * with mem_cgroup_cancel_charge() in case page instantiation fails.
+ * Returns 0 on success. Otherwise, an error code is returned.
*/
-int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
+int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
{
+ unsigned int nr_pages = thp_nr_pages(page);
struct mem_cgroup *memcg = NULL;
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
int ret = 0;
if (mem_cgroup_disabled())
goto out;
if (PageSwapCache(page)) {
+ swp_entry_t ent = { .val = page_private(page), };
+ unsigned short id;
+
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. The USED bit is protected by
- * the page lock, which serializes swap cache removal, which
+ * already charged pages, too. page->mem_cgroup is protected
+ * by the page lock, which serializes swap cache removal, which
* in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
if (compound_head(page)->mem_cgroup)
goto out;
- if (do_swap_account) {
- swp_entry_t ent = { .val = page_private(page), };
- unsigned short id = lookup_swap_cgroup_id(ent);
-
- rcu_read_lock();
- memcg = mem_cgroup_from_id(id);
- if (memcg && !css_tryget_online(&memcg->css))
- memcg = NULL;
- rcu_read_unlock();
- }
+ id = lookup_swap_cgroup_id(ent);
+ rcu_read_lock();
+ memcg = mem_cgroup_from_id(id);
+ if (memcg && !css_tryget_online(&memcg->css))
+ memcg = NULL;
+ rcu_read_unlock();
}
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
ret = try_charge(memcg, gfp_mask, nr_pages);
+ if (ret)
+ goto out_put;
- css_put(&memcg->css);
-out:
- *memcgp = memcg;
- return ret;
-}
-
-int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
- gfp_t gfp_mask, struct mem_cgroup **memcgp,
- bool compound)
-{
- struct mem_cgroup *memcg;
- int ret;
-
- ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
- memcg = *memcgp;
- mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
- return ret;
-}
-
-/**
- * mem_cgroup_commit_charge - commit a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @lrucare: page might be on LRU already
- * @compound: charge the page as compound or small page
- *
- * Finalize a charge transaction started by mem_cgroup_try_charge(),
- * after page->mapping has been set up. This must happen atomically
- * as part of the page instantiation, i.e. under the page table lock
- * for anonymous pages, under the page lock for page and swap cache.
- *
- * In addition, the page must not be on the LRU during the commit, to
- * prevent racing with task migration. If it might be, use @lrucare.
- *
- * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
- */
-void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
- bool lrucare, bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- VM_BUG_ON_PAGE(!page->mapping, page);
- VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- commit_charge(page, memcg, lrucare);
+ css_get(&memcg->css);
+ commit_charge(page, memcg);
local_irq_disable();
- mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
+ mem_cgroup_charge_statistics(memcg, page, nr_pages);
memcg_check_events(memcg, page);
local_irq_enable();
- if (do_memsw_account() && PageSwapCache(page)) {
+ if (PageSwapCache(page)) {
swp_entry_t entry = { .val = page_private(page) };
/*
* The swap entry might not get freed for a long time,
@@ -6557,42 +6736,18 @@ void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
*/
mem_cgroup_uncharge_swap(entry, nr_pages);
}
-}
-/**
- * mem_cgroup_cancel_charge - cancel a page charge
- * @page: page to charge
- * @memcg: memcg to charge the page to
- * @compound: charge the page as compound or small page
- *
- * Cancel a charge transaction started by mem_cgroup_try_charge().
- */
-void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
- bool compound)
-{
- unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
-
- if (mem_cgroup_disabled())
- return;
- /*
- * Swap faults will attempt to charge the same page multiple
- * times. But reuse_swap_page() might have removed the page
- * from swapcache already, so we can't check PageSwapCache().
- */
- if (!memcg)
- return;
-
- cancel_charge(memcg, nr_pages);
+out_put:
+ css_put(&memcg->css);
+out:
+ return ret;
}
struct uncharge_gather {
struct mem_cgroup *memcg;
+ unsigned long nr_pages;
unsigned long pgpgout;
- unsigned long nr_anon;
- unsigned long nr_file;
unsigned long nr_kmem;
- unsigned long nr_huge;
- unsigned long nr_shmem;
struct page *dummy_page;
};
@@ -6603,37 +6758,32 @@ static inline void uncharge_gather_clear(struct uncharge_gather *ug)
static void uncharge_batch(const struct uncharge_gather *ug)
{
- unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
unsigned long flags;
if (!mem_cgroup_is_root(ug->memcg)) {
- page_counter_uncharge(&ug->memcg->memory, nr_pages);
+ page_counter_uncharge(&ug->memcg->memory, ug->nr_pages);
if (do_memsw_account())
- page_counter_uncharge(&ug->memcg->memsw, nr_pages);
+ page_counter_uncharge(&ug->memcg->memsw, ug->nr_pages);
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
memcg_oom_recover(ug->memcg);
}
local_irq_save(flags);
- __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
- __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
- __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
- __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
__count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
- __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, nr_pages);
+ __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_pages);
memcg_check_events(ug->memcg, ug->dummy_page);
local_irq_restore(flags);
- if (!mem_cgroup_is_root(ug->memcg))
- css_put_many(&ug->memcg->css, nr_pages);
+ /* drop reference from uncharge_page */
+ css_put(&ug->memcg->css);
}
static void uncharge_page(struct page *page, struct uncharge_gather *ug)
{
+ unsigned long nr_pages;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
- !PageHWPoison(page) , page);
if (!page->mem_cgroup)
return;
@@ -6650,30 +6800,24 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
uncharge_gather_clear(ug);
}
ug->memcg = page->mem_cgroup;
+
+ /* pairs with css_put in uncharge_batch */
+ css_get(&ug->memcg->css);
}
- if (!PageKmemcg(page)) {
- unsigned int nr_pages = 1;
+ nr_pages = compound_nr(page);
+ ug->nr_pages += nr_pages;
- if (PageTransHuge(page)) {
- nr_pages = compound_nr(page);
- ug->nr_huge += nr_pages;
- }
- if (PageAnon(page))
- ug->nr_anon += nr_pages;
- else {
- ug->nr_file += nr_pages;
- if (PageSwapBacked(page))
- ug->nr_shmem += nr_pages;
- }
+ if (!PageKmemcg(page)) {
ug->pgpgout++;
} else {
- ug->nr_kmem += compound_nr(page);
+ ug->nr_kmem += nr_pages;
__ClearPageKmemcg(page);
}
ug->dummy_page = page;
page->mem_cgroup = NULL;
+ css_put(&ug->memcg->css);
}
static void uncharge_list(struct list_head *page_list)
@@ -6705,8 +6849,7 @@ static void uncharge_list(struct list_head *page_list)
* mem_cgroup_uncharge - uncharge a page
* @page: page to uncharge
*
- * Uncharge a page previously charged with mem_cgroup_try_charge() and
- * mem_cgroup_commit_charge().
+ * Uncharge a page previously charged with mem_cgroup_charge().
*/
void mem_cgroup_uncharge(struct page *page)
{
@@ -6729,7 +6872,7 @@ void mem_cgroup_uncharge(struct page *page)
* @page_list: list of pages to uncharge
*
* Uncharge a list of pages previously charged with
- * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
+ * mem_cgroup_charge().
*/
void mem_cgroup_uncharge_list(struct list_head *page_list)
{
@@ -6775,18 +6918,17 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Force-charge the new page. The old one will be freed soon */
- nr_pages = hpage_nr_pages(newpage);
+ nr_pages = thp_nr_pages(newpage);
page_counter_charge(&memcg->memory, nr_pages);
if (do_memsw_account())
page_counter_charge(&memcg->memsw, nr_pages);
- css_get_many(&memcg->css, nr_pages);
- commit_charge(newpage, memcg, false);
+ css_get(&memcg->css);
+ commit_charge(newpage, memcg);
local_irq_save(flags);
- mem_cgroup_charge_statistics(memcg, newpage, PageTransHuge(newpage),
- nr_pages);
+ mem_cgroup_charge_statistics(memcg, newpage, nr_pages);
memcg_check_events(memcg, newpage);
local_irq_restore(flags);
}
@@ -6905,17 +7047,6 @@ static int __init mem_cgroup_init(void)
{
int cpu, node;
-#ifdef CONFIG_MEMCG_KMEM
- /*
- * Kmem cache creation is mostly done with the slab_mutex held,
- * so use a workqueue with limited concurrency to avoid stalling
- * all worker threads in case lots of cgroups are created and
- * destroyed simultaneously.
- */
- memcg_kmem_cache_wq = alloc_workqueue("memcg_kmem_cache", 0, 1);
- BUG_ON(!memcg_kmem_cache_wq);
-#endif
-
cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL,
memcg_hotplug_cpu_dead);
@@ -6974,7 +7105,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
- if (!do_memsw_account())
+ if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
memcg = page->mem_cgroup;
@@ -6989,7 +7120,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* ancestor for the swap instead and transfer the memory+swap charge.
*/
swap_memcg = mem_cgroup_id_get_online(memcg);
- nr_entries = hpage_nr_pages(page);
+ nr_entries = thp_nr_pages(page);
/* Get references for the tail pages, too */
if (nr_entries > 1)
mem_cgroup_id_get_many(swap_memcg, nr_entries - 1);
@@ -7003,7 +7134,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
- if (memcg != swap_memcg) {
+ if (!cgroup_memory_noswap && memcg != swap_memcg) {
if (!mem_cgroup_is_root(swap_memcg))
page_counter_charge(&swap_memcg->memsw, nr_entries);
page_counter_uncharge(&memcg->memsw, nr_entries);
@@ -7016,12 +7147,10 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
* only synchronisation we have for updating the per-CPU variables.
*/
VM_BUG_ON(!irqs_disabled());
- mem_cgroup_charge_statistics(memcg, page, PageTransHuge(page),
- -nr_entries);
+ mem_cgroup_charge_statistics(memcg, page, -nr_entries);
memcg_check_events(memcg, page);
- if (!mem_cgroup_is_root(memcg))
- css_put_many(&memcg->css, nr_entries);
+ css_put(&memcg->css);
}
/**
@@ -7035,12 +7164,12 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
*/
int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
{
- unsigned int nr_pages = hpage_nr_pages(page);
+ unsigned int nr_pages = thp_nr_pages(page);
struct page_counter *counter;
struct mem_cgroup *memcg;
unsigned short oldid;
- if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) || !do_swap_account)
+ if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
memcg = page->mem_cgroup;
@@ -7056,7 +7185,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
memcg = mem_cgroup_id_get_online(memcg);
- if (!mem_cgroup_is_root(memcg) &&
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) &&
!page_counter_try_charge(&memcg->swap, nr_pages, &counter)) {
memcg_memory_event(memcg, MEMCG_SWAP_MAX);
memcg_memory_event(memcg, MEMCG_SWAP_FAIL);
@@ -7084,14 +7213,11 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages)
struct mem_cgroup *memcg;
unsigned short id;
- if (!do_swap_account)
- return;
-
id = swap_cgroup_record(entry, 0, nr_pages);
rcu_read_lock();
memcg = mem_cgroup_from_id(id);
if (memcg) {
- if (!mem_cgroup_is_root(memcg)) {
+ if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) {
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
page_counter_uncharge(&memcg->swap, nr_pages);
else
@@ -7107,7 +7233,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg)
{
long nr_swap_pages = get_nr_swap_pages();
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return nr_swap_pages;
for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
nr_swap_pages = min_t(long, nr_swap_pages,
@@ -7124,37 +7250,33 @@ bool mem_cgroup_swap_full(struct page *page)
if (vm_swap_full())
return true;
- if (!do_swap_account || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
+ if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
memcg = page->mem_cgroup;
if (!memcg)
return false;
- for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg))
- if (page_counter_read(&memcg->swap) * 2 >=
- READ_ONCE(memcg->swap.max))
+ for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) {
+ unsigned long usage = page_counter_read(&memcg->swap);
+
+ if (usage * 2 >= READ_ONCE(memcg->swap.high) ||
+ usage * 2 >= READ_ONCE(memcg->swap.max))
return true;
+ }
return false;
}
-/* for remember boot option*/
-#ifdef CONFIG_MEMCG_SWAP_ENABLED
-static int really_do_swap_account __initdata = 1;
-#else
-static int really_do_swap_account __initdata;
-#endif
-
-static int __init enable_swap_account(char *s)
+static int __init setup_swap_account(char *s)
{
if (!strcmp(s, "1"))
- really_do_swap_account = 1;
+ cgroup_memory_noswap = 0;
else if (!strcmp(s, "0"))
- really_do_swap_account = 0;
+ cgroup_memory_noswap = 1;
return 1;
}
-__setup("swapaccount=", enable_swap_account);
+__setup("swapaccount=", setup_swap_account);
static u64 swap_current_read(struct cgroup_subsys_state *css,
struct cftype *cft)
@@ -7164,6 +7286,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css,
return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE;
}
+static int swap_high_show(struct seq_file *m, void *v)
+{
+ return seq_puts_memcg_tunable(m,
+ READ_ONCE(mem_cgroup_from_seq(m)->swap.high));
+}
+
+static ssize_t swap_high_write(struct kernfs_open_file *of,
+ char *buf, size_t nbytes, loff_t off)
+{
+ struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
+ unsigned long high;
+ int err;
+
+ buf = strstrip(buf);
+ err = page_counter_memparse(buf, "max", &high);
+ if (err)
+ return err;
+
+ page_counter_set_high(&memcg->swap, high);
+
+ return nbytes;
+}
+
static int swap_max_show(struct seq_file *m, void *v)
{
return seq_puts_memcg_tunable(m,
@@ -7191,6 +7336,8 @@ static int swap_events_show(struct seq_file *m, void *v)
{
struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+ seq_printf(m, "high %lu\n",
+ atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH]));
seq_printf(m, "max %lu\n",
atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX]));
seq_printf(m, "fail %lu\n",
@@ -7206,6 +7353,12 @@ static struct cftype swap_files[] = {
.read_u64 = swap_current_read,
},
{
+ .name = "swap.high",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = swap_high_show,
+ .write = swap_high_write,
+ },
+ {
.name = "swap.max",
.flags = CFTYPE_NOT_ON_ROOT,
.seq_show = swap_max_show,
@@ -7220,7 +7373,7 @@ static struct cftype swap_files[] = {
{ } /* terminate */
};
-static struct cftype memsw_cgroup_files[] = {
+static struct cftype memsw_files[] = {
{
.name = "memsw.usage_in_bytes",
.private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE),
@@ -7247,17 +7400,27 @@ static struct cftype memsw_cgroup_files[] = {
{ }, /* terminate */
};
+/*
+ * If mem_cgroup_swap_init() is implemented as a subsys_initcall()
+ * instead of a core_initcall(), this could mean cgroup_memory_noswap still
+ * remains set to false even when memcg is disabled via "cgroup_disable=memory"
+ * boot parameter. This may result in premature OOPS inside
+ * mem_cgroup_get_nr_swap_pages() function in corner cases.
+ */
static int __init mem_cgroup_swap_init(void)
{
- if (!mem_cgroup_disabled() && really_do_swap_account) {
- do_swap_account = 1;
- WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys,
- swap_files));
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys,
- memsw_cgroup_files));
- }
+ /* No memory control -> no swap control */
+ if (mem_cgroup_disabled())
+ cgroup_memory_noswap = true;
+
+ if (cgroup_memory_noswap)
+ return 0;
+
+ WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files));
+ WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files));
+
return 0;
}
-subsys_initcall(mem_cgroup_swap_init);
+core_initcall(mem_cgroup_swap_init);
#endif /* CONFIG_MEMCG_SWAP */
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index a96364be8ab4..f1aa6433f404 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -210,14 +210,15 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
{
struct task_struct *t = tk->tsk;
short addr_lsb = tk->size_shift;
- int ret;
+ int ret = 0;
pr_err("Memory failure: %#lx: Sending SIGBUS to %s:%d due to hardware memory corruption\n",
- pfn, t->comm, t->pid);
+ pfn, t->comm, t->pid);
- if ((flags & MF_ACTION_REQUIRED) && t->mm == current->mm) {
- ret = force_sig_mceerr(BUS_MCEERR_AR, (void __user *)tk->addr,
- addr_lsb);
+ if (flags & MF_ACTION_REQUIRED) {
+ WARN_ON_ONCE(t != current);
+ ret = force_sig_mceerr(BUS_MCEERR_AR,
+ (void __user *)tk->addr, addr_lsb);
} else {
/*
* Don't use force here, it's convenient if the signal
@@ -399,9 +400,15 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
{
struct task_struct *t;
- for_each_thread(tsk, t)
- if ((t->flags & PF_MCE_PROCESS) && (t->flags & PF_MCE_EARLY))
- return t;
+ for_each_thread(tsk, t) {
+ if (t->flags & PF_MCE_PROCESS) {
+ if (t->flags & PF_MCE_EARLY)
+ return t;
+ } else {
+ if (sysctl_memory_failure_early_kill)
+ return t;
+ }
+ }
return NULL;
}
@@ -410,21 +417,26 @@ static struct task_struct *find_early_kill_thread(struct task_struct *tsk)
* to be signaled when some page under the process is hwpoisoned.
* Return task_struct of the dedicated thread (main thread unless explicitly
* specified) if the process is "early kill," and otherwise returns NULL.
+ *
+ * Note that the above is true for Action Optional case, but not for Action
+ * Required case where SIGBUS should sent only to the current thread.
*/
static struct task_struct *task_early_kill(struct task_struct *tsk,
int force_early)
{
- struct task_struct *t;
if (!tsk->mm)
return NULL;
- if (force_early)
- return tsk;
- t = find_early_kill_thread(tsk);
- if (t)
- return t;
- if (sysctl_memory_failure_early_kill)
- return tsk;
- return NULL;
+ if (force_early) {
+ /*
+ * Comparing ->mm here because current task might represent
+ * a subthread, while tsk always points to the main thread.
+ */
+ if (tsk->mm == current->mm)
+ return current;
+ else
+ return NULL;
+ }
+ return find_early_kill_thread(tsk);
}
/*
@@ -1493,7 +1505,7 @@ static void memory_failure_work_func(struct work_struct *work)
unsigned long proc_flags;
int gotten;
- mf_cpu = this_cpu_ptr(&memory_failure_cpu);
+ mf_cpu = container_of(work, struct memory_failure_cpu, work);
for (;;) {
spin_lock_irqsave(&mf_cpu->lock, proc_flags);
gotten = kfifo_get(&mf_cpu->fifo, &entry);
@@ -1507,6 +1519,19 @@ static void memory_failure_work_func(struct work_struct *work)
}
}
+/*
+ * Process memory_failure work queued on the specified CPU.
+ * Used to avoid return-to-userspace racing with the memory_failure workqueue.
+ */
+void memory_failure_queue_kick(int cpu)
+{
+ struct memory_failure_cpu *mf_cpu;
+
+ mf_cpu = &per_cpu(memory_failure_cpu, cpu);
+ cancel_work_sync(&mf_cpu->work);
+ memory_failure_work_func(&mf_cpu->work);
+}
+
static int __init memory_failure_init(void)
{
struct memory_failure_cpu *mf_cpu;
@@ -1623,9 +1648,12 @@ EXPORT_SYMBOL(unpoison_memory);
static struct page *new_page(struct page *p, unsigned long private)
{
- int nid = page_to_nid(p);
+ struct migration_target_control mtc = {
+ .nid = page_to_nid(p),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
- return new_page_nodemask(p, nid, &node_states[N_MEMORY]);
+ return alloc_migration_target(p, (unsigned long)&mtc);
}
/*
diff --git a/mm/memory.c b/mm/memory.c
index f703fe8c8346..eeae590e526a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -71,6 +71,9 @@
#include <linux/dax.h>
#include <linux/oom.h>
#include <linux/numa.h>
+#include <linux/perf_event.h>
+#include <linux/ptrace.h>
+#include <linux/vmalloc.h>
#include <trace/events/kmem.h>
@@ -80,8 +83,8 @@
#include <linux/uaccess.h>
#include <asm/tlb.h>
#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
+#include "pgalloc-track.h"
#include "internal.h"
#if defined(LAST_CPUPID_NOT_IN_PAGE_FLAGS) && !defined(CONFIG_COMPILE_TEST)
@@ -438,7 +441,7 @@ int __pte_alloc(struct mm_struct *mm, pmd_t *pmd)
* of a chain of data-dependent loads, meaning most CPUs (alpha
* being the notable exception) will already guarantee loads are
* seen in-order. See the alpha page table accessors for the
- * smp_read_barrier_depends() barriers in page table walking code.
+ * smp_rmb() barriers in page table walking code.
*/
smp_wmb(); /* Could be smp_wmb__xxx(before|after)_spin_lock */
@@ -692,84 +695,185 @@ out:
* covered by this vma.
*/
-static inline unsigned long
-copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+static unsigned long
+copy_nonpresent_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
unsigned long addr, int *rss)
{
unsigned long vm_flags = vma->vm_flags;
pte_t pte = *src_pte;
struct page *page;
+ swp_entry_t entry = pte_to_swp_entry(pte);
+
+ if (likely(!non_swap_entry(entry))) {
+ if (swap_duplicate(entry) < 0)
+ return entry.val;
+
+ /* make sure dst_mm is on swapoff's mmlist. */
+ if (unlikely(list_empty(&dst_mm->mmlist))) {
+ spin_lock(&mmlist_lock);
+ if (list_empty(&dst_mm->mmlist))
+ list_add(&dst_mm->mmlist,
+ &src_mm->mmlist);
+ spin_unlock(&mmlist_lock);
+ }
+ rss[MM_SWAPENTS]++;
+ } else if (is_migration_entry(entry)) {
+ page = migration_entry_to_page(entry);
- /* pte contains position in swap or file, so copy. */
- if (unlikely(!pte_present(pte))) {
- swp_entry_t entry = pte_to_swp_entry(pte);
-
- if (likely(!non_swap_entry(entry))) {
- if (swap_duplicate(entry) < 0)
- return entry.val;
-
- /* make sure dst_mm is on swapoff's mmlist. */
- if (unlikely(list_empty(&dst_mm->mmlist))) {
- spin_lock(&mmlist_lock);
- if (list_empty(&dst_mm->mmlist))
- list_add(&dst_mm->mmlist,
- &src_mm->mmlist);
- spin_unlock(&mmlist_lock);
- }
- rss[MM_SWAPENTS]++;
- } else if (is_migration_entry(entry)) {
- page = migration_entry_to_page(entry);
-
- rss[mm_counter(page)]++;
-
- if (is_write_migration_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- /*
- * COW mappings require pages in both
- * parent and child to be set to read.
- */
- make_migration_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- if (pte_swp_soft_dirty(*src_pte))
- pte = pte_swp_mksoft_dirty(pte);
- if (pte_swp_uffd_wp(*src_pte))
- pte = pte_swp_mkuffd_wp(pte);
- set_pte_at(src_mm, addr, src_pte, pte);
- }
- } else if (is_device_private_entry(entry)) {
- page = device_private_entry_to_page(entry);
+ rss[mm_counter(page)]++;
+ if (is_write_migration_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
/*
- * Update rss count even for unaddressable pages, as
- * they should treated just like normal pages in this
- * respect.
- *
- * We will likely want to have some new rss counters
- * for unaddressable pages, at some point. But for now
- * keep things as they are.
+ * COW mappings require pages in both
+ * parent and child to be set to read.
*/
- get_page(page);
- rss[mm_counter(page)]++;
- page_dup_rmap(page, false);
+ make_migration_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*src_pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
+ }
+ } else if (is_device_private_entry(entry)) {
+ page = device_private_entry_to_page(entry);
- /*
- * We do not preserve soft-dirty information, because so
- * far, checkpoint/restore is the only feature that
- * requires that. And checkpoint/restore does not work
- * when a device driver is involved (you cannot easily
- * save and restore device driver state).
- */
- if (is_write_device_private_entry(entry) &&
- is_cow_mapping(vm_flags)) {
- make_device_private_entry_read(&entry);
- pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*src_pte))
- pte = pte_swp_mkuffd_wp(pte);
- set_pte_at(src_mm, addr, src_pte, pte);
- }
+ /*
+ * Update rss count even for unaddressable pages, as
+ * they should treated just like normal pages in this
+ * respect.
+ *
+ * We will likely want to have some new rss counters
+ * for unaddressable pages, at some point. But for now
+ * keep things as they are.
+ */
+ get_page(page);
+ rss[mm_counter(page)]++;
+ page_dup_rmap(page, false);
+
+ /*
+ * We do not preserve soft-dirty information, because so
+ * far, checkpoint/restore is the only feature that
+ * requires that. And checkpoint/restore does not work
+ * when a device driver is involved (you cannot easily
+ * save and restore device driver state).
+ */
+ if (is_write_device_private_entry(entry) &&
+ is_cow_mapping(vm_flags)) {
+ make_device_private_entry_read(&entry);
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_uffd_wp(*src_pte))
+ pte = pte_swp_mkuffd_wp(pte);
+ set_pte_at(src_mm, addr, src_pte, pte);
}
- goto out_set_pte;
+ }
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+/*
+ * Copy a present and normal page if necessary.
+ *
+ * NOTE! The usual case is that this doesn't need to do
+ * anything, and can just return a positive value. That
+ * will let the caller know that it can just increase
+ * the page refcount and re-use the pte the traditional
+ * way.
+ *
+ * But _if_ we need to copy it because it needs to be
+ * pinned in the parent (and the child should get its own
+ * copy rather than just a reference to the same page),
+ * we'll do that here and return zero to let the caller
+ * know we're done.
+ *
+ * And if we need a pre-allocated page but don't yet have
+ * one, return a negative error to let the preallocation
+ * code know so that it can do so outside the page table
+ * lock.
+ */
+static inline int
+copy_present_page(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte,
+ struct vm_area_struct *vma, struct vm_area_struct *new,
+ unsigned long addr, int *rss, struct page **prealloc,
+ pte_t pte, struct page *page)
+{
+ struct page *new_page;
+
+ if (!is_cow_mapping(vma->vm_flags))
+ return 1;
+
+ /*
+ * What we want to do is to check whether this page may
+ * have been pinned by the parent process. If so,
+ * instead of wrprotect the pte on both sides, we copy
+ * the page immediately so that we'll always guarantee
+ * the pinned page won't be randomly replaced in the
+ * future.
+ *
+ * The page pinning checks are just "has this mm ever
+ * seen pinning", along with the (inexact) check of
+ * the page count. That might give false positives for
+ * for pinning, but it will work correctly.
+ */
+ if (likely(!atomic_read(&src_mm->has_pinned)))
+ return 1;
+ if (likely(!page_maybe_dma_pinned(page)))
+ return 1;
+
+ new_page = *prealloc;
+ if (!new_page)
+ return -EAGAIN;
+
+ /*
+ * We have a prealloc page, all good! Take it
+ * over and copy the page & arm it.
+ */
+ *prealloc = NULL;
+ copy_user_highpage(new_page, page, addr, vma);
+ __SetPageUptodate(new_page);
+ page_add_new_anon_rmap(new_page, new, addr, false);
+ lru_cache_add_inactive_or_unevictable(new_page, new);
+ rss[mm_counter(new_page)]++;
+
+ /* All done, just insert the new page copy in the child */
+ pte = mk_pte(new_page, new->vm_page_prot);
+ pte = maybe_mkwrite(pte_mkdirty(pte), new);
+ set_pte_at(dst_mm, addr, dst_pte, pte);
+ return 0;
+}
+
+/*
+ * Copy one pte. Returns 0 if succeeded, or -EAGAIN if one preallocated page
+ * is required to copy this pte.
+ */
+static inline int
+copy_present_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
+ pte_t *dst_pte, pte_t *src_pte, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
+ unsigned long addr, int *rss, struct page **prealloc)
+{
+ unsigned long vm_flags = vma->vm_flags;
+ pte_t pte = *src_pte;
+ struct page *page;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (page) {
+ int retval;
+
+ retval = copy_present_page(dst_mm, src_mm,
+ dst_pte, src_pte,
+ vma, new,
+ addr, rss, prealloc,
+ pte, page);
+ if (retval <= 0)
+ return retval;
+
+ get_page(page);
+ page_dup_rmap(page, false);
+ rss[mm_counter(page)]++;
}
/*
@@ -797,37 +901,51 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (!(vm_flags & VM_UFFD_WP))
pte = pte_clear_uffd_wp(pte);
- page = vm_normal_page(vma, addr, pte);
- if (page) {
- get_page(page);
- page_dup_rmap(page, false);
- rss[mm_counter(page)]++;
- } else if (pte_devmap(pte)) {
- page = pte_page(pte);
- }
-
-out_set_pte:
set_pte_at(dst_mm, addr, dst_pte, pte);
return 0;
}
+static inline struct page *
+page_copy_prealloc(struct mm_struct *src_mm, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ struct page *new_page;
+
+ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, addr);
+ if (!new_page)
+ return NULL;
+
+ if (mem_cgroup_charge(new_page, src_mm, GFP_KERNEL)) {
+ put_page(new_page);
+ return NULL;
+ }
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
+
+ return new_page;
+}
+
static int copy_pte_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pmd_t *dst_pmd, pmd_t *src_pmd, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
pte_t *orig_src_pte, *orig_dst_pte;
pte_t *src_pte, *dst_pte;
spinlock_t *src_ptl, *dst_ptl;
- int progress = 0;
+ int progress, ret = 0;
int rss[NR_MM_COUNTERS];
swp_entry_t entry = (swp_entry_t){0};
+ struct page *prealloc = NULL;
again:
+ progress = 0;
init_rss_vec(rss);
dst_pte = pte_alloc_map_lock(dst_mm, dst_pmd, addr, &dst_ptl);
- if (!dst_pte)
- return -ENOMEM;
+ if (!dst_pte) {
+ ret = -ENOMEM;
+ goto out;
+ }
src_pte = pte_offset_map(src_pmd, addr);
src_ptl = pte_lockptr(src_mm, src_pmd);
spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING);
@@ -850,10 +968,34 @@ again:
progress++;
continue;
}
- entry.val = copy_one_pte(dst_mm, src_mm, dst_pte, src_pte,
+ if (unlikely(!pte_present(*src_pte))) {
+ entry.val = copy_nonpresent_pte(dst_mm, src_mm,
+ dst_pte, src_pte,
vma, addr, rss);
- if (entry.val)
+ if (entry.val)
+ break;
+ progress += 8;
+ continue;
+ }
+ /* copy_present_pte() will clear `*prealloc' if consumed */
+ ret = copy_present_pte(dst_mm, src_mm, dst_pte, src_pte,
+ vma, new, addr, rss, &prealloc);
+ /*
+ * If we need a pre-allocated page for this pte, drop the
+ * locks, allocate, and try again.
+ */
+ if (unlikely(ret == -EAGAIN))
break;
+ if (unlikely(prealloc)) {
+ /*
+ * pre-alloc page cannot be reused by next time so as
+ * to strictly follow mempolicy (e.g., alloc_page_vma()
+ * will allocate page according to address). This
+ * could only happen if one pinned pte changed.
+ */
+ put_page(prealloc);
+ prealloc = NULL;
+ }
progress += 8;
} while (dst_pte++, src_pte++, addr += PAGE_SIZE, addr != end);
@@ -865,17 +1007,30 @@ again:
cond_resched();
if (entry.val) {
- if (add_swap_count_continuation(entry, GFP_KERNEL) < 0)
+ if (add_swap_count_continuation(entry, GFP_KERNEL) < 0) {
+ ret = -ENOMEM;
+ goto out;
+ }
+ entry.val = 0;
+ } else if (ret) {
+ WARN_ON_ONCE(ret != -EAGAIN);
+ prealloc = page_copy_prealloc(src_mm, vma, addr);
+ if (!prealloc)
return -ENOMEM;
- progress = 0;
+ /* We've captured and resolved the error. Reset, try again. */
+ ret = 0;
}
if (addr != end)
goto again;
- return 0;
+out:
+ if (unlikely(prealloc))
+ put_page(prealloc);
+ return ret;
}
static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pud_t *dst_pud, pud_t *src_pud, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
pmd_t *src_pmd, *dst_pmd;
@@ -902,7 +1057,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pmd_none_or_clear_bad(src_pmd))
continue;
if (copy_pte_range(dst_mm, src_mm, dst_pmd, src_pmd,
- vma, addr, next))
+ vma, new, addr, next))
return -ENOMEM;
} while (dst_pmd++, src_pmd++, addr = next, addr != end);
return 0;
@@ -910,6 +1065,7 @@ static inline int copy_pmd_range(struct mm_struct *dst_mm, struct mm_struct *src
static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
p4d_t *dst_p4d, p4d_t *src_p4d, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
pud_t *src_pud, *dst_pud;
@@ -936,7 +1092,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
if (pud_none_or_clear_bad(src_pud))
continue;
if (copy_pmd_range(dst_mm, src_mm, dst_pud, src_pud,
- vma, addr, next))
+ vma, new, addr, next))
return -ENOMEM;
} while (dst_pud++, src_pud++, addr = next, addr != end);
return 0;
@@ -944,6 +1100,7 @@ static inline int copy_pud_range(struct mm_struct *dst_mm, struct mm_struct *src
static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
pgd_t *dst_pgd, pgd_t *src_pgd, struct vm_area_struct *vma,
+ struct vm_area_struct *new,
unsigned long addr, unsigned long end)
{
p4d_t *src_p4d, *dst_p4d;
@@ -958,14 +1115,14 @@ static inline int copy_p4d_range(struct mm_struct *dst_mm, struct mm_struct *src
if (p4d_none_or_clear_bad(src_p4d))
continue;
if (copy_pud_range(dst_mm, src_mm, dst_p4d, src_p4d,
- vma, addr, next))
+ vma, new, addr, next))
return -ENOMEM;
} while (dst_p4d++, src_p4d++, addr = next, addr != end);
return 0;
}
int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
- struct vm_area_struct *vma)
+ struct vm_area_struct *vma, struct vm_area_struct *new)
{
pgd_t *src_pgd, *dst_pgd;
unsigned long next;
@@ -1020,7 +1177,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
if (pgd_none_or_clear_bad(src_pgd))
continue;
if (unlikely(copy_p4d_range(dst_mm, src_mm, dst_pgd, src_pgd,
- vma, addr, next))) {
+ vma, new, addr, next))) {
ret = -ENOMEM;
break;
}
@@ -1101,7 +1258,7 @@ again:
}
entry = pte_to_swp_entry(ptent);
- if (non_swap_entry(entry) && is_device_private_entry(entry)) {
+ if (is_device_private_entry(entry)) {
struct page *page = device_private_entry_to_page(entry);
if (unlikely(details && details->check_mapping)) {
@@ -1188,7 +1345,7 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb,
* Here there can be other concurrent MADV_DONTNEED or
* trans huge page faults running, and if the pmd is
* none or trans huge it can change under us. This is
- * because MADV_DONTNEED holds the mmap_sem in read
+ * because MADV_DONTNEED holds the mmap_lock in read
* mode.
*/
if (pmd_none_or_trans_huge_or_clear_bad(pmd))
@@ -1214,7 +1371,7 @@ static inline unsigned long zap_pud_range(struct mmu_gather *tlb,
next = pud_addr_end(addr, end);
if (pud_trans_huge(*pud) || pud_devmap(*pud)) {
if (next - addr != HPAGE_PUD_SIZE) {
- VM_BUG_ON_VMA(!rwsem_is_locked(&tlb->mm->mmap_sem), vma);
+ mmap_assert_locked(tlb->mm);
split_huge_pud(vma, pud, addr);
} else if (zap_huge_pud(tlb, vma, pud, addr))
goto next;
@@ -1501,7 +1658,7 @@ out:
}
#ifdef pte_index
-static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
+static int insert_page_in_batch_locked(struct mm_struct *mm, pte_t *pte,
unsigned long addr, struct page *page, pgprot_t prot)
{
int err;
@@ -1509,8 +1666,9 @@ static int insert_page_in_batch_locked(struct mm_struct *mm, pmd_t *pmd,
if (!page_count(page))
return -EINVAL;
err = validate_page_before_insert(page);
- return err ? err : insert_page_into_pte_locked(
- mm, pte_offset_map(pmd, addr), addr, page, prot);
+ if (err)
+ return err;
+ return insert_page_into_pte_locked(mm, pte, addr, page, prot);
}
/* insert_pages() amortizes the cost of spinlock operations
@@ -1520,7 +1678,8 @@ static int insert_pages(struct vm_area_struct *vma, unsigned long addr,
struct page **pages, unsigned long *num, pgprot_t prot)
{
pmd_t *pmd = NULL;
- spinlock_t *pte_lock = NULL;
+ pte_t *start_pte, *pte;
+ spinlock_t *pte_lock;
struct mm_struct *const mm = vma->vm_mm;
unsigned long curr_page_idx = 0;
unsigned long remaining_pages_total = *num;
@@ -1539,18 +1698,17 @@ more:
ret = -ENOMEM;
if (pte_alloc(mm, pmd))
goto out;
- pte_lock = pte_lockptr(mm, pmd);
while (pages_to_write_in_pmd) {
int pte_idx = 0;
const int batch_size = min_t(int, pages_to_write_in_pmd, 8);
- spin_lock(pte_lock);
- for (; pte_idx < batch_size; ++pte_idx) {
- int err = insert_page_in_batch_locked(mm, pmd,
+ start_pte = pte_offset_map_lock(mm, pmd, addr, &pte_lock);
+ for (pte = start_pte; pte_idx < batch_size; ++pte, ++pte_idx) {
+ int err = insert_page_in_batch_locked(mm, pte,
addr, pages[curr_page_idx], prot);
if (unlikely(err)) {
- spin_unlock(pte_lock);
+ pte_unmap_unlock(start_pte, pte_lock);
ret = err;
remaining_pages_total -= pte_idx;
goto out;
@@ -1558,7 +1716,7 @@ more:
addr += PAGE_SIZE;
++curr_page_idx;
}
- spin_unlock(pte_lock);
+ pte_unmap_unlock(start_pte, pte_lock);
pages_to_write_in_pmd -= batch_size;
remaining_pages_total -= batch_size;
}
@@ -1595,7 +1753,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
if (addr < vma->vm_start || end_addr >= vma->vm_end)
return -EFAULT;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
- BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
@@ -1603,7 +1761,7 @@ int vm_insert_pages(struct vm_area_struct *vma, unsigned long addr,
return insert_pages(vma, addr, pages, num, vma->vm_page_prot);
#else
unsigned long idx = 0, pgcount = *num;
- int err;
+ int err = -EINVAL;
for (; idx < pgcount; ++idx) {
err = vm_insert_page(vma, addr + (PAGE_SIZE * idx), pages[idx]);
@@ -1639,7 +1797,7 @@ EXPORT_SYMBOL(vm_insert_pages);
* The page does not need to be reserved.
*
* Usually this function is called from f_op->mmap() handler
- * under mm->mmap_sem write-lock, so it can change vma->vm_flags.
+ * under mm->mmap_lock write-lock, so it can change vma->vm_flags.
* Caller must set VM_MIXEDMAP on vma if it wants to call this
* function from other places, for example from page-fault handler.
*
@@ -1653,7 +1811,7 @@ int vm_insert_page(struct vm_area_struct *vma, unsigned long addr,
if (!page_count(page))
return -EINVAL;
if (!(vma->vm_flags & VM_MIXEDMAP)) {
- BUG_ON(down_read_trylock(&vma->vm_mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(vma->vm_mm));
BUG_ON(vma->vm_flags & VM_PFNMAP);
vma->vm_flags |= VM_MIXEDMAP;
}
@@ -1802,7 +1960,7 @@ out_unlock:
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_pfn(), except that it allows drivers to
+ * This is exactly like vmf_insert_pfn(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* This only makes sense for IO mappings, and it makes no sense for
@@ -1938,7 +2096,7 @@ static vm_fault_t __vm_insert_mixed(struct vm_area_struct *vma,
* @pfn: source kernel pfn
* @pgprot: pgprot flags for the inserted page
*
- * This is exactly like vmf_insert_mixed(), except that it allows drivers to
+ * This is exactly like vmf_insert_mixed(), except that it allows drivers
* to override pgprot on a per-page basis.
*
* Typically this function should be used by drivers to set caching- and
@@ -2084,7 +2242,7 @@ static inline int remap_p4d_range(struct mm_struct *mm, pgd_t *pgd,
/**
* remap_pfn_range - remap kernel memory to userspace
* @vma: user vma to map to
- * @addr: target user address to start at
+ * @addr: target page aligned user address to start at
* @pfn: page frame number of kernel physical memory address
* @size: size of mapping area
* @prot: page protection flags for this mapping
@@ -2103,6 +2261,9 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
unsigned long remap_pfn = pfn;
int err;
+ if (WARN_ON_ONCE(!PAGE_ALIGNED(addr)))
+ return -EINVAL;
+
/*
* Physically remapped pages are special. Tell the
* rest of the world about it:
@@ -2203,15 +2364,16 @@ EXPORT_SYMBOL(vm_iomap_memory);
static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data, bool create)
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
int err = 0;
- spinlock_t *uninitialized_var(ptl);
+ spinlock_t *ptl;
if (create) {
pte = (mm == &init_mm) ?
- pte_alloc_kernel(pmd, addr) :
+ pte_alloc_kernel_track(pmd, addr, mask) :
pte_alloc_map_lock(mm, pmd, addr, &ptl);
if (!pte)
return -ENOMEM;
@@ -2232,6 +2394,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
break;
}
} while (addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
arch_leave_lazy_mmu_mode();
@@ -2242,7 +2405,8 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd,
static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data, bool create)
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
@@ -2251,7 +2415,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
BUG_ON(pud_huge(*pud));
if (create) {
- pmd = pmd_alloc(mm, pud, addr);
+ pmd = pmd_alloc_track(mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
} else {
@@ -2261,7 +2425,7 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
next = pmd_addr_end(addr, end);
if (create || !pmd_none_or_clear_bad(pmd)) {
err = apply_to_pte_range(mm, pmd, addr, next, fn, data,
- create);
+ create, mask);
if (err)
break;
}
@@ -2271,14 +2435,15 @@ static int apply_to_pmd_range(struct mm_struct *mm, pud_t *pud,
static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data, bool create)
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
int err = 0;
if (create) {
- pud = pud_alloc(mm, p4d, addr);
+ pud = pud_alloc_track(mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
} else {
@@ -2288,7 +2453,7 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
next = pud_addr_end(addr, end);
if (create || !pud_none_or_clear_bad(pud)) {
err = apply_to_pmd_range(mm, pud, addr, next, fn, data,
- create);
+ create, mask);
if (err)
break;
}
@@ -2298,14 +2463,15 @@ static int apply_to_pud_range(struct mm_struct *mm, p4d_t *p4d,
static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
unsigned long addr, unsigned long end,
- pte_fn_t fn, void *data, bool create)
+ pte_fn_t fn, void *data, bool create,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
int err = 0;
if (create) {
- p4d = p4d_alloc(mm, pgd, addr);
+ p4d = p4d_alloc_track(mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
} else {
@@ -2315,7 +2481,7 @@ static int apply_to_p4d_range(struct mm_struct *mm, pgd_t *pgd,
next = p4d_addr_end(addr, end);
if (create || !p4d_none_or_clear_bad(p4d)) {
err = apply_to_pud_range(mm, p4d, addr, next, fn, data,
- create);
+ create, mask);
if (err)
break;
}
@@ -2328,8 +2494,9 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
void *data, bool create)
{
pgd_t *pgd;
- unsigned long next;
+ unsigned long start = addr, next;
unsigned long end = addr + size;
+ pgtbl_mod_mask mask = 0;
int err = 0;
if (WARN_ON(addr >= end))
@@ -2340,11 +2507,14 @@ static int __apply_to_page_range(struct mm_struct *mm, unsigned long addr,
next = pgd_addr_end(addr, end);
if (!create && pgd_none_or_clear_bad(pgd))
continue;
- err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create);
+ err = apply_to_p4d_range(mm, pgd, addr, next, fn, data, create, &mask);
if (err)
break;
} while (pgd++, addr = next, addr != end);
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, start + size);
+
return err;
}
@@ -2408,8 +2578,6 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
struct mm_struct *mm = vma->vm_mm;
unsigned long addr = vmf->address;
- debug_dma_assert_idle(src);
-
if (likely(src)) {
copy_user_highpage(dst, src, addr, vma);
return true;
@@ -2436,10 +2604,9 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
/*
* Other thread has already handled the fault
- * and we don't need to do anything. If it's
- * not the case, the fault will be triggered
- * again on the same address.
+ * and update local tlb only
*/
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
@@ -2463,13 +2630,14 @@ static inline bool cow_user_page(struct page *dst, struct page *src,
vmf->pte = pte_offset_map_lock(mm, vmf->pmd, addr, &vmf->ptl);
locked = true;
if (!likely(pte_same(*vmf->pte, vmf->orig_pte))) {
- /* The PTE changed under us. Retry page fault. */
+ /* The PTE changed under us, update local tlb */
+ update_mmu_tlb(vma, addr, vmf->pte);
ret = false;
goto pte_unlock;
}
/*
- * The same page can be mapped back since last copy attampt.
+ * The same page can be mapped back since last copy attempt.
* Try to copy again under PTL.
*/
if (__copy_from_user_inatomic(kaddr, uaddr, PAGE_SIZE)) {
@@ -2576,7 +2744,7 @@ static vm_fault_t fault_dirty_shared_page(struct vm_fault *vmf)
* mapping may be NULL here because some device drivers do not
* set page.mapping but still dirty their pages
*
- * Drop the mmap_sem before waiting on IO, if we can. The file
+ * Drop the mmap_lock before waiting on IO, if we can. The file
* is pinning the mapping, as per above.
*/
if ((dirtied || page_mkwrite) && mapping) {
@@ -2621,12 +2789,13 @@ static inline void wp_page_reuse(struct vm_fault *vmf)
if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1))
update_mmu_cache(vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
+ count_vm_event(PGREUSE);
}
/*
* Handle the case of a page which we actually need to copy to a new page.
*
- * Called with mmap_sem locked and the old page referenced, but
+ * Called with mmap_lock locked and the old page referenced, but
* without the ptl held.
*
* High level logic flow:
@@ -2647,7 +2816,6 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
struct page *new_page = NULL;
pte_t entry;
int page_copied = 0;
- struct mem_cgroup *memcg;
struct mmu_notifier_range range;
if (unlikely(anon_vma_prepare(vma)))
@@ -2678,8 +2846,9 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
}
- if (mem_cgroup_try_charge_delay(new_page, mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(new_page, mm, GFP_KERNEL))
goto oom_free_new;
+ cgroup_throttle_swaprate(new_page, GFP_KERNEL);
__SetPageUptodate(new_page);
@@ -2704,6 +2873,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
}
flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte));
entry = mk_pte(new_page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/*
* Clear the pte entry and flush it first, before updating the
@@ -2713,8 +2883,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
*/
ptep_clear_flush_notify(vma, vmf->address, vmf->pte);
page_add_new_anon_rmap(new_page, vma, vmf->address, false);
- mem_cgroup_commit_charge(new_page, memcg, false, false);
- lru_cache_add_active_or_unevictable(new_page, vma);
+ lru_cache_add_inactive_or_unevictable(new_page, vma);
/*
* We call the notify macro here because, when using secondary
* mmu page tables (such as kvm shadow page tables), we want the
@@ -2752,7 +2921,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf)
new_page = old_page;
page_copied = 1;
} else {
- mem_cgroup_cancel_charge(new_page, memcg, false);
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
}
if (new_page)
@@ -2812,6 +2981,7 @@ vm_fault_t finish_mkwrite_fault(struct vm_fault *vmf)
* pte_offset_map_lock.
*/
if (!pte_same(*vmf->pte, vmf->orig_pte)) {
+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
pte_unmap_unlock(vmf->pte, vmf->ptl);
return VM_FAULT_NOPAGE;
}
@@ -2889,9 +3059,9 @@ static vm_fault_t wp_page_shared(struct vm_fault *vmf)
* change only once the write actually happens. This avoids a few races,
* and potentially makes it more efficient.
*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), with pte both mapped and locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_wp_page(struct vm_fault *vmf)
__releases(vmf->ptl)
@@ -2925,49 +3095,25 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf)
* not dirty accountable.
*/
if (PageAnon(vmf->page)) {
- int total_map_swapcount;
- if (PageKsm(vmf->page) && (PageSwapCache(vmf->page) ||
- page_count(vmf->page) != 1))
+ struct page *page = vmf->page;
+
+ /* PageKsm() doesn't necessarily raise the page refcount */
+ if (PageKsm(page) || page_count(page) != 1)
+ goto copy;
+ if (!trylock_page(page))
+ goto copy;
+ if (PageKsm(page) || page_mapcount(page) != 1 || page_count(page) != 1) {
+ unlock_page(page);
goto copy;
- if (!trylock_page(vmf->page)) {
- get_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- lock_page(vmf->page);
- vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
- vmf->address, &vmf->ptl);
- if (!pte_same(*vmf->pte, vmf->orig_pte)) {
- unlock_page(vmf->page);
- pte_unmap_unlock(vmf->pte, vmf->ptl);
- put_page(vmf->page);
- return 0;
- }
- put_page(vmf->page);
- }
- if (PageKsm(vmf->page)) {
- bool reused = reuse_ksm_page(vmf->page, vmf->vma,
- vmf->address);
- unlock_page(vmf->page);
- if (!reused)
- goto copy;
- wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
- }
- if (reuse_swap_page(vmf->page, &total_map_swapcount)) {
- if (total_map_swapcount == 1) {
- /*
- * The page is all ours. Move it to
- * our anon_vma so the rmap code will
- * not search our parent or siblings.
- * Protected against the rmap code by
- * the page lock.
- */
- page_move_anon_rmap(vmf->page, vma);
- }
- unlock_page(vmf->page);
- wp_page_reuse(vmf);
- return VM_FAULT_WRITE;
}
- unlock_page(vmf->page);
+ /*
+ * Ok, we've got the only map reference, and the only
+ * page count reference, and the page is locked,
+ * it's dark out, and we're wearing sunglasses. Hit it.
+ */
+ unlock_page(page);
+ wp_page_reuse(vmf);
+ return VM_FAULT_WRITE;
} else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) ==
(VM_WRITE|VM_SHARED))) {
return wp_page_shared(vmf);
@@ -3079,23 +3225,23 @@ void unmap_mapping_range(struct address_space *mapping,
EXPORT_SYMBOL(unmap_mapping_range);
/*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
* We return with pte unmapped and unlocked.
*
- * We return with the mmap_sem locked or unlocked in the same cases
+ * We return with the mmap_lock locked or unlocked in the same cases
* as does filemap_fault().
*/
vm_fault_t do_swap_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
struct page *page = NULL, *swapcache;
- struct mem_cgroup *memcg;
swp_entry_t entry;
pte_t pte;
int locked;
int exclusive = 0;
vm_fault_t ret = 0;
+ void *shadow = NULL;
if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte))
goto out;
@@ -3125,16 +3271,33 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
if (!page) {
struct swap_info_struct *si = swp_swap_info(entry);
- if (si->flags & SWP_SYNCHRONOUS_IO &&
- __swap_count(entry) == 1) {
+ if (data_race(si->flags & SWP_SYNCHRONOUS_IO) &&
+ __swap_count(entry) == 1) {
/* skip swapcache */
page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma,
vmf->address);
if (page) {
+ int err;
+
__SetPageLocked(page);
__SetPageSwapBacked(page);
set_page_private(page, entry.val);
- lru_cache_add_anon(page);
+
+ /* Tell memcg to use swap ownership records */
+ SetPageSwapCache(page);
+ err = mem_cgroup_charge(page, vma->vm_mm,
+ GFP_KERNEL);
+ ClearPageSwapCache(page);
+ if (err) {
+ ret = VM_FAULT_OOM;
+ goto out_page;
+ }
+
+ shadow = get_shadow_from_swap_cache(entry);
+ if (shadow)
+ workingset_refault(page, shadow);
+
+ lru_cache_add(page);
swap_readpage(page, true);
}
} else {
@@ -3195,11 +3358,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
goto out_page;
}
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- ret = VM_FAULT_OOM;
- goto out_page;
- }
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* Back out if somebody else already faulted in this pte.
@@ -3247,12 +3406,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
/* ksm created a completely new copy */
if (unlikely(page != swapcache && swapcache)) {
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
- mem_cgroup_commit_charge(page, memcg, true, false);
- activate_page(page);
}
swap_free(entry);
@@ -3287,7 +3443,6 @@ unlock:
out:
return ret;
out_nomap:
- mem_cgroup_cancel_charge(page, memcg, false);
pte_unmap_unlock(vmf->pte, vmf->ptl);
out_page:
unlock_page(page);
@@ -3301,14 +3456,13 @@ out_release:
}
/*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults), and pte mapped but not yet locked.
- * We return with mmap_sem still held, but pte unmapped and unlocked.
+ * We return with mmap_lock still held, but pte unmapped and unlocked.
*/
static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
{
struct vm_area_struct *vma = vmf->vma;
- struct mem_cgroup *memcg;
struct page *page;
vm_fault_t ret = 0;
pte_t entry;
@@ -3322,10 +3476,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
- * Here we only have down_read(mmap_sem).
+ * Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(vma->vm_mm, vmf->pmd))
return VM_FAULT_OOM;
@@ -3341,8 +3495,10 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
vma->vm_page_prot));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd,
vmf->address, &vmf->ptl);
- if (!pte_none(*vmf->pte))
+ if (!pte_none(*vmf->pte)) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
goto unlock;
+ }
ret = check_stable_address_space(vma->vm_mm);
if (ret)
goto unlock;
@@ -3361,9 +3517,9 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
if (!page)
goto oom;
- if (mem_cgroup_try_charge_delay(page, vma->vm_mm, GFP_KERNEL, &memcg,
- false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto oom_free_page;
+ cgroup_throttle_swaprate(page, GFP_KERNEL);
/*
* The memory barrier inside __SetPageUptodate makes sure that
@@ -3373,13 +3529,16 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
__SetPageUptodate(page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (vma->vm_flags & VM_WRITE)
entry = pte_mkwrite(pte_mkdirty(entry));
vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address,
&vmf->ptl);
- if (!pte_none(*vmf->pte))
+ if (!pte_none(*vmf->pte)) {
+ update_mmu_cache(vma, vmf->address, vmf->pte);
goto release;
+ }
ret = check_stable_address_space(vma->vm_mm);
if (ret)
@@ -3388,15 +3547,13 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf)
/* Deliver the page fault to userland, check inside PT lock */
if (userfaultfd_missing(vma)) {
pte_unmap_unlock(vmf->pte, vmf->ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
return handle_userfault(vmf, VM_UFFD_MISSING);
}
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
setpte:
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry);
@@ -3406,7 +3563,6 @@ unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
release:
- mem_cgroup_cancel_charge(page, memcg, false);
put_page(page);
goto unlock;
oom_free_page:
@@ -3416,7 +3572,7 @@ oom:
}
/*
- * The mmap_sem must have been held on entry, and may have been
+ * The mmap_lock must have been held on entry, and may have been
* released depending on flags and vma->vm_ops->fault() return value.
* See filemap_fault() and __lock_page_retry().
*/
@@ -3611,7 +3767,6 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
* mapping. If needed, the fucntion allocates page table or use pre-allocated.
*
* @vmf: fault environment
- * @memcg: memcg to charge page (only for private mappings)
* @page: page to map
*
* Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on
@@ -3622,8 +3777,7 @@ static vm_fault_t do_set_pmd(struct vm_fault *vmf, struct page *page)
*
* Return: %0 on success, %VM_FAULT_ code in case of error.
*/
-vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
- struct page *page)
+vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct page *page)
{
struct vm_area_struct *vma = vmf->vma;
bool write = vmf->flags & FAULT_FLAG_WRITE;
@@ -3631,9 +3785,6 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
vm_fault_t ret;
if (pmd_none(*vmf->pmd) && PageTransCompound(page)) {
- /* THP on COW? */
- VM_BUG_ON_PAGE(memcg, page);
-
ret = do_set_pmd(vmf, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
@@ -3646,19 +3797,21 @@ vm_fault_t alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg,
}
/* Re-check under ptl */
- if (unlikely(!pte_none(*vmf->pte)))
+ if (unlikely(!pte_none(*vmf->pte))) {
+ update_mmu_tlb(vma, vmf->address, vmf->pte);
return VM_FAULT_NOPAGE;
+ }
flush_icache_page(vma, page);
entry = mk_pte(page, vma->vm_page_prot);
+ entry = pte_sw_mkyoung(entry);
if (write)
entry = maybe_mkwrite(pte_mkdirty(entry), vma);
/* copy-on-write page */
if (write && !(vma->vm_flags & VM_SHARED)) {
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, vmf->address, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
} else {
inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page));
page_add_file_rmap(page, false);
@@ -3706,7 +3859,7 @@ vm_fault_t finish_fault(struct vm_fault *vmf)
if (!(vmf->vma->vm_flags & VM_SHARED))
ret = check_stable_address_space(vmf->vma->vm_mm);
if (!ret)
- ret = alloc_set_pte(vmf, vmf->memcg, page);
+ ret = alloc_set_pte(vmf, page);
if (vmf->pte)
pte_unmap_unlock(vmf->pte, vmf->ptl);
return ret;
@@ -3866,11 +4019,11 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
if (!vmf->cow_page)
return VM_FAULT_OOM;
- if (mem_cgroup_try_charge_delay(vmf->cow_page, vma->vm_mm, GFP_KERNEL,
- &vmf->memcg, false)) {
+ if (mem_cgroup_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL)) {
put_page(vmf->cow_page);
return VM_FAULT_OOM;
}
+ cgroup_throttle_swaprate(vmf->cow_page, GFP_KERNEL);
ret = __do_fault(vmf);
if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY)))
@@ -3888,7 +4041,6 @@ static vm_fault_t do_cow_fault(struct vm_fault *vmf)
goto uncharge_out;
return ret;
uncharge_out:
- mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false);
put_page(vmf->cow_page);
return ret;
}
@@ -3929,11 +4081,11 @@ static vm_fault_t do_shared_fault(struct vm_fault *vmf)
}
/*
- * We enter with non-exclusive mmap_sem (to exclude vma changes,
+ * We enter with non-exclusive mmap_lock (to exclude vma changes,
* but allow concurrent faults).
- * The mmap_sem may have been released depending on flags and our
+ * The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
- * If mmap_sem is released, vma may become invalid (for example
+ * If mmap_lock is released, vma may become invalid (for example
* by other thread calling munmap()).
*/
static vm_fault_t do_fault(struct vm_fault *vmf)
@@ -4162,10 +4314,10 @@ static vm_fault_t wp_huge_pud(struct vm_fault *vmf, pud_t orig_pud)
* with external mmu caches can use to update those (ie the Sparc or
* PowerPC hashed page tables that act as extended TLBs).
*
- * We enter with non-exclusive mmap_sem (to exclude vma changes, but allow
+ * We enter with non-exclusive mmap_lock (to exclude vma changes, but allow
* concurrent faults).
*
- * The mmap_sem may have been released depending on flags and our return value.
+ * The mmap_lock may have been released depending on flags and our return value.
* See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
@@ -4187,7 +4339,7 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
/*
* A regular pmd is established and it can't morph into a huge
* pmd from under us anymore at this point because we hold the
- * mmap_sem read mode and khugepaged takes it in write mode.
+ * mmap_lock read mode and khugepaged takes it in write mode.
* So now it's safe to run pte_offset_map().
*/
vmf->pte = pte_offset_map(vmf->pmd, vmf->address);
@@ -4224,8 +4376,10 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd);
spin_lock(vmf->ptl);
entry = vmf->orig_pte;
- if (unlikely(!pte_same(*vmf->pte, entry)))
+ if (unlikely(!pte_same(*vmf->pte, entry))) {
+ update_mmu_tlb(vmf->vma, vmf->address, vmf->pte);
goto unlock;
+ }
if (vmf->flags & FAULT_FLAG_WRITE) {
if (!pte_write(entry))
return do_wp_page(vmf);
@@ -4236,6 +4390,9 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
vmf->flags & FAULT_FLAG_WRITE)) {
update_mmu_cache(vmf->vma, vmf->address, vmf->pte);
} else {
+ /* Skip spurious TLB flush for retried page fault */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ goto unlock;
/*
* This is needed only for protection faults but the arch code
* is not yet telling us if this is a protection fault or not.
@@ -4253,7 +4410,7 @@ unlock:
/*
* By the time we get here, we already hold the mm semaphore
*
- * The mmap_sem may have been released depending on flags and our
+ * The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
@@ -4345,14 +4502,75 @@ retry_pud:
return handle_pte_fault(&vmf);
}
+/**
+ * mm_account_fault - Do page fault accountings
+ *
+ * @regs: the pt_regs struct pointer. When set to NULL, will skip accounting
+ * of perf event counters, but we'll still do the per-task accounting to
+ * the task who triggered this page fault.
+ * @address: the faulted address.
+ * @flags: the fault flags.
+ * @ret: the fault retcode.
+ *
+ * This will take care of most of the page fault accountings. Meanwhile, it
+ * will also include the PERF_COUNT_SW_PAGE_FAULTS_[MAJ|MIN] perf counter
+ * updates. However note that the handling of PERF_COUNT_SW_PAGE_FAULTS should
+ * still be in per-arch page fault handlers at the entry of page fault.
+ */
+static inline void mm_account_fault(struct pt_regs *regs,
+ unsigned long address, unsigned int flags,
+ vm_fault_t ret)
+{
+ bool major;
+
+ /*
+ * We don't do accounting for some specific faults:
+ *
+ * - Unsuccessful faults (e.g. when the address wasn't valid). That
+ * includes arch_vma_access_permitted() failing before reaching here.
+ * So this is not a "this many hardware page faults" counter. We
+ * should use the hw profiling for that.
+ *
+ * - Incomplete faults (VM_FAULT_RETRY). They will only be counted
+ * once they're completed.
+ */
+ if (ret & (VM_FAULT_ERROR | VM_FAULT_RETRY))
+ return;
+
+ /*
+ * We define the fault as a major fault when the final successful fault
+ * is VM_FAULT_MAJOR, or if it retried (which implies that we couldn't
+ * handle it immediately previously).
+ */
+ major = (ret & VM_FAULT_MAJOR) || (flags & FAULT_FLAG_TRIED);
+
+ if (major)
+ current->maj_flt++;
+ else
+ current->min_flt++;
+
+ /*
+ * If the fault is done for GUP, regs will be NULL. We only do the
+ * accounting for the per thread fault counters who triggered the
+ * fault, and we skip the perf event updates.
+ */
+ if (!regs)
+ return;
+
+ if (major)
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MAJ, 1, regs, address);
+ else
+ perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS_MIN, 1, regs, address);
+}
+
/*
* By the time we get here, we already hold the mm semaphore
*
- * The mmap_sem may have been released depending on flags and our
+ * The mmap_lock may have been released depending on flags and our
* return value. See filemap_fault() and __lock_page_or_retry().
*/
vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
- unsigned int flags)
+ unsigned int flags, struct pt_regs *regs)
{
vm_fault_t ret;
@@ -4393,6 +4611,8 @@ vm_fault_t handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
mem_cgroup_oom_synchronize(false);
}
+ mm_account_fault(regs, address, flags, ret);
+
return ret;
}
EXPORT_SYMBOL_GPL(handle_mm_fault);
@@ -4434,19 +4654,11 @@ int __pud_alloc(struct mm_struct *mm, p4d_t *p4d, unsigned long address)
smp_wmb(); /* See comment in __pte_alloc */
spin_lock(&mm->page_table_lock);
-#ifndef __ARCH_HAS_5LEVEL_HACK
if (!p4d_present(*p4d)) {
mm_inc_nr_puds(mm);
p4d_populate(mm, p4d, new);
} else /* Another has populated it */
pud_free(mm, new);
-#else
- if (!pgd_present(*p4d)) {
- mm_inc_nr_puds(mm);
- pgd_populate(mm, p4d, new);
- } else /* Another has populated it */
- pud_free(mm, new);
-#endif /* __ARCH_HAS_5LEVEL_HACK */
spin_unlock(&mm->page_table_lock);
return 0;
}
@@ -4665,7 +4877,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *old_buf = buf;
int write = gup_flags & FOLL_WRITE;
- if (down_read_killable(&mm->mmap_sem))
+ if (mmap_read_lock_killable(mm))
return 0;
/* ignore errors, just check how much was successfully transferred */
@@ -4674,7 +4886,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
void *maddr;
struct page *page = NULL;
- ret = get_user_pages_remote(tsk, mm, addr, 1,
+ ret = get_user_pages_remote(mm, addr, 1,
gup_flags, &page, &vma, NULL);
if (ret <= 0) {
#ifndef CONFIG_HAVE_IOREMAP_PROT
@@ -4716,7 +4928,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
buf += bytes;
addr += bytes;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return buf - old_buf;
}
@@ -4773,7 +4985,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
/*
* we might be running from an atomic context so we cannot sleep
*/
- if (!down_read_trylock(&mm->mmap_sem))
+ if (!mmap_read_trylock(mm))
return;
vma = find_vma(mm, ip);
@@ -4792,7 +5004,7 @@ void print_vma_addr(char *prefix, unsigned long ip)
free_page((unsigned long)buf);
}
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
#if defined(CONFIG_PROVE_LOCKING) || defined(CONFIG_DEBUG_ATOMIC_SLEEP)
@@ -4800,7 +5012,7 @@ void __might_fault(const char *file, int line)
{
/*
* Some code (nfs/sunrpc) uses socket ops on kernel memory while
- * holding the mmap_sem, this is safe because kernel memory doesn't
+ * holding the mmap_lock, this is safe because kernel memory doesn't
* get paged out, therefore we'll never actually fault, and the
* below annotations will generate false positives.
*/
@@ -4811,7 +5023,7 @@ void __might_fault(const char *file, int line)
__might_sleep(file, line, 0);
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP)
if (current->mm)
- might_lock_read(&current->mm->mmap_sem);
+ might_lock_read(&current->mm->mmap_lock);
#endif
}
EXPORT_SYMBOL(__might_fault);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index fc0aad0bc1f5..ce3e73e3a5c1 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -98,11 +98,14 @@ void mem_hotplug_done(void)
u64 max_mem_size = U64_MAX;
/* add this memory to iomem resource */
-static struct resource *register_memory_resource(u64 start, u64 size)
+static struct resource *register_memory_resource(u64 start, u64 size,
+ const char *resource_name)
{
struct resource *res;
unsigned long flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
- char *resource_name = "System RAM";
+
+ if (strcmp(resource_name, "System RAM"))
+ flags |= IORESOURCE_MEM_DRIVER_MANAGED;
/*
* Make sure value parsed from 'mem=' only restricts memory adding
@@ -347,6 +350,16 @@ int __ref __add_pages(int nid, unsigned long pfn, unsigned long nr_pages,
return err;
}
+#ifdef CONFIG_NUMA
+int __weak memory_add_physaddr_to_nid(u64 start)
+{
+ pr_info_once("Unknown target node for memory at 0x%llx, assuming node 0\n",
+ start);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(memory_add_physaddr_to_nid);
+#endif
+
/* find the smallest valid pfn in the range [start_pfn, end_pfn) */
static unsigned long find_smallest_section_pfn(int nid, struct zone *zone,
unsigned long start_pfn,
@@ -468,11 +481,20 @@ void __ref remove_pfn_range_from_zone(struct zone *zone,
unsigned long start_pfn,
unsigned long nr_pages)
{
+ const unsigned long end_pfn = start_pfn + nr_pages;
struct pglist_data *pgdat = zone->zone_pgdat;
- unsigned long flags;
+ unsigned long pfn, cur_nr_pages, flags;
/* Poison struct pages because they are now uninitialized again. */
- page_init_poison(pfn_to_page(start_pfn), sizeof(struct page) * nr_pages);
+ for (pfn = start_pfn; pfn < end_pfn; pfn += cur_nr_pages) {
+ cond_resched();
+
+ /* Select all remaining pages up to the next section boundary */
+ cur_nr_pages =
+ min(end_pfn - pfn, SECTION_ALIGN_UP(pfn + 1) - pfn);
+ page_init_poison(pfn_to_page(pfn),
+ sizeof(struct page) * cur_nr_pages);
+ }
#ifdef CONFIG_ZONE_DEVICE
/*
@@ -707,7 +729,7 @@ void __ref move_pfn_range_to_zone(struct zone *zone, unsigned long start_pfn,
* are reserved so nobody should be touching them so we should be safe
*/
memmap_init_zone(nr_pages, nid, zone_idx(zone), start_pfn,
- MEMMAP_HOTPLUG, altmap);
+ MEMINIT_HOTPLUG, altmap);
set_zone_contiguous(zone);
}
@@ -819,21 +841,26 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages,
zone->zone_pgdat->node_present_pages += onlined_pages;
pgdat_resize_unlock(zone->zone_pgdat, &flags);
+ /*
+ * When exposing larger, physically contiguous memory areas to the
+ * buddy, shuffling in the buddy (when freeing onlined pages, putting
+ * them either to the head or the tail of the freelist) is only helpful
+ * for maintaining the shuffle, but not for creating the initial
+ * shuffle. Shuffle the whole zone to make sure the just onlined pages
+ * are properly distributed across the whole freelist.
+ */
shuffle_zone(zone);
node_states_set_node(nid, &arg);
if (need_zonelists_rebuild)
build_all_zonelists(NULL);
- else
- zone_pcp_update(zone);
+ zone_pcp_update(zone);
init_per_zone_wmark_min();
kswapd_run(nid);
kcompactd_run(nid);
- vm_total_pages = nr_free_pagecache_pages();
-
writeback_set_ratelimit();
memory_notify(MEM_ONLINE, &arg);
@@ -862,10 +889,9 @@ static void reset_node_present_pages(pg_data_t *pgdat)
}
/* we are OK calling __meminit stuff here - we have CONFIG_MEMORY_HOTPLUG */
-static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
+static pg_data_t __ref *hotadd_new_pgdat(int nid)
{
struct pglist_data *pgdat;
- unsigned long start_pfn = PFN_DOWN(start);
pgdat = NODE_DATA(nid);
if (!pgdat) {
@@ -879,13 +905,13 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
} else {
int cpu;
/*
- * Reset the nr_zones, order and classzone_idx before reuse.
- * Note that kswapd will init kswapd_classzone_idx properly
+ * Reset the nr_zones, order and highest_zoneidx before reuse.
+ * Note that kswapd will init kswapd_highest_zoneidx properly
* when it starts in the near future.
*/
pgdat->nr_zones = 0;
pgdat->kswapd_order = 0;
- pgdat->kswapd_classzone_idx = 0;
+ pgdat->kswapd_highest_zoneidx = 0;
for_each_online_cpu(cpu) {
struct per_cpu_nodestat *p;
@@ -895,9 +921,8 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start)
}
/* we can use NODE_DATA(nid) from here */
-
pgdat->node_id = nid;
- pgdat->node_start_pfn = start_pfn;
+ pgdat->node_start_pfn = 0;
/* init node's zones as empty zones, we don't have any present pages.*/
free_area_init_core_hotplug(nid);
@@ -932,7 +957,6 @@ static void rollback_node_hotadd(int nid)
/**
* try_online_node - online a node if offlined
* @nid: the node ID
- * @start: start addr of the node
* @set_node_online: Whether we want to online the node
* called by cpu_up() to online a node without onlined memory.
*
@@ -941,7 +965,7 @@ static void rollback_node_hotadd(int nid)
* 0 -> the node is already online
* -ENOMEM -> the node could not be allocated
*/
-static int __try_online_node(int nid, u64 start, bool set_node_online)
+static int __try_online_node(int nid, bool set_node_online)
{
pg_data_t *pgdat;
int ret = 1;
@@ -949,7 +973,7 @@ static int __try_online_node(int nid, u64 start, bool set_node_online)
if (node_online(nid))
return 0;
- pgdat = hotadd_new_pgdat(nid, start);
+ pgdat = hotadd_new_pgdat(nid);
if (!pgdat) {
pr_err("Cannot online node %d due to NULL pgdat\n", nid);
ret = -ENOMEM;
@@ -973,7 +997,7 @@ int try_online_node(int nid)
int ret;
mem_hotplug_begin();
- ret = __try_online_node(nid, 0, true);
+ ret = __try_online_node(nid, true);
mem_hotplug_done();
return ret;
}
@@ -1017,17 +1041,17 @@ int __ref add_memory_resource(int nid, struct resource *res)
if (ret)
return ret;
+ if (!node_possible(nid)) {
+ WARN(1, "node %d was absent from the node_possible_map\n", nid);
+ return -EINVAL;
+ }
+
mem_hotplug_begin();
- /*
- * Add new range to memblock so that when hotadd_new_pgdat() is called
- * to allocate new pgdat, get_pfn_range_for_nid() will be able to find
- * this new range and calculate total pages correctly. The range will
- * be removed at hot-remove time.
- */
- memblock_add_node(start, size, nid);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_add_node(start, size, nid);
- ret = __try_online_node(nid, start, false);
+ ret = __try_online_node(nid, false);
if (ret < 0)
goto error;
new_node = ret;
@@ -1056,11 +1080,13 @@ int __ref add_memory_resource(int nid, struct resource *res)
}
/* link memory sections under this node.*/
- ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1));
+ ret = link_mem_sections(nid, PFN_DOWN(start), PFN_UP(start + size - 1),
+ MEMINIT_HOTPLUG);
BUG_ON(ret);
/* create new memmap entry */
- firmware_map_add_hotplug(start, start + size, "System RAM");
+ if (!strcmp(res->name, "System RAM"))
+ firmware_map_add_hotplug(start, start + size, "System RAM");
/* device_online() will take the lock when calling online_pages() */
mem_hotplug_done();
@@ -1074,7 +1100,8 @@ error:
/* rollback pgdat allocation and others */
if (new_node)
rollback_node_hotadd(nid);
- memblock_remove(start, size);
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK))
+ memblock_remove(start, size);
mem_hotplug_done();
return ret;
}
@@ -1085,7 +1112,7 @@ int __ref __add_memory(int nid, u64 start, u64 size)
struct resource *res;
int ret;
- res = register_memory_resource(start, size);
+ res = register_memory_resource(start, size, "System RAM");
if (IS_ERR(res))
return PTR_ERR(res);
@@ -1107,82 +1134,57 @@ int add_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(add_memory);
-#ifdef CONFIG_MEMORY_HOTREMOVE
/*
- * A free page on the buddy free lists (not the per-cpu lists) has PageBuddy
- * set and the size of the free page is given by page_order(). Using this,
- * the function determines if the pageblock contains only free pages.
- * Due to buddy contraints, a free page at least the size of a pageblock will
- * be located at the start of the pageblock
+ * Add special, driver-managed memory to the system as system RAM. Such
+ * memory is not exposed via the raw firmware-provided memmap as system
+ * RAM, instead, it is detected and added by a driver - during cold boot,
+ * after a reboot, and after kexec.
+ *
+ * Reasons why this memory should not be used for the initial memmap of a
+ * kexec kernel or for placing kexec images:
+ * - The booting kernel is in charge of determining how this memory will be
+ * used (e.g., use persistent memory as system RAM)
+ * - Coordination with a hypervisor is required before this memory
+ * can be used (e.g., inaccessible parts).
+ *
+ * For this memory, no entries in /sys/firmware/memmap ("raw firmware-provided
+ * memory map") are created. Also, the created memory resource is flagged
+ * with IORESOURCE_MEM_DRIVER_MANAGED, so in-kernel users can special-case
+ * this memory as well (esp., not place kexec images onto it).
+ *
+ * The resource_name (visible via /proc/iomem) has to have the format
+ * "System RAM ($DRIVER)".
*/
-static inline int pageblock_free(struct page *page)
+int add_memory_driver_managed(int nid, u64 start, u64 size,
+ const char *resource_name)
{
- return PageBuddy(page) && page_order(page) >= pageblock_order;
-}
+ struct resource *res;
+ int rc;
-/* Return the pfn of the start of the next active pageblock after a given pfn */
-static unsigned long next_active_pageblock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
+ if (!resource_name ||
+ strstr(resource_name, "System RAM (") != resource_name ||
+ resource_name[strlen(resource_name) - 1] != ')')
+ return -EINVAL;
- /* Ensure the starting page is pageblock-aligned */
- BUG_ON(pfn & (pageblock_nr_pages - 1));
+ lock_device_hotplug();
- /* If the entire pageblock is free, move to the end of free page */
- if (pageblock_free(page)) {
- int order;
- /* be careful. we don't have locks, page_order can be changed.*/
- order = page_order(page);
- if ((order < MAX_ORDER) && (order >= pageblock_order))
- return pfn + (1 << order);
+ res = register_memory_resource(start, size, resource_name);
+ if (IS_ERR(res)) {
+ rc = PTR_ERR(res);
+ goto out_unlock;
}
- return pfn + pageblock_nr_pages;
-}
-
-static bool is_pageblock_removable_nolock(unsigned long pfn)
-{
- struct page *page = pfn_to_page(pfn);
- struct zone *zone;
-
- /*
- * We have to be careful here because we are iterating over memory
- * sections which are not zone aware so we might end up outside of
- * the zone but still within the section.
- * We have to take care about the node as well. If the node is offline
- * its NODE_DATA will be NULL - see page_zone.
- */
- if (!node_online(page_to_nid(page)))
- return false;
-
- zone = page_zone(page);
- pfn = page_to_pfn(page);
- if (!zone_spans_pfn(zone, pfn))
- return false;
-
- return !has_unmovable_pages(zone, page, MIGRATE_MOVABLE,
- MEMORY_OFFLINE);
-}
-
-/* Checks if this range of memory is likely to be hot-removable. */
-bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages)
-{
- unsigned long end_pfn, pfn;
-
- end_pfn = min(start_pfn + nr_pages,
- zone_end_pfn(page_zone(pfn_to_page(start_pfn))));
-
- /* Check the starting page of each pageblock within the range */
- for (pfn = start_pfn; pfn < end_pfn; pfn = next_active_pageblock(pfn)) {
- if (!is_pageblock_removable_nolock(pfn))
- return false;
- cond_resched();
- }
+ rc = add_memory_resource(nid, res);
+ if (rc < 0)
+ release_memory_resource(res);
- /* All pageblocks in the memory block are likely to be hot-removable */
- return true;
+out_unlock:
+ unlock_device_hotplug();
+ return rc;
}
+EXPORT_SYMBOL_GPL(add_memory_driver_managed);
+#ifdef CONFIG_MEMORY_HOTREMOVE
/*
* Confirm all pages in a range [start, end) belong to the same zone (skipping
* memory holes). When true, return the zone.
@@ -1224,11 +1226,17 @@ struct zone *test_pages_in_a_zone(unsigned long start_pfn,
/*
* Scan pfn range [start,end) to find movable/migratable pages (LRU pages,
- * non-lru movable pages and hugepages). We scan pfn because it's much
- * easier than scanning over linked list. This function returns the pfn
- * of the first found movable page if it's found, otherwise 0.
+ * non-lru movable pages and hugepages). Will skip over most unmovable
+ * pages (esp., pages that can be skipped when offlining), but bail out on
+ * definitely unmovable pages.
+ *
+ * Returns:
+ * 0 in case a movable page is found and movable_pfn was updated.
+ * -ENOENT in case no movable page was found.
+ * -EBUSY in case a definitely unmovable page was found.
*/
-static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
+static int scan_movable_pages(unsigned long start, unsigned long end,
+ unsigned long *movable_pfn)
{
unsigned long pfn;
@@ -1240,43 +1248,59 @@ static unsigned long scan_movable_pages(unsigned long start, unsigned long end)
continue;
page = pfn_to_page(pfn);
if (PageLRU(page))
- return pfn;
+ goto found;
if (__PageMovable(page))
- return pfn;
+ goto found;
+
+ /*
+ * PageOffline() pages that are not marked __PageMovable() and
+ * have a reference count > 0 (after MEM_GOING_OFFLINE) are
+ * definitely unmovable. If their reference count would be 0,
+ * they could at least be skipped when offlining memory.
+ */
+ if (PageOffline(page) && page_count(page))
+ return -EBUSY;
if (!PageHuge(page))
continue;
head = compound_head(page);
if (page_huge_active(head))
- return pfn;
+ goto found;
skip = compound_nr(head) - (page - head);
pfn += skip - 1;
}
+ return -ENOENT;
+found:
+ *movable_pfn = pfn;
return 0;
}
static struct page *new_node_page(struct page *page, unsigned long private)
{
- int nid = page_to_nid(page);
nodemask_t nmask = node_states[N_MEMORY];
+ struct migration_target_control mtc = {
+ .nid = page_to_nid(page),
+ .nmask = &nmask,
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
/*
* try to allocate from a different node but reuse this node if there
* are no other online nodes to be used (e.g. we are offlining a part
* of the only existing node)
*/
- node_clear(nid, nmask);
+ node_clear(mtc.nid, nmask);
if (nodes_empty(nmask))
- node_set(nid, nmask);
+ node_set(mtc.nid, nmask);
- return new_page_nodemask(page, nid, &nmask);
+ return alloc_migration_target(page, (unsigned long)&mtc);
}
static int
do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
{
unsigned long pfn;
- struct page *page;
+ struct page *page, *head;
int ret = 0;
LIST_HEAD(source);
@@ -1284,15 +1308,14 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (!pfn_valid(pfn))
continue;
page = pfn_to_page(pfn);
+ head = compound_head(page);
if (PageHuge(page)) {
- struct page *head = compound_head(page);
pfn = page_to_pfn(head) + compound_nr(head) - 1;
isolate_huge_page(head, &source);
continue;
} else if (PageTransHuge(page))
- pfn = page_to_pfn(compound_head(page))
- + hpage_nr_pages(page) - 1;
+ pfn = page_to_pfn(head) + thp_nr_pages(page) - 1;
/*
* HWPoison pages have elevated reference counts so the migration would
@@ -1360,7 +1383,7 @@ offline_isolated_pages_cb(unsigned long start, unsigned long nr_pages,
}
/*
- * Check all pages in range, recoreded as memory resource, are isolated.
+ * Check all pages in range, recorded as memory resource, are isolated.
*/
static int
check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
@@ -1372,11 +1395,7 @@ check_pages_isolated_cb(unsigned long start_pfn, unsigned long nr_pages,
static int __init cmdline_parse_movable_node(char *p)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
movable_node_enabled = true;
-#else
- pr_warn("movable_node parameter depends on CONFIG_HAVE_MEMBLOCK_NODE_MAP to work properly\n");
-#endif
return 0;
}
early_param("movable_node", cmdline_parse_movable_node);
@@ -1518,7 +1537,8 @@ static int __ref __offline_pages(unsigned long start_pfn,
}
do {
- for (pfn = start_pfn; pfn;) {
+ pfn = start_pfn;
+ do {
if (signal_pending(current)) {
ret = -EINTR;
reason = "signal backoff";
@@ -1528,14 +1548,19 @@ static int __ref __offline_pages(unsigned long start_pfn,
cond_resched();
lru_add_drain_all();
- pfn = scan_movable_pages(pfn, end_pfn);
- if (pfn) {
+ ret = scan_movable_pages(pfn, end_pfn, &pfn);
+ if (!ret) {
/*
* TODO: fatal migration failures should bail
* out
*/
do_migrate_range(pfn, end_pfn);
}
+ } while (!ret);
+
+ if (ret != -ENOENT) {
+ reason = "unmovable page";
+ goto failed_removal_isolated;
}
/*
@@ -1551,6 +1576,20 @@ static int __ref __offline_pages(unsigned long start_pfn,
/* check again */
ret = walk_system_ram_range(start_pfn, end_pfn - start_pfn,
NULL, check_pages_isolated_cb);
+ /*
+ * per-cpu pages are drained in start_isolate_page_range, but if
+ * there are still pages that are not free, make sure that we
+ * drain again, because when we isolated range we might
+ * have raced with another thread that was adding pages to pcp
+ * list.
+ *
+ * Forward progress should be still guaranteed because
+ * pages on the pcp list can only belong to MOVABLE_ZONE
+ * because has_unmovable_pages explicitly checks for
+ * PageBuddy on freed pages on other zones.
+ */
+ if (ret)
+ drain_all_pages(zone);
} while (ret);
/* Ok, all of our target is isolated.
@@ -1589,7 +1628,6 @@ static int __ref __offline_pages(unsigned long start_pfn,
kcompactd_stop(node);
}
- vm_total_pages = nr_free_pagecache_pages();
writeback_set_ratelimit();
memory_notify(MEM_OFFLINE, &arg);
@@ -1736,7 +1774,7 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
*/
rc = walk_memory_blocks(start, size, NULL, check_memblock_offlined_cb);
if (rc)
- goto done;
+ return rc;
/* remove memmap entry */
firmware_map_remove(start, start + size, "System RAM");
@@ -1750,15 +1788,18 @@ static int __ref try_remove_memory(int nid, u64 start, u64 size)
mem_hotplug_begin();
arch_remove_memory(nid, start, size, NULL);
- memblock_free(start, size);
- memblock_remove(start, size);
+
+ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK)) {
+ memblock_free(start, size);
+ memblock_remove(start, size);
+ }
+
__release_memory_resource(start, size);
try_offline_node(nid);
-done:
mem_hotplug_done();
- return rc;
+ return 0;
}
/**
@@ -1797,4 +1838,41 @@ int remove_memory(int nid, u64 start, u64 size)
return rc;
}
EXPORT_SYMBOL_GPL(remove_memory);
+
+/*
+ * Try to offline and remove a memory block. Might take a long time to
+ * finish in case memory is still in use. Primarily useful for memory devices
+ * that logically unplugged all memory (so it's no longer in use) and want to
+ * offline + remove the memory block.
+ */
+int offline_and_remove_memory(int nid, u64 start, u64 size)
+{
+ struct memory_block *mem;
+ int rc = -EINVAL;
+
+ if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
+ size != memory_block_size_bytes())
+ return rc;
+
+ lock_device_hotplug();
+ mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
+ if (mem)
+ rc = device_offline(&mem->dev);
+ /* Ignore if the device is already offline. */
+ if (rc > 0)
+ rc = 0;
+
+ /*
+ * In case we succeeded to offline the memory block, remove it.
+ * This cannot fail as it cannot get onlined in the meantime.
+ */
+ if (!rc) {
+ rc = try_remove_memory(nid, start, size);
+ WARN_ON_ONCE(rc);
+ }
+ unlock_device_hotplug();
+
+ return rc;
+}
+EXPORT_SYMBOL_GPL(offline_and_remove_memory);
#endif /* CONFIG_MEMORY_HOTREMOVE */
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 48ba9729062e..eddbe4e56c73 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -129,7 +129,7 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
/**
* numa_map_to_online_node - Find closest online node
- * @nid: Node id to start the search
+ * @node: Node id to start the search
*
* Lookup the next closest node by distance if @nid is not online.
*/
@@ -224,7 +224,7 @@ static int mpol_new_bind(struct mempolicy *pol, const nodemask_t *nodes)
* handle an empty nodemask with MPOL_PREFERRED here.
*
* Must be called holding task's alloc_lock to protect task's mems_allowed
- * and mempolicy. May also be called holding the mmap_semaphore for write.
+ * and mempolicy. May also be called holding the mmap_lock for write.
*/
static int mpol_set_nodemask(struct mempolicy *pol,
const nodemask_t *nodes, struct nodemask_scratch *nsc)
@@ -368,7 +368,7 @@ static void mpol_rebind_preferred(struct mempolicy *pol,
/*
* mpol_rebind_policy - Migrate a policy to a different set of nodes
*
- * Per-vma policies are protected by mmap_sem. Allocations using per-task
+ * Per-vma policies are protected by mmap_lock. Allocations using per-task
* policies are protected by task->mems_allowed_seq to prevent a premature
* OOM/allocation failure due to parallel nodemask modification.
*/
@@ -398,17 +398,17 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new)
/*
* Rebind each vma in mm to new nodemask.
*
- * Call holding a reference to mm. Takes mm->mmap_sem during call.
+ * Call holding a reference to mm. Takes mm->mmap_lock during call.
*/
void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new)
{
struct vm_area_struct *vma;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next)
mpol_rebind_policy(vma->vm_policy, new);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
}
static const struct mempolicy_operations mpol_ops[MPOL_MAX] = {
@@ -764,7 +764,7 @@ queue_pages_range(struct mm_struct *mm, unsigned long start, unsigned long end,
/*
* Apply policy to a single VMA
- * This must be called with the mmap_sem held for writing.
+ * This must be called with the mmap_lock held for writing.
*/
static int vma_replace_policy(struct vm_area_struct *vma,
struct mempolicy *pol)
@@ -789,7 +789,7 @@ static int vma_replace_policy(struct vm_area_struct *vma,
}
old = vma->vm_policy;
- vma->vm_policy = new; /* protected by mmap_sem */
+ vma->vm_policy = new; /* protected by mmap_lock */
mpol_put(old);
return 0;
@@ -927,15 +927,12 @@ static int lookup_node(struct mm_struct *mm, unsigned long addr)
int locked = 1;
err = get_user_pages_locked(addr & PAGE_MASK, 1, 0, &p, &locked);
- if (err == 0) {
- /* E.g. GUP interrupted by fatal signal */
- err = -EFAULT;
- } else if (err > 0) {
+ if (err > 0) {
err = page_to_nid(p);
put_page(p);
}
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -968,10 +965,10 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
* vma/shared policy at addr is NULL. We
* want to return MPOL_DEFAULT in this case.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma_intersection(mm, addr, addr+1);
if (!vma) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return -EFAULT;
}
if (vma->vm_ops && vma->vm_ops->get_policy)
@@ -988,7 +985,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
if (flags & MPOL_F_ADDR) {
/*
* Take a refcount on the mpol, lookup_node()
- * wil drop the mmap_sem, so after calling
+ * wil drop the mmap_lock, so after calling
* lookup_node() only "pol" remains valid, "vma"
* is stale.
*/
@@ -1030,7 +1027,7 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask,
out:
mpol_cond_put(pol);
if (vma)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pol_refcount)
mpol_put(pol_refcount);
return err;
@@ -1052,7 +1049,7 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_lru(head),
- hpage_nr_pages(head));
+ thp_nr_pages(head));
} else if (flags & MPOL_MF_STRICT) {
/*
* Non-movable page may reach here. And, there may be
@@ -1068,27 +1065,6 @@ static int migrate_page_add(struct page *page, struct list_head *pagelist,
return 0;
}
-/* page allocation callback for NUMA node migration */
-struct page *alloc_new_node_page(struct page *page, unsigned long node)
-{
- if (PageHuge(page))
- return alloc_huge_page_node(page_hstate(compound_head(page)),
- node);
- else if (PageTransHuge(page)) {
- struct page *thp;
-
- thp = alloc_pages_node(node,
- (GFP_TRANSHUGE | __GFP_THISNODE),
- HPAGE_PMD_ORDER);
- if (!thp)
- return NULL;
- prep_transhuge_page(thp);
- return thp;
- } else
- return __alloc_pages_node(node, GFP_HIGHUSER_MOVABLE |
- __GFP_THISNODE, 0);
-}
-
/*
* Migrate pages from one node to a target node.
* Returns error or the number of pages not migrated.
@@ -1099,6 +1075,10 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
nodemask_t nmask;
LIST_HEAD(pagelist);
int err = 0;
+ struct migration_target_control mtc = {
+ .nid = dest,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
nodes_clear(nmask);
node_set(source, nmask);
@@ -1113,8 +1093,8 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest,
flags | MPOL_MF_DISCONTIG_OK, &pagelist);
if (!list_empty(&pagelist)) {
- err = migrate_pages(&pagelist, alloc_new_node_page, NULL, dest,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(&pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(&pagelist);
}
@@ -1139,7 +1119,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (err)
return err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
/*
* Find a 'source' bit set in 'tmp' whose corresponding 'dest'
@@ -1220,7 +1200,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
if (err < 0)
break;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (err < 0)
return err;
return busy;
@@ -1237,7 +1217,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
static struct page *new_page(struct page *page, unsigned long start)
{
struct vm_area_struct *vma;
- unsigned long uninitialized_var(address);
+ unsigned long address;
vma = find_vma(current->mm, start);
while (vma) {
@@ -1343,12 +1323,12 @@ static long do_mbind(unsigned long start, unsigned long len,
{
NODEMASK_SCRATCH(scratch);
if (scratch) {
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
task_lock(current);
err = mpol_set_nodemask(new, nmask, scratch);
task_unlock(current);
if (err)
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
} else
err = -ENOMEM;
NODEMASK_SCRATCH_FREE(scratch);
@@ -1385,7 +1365,7 @@ up_out:
putback_movable_pages(&pagelist);
}
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
mpol_out:
mpol_put(new);
return err;
@@ -1632,14 +1612,14 @@ static int kernel_get_mempolicy(int __user *policy,
unsigned long flags)
{
int err;
- int uninitialized_var(pval);
+ int pval;
nodemask_t nodes;
- addr = untagged_addr(addr);
-
if (nmask != NULL && maxnode < nr_node_ids)
return -EINVAL;
+ addr = untagged_addr(addr);
+
err = do_get_mempolicy(&pval, &nodes, addr, flags);
if (err)
@@ -1893,7 +1873,7 @@ static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
* Return a nodemask representing a mempolicy for filtering nodes for
* page allocation
*/
-static nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
+nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy)
{
/* Lower zones don't get a nodemask applied for MPOL_BIND */
if (unlikely(policy->mode == MPOL_BIND) &&
@@ -2188,7 +2168,7 @@ static struct page *alloc_page_interleave(gfp_t gfp, unsigned order,
*
* This function allocates a page from the kernel page pool and applies
* a NUMA policy associated with the VMA or the current process.
- * When VMA is not NULL caller must hold down_read on the mmap_sem of the
+ * When VMA is not NULL caller must read-lock the mmap_lock of the
* mm_struct of the VMA to prevent it from going away. Should be used for
* all allocations for pages that will be mapped into user space. Returns
* NULL when no page can be allocated.
diff --git a/mm/mempool.c b/mm/mempool.c
index 85efab3da720..79bff63ecf27 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -489,7 +489,7 @@ void mempool_free(void *element, mempool_t *pool)
* ensures that there will be frees which return elements to the
* pool waking up the waiters.
*/
- if (unlikely(pool->curr_nr < pool->min_nr)) {
+ if (unlikely(READ_ONCE(pool->curr_nr) < pool->min_nr)) {
spin_lock_irqsave(&pool->lock, flags);
if (likely(pool->curr_nr < pool->min_nr)) {
add_element(pool, element);
diff --git a/mm/memremap.c b/mm/memremap.c
index 03e38b7a38f1..006dace60b1a 100644
--- a/mm/memremap.c
+++ b/mm/memremap.c
@@ -216,7 +216,7 @@ void *memremap_pages(struct dev_pagemap *pgmap, int nid)
return ERR_PTR(-EINVAL);
}
break;
- case MEMORY_DEVICE_DEVDAX:
+ case MEMORY_DEVICE_GENERIC:
need_devmap_managed = false;
break;
case MEMORY_DEVICE_PCI_P2PDMA:
diff --git a/mm/migrate.c b/mm/migrate.c
index 7160c1556f79..04a98bb2f568 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -193,7 +193,7 @@ void putback_movable_pages(struct list_head *l)
put_page(page);
} else {
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -thp_nr_pages(page));
putback_lru_page(page);
}
}
@@ -246,13 +246,13 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
else if (pte_swp_uffd_wp(*pvmw.pte))
pte = pte_mkuffd_wp(pte);
- if (unlikely(is_zone_device_page(new))) {
- if (is_device_private_page(new)) {
- entry = make_device_private_entry(new, pte_write(pte));
- pte = swp_entry_to_pte(entry);
- if (pte_swp_uffd_wp(*pvmw.pte))
- pte = pte_mkuffd_wp(pte);
- }
+ if (unlikely(is_device_private_page(new))) {
+ entry = make_device_private_entry(new, pte_write(pte));
+ pte = swp_entry_to_pte(entry);
+ if (pte_swp_soft_dirty(*pvmw.pte))
+ pte = pte_swp_mksoft_dirty(pte);
+ if (pte_swp_uffd_wp(*pvmw.pte))
+ pte = pte_swp_mkuffd_wp(pte);
}
#ifdef CONFIG_HUGETLB_PAGE
@@ -386,7 +386,7 @@ static int expected_page_refs(struct address_space *mapping, struct page *page)
*/
expected_count += is_device_private_page(page);
if (mapping)
- expected_count += hpage_nr_pages(page) + page_has_private(page);
+ expected_count += thp_nr_pages(page) + page_has_private(page);
return expected_count;
}
@@ -441,7 +441,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
*/
newpage->index = page->index;
newpage->mapping = page->mapping;
- page_ref_add(newpage, hpage_nr_pages(page)); /* add cache reference */
+ page_ref_add(newpage, thp_nr_pages(page)); /* add cache reference */
if (PageSwapBacked(page)) {
__SetPageSwapBacked(newpage);
if (PageSwapCache(page)) {
@@ -474,7 +474,7 @@ int migrate_page_move_mapping(struct address_space *mapping,
* to one less reference.
* We know this isn't the last reference.
*/
- page_ref_unfreeze(page, expected_count - hpage_nr_pages(page));
+ page_ref_unfreeze(page, expected_count - thp_nr_pages(page));
xas_unlock(&xas);
/* Leave irq disabled to prevent preemption while updating stats */
@@ -490,11 +490,18 @@ int migrate_page_move_mapping(struct address_space *mapping,
* are mapped to swap space.
*/
if (newzone != oldzone) {
- __dec_node_state(oldzone->zone_pgdat, NR_FILE_PAGES);
- __inc_node_state(newzone->zone_pgdat, NR_FILE_PAGES);
+ struct lruvec *old_lruvec, *new_lruvec;
+ struct mem_cgroup *memcg;
+
+ memcg = page_memcg(page);
+ old_lruvec = mem_cgroup_lruvec(memcg, oldzone->zone_pgdat);
+ new_lruvec = mem_cgroup_lruvec(memcg, newzone->zone_pgdat);
+
+ __dec_lruvec_state(old_lruvec, NR_FILE_PAGES);
+ __inc_lruvec_state(new_lruvec, NR_FILE_PAGES);
if (PageSwapBacked(page) && !PageSwapCache(page)) {
- __dec_node_state(oldzone->zone_pgdat, NR_SHMEM);
- __inc_node_state(newzone->zone_pgdat, NR_SHMEM);
+ __dec_lruvec_state(old_lruvec, NR_SHMEM);
+ __inc_lruvec_state(new_lruvec, NR_SHMEM);
}
if (dirty && mapping_cap_account_dirty(mapping)) {
__dec_node_state(oldzone->zone_pgdat, NR_FILE_DIRTY);
@@ -584,7 +591,7 @@ static void copy_huge_page(struct page *dst, struct page *src)
} else {
/* thp page */
BUG_ON(!PageTransHuge(src));
- nr_pages = hpage_nr_pages(src);
+ nr_pages = thp_nr_pages(src);
}
for (i = 0; i < nr_pages; i++) {
@@ -661,7 +668,8 @@ void migrate_page_states(struct page *newpage, struct page *page)
copy_page_owner(page, newpage);
- mem_cgroup_migrate(page, newpage);
+ if (!PageHuge(page))
+ mem_cgroup_migrate(page, newpage);
}
EXPORT_SYMBOL(migrate_page_states);
@@ -797,11 +805,7 @@ recheck_buffers:
if (rc != MIGRATEPAGE_SUCCESS)
goto unlock_buffers;
- ClearPagePrivate(page);
- set_page_private(newpage, page_private(page));
- set_page_private(page, 0);
- put_page(page);
- get_page(newpage);
+ attach_page_private(newpage, detach_page_private(page));
bh = head;
do {
@@ -810,8 +814,6 @@ recheck_buffers:
} while (bh != head);
- SetPagePrivate(newpage);
-
if (mode != MIGRATE_SYNC_NO_COPY)
migrate_page_copy(newpage, page);
else
@@ -1032,7 +1034,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* to the LRU. Later, when the IO completes the pages are
* marked uptodate and unlocked. However, the queueing
* could be merging multiple pages for one bio (e.g.
- * mpage_readpages). If an allocation happens for the
+ * mpage_readahead). If an allocation happens for the
* second or third page, the process can end up locking
* the same page twice and deadlocking. Rather than
* trying to be clever about what pages can be locked,
@@ -1160,21 +1162,10 @@ out:
}
/*
- * gcc 4.7 and 4.8 on arm get an ICEs when inlining unmap_and_move(). Work
- * around it.
- */
-#if defined(CONFIG_ARM) && \
- defined(GCC_VERSION) && GCC_VERSION < 40900 && GCC_VERSION >= 40700
-#define ICE_noinline noinline
-#else
-#define ICE_noinline
-#endif
-
-/*
* Obtain the lock on page, remove all ptes and migrate the page
* to the newly allocated page in newpage.
*/
-static ICE_noinline int unmap_and_move(new_page_t get_new_page,
+static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode,
@@ -1223,7 +1214,7 @@ out:
*/
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
- page_is_file_lru(page), -hpage_nr_pages(page));
+ page_is_file_lru(page), -thp_nr_pages(page));
}
/*
@@ -1428,22 +1419,35 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
enum migrate_mode mode, int reason)
{
int retry = 1;
+ int thp_retry = 1;
int nr_failed = 0;
int nr_succeeded = 0;
+ int nr_thp_succeeded = 0;
+ int nr_thp_failed = 0;
+ int nr_thp_split = 0;
int pass = 0;
+ bool is_thp = false;
struct page *page;
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
- int rc;
+ int rc, nr_subpages;
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
- for(pass = 0; pass < 10 && retry; pass++) {
+ for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
retry = 0;
+ thp_retry = 0;
list_for_each_entry_safe(page, page2, from, lru) {
retry:
+ /*
+ * THP statistics is based on the source huge page.
+ * Capture required information that might get lost
+ * during migration.
+ */
+ is_thp = PageTransHuge(page) && !PageHuge(page);
+ nr_subpages = thp_nr_pages(page);
cond_resched();
if (PageHuge(page))
@@ -1468,21 +1472,35 @@ retry:
* we encounter them after the rest of the list
* is processed.
*/
- if (PageTransHuge(page) && !PageHuge(page)) {
+ if (is_thp) {
lock_page(page);
rc = split_huge_page_to_list(page, from);
unlock_page(page);
if (!rc) {
list_safe_reset_next(page, page2, lru);
+ nr_thp_split++;
goto retry;
}
+
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ goto out;
}
nr_failed++;
goto out;
case -EAGAIN:
+ if (is_thp) {
+ thp_retry++;
+ break;
+ }
retry++;
break;
case MIGRATEPAGE_SUCCESS:
+ if (is_thp) {
+ nr_thp_succeeded++;
+ nr_succeeded += nr_subpages;
+ break;
+ }
nr_succeeded++;
break;
default:
@@ -1492,19 +1510,27 @@ retry:
* removed from migration page list and not
* retried in the next outer loop.
*/
+ if (is_thp) {
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
nr_failed++;
break;
}
}
}
- nr_failed += retry;
+ nr_failed += retry + thp_retry;
+ nr_thp_failed += thp_retry;
rc = nr_failed;
out:
- if (nr_succeeded)
- count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
- if (nr_failed)
- count_vm_events(PGMIGRATE_FAIL, nr_failed);
- trace_mm_migrate_pages(nr_succeeded, nr_failed, mode, reason);
+ count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
+ count_vm_events(PGMIGRATE_FAIL, nr_failed);
+ count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
+ count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
+ count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
+ trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
+ nr_thp_failed, nr_thp_split, mode, reason);
if (!swapwrite)
current->flags &= ~PF_SWAPWRITE;
@@ -1512,6 +1538,49 @@ out:
return rc;
}
+struct page *alloc_migration_target(struct page *page, unsigned long private)
+{
+ struct migration_target_control *mtc;
+ gfp_t gfp_mask;
+ unsigned int order = 0;
+ struct page *new_page = NULL;
+ int nid;
+ int zidx;
+
+ mtc = (struct migration_target_control *)private;
+ gfp_mask = mtc->gfp_mask;
+ nid = mtc->nid;
+ if (nid == NUMA_NO_NODE)
+ nid = page_to_nid(page);
+
+ if (PageHuge(page)) {
+ struct hstate *h = page_hstate(compound_head(page));
+
+ gfp_mask = htlb_modify_alloc_mask(h, gfp_mask);
+ return alloc_huge_page_nodemask(h, nid, mtc->nmask, gfp_mask);
+ }
+
+ if (PageTransHuge(page)) {
+ /*
+ * clear __GFP_RECLAIM to make the migration callback
+ * consistent with regular THP allocations.
+ */
+ gfp_mask &= ~__GFP_RECLAIM;
+ gfp_mask |= GFP_TRANSHUGE;
+ order = HPAGE_PMD_ORDER;
+ }
+ zidx = zone_idx(page_zone(page));
+ if (is_highmem_idx(zidx) || zidx == ZONE_MOVABLE)
+ gfp_mask |= __GFP_HIGHMEM;
+
+ new_page = __alloc_pages_nodemask(gfp_mask, order, nid, mtc->nmask);
+
+ if (new_page && PageTransHuge(new_page))
+ prep_transhuge_page(new_page);
+
+ return new_page;
+}
+
#ifdef CONFIG_NUMA
static int store_status(int __user *status, int start, int value, int nr)
@@ -1529,9 +1598,13 @@ static int do_move_pages_to_node(struct mm_struct *mm,
struct list_head *pagelist, int node)
{
int err;
+ struct migration_target_control mtc = {
+ .nid = node,
+ .gfp_mask = GFP_HIGHUSER_MOVABLE | __GFP_THISNODE,
+ };
- err = migrate_pages(pagelist, alloc_new_node_page, NULL, node,
- MIGRATE_SYNC, MR_SYSCALL);
+ err = migrate_pages(pagelist, alloc_migration_target, NULL,
+ (unsigned long)&mtc, MIGRATE_SYNC, MR_SYSCALL);
if (err)
putback_movable_pages(pagelist);
return err;
@@ -1554,7 +1627,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
unsigned int follflags;
int err;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
err = -EFAULT;
vma = find_vma(mm, addr);
if (!vma || addr < vma->vm_start || !vma_migratable(vma))
@@ -1597,7 +1670,7 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr,
list_add_tail(&head->lru, pagelist);
mod_node_page_state(page_pgdat(head),
NR_ISOLATED_ANON + page_is_file_lru(head),
- hpage_nr_pages(head));
+ thp_nr_pages(head));
}
out_putpage:
/*
@@ -1607,7 +1680,7 @@ out_putpage:
*/
put_page(page);
out:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return err;
}
@@ -1732,7 +1805,7 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages,
{
unsigned long i;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (i = 0; i < nr_pages; i++) {
unsigned long addr = (unsigned long)(*pages);
@@ -1759,7 +1832,7 @@ set_status:
status++;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
}
/*
@@ -1961,7 +2034,7 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
page_lru = page_is_file_lru(page);
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON + page_lru,
- hpage_nr_pages(page));
+ thp_nr_pages(page));
/*
* Isolating the page has taken another reference, so the
@@ -2119,7 +2192,7 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
* pmd before doing set_pmd_at(), nor to flush the TLB after
* set_pmd_at(). Clearing the pmd here would introduce a race
* condition against MADV_DONTNEED, because MADV_DONTNEED only holds the
- * mmap_sem for reading. If the pmd is set to NULL at any given time,
+ * mmap_lock for reading. If the pmd is set to NULL at any given time,
* MADV_DONTNEED won't wait on the pmd lock and it'll skip clearing this
* pmd.
*/
@@ -2178,6 +2251,16 @@ static int migrate_vma_collect_hole(unsigned long start,
struct migrate_vma *migrate = walk->private;
unsigned long addr;
+ /* Only allow populating anonymous memory. */
+ if (!vma_is_anonymous(walk->vma)) {
+ for (addr = start; addr < end; addr += PAGE_SIZE) {
+ migrate->src[migrate->npages] = 0;
+ migrate->dst[migrate->npages] = 0;
+ migrate->npages++;
+ }
+ return 0;
+ }
+
for (addr = start; addr < end; addr += PAGE_SIZE) {
migrate->src[migrate->npages] = MIGRATE_PFN_MIGRATE;
migrate->dst[migrate->npages] = 0;
@@ -2270,8 +2353,10 @@ again:
pte = *ptep;
if (pte_none(pte)) {
- mpfn = MIGRATE_PFN_MIGRATE;
- migrate->cpages++;
+ if (vma_is_anonymous(vma)) {
+ mpfn = MIGRATE_PFN_MIGRATE;
+ migrate->cpages++;
+ }
goto next;
}
@@ -2286,7 +2371,9 @@ again:
goto next;
page = device_private_entry_to_page(entry);
- if (page->pgmap->owner != migrate->src_owner)
+ if (!(migrate->flags &
+ MIGRATE_VMA_SELECT_DEVICE_PRIVATE) ||
+ page->pgmap->owner != migrate->pgmap_owner)
goto next;
mpfn = migrate_pfn(page_to_pfn(page)) |
@@ -2294,7 +2381,7 @@ again:
if (is_write_device_private_entry(entry))
mpfn |= MIGRATE_PFN_WRITE;
} else {
- if (migrate->src_owner)
+ if (!(migrate->flags & MIGRATE_VMA_SELECT_SYSTEM))
goto next;
pfn = pte_pfn(pte);
if (is_zero_pfn(pfn)) {
@@ -2340,10 +2427,17 @@ again:
entry = make_migration_entry(page, mpfn &
MIGRATE_PFN_WRITE);
swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pte))
- swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pte))
- swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ if (pte_present(pte)) {
+ if (pte_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ } else {
+ if (pte_swp_soft_dirty(pte))
+ swp_pte = pte_swp_mksoft_dirty(swp_pte);
+ if (pte_swp_uffd_wp(pte))
+ swp_pte = pte_swp_mkuffd_wp(swp_pte);
+ }
set_pte_at(mm, addr, ptep, swp_pte);
/*
@@ -2389,8 +2483,14 @@ static void migrate_vma_collect(struct migrate_vma *migrate)
{
struct mmu_notifier_range range;
- mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, NULL,
- migrate->vma->vm_mm, migrate->start, migrate->end);
+ /*
+ * Note that the pgmap_owner is passed to the mmu notifier callback so
+ * that the registered device driver can skip invalidating device
+ * private page mappings that won't be migrated.
+ */
+ mmu_notifier_range_init_migrate(&range, 0, migrate->vma,
+ migrate->vma->vm_mm, migrate->start, migrate->end,
+ migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
walk_page_range(migrate->vma->vm_mm, migrate->start, migrate->end,
@@ -2621,7 +2721,7 @@ restore:
/**
* migrate_vma_setup() - prepare to migrate a range of memory
- * @args: contains the vma, start, and and pfns arrays for the migration
+ * @args: contains the vma, start, and pfns arrays for the migration
*
* Returns: negative errno on failures, 0 when 0 or more pages were migrated
* without an error.
@@ -2674,7 +2774,7 @@ restore:
* have the MIGRATE_PFN_MIGRATE flag set for their src array entry.
*
* It is safe to update device page table after migrate_vma_pages() because
- * both destination and source page are still locked, and the mmap_sem is held
+ * both destination and source page are still locked, and the mmap_lock is held
* in read mode (hence no one can unmap the range being migrated).
*
* Once the caller is done cleaning up things and updating its page table (if it
@@ -2739,7 +2839,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
{
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
- struct mem_cgroup *memcg;
bool flush = false;
spinlock_t *ptl;
pte_t entry;
@@ -2772,10 +2871,10 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
* pte_offset_map() on pmds where a huge pmd might be created
* from a different thread.
*
- * pte_alloc_map() is safe to use under down_write(mmap_sem) or when
+ * pte_alloc_map() is safe to use under mmap_write_lock(mm) or when
* parallel threads are excluded by other means.
*
- * Here we only have down_read(mmap_sem).
+ * Here we only have mmap_read_lock(mm).
*/
if (pte_alloc(mm, pmdp))
goto abort;
@@ -2786,7 +2885,7 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
if (unlikely(anon_vma_prepare(vma)))
goto abort;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, vma->vm_mm, GFP_KERNEL))
goto abort;
/*
@@ -2832,9 +2931,8 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
inc_mm_counter(mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
if (!is_zone_device_page(page))
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
get_page(page);
if (flush) {
@@ -2854,7 +2952,6 @@ static void migrate_vma_insert_page(struct migrate_vma *migrate,
unlock_abort:
pte_unmap_unlock(ptep, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
abort:
*src &= ~MIGRATE_PFN_MIGRATE;
}
diff --git a/mm/mincore.c b/mm/mincore.c
index 0e6dd9948f1a..453ff112470f 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -17,9 +17,9 @@
#include <linux/swapops.h>
#include <linux/shmem_fs.h>
#include <linux/hugetlb.h>
+#include <linux/pgtable.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
static int mincore_hugetlb(pte_t *pte, unsigned long hmask, unsigned long addr,
unsigned long end, struct mm_walk *walk)
@@ -284,9 +284,9 @@ SYSCALL_DEFINE3(mincore, unsigned long, start, size_t, len,
* Do at most PAGE_SIZE entries per iteration, due to
* the temporary buffer size.
*/
- down_read(&current->mm->mmap_sem);
+ mmap_read_lock(current->mm);
retval = do_mincore(start, min(pages, PAGE_SIZE), tmp);
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
if (retval <= 0)
break;
diff --git a/mm/mlock.c b/mm/mlock.c
index a72c1eeded77..884b1216da6a 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -49,7 +49,7 @@ EXPORT_SYMBOL(can_do_mlock);
* When lazy mlocking via vmscan, it is important to ensure that the
* vma's VM_LOCKED status is not concurrently being modified, otherwise we
* may have mlocked a page that is being munlocked. So lazy mlock must take
- * the mmap_sem for read, and verify that the vma really is locked
+ * the mmap_lock for read, and verify that the vma really is locked
* (see mm/rmap.c).
*/
@@ -58,12 +58,14 @@ EXPORT_SYMBOL(can_do_mlock);
*/
void clear_page_mlock(struct page *page)
{
+ int nr_pages;
+
if (!TestClearPageMlocked(page))
return;
- mod_zone_page_state(page_zone(page), NR_MLOCK,
- -hpage_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGCLEARED);
+ nr_pages = thp_nr_pages(page);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ count_vm_events(UNEVICTABLE_PGCLEARED, nr_pages);
/*
* The previous TestClearPageMlocked() corresponds to the smp_mb()
* in __pagevec_lru_add_fn().
@@ -77,7 +79,7 @@ void clear_page_mlock(struct page *page)
* We lost the race. the page already moved to evictable list.
*/
if (PageUnevictable(page))
- count_vm_event(UNEVICTABLE_PGSTRANDED);
+ count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
}
}
@@ -94,9 +96,10 @@ void mlock_vma_page(struct page *page)
VM_BUG_ON_PAGE(PageCompound(page) && PageDoubleMap(page), page);
if (!TestSetPageMlocked(page)) {
- mod_zone_page_state(page_zone(page), NR_MLOCK,
- hpage_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
+ int nr_pages = thp_nr_pages(page);
+
+ mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
if (!isolate_lru_page(page))
putback_lru_page(page);
}
@@ -139,7 +142,7 @@ static void __munlock_isolated_page(struct page *page)
/* Did try_to_unlock() succeed or punt? */
if (!PageMlocked(page))
- count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ count_vm_events(UNEVICTABLE_PGMUNLOCKED, thp_nr_pages(page));
putback_lru_page(page);
}
@@ -155,10 +158,12 @@ static void __munlock_isolated_page(struct page *page)
*/
static void __munlock_isolation_failed(struct page *page)
{
+ int nr_pages = thp_nr_pages(page);
+
if (PageUnevictable(page))
- __count_vm_event(UNEVICTABLE_PGSTRANDED);
+ __count_vm_events(UNEVICTABLE_PGSTRANDED, nr_pages);
else
- __count_vm_event(UNEVICTABLE_PGMUNLOCKED);
+ __count_vm_events(UNEVICTABLE_PGMUNLOCKED, nr_pages);
}
/**
@@ -192,7 +197,7 @@ unsigned int munlock_vma_page(struct page *page)
/*
* Serialize with any parallel __split_huge_page_refcount() which
* might otherwise copy PageMlocked to part of the tail pages before
- * we clear it in the head page. It also stabilizes hpage_nr_pages().
+ * we clear it in the head page. It also stabilizes thp_nr_pages().
*/
spin_lock_irq(&pgdat->lru_lock);
@@ -202,7 +207,7 @@ unsigned int munlock_vma_page(struct page *page)
goto unlock_out;
}
- nr_pages = hpage_nr_pages(page);
+ nr_pages = thp_nr_pages(page);
__mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
if (__munlock_isolate_lru_page(page, true)) {
@@ -381,7 +386,7 @@ static unsigned long __munlock_pagevec_fill(struct pagevec *pvec,
/*
* Initialize pte walk starting at the already pinned page where we
* are sure that there is a pte, as it was pinned under the same
- * mmap_sem write op.
+ * mmap_lock write op.
*/
pte = get_locked_pte(vma->vm_mm, start, &ptl);
/* Make sure we do not cross the page table boundary */
@@ -565,7 +570,7 @@ success:
mm->locked_vm += nr_pages;
/*
- * vm_flags is protected by the mmap_sem held in write mode.
+ * vm_flags is protected by the mmap_lock held in write mode.
* It's okay if try_to_unmap_one unmaps a page just after we
* set VM_LOCKED, populate_vma_page_range will bring it back.
*/
@@ -686,7 +691,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
lock_limit >>= PAGE_SHIFT;
locked = len >> PAGE_SHIFT;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
locked += current->mm->locked_vm;
@@ -705,7 +710,7 @@ static __must_check int do_mlock(unsigned long start, size_t len, vm_flags_t fla
if ((locked <= lock_limit) || capable(CAP_IPC_LOCK))
error = apply_vma_lock_flags(start, len, flags);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (error)
return error;
@@ -742,10 +747,10 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len)
len = PAGE_ALIGN(len + (offset_in_page(start)));
start &= PAGE_MASK;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
ret = apply_vma_lock_flags(start, len, 0);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
@@ -811,14 +816,14 @@ SYSCALL_DEFINE1(mlockall, int, flags)
lock_limit = rlimit(RLIMIT_MEMLOCK);
lock_limit >>= PAGE_SHIFT;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
ret = -ENOMEM;
if (!(flags & MCL_CURRENT) || (current->mm->total_vm <= lock_limit) ||
capable(CAP_IPC_LOCK))
ret = apply_mlockall_flags(flags);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (!ret && (flags & MCL_CURRENT))
mm_populate(0, TASK_SIZE);
@@ -829,10 +834,10 @@ SYSCALL_DEFINE0(munlockall)
{
int ret;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
ret = apply_mlockall_flags(0);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 7da6991d9435..b06a30fbedff 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -13,6 +13,7 @@
#include <linux/memory.h>
#include <linux/notifier.h>
#include <linux/sched.h>
+#include <linux/mman.h>
#include "internal.h"
#ifdef CONFIG_DEBUG_MEMORY_INIT
@@ -67,26 +68,30 @@ void __init mminit_verify_pageflags_layout(void)
unsigned long or_mask, add_mask;
shift = 8 * sizeof(unsigned long);
- width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH - LAST_CPUPID_SHIFT;
+ width = shift - SECTIONS_WIDTH - NODES_WIDTH - ZONES_WIDTH
+ - LAST_CPUPID_SHIFT - KASAN_TAG_WIDTH;
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_widths",
- "Section %d Node %d Zone %d Lastcpupid %d Flags %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d Flags %d\n",
SECTIONS_WIDTH,
NODES_WIDTH,
ZONES_WIDTH,
LAST_CPUPID_WIDTH,
+ KASAN_TAG_WIDTH,
NR_PAGEFLAGS);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_shifts",
- "Section %d Node %d Zone %d Lastcpupid %d\n",
+ "Section %d Node %d Zone %d Lastcpupid %d Kasantag %d\n",
SECTIONS_SHIFT,
NODES_SHIFT,
ZONES_SHIFT,
- LAST_CPUPID_SHIFT);
+ LAST_CPUPID_SHIFT,
+ KASAN_TAG_WIDTH);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_pgshifts",
- "Section %lu Node %lu Zone %lu Lastcpupid %lu\n",
+ "Section %lu Node %lu Zone %lu Lastcpupid %lu Kasantag %lu\n",
(unsigned long)SECTIONS_PGSHIFT,
(unsigned long)NODES_PGSHIFT,
(unsigned long)ZONES_PGSHIFT,
- (unsigned long)LAST_CPUPID_PGSHIFT);
+ (unsigned long)LAST_CPUPID_PGSHIFT,
+ (unsigned long)KASAN_TAG_PGSHIFT);
mminit_dprintk(MMINIT_TRACE, "pageflags_layout_nodezoneid",
"Node/Zone ID: %lu -> %lu\n",
(unsigned long)(ZONEID_PGOFF + ZONEID_SHIFT),
@@ -140,14 +145,23 @@ EXPORT_SYMBOL_GPL(mm_kobj);
#ifdef CONFIG_SMP
s32 vm_committed_as_batch = 32;
-static void __meminit mm_compute_batch(void)
+void mm_compute_batch(int overcommit_policy)
{
u64 memsized_batch;
s32 nr = num_present_cpus();
s32 batch = max_t(s32, nr*2, 32);
-
- /* batch size set to 0.4% of (total memory/#cpus), or max int32 */
- memsized_batch = min_t(u64, (totalram_pages()/nr)/256, 0x7fffffff);
+ unsigned long ram_pages = totalram_pages();
+
+ /*
+ * For policy OVERCOMMIT_NEVER, set batch size to 0.4% of
+ * (total memory/#cpus), and lift it to 25% for other policies
+ * to easy the possible lock contention for percpu_counter
+ * vm_committed_as, while the max limit is INT_MAX
+ */
+ if (overcommit_policy == OVERCOMMIT_NEVER)
+ memsized_batch = min_t(u64, ram_pages/nr/256, INT_MAX);
+ else
+ memsized_batch = min_t(u64, ram_pages/nr/4, INT_MAX);
vm_committed_as_batch = max_t(s32, memsized_batch, batch);
}
@@ -158,7 +172,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
switch (action) {
case MEM_ONLINE:
case MEM_OFFLINE:
- mm_compute_batch();
+ mm_compute_batch(sysctl_overcommit_memory);
default:
break;
}
@@ -172,7 +186,7 @@ static struct notifier_block compute_batch_nb __meminitdata = {
static int __init mm_compute_batch_init(void)
{
- mm_compute_batch();
+ mm_compute_batch(sysctl_overcommit_memory);
register_hotmemory_notifier(&compute_batch_nb);
return 0;
diff --git a/mm/mmap.c b/mm/mmap.c
index f609e9ec4a25..bdd19f5b994e 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -132,7 +132,7 @@ void vma_set_page_prot(struct vm_area_struct *vma)
vm_flags &= ~VM_SHARED;
vm_page_prot = vm_pgprot_modify(vm_page_prot, vm_flags);
}
- /* remove_protection_ptes reads vma->vm_page_prot without mmap_sem */
+ /* remove_protection_ptes reads vma->vm_page_prot without mmap_lock */
WRITE_ONCE(vma->vm_page_prot, vm_page_prot);
}
@@ -198,7 +198,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
bool downgraded = false;
LIST_HEAD(uf);
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
origbrk = mm->brk;
@@ -238,14 +238,14 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Always allow shrinking brk.
- * __do_munmap() may downgrade mmap_sem to read.
+ * __do_munmap() may downgrade mmap_lock to read.
*/
if (brk <= mm->brk) {
int ret;
/*
- * mm->brk must to be protected by write mmap_sem so update it
- * before downgrading mmap_sem. When __do_munmap() fails,
+ * mm->brk must to be protected by write mmap_lock so update it
+ * before downgrading mmap_lock. When __do_munmap() fails,
* mm->brk will be restored from origbrk.
*/
mm->brk = brk;
@@ -272,9 +272,9 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
success:
populate = newbrk > oldbrk && (mm->def_flags & VM_LOCKED) != 0;
if (downgraded)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
else
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(oldbrk, newbrk - oldbrk);
@@ -282,7 +282,7 @@ success:
out:
retval = origbrk;
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return retval;
}
@@ -505,7 +505,7 @@ static __always_inline void vma_rb_erase(struct vm_area_struct *vma,
* After the update, the vma will be reinserted using
* anon_vma_interval_tree_post_update_vma().
*
- * The entire update must be protected by exclusive mmap_sem and by
+ * The entire update must be protected by exclusive mmap_lock and by
* the root anon_vma's mutex.
*/
static inline void
@@ -1030,7 +1030,7 @@ static inline int is_mergeable_anon_vma(struct anon_vma *anon_vma1,
* anon_vmas, nor if same anon_vma is assigned but offsets incompatible.
*
* We don't check here for the merged mmap wrapping around the end of pagecache
- * indices (16TB on ia32) because do_mmap_pgoff() does not permit mmap's which
+ * indices (16TB on ia32) because do_mmap() does not permit mmap's which
* wrap, nor mmaps which cover the final page at index -1UL.
*/
static int
@@ -1207,7 +1207,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
}
/*
- * Rough compatbility check to quickly see if it's even worth looking
+ * Rough compatibility check to quickly see if it's even worth looking
* at sharing an anon_vma.
*
* They need to have the same vm_file, and the flags can only differ
@@ -1361,15 +1361,15 @@ static inline bool file_mmap_ok(struct file *file, struct inode *inode,
}
/*
- * The caller must hold down_write(&current->mm->mmap_sem).
+ * The caller must write-lock current->mm->mmap_lock.
*/
unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot,
- unsigned long flags, vm_flags_t vm_flags,
- unsigned long pgoff, unsigned long *populate,
- struct list_head *uf)
+ unsigned long flags, unsigned long pgoff,
+ unsigned long *populate, struct list_head *uf)
{
struct mm_struct *mm = current->mm;
+ vm_flags_t vm_flags;
int pkey = 0;
*populate = 0;
@@ -1431,7 +1431,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
* to. we assume access permissions have been handled by the open
* of the memory object, so we don't do any here.
*/
- vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
+ vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) |
mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC;
if (flags & MAP_LOCKED)
@@ -1562,11 +1562,12 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
file = fget(fd);
if (!file)
return -EBADF;
- if (is_file_hugepages(file))
+ if (is_file_hugepages(file)) {
len = ALIGN(len, huge_page_size(hstate_file(file)));
- retval = -EINVAL;
- if (unlikely(flags & MAP_HUGETLB && !is_file_hugepages(file)))
+ } else if (unlikely(flags & MAP_HUGETLB)) {
+ retval = -EINVAL;
goto out_fput;
+ }
} else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL;
struct hstate *hs;
@@ -1689,7 +1690,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
struct list_head *uf)
{
struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma, *prev;
+ struct vm_area_struct *vma, *prev, *merge;
int error;
struct rb_node **rb_link, *rb_parent;
unsigned long charged = 0;
@@ -1773,6 +1774,29 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (error)
goto unmap_and_free_vma;
+ /* If vm_flags changed after call_mmap(), we should try merge vma again
+ * as we may succeed this time.
+ */
+ if (unlikely(vm_flags != vma->vm_flags && prev)) {
+ merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
+ NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
+ if (merge) {
+ /* ->mmap() can change vma->vm_file and fput the original file. So
+ * fput the vma->vm_file here or we would add an extra fput for file
+ * and cause general protection fault ultimately.
+ */
+ fput(vma->vm_file);
+ vm_area_free(vma);
+ vma = merge;
+ /* Update vm_flags and possible addr to pick up the change. We don't
+ * warn here if addr changed as the vma is not linked by vma_link().
+ */
+ addr = vma->vm_start;
+ vm_flags = vma->vm_flags;
+ goto unmap_writable;
+ }
+ }
+
/* Can addr have changed??
*
* Answer: Yes, several device drivers can do it in their
@@ -1795,6 +1819,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
vma_link(mm, vma, prev, rb_link, rb_parent);
/* Once vma denies write, undo our temporary denial count */
if (file) {
+unmap_writable:
if (vm_flags & VM_SHARED)
mapping_unmap_writable(file->f_mapping);
if (vm_flags & VM_DENYWRITE)
@@ -2209,7 +2234,7 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
/*
* mmap_region() will call shmem_zero_setup() to create a file,
* so use shmem's get_unmapped_area in case it can be huge.
- * do_mmap_pgoff() will clear pgoff, so match alignment.
+ * do_mmap() will clear pgoff, so match alignment.
*/
pgoff = 0;
get_area = shmem_get_unmapped_area;
@@ -2371,7 +2396,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
/*
* vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_sem in read mode. We need the
+ * is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
@@ -2389,7 +2414,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address)
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_sem
+ * updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
@@ -2451,7 +2476,7 @@ int expand_downwards(struct vm_area_struct *vma,
/*
* vma->vm_start/vm_end cannot change under us because the caller
- * is required to hold the mmap_sem in read mode. We need the
+ * is required to hold the mmap_lock in read mode. We need the
* anon_vma lock to serialize against concurrent expand_stacks.
*/
anon_vma_lock_write(vma->anon_vma);
@@ -2469,7 +2494,7 @@ int expand_downwards(struct vm_area_struct *vma,
if (!error) {
/*
* vma_gap_update() doesn't support concurrent
- * updates, but we only hold a shared mmap_sem
+ * updates, but we only hold a shared mmap_lock
* lock here, so we need to protect against
* concurrent vma expansions.
* anon_vma_lock_write() doesn't help here, as
@@ -2620,7 +2645,7 @@ static void unmap_region(struct mm_struct *mm,
* Create a list of vma's touched by the unmap, removing them from the mm's
* vma list as we go..
*/
-static void
+static bool
detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *prev, unsigned long end)
{
@@ -2645,6 +2670,17 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct vm_area_struct *vma,
/* Kill the cache */
vmacache_invalidate(mm);
+
+ /*
+ * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or
+ * VM_GROWSUP VMA. Such VMAs can change their size under
+ * down_read(mmap_lock) and collide with the VMA we are about to unmap.
+ */
+ if (vma && (vma->vm_flags & VM_GROWSDOWN))
+ return false;
+ if (prev && (prev->vm_flags & VM_GROWSUP))
+ return false;
+ return true;
}
/*
@@ -2825,10 +2861,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len,
}
/* Detach vmas from rbtree */
- detach_vmas_to_be_unmapped(mm, vma, prev, end);
+ if (!detach_vmas_to_be_unmapped(mm, vma, prev, end))
+ downgrade = false;
if (downgrade)
- downgrade_write(&mm->mmap_sem);
+ mmap_write_downgrade(mm);
unmap_region(mm, vma, prev, start, end);
@@ -2850,20 +2887,20 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade)
struct mm_struct *mm = current->mm;
LIST_HEAD(uf);
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
ret = __do_munmap(mm, start, len, &uf, downgrade);
/*
- * Returning 1 indicates mmap_sem is downgraded.
+ * Returning 1 indicates mmap_lock is downgraded.
* But 1 is not legal return value of vm_munmap() and munmap(), reset
* it to 0 before return.
*/
if (ret == 1) {
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
ret = 0;
} else
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
return ret;
@@ -2911,7 +2948,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
if (pgoff + (size >> PAGE_SHIFT) < pgoff)
return ret;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
vma = find_vma(mm, start);
@@ -2970,11 +3007,11 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
}
file = get_file(vma->vm_file);
- ret = do_mmap_pgoff(vma->vm_file, start, size,
+ ret = do_mmap(vma->vm_file, start, size,
prot, flags, pgoff, &populate, NULL);
fput(file);
out:
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
if (populate)
mm_populate(ret, populate);
if (!IS_ERR_VALUE(ret))
@@ -3074,12 +3111,12 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags)
if (!len)
return 0;
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
ret = do_brk_flags(addr, len, flags, &uf);
populate = ((mm->def_flags & VM_LOCKED) != 0);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate && !ret)
mm_populate(addr, len);
@@ -3107,12 +3144,12 @@ void exit_mmap(struct mm_struct *mm)
/*
* Manually reap the mm to free as much memory as possible.
* Then, as the oom reaper does, set MMF_OOM_SKIP to disregard
- * this mm from further consideration. Taking mm->mmap_sem for
+ * this mm from further consideration. Taking mm->mmap_lock for
* write after setting MMF_OOM_SKIP will guarantee that the oom
- * reaper will not run on this mm again after mmap_sem is
+ * reaper will not run on this mm again after mmap_lock is
* dropped.
*
- * Nothing can be holding mm->mmap_sem here and the above call
+ * Nothing can be holding mm->mmap_lock here and the above call
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
@@ -3123,8 +3160,8 @@ void exit_mmap(struct mm_struct *mm)
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
- down_write(&mm->mmap_sem);
- up_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
+ mmap_write_unlock(mm);
}
if (mm->locked_vm) {
@@ -3159,6 +3196,7 @@ void exit_mmap(struct mm_struct *mm)
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
+ cond_resched();
}
vm_unacct_memory(nr_accounted);
}
@@ -3189,7 +3227,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma)
* By setting it to reflect the virtual start address of the
* vma, merges and splits can happen in a seamless way, just
* using the existing file pgoff checks and manipulations.
- * Similarly in do_mmap_pgoff and in do_brk.
+ * Similarly in do_mmap and in do_brk.
*/
if (vma_is_anonymous(vma)) {
BUG_ON(vma->anon_vma);
@@ -3437,7 +3475,7 @@ bool vma_is_special_mapping(const struct vm_area_struct *vma,
}
/*
- * Called with mm->mmap_sem held for writing.
+ * Called with mm->mmap_lock held for writing.
* Insert a new vma covering the given region, with the given flags.
* Its pages are supplied by the given array of struct page *.
* The array can be shorter than len >> PAGE_SHIFT if it's null-terminated.
@@ -3474,7 +3512,7 @@ static void vm_lock_anon_vma(struct mm_struct *mm, struct anon_vma *anon_vma)
* The LSB of head.next can't change from under us
* because we hold the mm_all_locks_mutex.
*/
- down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_sem);
+ down_write_nest_lock(&anon_vma->root->rwsem, &mm->mmap_lock);
/*
* We can safely modify head.next after taking the
* anon_vma->root->rwsem. If some other vma in this mm shares
@@ -3504,7 +3542,7 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
*/
if (test_and_set_bit(AS_MM_ALL_LOCKS, &mapping->flags))
BUG();
- down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_sem);
+ down_write_nest_lock(&mapping->i_mmap_rwsem, &mm->mmap_lock);
}
}
@@ -3513,11 +3551,11 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
* operations that could ever happen on a certain mm. This includes
* vmtruncate, try_to_unmap, and all page faults.
*
- * The caller must take the mmap_sem in write mode before calling
+ * The caller must take the mmap_lock in write mode before calling
* mm_take_all_locks(). The caller isn't allowed to release the
- * mmap_sem until mm_drop_all_locks() returns.
+ * mmap_lock until mm_drop_all_locks() returns.
*
- * mmap_sem in write mode is required in order to block all operations
+ * mmap_lock in write mode is required in order to block all operations
* that could modify pagetables and free pages without need of
* altering the vma layout. It's also needed in write mode to avoid new
* anon_vmas to be associated with existing vmas.
@@ -3550,7 +3588,7 @@ int mm_take_all_locks(struct mm_struct *mm)
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(mm));
mutex_lock(&mm_all_locks_mutex);
@@ -3622,7 +3660,7 @@ static void vm_unlock_mapping(struct address_space *mapping)
}
/*
- * The mmap_sem cannot be released by the caller until
+ * The mmap_lock cannot be released by the caller until
* mm_drop_all_locks() returns.
*/
void mm_drop_all_locks(struct mm_struct *mm)
@@ -3630,7 +3668,7 @@ void mm_drop_all_locks(struct mm_struct *mm)
struct vm_area_struct *vma;
struct anon_vma_chain *avc;
- BUG_ON(down_read_trylock(&mm->mmap_sem));
+ BUG_ON(mmap_read_trylock(mm));
BUG_ON(!mutex_is_locked(&mm_all_locks_mutex));
for (vma = mm->mmap; vma; vma = vma->vm_next) {
diff --git a/mm/mmu_context.c b/mm/mmu_context.c
deleted file mode 100644
index 3e612ae748e9..000000000000
--- a/mm/mmu_context.c
+++ /dev/null
@@ -1,64 +0,0 @@
-/* Copyright (C) 2009 Red Hat, Inc.
- *
- * See ../COPYING for licensing terms.
- */
-
-#include <linux/mm.h>
-#include <linux/sched.h>
-#include <linux/sched/mm.h>
-#include <linux/sched/task.h>
-#include <linux/mmu_context.h>
-#include <linux/export.h>
-
-#include <asm/mmu_context.h>
-
-/*
- * use_mm
- * Makes the calling kernel thread take on the specified
- * mm context.
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-void use_mm(struct mm_struct *mm)
-{
- struct mm_struct *active_mm;
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- active_mm = tsk->active_mm;
- if (active_mm != mm) {
- mmgrab(mm);
- tsk->active_mm = mm;
- }
- tsk->mm = mm;
- switch_mm(active_mm, mm, tsk);
- task_unlock(tsk);
-#ifdef finish_arch_post_lock_switch
- finish_arch_post_lock_switch();
-#endif
-
- if (active_mm != mm)
- mmdrop(active_mm);
-}
-EXPORT_SYMBOL_GPL(use_mm);
-
-/*
- * unuse_mm
- * Reverses the effect of use_mm, i.e. releases the
- * specified mm context which was earlier taken on
- * by the calling kernel thread
- * (Note: this routine is intended to be called only
- * from a kernel thread context)
- */
-void unuse_mm(struct mm_struct *mm)
-{
- struct task_struct *tsk = current;
-
- task_lock(tsk);
- sync_mm_rss(mm);
- tsk->mm = NULL;
- /* active_mm is still 'mm' */
- enter_lazy_tlb(mm, tsk);
- task_unlock(tsk);
-}
-EXPORT_SYMBOL_GPL(unuse_mm);
diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c
index a3538cb2bcbe..03c33c93a582 100644
--- a/mm/mmu_gather.c
+++ b/mm/mmu_gather.c
@@ -301,7 +301,7 @@ void tlb_finish_mmu(struct mmu_gather *tlb,
{
/*
* If there are parallel threads are doing PTE changes on same range
- * under non-exclusive lock (e.g., mmap_sem read-side) but defer TLB
+ * under non-exclusive lock (e.g., mmap_lock read-side) but defer TLB
* flush by batching, one thread may end up seeing inconsistent PTEs
* and result in having stale TLB entries. So flush TLB forcefully
* if we detect parallel PTE batching threads.
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 06852b896fa6..4fc918163dd3 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -166,7 +166,7 @@ static void mn_itree_inv_end(struct mmu_notifier_subscriptions *subscriptions)
/**
* mmu_interval_read_begin - Begin a read side critical section against a VA
* range
- * interval_sub: The interval subscription
+ * @interval_sub: The interval subscription
*
* mmu_iterval_read_begin()/mmu_iterval_read_retry() implement a
* collision-retry scheme similar to seqcount for the VA range under
@@ -599,7 +599,7 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm,
}
/*
- * Same as mmu_notifier_register but here the caller must hold the mmap_sem in
+ * Same as mmu_notifier_register but here the caller must hold the mmap_lock in
* write mode. A NULL mn signals the notifier is being registered for itree
* mode.
*/
@@ -609,7 +609,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
struct mmu_notifier_subscriptions *subscriptions = NULL;
int ret;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
if (IS_ENABLED(CONFIG_LOCKDEP)) {
@@ -623,7 +623,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
* know that mm->notifier_subscriptions can't change while we
- * hold the write side of the mmap_sem.
+ * hold the write side of the mmap_lock.
*/
subscriptions = kzalloc(
sizeof(struct mmu_notifier_subscriptions), GFP_KERNEL);
@@ -655,7 +655,7 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
* readers. acquire can only be used while holding the mmgrab or
* mmget, and is safe because once created the
* mmu_notifier_subscriptions is not freed until the mm is destroyed.
- * As above, users holding the mmap_sem or one of the
+ * As above, users holding the mmap_lock or one of the
* mm_take_all_locks() do not need to use acquire semantics.
*/
if (subscriptions)
@@ -686,10 +686,10 @@ EXPORT_SYMBOL_GPL(__mmu_notifier_register);
/**
* mmu_notifier_register - Register a notifier on a mm
- * @mn: The notifier to attach
+ * @subscription: The notifier to attach
* @mm: The mm to attach the notifier to
*
- * Must not hold mmap_sem nor any other VM related lock when calling
+ * Must not hold mmap_lock nor any other VM related lock when calling
* this registration function. Must also ensure mm_users can't go down
* to zero while this runs to avoid races with mmu_notifier_release,
* so mm has to be current->mm or the mm should be pinned safely such
@@ -708,9 +708,9 @@ int mmu_notifier_register(struct mmu_notifier *subscription,
{
int ret;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
ret = __mmu_notifier_register(subscription, mm);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL_GPL(mmu_notifier_register);
@@ -750,7 +750,7 @@ find_get_mmu_notifier(struct mm_struct *mm, const struct mmu_notifier_ops *ops)
* are the same.
*
* Each call to mmu_notifier_get() must be paired with a call to
- * mmu_notifier_put(). The caller must hold the write side of mm->mmap_sem.
+ * mmu_notifier_put(). The caller must hold the write side of mm->mmap_lock.
*
* While the caller has a mmu_notifier get the mm pointer will remain valid,
* and can be converted to an active mm pointer via mmget_not_zero().
@@ -761,7 +761,7 @@ struct mmu_notifier *mmu_notifier_get_locked(const struct mmu_notifier_ops *ops,
struct mmu_notifier *subscription;
int ret;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
if (mm->notifier_subscriptions) {
subscription = find_get_mmu_notifier(mm, ops);
@@ -856,7 +856,7 @@ static void mmu_notifier_free_rcu(struct rcu_head *rcu)
/**
* mmu_notifier_put - Release the reference on the notifier
- * @mn: The notifier to act on
+ * @subscription: The notifier to act on
*
* This function must be paired with each mmu_notifier_get(), it releases the
* reference obtained by the get. If this is the last reference then process
@@ -965,7 +965,8 @@ static int __mmu_interval_notifier_insert(
* @interval_sub: Interval subscription to register
* @start: Starting virtual address to monitor
* @length: Length of the range to monitor
- * @mm : mm_struct to attach to
+ * @mm: mm_struct to attach to
+ * @ops: Interval notifier operations to be called on matching events
*
* This function subscribes the interval notifier for notifications from the
* mm. Upon return the ops related to mmu_interval_notifier will be called
@@ -983,7 +984,7 @@ int mmu_interval_notifier_insert(struct mmu_interval_notifier *interval_sub,
struct mmu_notifier_subscriptions *subscriptions;
int ret;
- might_lock(&mm->mmap_sem);
+ might_lock(&mm->mmap_lock);
subscriptions = smp_load_acquire(&mm->notifier_subscriptions);
if (!subscriptions || !subscriptions->has_itree) {
@@ -1006,7 +1007,7 @@ int mmu_interval_notifier_insert_locked(
mm->notifier_subscriptions;
int ret;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
if (!subscriptions || !subscriptions->has_itree) {
ret = __mmu_notifier_register(NULL, mm);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 494192ca954b..ce8b8a5eacbb 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -28,7 +28,7 @@
#include <linux/ksm.h>
#include <linux/uaccess.h>
#include <linux/mm_inline.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <asm/cacheflush.h>
#include <asm/mmu_context.h>
#include <asm/tlbflush.h>
@@ -49,7 +49,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
bool uffd_wp_resolve = cp_flags & MM_CP_UFFD_WP_RESOLVE;
/*
- * Can be called with only the mmap_sem for reading by
+ * Can be called with only the mmap_lock for reading by
* prot_numa so we must check the pmd isn't constantly
* changing from under us from pmd_none to pmd_trans_huge
* and/or the other way around.
@@ -59,7 +59,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
/*
* The pmd points to a regular pte so the pmd can't change
- * from under us even if the mmap_sem is only hold for
+ * from under us even if the mmap_lock is only hold for
* reading.
*/
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
@@ -228,7 +228,7 @@ static inline unsigned long change_pmd_range(struct vm_area_struct *vma,
next = pmd_addr_end(addr, end);
/*
- * Automatic NUMA balancing walks the tables with mmap_sem
+ * Automatic NUMA balancing walks the tables with mmap_lock
* held for read. It's possible a parallel update to occur
* between pmd_trans_huge() and a pmd_none_or_clear_bad()
* check leading to a false positive and clearing.
@@ -477,7 +477,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
success:
/*
- * vm_flags and vm_page_prot are protected by the mmap_sem
+ * vm_flags and vm_page_prot are protected by the mmap_lock
* held in write mode.
*/
vma->vm_flags = newflags;
@@ -538,7 +538,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
reqprot = prot;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
/*
@@ -628,7 +628,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
prot = reqprot;
}
out:
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return error;
}
@@ -658,7 +658,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
if (init_val & ~PKEY_ACCESS_MASK)
return -EINVAL;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
pkey = mm_pkey_alloc(current->mm);
ret = -ENOSPC;
@@ -672,7 +672,7 @@ SYSCALL_DEFINE2(pkey_alloc, unsigned long, flags, unsigned long, init_val)
}
ret = pkey;
out:
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
@@ -680,9 +680,9 @@ SYSCALL_DEFINE1(pkey_free, int, pkey)
{
int ret;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
ret = mm_pkey_free(current->mm, pkey);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
/*
* We could provie warnings or errors if any VMA still
diff --git a/mm/mremap.c b/mm/mremap.c
index 6aa6ea605068..138abbae4f75 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -146,7 +146,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
/*
* We don't have to worry about the ordering of src and dst
- * pte locks because exclusive mmap_sem prevents deadlock.
+ * pte locks because exclusive mmap_lock prevents deadlock.
*/
old_pte = pte_offset_map_lock(mm, old_pmd, old_addr, &old_ptl);
new_pte = pte_offset_map(new_pmd, new_addr);
@@ -193,27 +193,41 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
#ifdef CONFIG_HAVE_MOVE_PMD
static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
- unsigned long new_addr, unsigned long old_end,
- pmd_t *old_pmd, pmd_t *new_pmd)
+ unsigned long new_addr, pmd_t *old_pmd, pmd_t *new_pmd)
{
spinlock_t *old_ptl, *new_ptl;
struct mm_struct *mm = vma->vm_mm;
pmd_t pmd;
- if ((old_addr & ~PMD_MASK) || (new_addr & ~PMD_MASK)
- || old_end - old_addr < PMD_SIZE)
- return false;
-
/*
* The destination pmd shouldn't be established, free_pgtables()
- * should have release it.
+ * should have released it.
+ *
+ * However, there's a case during execve() where we use mremap
+ * to move the initial stack, and in that case the target area
+ * may overlap the source area (always moving down).
+ *
+ * If everything is PMD-aligned, that works fine, as moving
+ * each pmd down will clear the source pmd. But if we first
+ * have a few 4kB-only pages that get moved down, and then
+ * hit the "now the rest is PMD-aligned, let's do everything
+ * one pmd at a time", we will still have the old (now empty
+ * of any 4kB pages, but still there) PMD in the page table
+ * tree.
+ *
+ * Warn on it once - because we really should try to figure
+ * out how to do this better - but then say "I won't move
+ * this pmd".
+ *
+ * One alternative might be to just unmap the target pmd at
+ * this point, and verify that it really is empty. We'll see.
*/
- if (WARN_ON(!pmd_none(*new_pmd)))
+ if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
return false;
/*
* We don't have to worry about the ordering of src and dst
- * ptlocks because exclusive mmap_sem prevents deadlock.
+ * ptlocks because exclusive mmap_lock prevents deadlock.
*/
old_ptl = pmd_lock(vma->vm_mm, old_pmd);
new_ptl = pmd_lockptr(mm, new_pmd);
@@ -260,20 +274,23 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
extent = next - old_addr;
if (extent > old_end - old_addr)
extent = old_end - old_addr;
+ next = (new_addr + PMD_SIZE) & PMD_MASK;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd)) {
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
if (extent == HPAGE_PMD_SIZE) {
bool moved;
/* See comment in move_ptes() */
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_huge_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd);
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -293,7 +310,7 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (need_rmap_locks)
take_rmap_locks(vma);
moved = move_normal_pmd(vma, old_addr, new_addr,
- old_end, old_pmd, new_pmd);
+ old_pmd, new_pmd);
if (need_rmap_locks)
drop_rmap_locks(vma);
if (moved)
@@ -303,9 +320,6 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
if (pte_alloc(new_vma->vm_mm, new_pmd))
break;
- next = (new_addr + PMD_SIZE) & PMD_MASK;
- if (extent > next - new_addr)
- extent = next - new_addr;
move_ptes(vma, old_pmd, old_addr, old_addr + extent, new_vma,
new_pmd, new_addr, need_rmap_locks);
}
@@ -696,7 +710,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (!new_len)
return ret;
- if (down_write_killable(&current->mm->mmap_sem))
+ if (mmap_write_lock_killable(current->mm))
return -EINTR;
if (flags & (MREMAP_FIXED | MREMAP_DONTUNMAP)) {
@@ -710,7 +724,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
* Always allow a shrinking remap: that just unmaps
* the unnecessary pages..
* __do_munmap does all the needed commit accounting, and
- * downgrades mmap_sem to read if so directed.
+ * downgrades mmap_lock to read if so directed.
*/
if (old_len >= new_len) {
int retval;
@@ -720,7 +734,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
if (retval < 0 && old_len != new_len) {
ret = retval;
goto out;
- /* Returning 1 indicates mmap_sem is downgraded to read. */
+ /* Returning 1 indicates mmap_lock is downgraded to read. */
} else if (retval == 1)
downgraded = true;
ret = addr;
@@ -785,12 +799,12 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
out:
if (offset_in_page(ret)) {
vm_unacct_memory(charged);
- locked = 0;
+ locked = false;
}
if (downgraded)
- up_read(&current->mm->mmap_sem);
+ mmap_read_unlock(current->mm);
else
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
if (locked && new_len > old_len)
mm_populate(new_addr + old_len, new_len - old_len);
userfaultfd_unmap_complete(mm, &uf_unmap_early);
diff --git a/mm/msync.c b/mm/msync.c
index c3bd3e75f687..69c6d2029531 100644
--- a/mm/msync.c
+++ b/mm/msync.c
@@ -57,7 +57,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
* If the interval [start,end) covers some unmapped address ranges,
* just ignore them, but return -ENOMEM at the end.
*/
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, start);
for (;;) {
struct file *file;
@@ -88,12 +88,12 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
if ((flags & MS_SYNC) && file &&
(vma->vm_flags & VM_SHARED)) {
get_file(file);
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
error = vfs_fsync_range(file, fstart, fend, 1);
fput(file);
if (error || start >= end)
goto out;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
vma = find_vma(mm, start);
} else {
if (start >= end) {
@@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags)
}
}
out_unlock:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
out:
return error ? : unmapped_error;
}
diff --git a/mm/nommu.c b/mm/nommu.c
index 318df4e236c9..75a327149af1 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -5,7 +5,7 @@
* Replacement code for mm functions to support CPU's that don't
* have any form of memory management unit (thus no virtual memory).
*
- * See Documentation/nommu-mmap.txt
+ * See Documentation/mm/nommu-mmap.rst
*
* Copyright (c) 2004-2008 David Howells <dhowells@redhat.com>
* Copyright (c) 2000-2003 David McCullough <davidm@snapgear.com>
@@ -140,7 +140,7 @@ void vfree(const void *addr)
}
EXPORT_SYMBOL(vfree);
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
/*
* You can't specify __GFP_HIGHMEM with kmalloc() since kmalloc()
@@ -150,24 +150,33 @@ void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
}
EXPORT_SYMBOL(__vmalloc);
-void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags)
+void *__vmalloc_node_range(unsigned long size, unsigned long align,
+ unsigned long start, unsigned long end, gfp_t gfp_mask,
+ pgprot_t prot, unsigned long vm_flags, int node,
+ const void *caller)
{
- return __vmalloc(size, flags, PAGE_KERNEL);
+ return __vmalloc(size, gfp_mask);
+}
+
+void *__vmalloc_node(unsigned long size, unsigned long align, gfp_t gfp_mask,
+ int node, const void *caller)
+{
+ return __vmalloc(size, gfp_mask);
}
static void *__vmalloc_user_flags(unsigned long size, gfp_t flags)
{
void *ret;
- ret = __vmalloc(size, flags, PAGE_KERNEL);
+ ret = __vmalloc(size, flags);
if (ret) {
struct vm_area_struct *vma;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
vma = find_vma(current->mm, (unsigned long)ret);
if (vma)
vma->vm_flags |= VM_USERMAP;
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
}
return ret;
@@ -179,12 +188,6 @@ void *vmalloc_user(unsigned long size)
}
EXPORT_SYMBOL(vmalloc_user);
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
- return __vmalloc_user_flags(size, flags | __GFP_ZERO);
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
struct page *vmalloc_to_page(const void *addr)
{
return virt_to_page(addr);
@@ -230,7 +233,7 @@ long vwrite(char *buf, char *addr, unsigned long count)
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM);
}
EXPORT_SYMBOL(vmalloc);
@@ -248,8 +251,7 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO,
- PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO);
}
EXPORT_SYMBOL(vzalloc);
@@ -289,23 +291,6 @@ void *vzalloc_node(unsigned long size, int node)
EXPORT_SYMBOL(vzalloc_node);
/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
- *
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
- */
-
-void *vmalloc_exec(unsigned long size)
-{
- return __vmalloc(size, GFP_KERNEL | __GFP_HIGHMEM, PAGE_KERNEL_EXEC);
-}
-
-/**
* vmalloc_32 - allocate virtually contiguous memory (32bit addressable)
* @size: allocation size
*
@@ -314,7 +299,7 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc(size, GFP_KERNEL, PAGE_KERNEL);
+ return __vmalloc(size, GFP_KERNEL);
}
EXPORT_SYMBOL(vmalloc_32);
@@ -351,7 +336,7 @@ void vunmap(const void *addr)
}
EXPORT_SYMBOL(vunmap);
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
BUG();
return NULL;
@@ -369,18 +354,6 @@ void vm_unmap_aliases(void)
}
EXPORT_SYMBOL_GPL(vm_unmap_aliases);
-/*
- * Implement a stub for vmalloc_sync_[un]mapping() if the architecture
- * chose not to have one.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
struct vm_struct *alloc_vm_area(size_t size, pte_t **ptes)
{
BUG();
@@ -443,7 +416,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
/*
* Ok, looks good - let it rip.
*/
- flush_icache_range(mm->brk, brk);
+ flush_icache_user_range(mm->brk, brk);
return mm->brk = brk;
}
@@ -592,7 +565,7 @@ static void put_nommu_region(struct vm_region *region)
* add a VMA into a process's mm_struct in the appropriate place in the list
* and tree and add to the address space's page tree also if not an anonymous
* page
- * - should be called with mm->mmap_sem held writelocked
+ * - should be called with mm->mmap_lock held writelocked
*/
static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma)
{
@@ -706,7 +679,7 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma)
/*
* look up the first VMA in which addr resides, NULL if none
- * - should be called with mm->mmap_sem at least held readlocked
+ * - should be called with mm->mmap_lock at least held readlocked
*/
struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr)
{
@@ -752,7 +725,7 @@ int expand_stack(struct vm_area_struct *vma, unsigned long address)
/*
* look up the first VMA exactly that exactly matches addr
- * - should be called with mm->mmap_sem at least held readlocked
+ * - should be called with mm->mmap_lock at least held readlocked
*/
static struct vm_area_struct *find_vma_exact(struct mm_struct *mm,
unsigned long addr,
@@ -1105,7 +1078,6 @@ unsigned long do_mmap(struct file *file,
unsigned long len,
unsigned long prot,
unsigned long flags,
- vm_flags_t vm_flags,
unsigned long pgoff,
unsigned long *populate,
struct list_head *uf)
@@ -1113,6 +1085,7 @@ unsigned long do_mmap(struct file *file,
struct vm_area_struct *vma;
struct vm_region *region;
struct rb_node *rb;
+ vm_flags_t vm_flags;
unsigned long capabilities, result;
int ret;
@@ -1131,7 +1104,7 @@ unsigned long do_mmap(struct file *file,
/* we've determined that we can make the mapping, now translate what we
* now know into VMA flags */
- vm_flags |= determine_vm_flags(file, prot, flags, capabilities);
+ vm_flags = determine_vm_flags(file, prot, flags, capabilities);
/* we're going to need to record the mapping */
region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL);
@@ -1287,7 +1260,7 @@ share:
/* we flush the region from the icache only when the first executable
* mapping of it is made */
if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) {
- flush_icache_range(region->vm_start, region->vm_end);
+ flush_icache_user_range(region->vm_start, region->vm_end);
region->vm_icache_flushed = true;
}
@@ -1552,9 +1525,9 @@ int vm_munmap(unsigned long addr, size_t len)
struct mm_struct *mm = current->mm;
int ret;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
ret = do_munmap(mm, addr, len, NULL);
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
EXPORT_SYMBOL(vm_munmap);
@@ -1641,9 +1614,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
{
unsigned long ret;
- down_write(&current->mm->mmap_sem);
+ mmap_write_lock(current->mm);
ret = do_mremap(addr, old_len, new_len, flags, new_addr);
- up_write(&current->mm->mmap_sem);
+ mmap_write_unlock(current->mm);
return ret;
}
@@ -1715,7 +1688,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
- if (down_read_killable(&mm->mmap_sem))
+ if (mmap_read_lock_killable(mm))
return 0;
/* the access must start within one of the target process's mappings */
@@ -1738,7 +1711,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
len = 0;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return len;
}
@@ -1789,8 +1762,8 @@ EXPORT_SYMBOL_GPL(access_process_vm);
* @newsize: The proposed filesize of the inode
*
* Check the shared mappings on an inode on behalf of a shrinking truncate to
- * make sure that that any outstanding VMAs aren't broken and then shrink the
- * vm_regions that extend that beyond so that do_mmap_pgoff() doesn't
+ * make sure that any outstanding VMAs aren't broken and then shrink the
+ * vm_regions that extend beyond so that do_mmap() doesn't
* automatically grant mappings that are too large.
*/
int nommu_shrink_inode_mappings(struct inode *inode, size_t size,
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index dfc357614e56..e90f25d6385d 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -126,7 +126,7 @@ static bool oom_cpuset_eligible(struct task_struct *tsk, struct oom_control *oc)
/*
* The process p may have detached its own ->mm while exiting or through
- * use_mm(), but one or more of its subthreads may still have a valid
+ * kthread_use_mm(), but one or more of its subthreads may still have a valid
* pointer. Return p, or any of its subthreads with a valid ->mm, with
* task_lock() held.
*/
@@ -184,7 +184,7 @@ static bool is_dump_unreclaim_slabs(void)
global_node_page_state(NR_ISOLATED_FILE) +
global_node_page_state(NR_UNEVICTABLE);
- return (global_node_page_state(NR_SLAB_UNRECLAIMABLE) > nr_lru);
+ return (global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B) > nr_lru);
}
/**
@@ -196,17 +196,17 @@ static bool is_dump_unreclaim_slabs(void)
* predictable as possible. The goal is to return the highest value for the
* task consuming the most memory to avoid subsequent oom failures.
*/
-unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
+long oom_badness(struct task_struct *p, unsigned long totalpages)
{
long points;
long adj;
if (oom_unkillable_task(p))
- return 0;
+ return LONG_MIN;
p = find_lock_task_mm(p);
if (!p)
- return 0;
+ return LONG_MIN;
/*
* Do not even consider tasks which are explicitly marked oom
@@ -218,7 +218,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
test_bit(MMF_OOM_SKIP, &p->mm->flags) ||
in_vfork(p)) {
task_unlock(p);
- return 0;
+ return LONG_MIN;
}
/*
@@ -233,11 +233,7 @@ unsigned long oom_badness(struct task_struct *p, unsigned long totalpages)
adj *= totalpages / 1000;
points += adj;
- /*
- * Never return 0 for an eligible task regardless of the root bonus and
- * oom_score_adj (oom_score_adj can't be OOM_SCORE_ADJ_MIN here).
- */
- return points > 0 ? points : 1;
+ return points;
}
static const char * const oom_constraint_text[] = {
@@ -254,7 +250,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
{
struct zone *zone;
struct zoneref *z;
- enum zone_type high_zoneidx = gfp_zone(oc->gfp_mask);
+ enum zone_type highest_zoneidx = gfp_zone(oc->gfp_mask);
bool cpuset_limited = false;
int nid;
@@ -294,7 +290,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
/* Check this allocation failure is caused by cpuset's wall function */
for_each_zone_zonelist_nodemask(zone, z, oc->zonelist,
- high_zoneidx, oc->nodemask)
+ highest_zoneidx, oc->nodemask)
if (!cpuset_zone_allowed(zone, oc->gfp_mask))
cpuset_limited = true;
@@ -310,7 +306,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc)
static int oom_evaluate_task(struct task_struct *task, void *arg)
{
struct oom_control *oc = arg;
- unsigned long points;
+ long points;
if (oom_unkillable_task(task))
goto next;
@@ -336,12 +332,12 @@ static int oom_evaluate_task(struct task_struct *task, void *arg)
* killed first if it triggers an oom, then select it.
*/
if (oom_task_origin(task)) {
- points = ULONG_MAX;
+ points = LONG_MAX;
goto select;
}
points = oom_badness(task, oc->totalpages);
- if (!points || points < oc->chosen_points)
+ if (points == LONG_MIN || points < oc->chosen_points)
goto next;
select:
@@ -365,6 +361,8 @@ abort:
*/
static void select_bad_process(struct oom_control *oc)
{
+ oc->chosen_points = LONG_MIN;
+
if (is_memcg_oom(oc))
mem_cgroup_scan_tasks(oc->memcg, oom_evaluate_task, oc);
else {
@@ -569,7 +567,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
{
bool ret = true;
- if (!down_read_trylock(&mm->mmap_sem)) {
+ if (!mmap_read_trylock(mm)) {
trace_skip_task_reaping(tsk->pid);
return false;
}
@@ -577,8 +575,8 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
/*
* MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't
* work on the mm anymore. The check for MMF_OOM_SKIP must run
- * under mmap_sem for reading because it serializes against the
- * down_write();up_write() cycle in exit_mmap().
+ * under mmap_lock for reading because it serializes against the
+ * mmap_write_lock();mmap_write_unlock() cycle in exit_mmap().
*/
if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
trace_skip_task_reaping(tsk->pid);
@@ -600,7 +598,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm)
out_finish:
trace_finish_task_reaping(tsk->pid);
out_unlock:
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
@@ -611,7 +609,7 @@ static void oom_reap_task(struct task_struct *tsk)
int attempts = 0;
struct mm_struct *mm = tsk->signal->oom_mm;
- /* Retry the down_read_trylock(mmap_sem) a few times */
+ /* Retry the mmap_read_trylock(mm) a few times */
while (attempts++ < MAX_OOM_REAP_RETRIES && !oom_reap_task_mm(tsk, mm))
schedule_timeout_idle(HZ/10);
@@ -629,7 +627,7 @@ done:
/*
* Hide this mm from OOM killer because it has been either reaped or
- * somebody can't call up_write(mmap_sem).
+ * somebody can't call mmap_write_unlock(mm).
*/
set_bit(MMF_OOM_SKIP, &mm->flags);
@@ -863,6 +861,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
p = find_lock_task_mm(victim);
if (!p) {
+ pr_info("%s: OOM victim %d (%s) is already exiting. Skip killing the task\n",
+ message, task_pid_nr(victim), victim->comm);
put_task_struct(victim);
return;
} else if (victim != p) {
@@ -898,7 +898,7 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
/*
* Kill all user processes sharing victim->mm in other thread groups, if
* any. They don't get access to memory reserves, though, to avoid
- * depletion of all memory. This prevents mm->mmap_sem livelock when an
+ * depletion of all memory. This prevents mm->mmap_lock livelock when an
* oom killed thread cannot exit because it requires the semaphore and
* its contended by another thread trying to allocate memory itself.
* That thread will now get access to memory reserves since it has a
@@ -919,8 +919,8 @@ static void __oom_kill_process(struct task_struct *victim, const char *message)
continue;
}
/*
- * No use_mm() user needs to read from the userspace so we are
- * ok to reap it.
+ * No kthead_use_mm() user needs to read from the userspace so
+ * we are ok to reap it.
*/
if (unlikely(p->flags & PF_KTHREAD))
continue;
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 7326b54ab728..4e4ddd67b71e 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -257,7 +257,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb,
* requiring writeback.
*
* This number of dirtyable pages is the base value of which the
- * user-configurable dirty ratio is the effictive number of pages that
+ * user-configurable dirty ratio is the effective number of pages that
* are allowed to be actually dirtied. Per individual zone, or
* globally by using the sum of dirtyable pages over all zones.
*
@@ -387,8 +387,7 @@ static unsigned long global_dirtyable_memory(void)
* Calculate @dtc->thresh and ->bg_thresh considering
* vm_dirty_{bytes|ratio} and dirty_background_{bytes|ratio}. The caller
* must ensure that @dtc->avail is set before calling this function. The
- * dirty limits will be lifted by 1/4 for PF_LESS_THROTTLE (ie. nfsd) and
- * real-time tasks.
+ * dirty limits will be lifted by 1/4 for real-time tasks.
*/
static void domain_dirty_limits(struct dirty_throttle_control *dtc)
{
@@ -436,7 +435,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc)
if (bg_thresh >= thresh)
bg_thresh = thresh / 2;
tsk = current;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk)) {
+ if (rt_task(tsk)) {
bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32;
thresh += thresh / 4 + global_wb_domain.dirty_limit / 32;
}
@@ -486,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat)
else
dirty = vm_dirty_ratio * node_memory / 100;
- if (tsk->flags & PF_LESS_THROTTLE || rt_task(tsk))
+ if (rt_task(tsk))
dirty += dirty / 4;
return dirty;
@@ -505,15 +504,13 @@ bool node_dirty_ok(struct pglist_data *pgdat)
unsigned long nr_pages = 0;
nr_pages += node_page_state(pgdat, NR_FILE_DIRTY);
- nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS);
nr_pages += node_page_state(pgdat, NR_WRITEBACK);
return nr_pages <= limit;
}
int dirty_background_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -524,8 +521,7 @@ int dirty_background_ratio_handler(struct ctl_table *table, int write,
}
int dirty_background_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
int ret;
@@ -535,9 +531,8 @@ int dirty_background_bytes_handler(struct ctl_table *table, int write,
return ret;
}
-int dirty_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int dirty_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int old_ratio = vm_dirty_ratio;
int ret;
@@ -551,8 +546,7 @@ int dirty_ratio_handler(struct ctl_table *table, int write,
}
int dirty_bytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
unsigned long old_bytes = vm_dirty_bytes;
int ret;
@@ -759,7 +753,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc,
* bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set.
*
* Return: @wb's dirty limit in pages. The term "dirty" in the context of
- * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages.
+ * dirty balancing includes all PG_dirty and PG_writeback pages.
*/
static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc)
{
@@ -1567,7 +1561,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ?
&mdtc_stor : NULL;
struct dirty_throttle_control *sdtc;
- unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */
+ unsigned long nr_reclaimable; /* = file_dirty */
long period;
long pause;
long max_pause;
@@ -1587,14 +1581,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
unsigned long m_thresh = 0;
unsigned long m_bg_thresh = 0;
- /*
- * Unstable writes are a feature of certain networked
- * filesystems (i.e. NFS) in which data may have been
- * written to the server's write cache, but has not yet
- * been flushed to permanent storage.
- */
- nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
+ nr_reclaimable = global_node_page_state(NR_FILE_DIRTY);
gdtc->avail = global_dirtyable_memory();
gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK);
@@ -1653,8 +1640,12 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
if (dirty <= dirty_freerun_ceiling(thresh, bg_thresh) &&
(!mdtc ||
m_dirty <= dirty_freerun_ceiling(m_thresh, m_bg_thresh))) {
- unsigned long intv = dirty_poll_interval(dirty, thresh);
- unsigned long m_intv = ULONG_MAX;
+ unsigned long intv;
+ unsigned long m_intv;
+
+free_running:
+ intv = dirty_poll_interval(dirty, thresh);
+ m_intv = ULONG_MAX;
current->dirty_paused_when = now;
current->nr_dirtied = 0;
@@ -1673,9 +1664,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
* Calculate global domain's pos_ratio and select the
* global dtc by default.
*/
- if (!strictlimit)
+ if (!strictlimit) {
wb_dirty_limits(gdtc);
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ gdtc->wb_dirty <
+ dirty_freerun_ceiling(gdtc->wb_thresh,
+ gdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be throttled
+ * when below the per-wb freerun ceiling.
+ */
+ goto free_running;
+ }
+
dirty_exceeded = (gdtc->wb_dirty > gdtc->wb_thresh) &&
((gdtc->dirty > gdtc->thresh) || strictlimit);
@@ -1689,9 +1691,20 @@ static void balance_dirty_pages(struct bdi_writeback *wb,
* both global and memcg domains. Choose the one
* w/ lower pos_ratio.
*/
- if (!strictlimit)
+ if (!strictlimit) {
wb_dirty_limits(mdtc);
+ if ((current->flags & PF_LOCAL_THROTTLE) &&
+ mdtc->wb_dirty <
+ dirty_freerun_ceiling(mdtc->wb_thresh,
+ mdtc->wb_bg_thresh))
+ /*
+ * LOCAL_THROTTLE tasks must not be
+ * throttled when below the per-wb
+ * freerun ceiling.
+ */
+ goto free_running;
+ }
dirty_exceeded |= (mdtc->wb_dirty > mdtc->wb_thresh) &&
((mdtc->dirty > mdtc->thresh) || strictlimit);
@@ -1938,8 +1951,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* as we're trying to decide whether to put more under writeback.
*/
gdtc->avail = global_dirtyable_memory();
- gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) +
- global_node_page_state(NR_UNSTABLE_NFS);
+ gdtc->dirty = global_node_page_state(NR_FILE_DIRTY);
domain_dirty_limits(gdtc);
if (gdtc->dirty > gdtc->bg_thresh)
@@ -1972,7 +1984,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb)
* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
*/
int dirty_writeback_centisecs_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
unsigned int old_interval = dirty_writeback_interval;
int ret;
@@ -2064,13 +2076,11 @@ static int page_writeback_cpu_online(unsigned int cpu)
* Called early on to tune the page writeback dirty limits.
*
* We used to scale dirty pages according to how total memory
- * related to pages that could be allocated for buffers (by
- * comparing nr_free_buffer_pages() to vm_total_pages.
+ * related to pages that could be allocated for buffers.
*
* However, that was when we used "dirty_ratio" to scale with
* all memory, and we don't do that any more. "dirty_ratio"
- * is now applied to total non-HIGHPAGE memory (by subtracting
- * totalhigh_pages from vm_total_pages), and as such we can't
+ * is now applied to total non-HIGHPAGE memory, and as such we can't
* get into the old insane situation any more where we had
* large amounts of dirty pages compared to a small amount of
* non-HIGHMEM memory.
@@ -2164,7 +2174,6 @@ int write_cache_pages(struct address_space *mapping,
int error;
struct pagevec pvec;
int nr_pages;
- pgoff_t uninitialized_var(writeback_index);
pgoff_t index;
pgoff_t end; /* Inclusive */
pgoff_t done_index;
@@ -2173,8 +2182,7 @@ int write_cache_pages(struct address_space *mapping,
pagevec_init(&pvec);
if (wbc->range_cyclic) {
- writeback_index = mapping->writeback_index; /* prev offset */
- index = writeback_index;
+ index = mapping->writeback_index; /* prev offset */
end = -1;
} else {
index = wbc->range_start >> PAGE_SHIFT;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 13cc653122b7..780c8f023b28 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -68,6 +68,8 @@
#include <linux/lockdep.h>
#include <linux/nmi.h>
#include <linux/psi.h>
+#include <linux/padata.h>
+#include <linux/khugepaged.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -302,14 +304,14 @@ const char * const migratetype_names[MIGRATE_TYPES] = {
#endif
};
-compound_page_dtor * const compound_page_dtors[] = {
- NULL,
- free_compound_page,
+compound_page_dtor * const compound_page_dtors[NR_COMPOUND_DTORS] = {
+ [NULL_COMPOUND_DTOR] = NULL,
+ [COMPOUND_PAGE_DTOR] = free_compound_page,
#ifdef CONFIG_HUGETLB_PAGE
- free_huge_page,
+ [HUGETLB_PAGE_DTOR] = free_huge_page,
#endif
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- free_transhuge_page,
+ [TRANSHUGE_PAGE_DTOR] = free_transhuge_page,
#endif
};
@@ -335,7 +337,6 @@ static unsigned long nr_kernel_pages __initdata;
static unsigned long nr_all_pages __initdata;
static unsigned long dma_reserve __initdata;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static unsigned long arch_zone_lowest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long arch_zone_highest_possible_pfn[MAX_NR_ZONES] __initdata;
static unsigned long required_kernelcore __initdata;
@@ -348,7 +349,6 @@ static bool mirrored_kernelcore __meminitdata;
/* movable_zone is the "real" zone pages in ZONE_MOVABLE are taken from */
int movable_zone;
EXPORT_SYMBOL(movable_zone);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
#if MAX_NUMNODES > 1
unsigned int nr_node_ids __read_mostly = MAX_NUMNODES;
@@ -460,25 +460,23 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
{
#ifdef CONFIG_SPARSEMEM
pfn &= (PAGES_PER_SECTION-1);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#else
pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages);
- return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
#endif /* CONFIG_SPARSEMEM */
+ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
/**
* get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
* @page: The page within the block of interest
* @pfn: The target page frame number
- * @end_bitidx: The last bit of interest to retrieve
* @mask: mask of bits that the caller is interested in
*
* Return: pageblock_bits flags
*/
-static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page,
+static __always_inline
+unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
@@ -491,20 +489,18 @@ static __always_inline unsigned long __get_pfnblock_flags_mask(struct page *page
bitidx &= (BITS_PER_LONG-1);
word = bitmap[word_bitidx];
- bitidx += end_bitidx;
- return (word >> (BITS_PER_LONG - bitidx - 1)) & mask;
+ return (word >> bitidx) & mask;
}
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
- return __get_pfnblock_flags_mask(page, pfn, end_bitidx, mask);
+ return __get_pfnblock_flags_mask(page, pfn, mask);
}
static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned long pfn)
{
- return __get_pfnblock_flags_mask(page, pfn, PB_migrate_end, MIGRATETYPE_MASK);
+ return __get_pfnblock_flags_mask(page, pfn, MIGRATETYPE_MASK);
}
/**
@@ -512,12 +508,10 @@ static __always_inline int get_pfnblock_migratetype(struct page *page, unsigned
* @page: The page within the block of interest
* @flags: The flags to set
* @pfn: The target page frame number
- * @end_bitidx: The last bit of interest
* @mask: mask of bits that the caller is interested in
*/
void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
unsigned long pfn,
- unsigned long end_bitidx,
unsigned long mask)
{
unsigned long *bitmap;
@@ -534,9 +528,8 @@ void set_pfnblock_flags_mask(struct page *page, unsigned long flags,
VM_BUG_ON_PAGE(!zone_spans_pfn(page_zone(page), pfn), page);
- bitidx += end_bitidx;
- mask <<= (BITS_PER_LONG - bitidx - 1);
- flags <<= (BITS_PER_LONG - bitidx - 1);
+ mask <<= bitidx;
+ flags <<= bitidx;
word = READ_ONCE(bitmap[word_bitidx]);
for (;;) {
@@ -553,8 +546,8 @@ void set_pageblock_migratetype(struct page *page, int migratetype)
migratetype < MIGRATE_PCPTYPES))
migratetype = MIGRATE_UNMOVABLE;
- set_pageblock_flags_group(page, (unsigned long)migratetype,
- PB_migrate, PB_migrate_end);
+ set_pfnblock_flags_mask(page, (unsigned long)migratetype,
+ page_to_pfn(page), MIGRATETYPE_MASK);
}
#ifdef CONFIG_DEBUG_VM
@@ -609,8 +602,7 @@ static inline int __maybe_unused bad_range(struct zone *zone, struct page *page)
}
#endif
-static void bad_page(struct page *page, const char *reason,
- unsigned long bad_flags)
+static void bad_page(struct page *page, const char *reason)
{
static unsigned long resume;
static unsigned long nr_shown;
@@ -639,10 +631,6 @@ static void bad_page(struct page *page, const char *reason,
pr_alert("BUG: Bad page state in process %s pfn:%05lx\n",
current->comm, page_to_pfn(page));
__dump_page(page, reason);
- bad_flags &= page->flags;
- if (bad_flags)
- pr_alert("bad because of flags: %#lx(%pGp)\n",
- bad_flags, &bad_flags);
dump_page_owner(page);
print_modules();
@@ -679,8 +667,6 @@ void prep_compound_page(struct page *page, unsigned int order)
int i;
int nr_pages = 1 << order;
- set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
- set_compound_order(page, order);
__SetPageHead(page);
for (i = 1; i < nr_pages; i++) {
struct page *p = page + i;
@@ -688,6 +674,9 @@ void prep_compound_page(struct page *page, unsigned int order)
p->mapping = TAIL_MAPPING;
set_compound_head(p, page);
}
+
+ set_compound_page_dtor(page, COMPOUND_PAGE_DTOR);
+ set_compound_order(page, order);
atomic_set(compound_mapcount_ptr(page), -1);
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
@@ -819,11 +808,10 @@ static inline struct capture_control *task_capc(struct zone *zone)
{
struct capture_control *capc = current->capture_control;
- return capc &&
+ return unlikely(capc) &&
!(current->flags & PF_KTHREAD) &&
!capc->page &&
- capc->cc->zone == zone &&
- capc->cc->direct_compaction ? capc : NULL;
+ capc->cc->zone == zone ? capc : NULL;
}
static inline bool
@@ -967,7 +955,7 @@ static inline void __free_one_page(struct page *page,
int migratetype, bool report)
{
struct capture_control *capc = task_capc(zone);
- unsigned long uninitialized_var(buddy_pfn);
+ unsigned long buddy_pfn;
unsigned long combined_pfn;
unsigned int max_order;
struct page *buddy;
@@ -1077,13 +1065,9 @@ static inline bool page_expected_state(struct page *page,
return true;
}
-static void free_pages_check_bad(struct page *page)
+static const char *page_bad_reason(struct page *page, unsigned long flags)
{
- const char *bad_reason;
- unsigned long bad_flags;
-
- bad_reason = NULL;
- bad_flags = 0;
+ const char *bad_reason = NULL;
if (unlikely(atomic_read(&page->_mapcount) != -1))
bad_reason = "nonzero mapcount";
@@ -1091,24 +1075,32 @@ static void free_pages_check_bad(struct page *page)
bad_reason = "non-NULL mapping";
if (unlikely(page_ref_count(page) != 0))
bad_reason = "nonzero _refcount";
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_FREE)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
- bad_flags = PAGE_FLAGS_CHECK_AT_FREE;
+ if (unlikely(page->flags & flags)) {
+ if (flags == PAGE_FLAGS_CHECK_AT_PREP)
+ bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag(s) set";
+ else
+ bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
if (unlikely(page->mem_cgroup))
bad_reason = "page still charged to cgroup";
#endif
- bad_page(page, bad_reason, bad_flags);
+ return bad_reason;
}
-static inline int free_pages_check(struct page *page)
+static void check_free_page_bad(struct page *page)
+{
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE));
+}
+
+static inline int check_free_page(struct page *page)
{
if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE)))
return 0;
/* Something has gone sideways, find it */
- free_pages_check_bad(page);
+ check_free_page_bad(page);
return 1;
}
@@ -1130,7 +1122,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
case 1:
/* the first tail page: ->mapping may be compound_mapcount() */
if (unlikely(compound_mapcount(page))) {
- bad_page(page, "nonzero compound_mapcount", 0);
+ bad_page(page, "nonzero compound_mapcount");
goto out;
}
break;
@@ -1142,17 +1134,17 @@ static int free_tail_pages_check(struct page *head_page, struct page *page)
break;
default:
if (page->mapping != TAIL_MAPPING) {
- bad_page(page, "corrupted mapping in tail page", 0);
+ bad_page(page, "corrupted mapping in tail page");
goto out;
}
break;
}
if (unlikely(!PageTail(page))) {
- bad_page(page, "PageTail not set", 0);
+ bad_page(page, "PageTail not set");
goto out;
}
if (unlikely(compound_head(page) != head_page)) {
- bad_page(page, "compound_head not consistent", 0);
+ bad_page(page, "compound_head not consistent");
goto out;
}
ret = 0;
@@ -1166,8 +1158,11 @@ static void kernel_init_free_pages(struct page *page, int numpages)
{
int i;
+ /* s390's use of memset() could override KASAN redzones. */
+ kasan_disable_current();
for (i = 0; i < numpages; i++)
clear_highpage(page + i);
+ kasan_enable_current();
}
static __always_inline bool free_pages_prepare(struct page *page,
@@ -1194,7 +1189,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
for (i = 1; i < (1 << order); i++) {
if (compound)
bad += free_tail_pages_check(page, page + i);
- if (unlikely(free_pages_check(page + i))) {
+ if (unlikely(check_free_page(page + i))) {
bad++;
continue;
}
@@ -1206,7 +1201,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (memcg_kmem_enabled() && PageKmemcg(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
- bad += free_pages_check(page);
+ bad += check_free_page(page);
if (bad)
return false;
@@ -1253,7 +1248,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
if (debug_pagealloc_enabled_static())
- return free_pages_check(page);
+ return check_free_page(page);
else
return false;
}
@@ -1274,7 +1269,7 @@ static bool free_pcp_prepare(struct page *page)
static bool bulkfree_pcp_prepare(struct page *page)
{
- return free_pages_check(page);
+ return check_free_page(page);
}
#endif /* CONFIG_DEBUG_VM */
@@ -1308,6 +1303,11 @@ static void free_pcppages_bulk(struct zone *zone, int count,
struct page *page, *tmp;
LIST_HEAD(head);
+ /*
+ * Ensure proper count is passed which otherwise would stuck in the
+ * below while (list_empty(list)) loop.
+ */
+ count = min(pcp->count, count);
while (count) {
struct list_head *list;
@@ -1499,45 +1499,49 @@ void __free_pages_core(struct page *page, unsigned int order)
__free_pages(page, order);
}
-#if defined(CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID) || \
- defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP)
+#ifdef CONFIG_NEED_MULTIPLE_NODES
static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
-int __meminit early_pfn_to_nid(unsigned long pfn)
+#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+
+/*
+ * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
+ */
+int __meminit __early_pfn_to_nid(unsigned long pfn,
+ struct mminit_pfnnid_cache *state)
{
- static DEFINE_SPINLOCK(early_pfn_lock);
+ unsigned long start_pfn, end_pfn;
int nid;
- spin_lock(&early_pfn_lock);
- nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid < 0)
- nid = first_online_node;
- spin_unlock(&early_pfn_lock);
+ if (state->last_start <= pfn && pfn < state->last_end)
+ return state->last_nid;
+
+ nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
+ if (nid != NUMA_NO_NODE) {
+ state->last_start = start_pfn;
+ state->last_end = end_pfn;
+ state->last_nid = nid;
+ }
return nid;
}
-#endif
+#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-#ifdef CONFIG_NODES_SPAN_OTHER_NODES
-/* Only safe to use early in boot when initialisation is single-threaded */
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
+int __meminit early_pfn_to_nid(unsigned long pfn)
{
+ static DEFINE_SPINLOCK(early_pfn_lock);
int nid;
+ spin_lock(&early_pfn_lock);
nid = __early_pfn_to_nid(pfn, &early_pfnnid_cache);
- if (nid >= 0 && nid != node)
- return false;
- return true;
-}
+ if (nid < 0)
+ nid = first_online_node;
+ spin_unlock(&early_pfn_lock);
-#else
-static inline bool __meminit early_pfn_in_nid(unsigned long pfn, int node)
-{
- return true;
+ return nid;
}
-#endif
-
+#endif /* CONFIG_NEED_MULTIPLE_NODES */
void __init memblock_free_pages(struct page *page, unsigned long pfn,
unsigned int order)
@@ -1692,7 +1696,6 @@ static void __init deferred_free_pages(unsigned long pfn,
} else if (!(pfn & nr_pgmask)) {
deferred_free_range(pfn - nr_free, nr_free);
nr_free = 1;
- touch_nmi_watchdog();
} else {
nr_free++;
}
@@ -1722,7 +1725,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone,
continue;
} else if (!page || !(pfn & nr_pgmask)) {
page = pfn_to_page(pfn);
- touch_nmi_watchdog();
} else {
page++;
}
@@ -1816,16 +1818,43 @@ deferred_init_maxorder(u64 *i, struct zone *zone, unsigned long *start_pfn,
return nr_pages;
}
+static void __init
+deferred_init_memmap_chunk(unsigned long start_pfn, unsigned long end_pfn,
+ void *arg)
+{
+ unsigned long spfn, epfn;
+ struct zone *zone = arg;
+ u64 i;
+
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn, start_pfn);
+
+ /*
+ * Initialize and free pages in MAX_ORDER sized increments so that we
+ * can avoid introducing any issues with the buddy allocator.
+ */
+ while (spfn < end_pfn) {
+ deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ cond_resched();
+ }
+}
+
+/* An arch may override for more concurrency. */
+__weak int __init
+deferred_page_init_max_threads(const struct cpumask *node_cpumask)
+{
+ return 1;
+}
+
/* Initialise remaining memory on a node */
static int __init deferred_init_memmap(void *data)
{
pg_data_t *pgdat = data;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
- unsigned long spfn = 0, epfn = 0, nr_pages = 0;
+ unsigned long spfn = 0, epfn = 0;
unsigned long first_init_pfn, flags;
unsigned long start = jiffies;
struct zone *zone;
- int zid;
+ int zid, max_threads;
u64 i;
/* Bind memory initialisation thread to a local node if possible */
@@ -1845,6 +1874,13 @@ static int __init deferred_init_memmap(void *data)
BUG_ON(pgdat->first_deferred_pfn > pgdat_end_pfn(pgdat));
pgdat->first_deferred_pfn = ULONG_MAX;
+ /*
+ * Once we unlock here, the zone cannot be grown anymore, thus if an
+ * interrupt thread must allocate this early in boot, zone must be
+ * pre-grown prior to start of deferred page initialization.
+ */
+ pgdat_resize_unlock(pgdat, &flags);
+
/* Only the highest zone is deferred so find it */
for (zid = 0; zid < MAX_NR_ZONES; zid++) {
zone = pgdat->node_zones + zid;
@@ -1857,21 +1893,30 @@ static int __init deferred_init_memmap(void *data)
first_init_pfn))
goto zone_empty;
- /*
- * Initialize and free pages in MAX_ORDER sized increments so
- * that we can avoid introducing any issues with the buddy
- * allocator.
- */
- while (spfn < epfn)
- nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
-zone_empty:
- pgdat_resize_unlock(pgdat, &flags);
+ max_threads = deferred_page_init_max_threads(cpumask);
+ while (spfn < epfn) {
+ unsigned long epfn_align = ALIGN(epfn, PAGES_PER_SECTION);
+ struct padata_mt_job job = {
+ .thread_fn = deferred_init_memmap_chunk,
+ .fn_arg = zone,
+ .start = spfn,
+ .size = epfn_align - spfn,
+ .align = PAGES_PER_SECTION,
+ .min_chunk = PAGES_PER_SECTION,
+ .max_threads = max_threads,
+ };
+
+ padata_do_multithreaded(&job);
+ deferred_init_mem_pfn_range_in_zone(&i, zone, &spfn, &epfn,
+ epfn_align);
+ }
+zone_empty:
/* Sanity check that the next zone really is unpopulated */
WARN_ON(++zid < MAX_NR_ZONES && populated_zone(++zone));
- pr_info("node %d initialised, %lu pages in %ums\n",
- pgdat->node_id, nr_pages, jiffies_to_msecs(jiffies - start));
+ pr_info("node %d deferred pages initialised in %ums\n",
+ pgdat->node_id, jiffies_to_msecs(jiffies - start));
pgdat_init_report_one_done();
return 0;
@@ -1909,17 +1954,6 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
pgdat_resize_lock(pgdat, &flags);
/*
- * If deferred pages have been initialized while we were waiting for
- * the lock, return true, as the zone was grown. The caller will retry
- * this zone. We won't return to this function since the caller also
- * has this static branch.
- */
- if (!static_branch_unlikely(&deferred_pages)) {
- pgdat_resize_unlock(pgdat, &flags);
- return true;
- }
-
- /*
* If someone grew this zone while we were waiting for spinlock, return
* true, as there might be enough pages already.
*/
@@ -1947,6 +1981,7 @@ deferred_grow_zone(struct zone *zone, unsigned int order)
first_deferred_pfn = spfn;
nr_pages += deferred_init_maxorder(&i, zone, &spfn, &epfn);
+ touch_nmi_watchdog();
/* We should only stop along section boundaries */
if ((first_deferred_pfn ^ spfn) < PAGES_PER_SECTION)
@@ -2092,31 +2127,14 @@ static inline void expand(struct zone *zone, struct page *page,
static void check_new_page_bad(struct page *page)
{
- const char *bad_reason = NULL;
- unsigned long bad_flags = 0;
-
- if (unlikely(atomic_read(&page->_mapcount) != -1))
- bad_reason = "nonzero mapcount";
- if (unlikely(page->mapping != NULL))
- bad_reason = "non-NULL mapping";
- if (unlikely(page_ref_count(page) != 0))
- bad_reason = "nonzero _refcount";
if (unlikely(page->flags & __PG_HWPOISON)) {
- bad_reason = "HWPoisoned (hardware-corrupted)";
- bad_flags = __PG_HWPOISON;
/* Don't complain about hwpoisoned pages */
page_mapcount_reset(page); /* remove PageBuddy */
return;
}
- if (unlikely(page->flags & PAGE_FLAGS_CHECK_AT_PREP)) {
- bad_reason = "PAGE_FLAGS_CHECK_AT_PREP flag set";
- bad_flags = PAGE_FLAGS_CHECK_AT_PREP;
- }
-#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
- bad_reason = "page still charged to cgroup";
-#endif
- bad_page(page, bad_reason, bad_flags);
+
+ bad_page(page,
+ page_bad_reason(page, PAGE_FLAGS_CHECK_AT_PREP));
}
/*
@@ -2257,7 +2275,7 @@ struct page *__rmqueue_smallest(struct zone *zone, unsigned int order,
* This array describes the order lists are fallen back to when
* the free lists for the desirable migrate type are depleted
*/
-static int fallbacks[MIGRATE_TYPES][4] = {
+static int fallbacks[MIGRATE_TYPES][3] = {
[MIGRATE_UNMOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
[MIGRATE_MOVABLE] = { MIGRATE_RECLAIMABLE, MIGRATE_UNMOVABLE, MIGRATE_TYPES },
[MIGRATE_RECLAIMABLE] = { MIGRATE_UNMOVABLE, MIGRATE_MOVABLE, MIGRATE_TYPES },
@@ -2609,7 +2627,7 @@ static bool unreserve_highatomic_pageblock(const struct alloc_context *ac,
int order;
bool ret;
- for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->high_zoneidx,
+ for_each_zone_zonelist_nodemask(zone, z, zonelist, ac->highest_zoneidx,
ac->nodemask) {
/*
* Preserve at least one pageblock unless memory pressure
@@ -2768,10 +2786,24 @@ __rmqueue(struct zone *zone, unsigned int order, int migratetype,
{
struct page *page;
+#ifdef CONFIG_CMA
+ /*
+ * Balance movable allocations between regular and CMA areas by
+ * allocating from CMA when over half of the zone's free memory
+ * is in the CMA area.
+ */
+ if (alloc_flags & ALLOC_CMA &&
+ zone_page_state(zone, NR_FREE_CMA_PAGES) >
+ zone_page_state(zone, NR_FREE_PAGES) / 2) {
+ page = __rmqueue_cma_fallback(zone, order);
+ if (page)
+ return page;
+ }
+#endif
retry:
page = __rmqueue_smallest(zone, order, migratetype);
if (unlikely(!page)) {
- if (migratetype == MIGRATE_MOVABLE)
+ if (alloc_flags & ALLOC_CMA)
page = __rmqueue_cma_fallback(zone, order);
if (!page && __rmqueue_fallback(zone, order, migratetype,
@@ -3336,9 +3368,16 @@ struct page *rmqueue(struct zone *preferred_zone,
struct page *page;
if (likely(order == 0)) {
- page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
+ /*
+ * MIGRATE_MOVABLE pcplist could have the pages on CMA area and
+ * we need to skip it when CMA area isn't allowed.
+ */
+ if (!IS_ENABLED(CONFIG_CMA) || alloc_flags & ALLOC_CMA ||
+ migratetype != MIGRATE_MOVABLE) {
+ page = rmqueue_pcplist(preferred_zone, zone, gfp_flags,
migratetype, alloc_flags);
- goto out;
+ goto out;
+ }
}
/*
@@ -3350,7 +3389,13 @@ struct page *rmqueue(struct zone *preferred_zone,
do {
page = NULL;
- if (alloc_flags & ALLOC_HARDER) {
+ /*
+ * order-0 request can reach here when the pcplist is skipped
+ * due to non-CMA allocation context. HIGHATOMIC area is
+ * reserved for high-order atomic allocation, so order-0
+ * request should skip it.
+ */
+ if (order > 0 && alloc_flags & ALLOC_HARDER) {
page = __rmqueue_smallest(zone, order, MIGRATE_HIGHATOMIC);
if (page)
trace_mm_page_alloc_zone_locked(page, order, migratetype);
@@ -3457,6 +3502,29 @@ static noinline bool should_fail_alloc_page(gfp_t gfp_mask, unsigned int order)
}
ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
+static inline long __zone_watermark_unusable_free(struct zone *z,
+ unsigned int order, unsigned int alloc_flags)
+{
+ const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
+ long unusable_free = (1 << order) - 1;
+
+ /*
+ * If the caller does not have rights to ALLOC_HARDER then subtract
+ * the high-atomic reserves. This will over-estimate the size of the
+ * atomic reserve but it avoids a search.
+ */
+ if (likely(!alloc_harder))
+ unusable_free += z->nr_reserved_highatomic;
+
+#ifdef CONFIG_CMA
+ /* If allocation can't use CMA areas don't use free CMA pages */
+ if (!(alloc_flags & ALLOC_CMA))
+ unusable_free += zone_page_state(z, NR_FREE_CMA_PAGES);
+#endif
+
+ return unusable_free;
+}
+
/*
* Return true if free base pages are above 'mark'. For high-order checks it
* will return true of the order-0 watermark is reached and there is at least
@@ -3464,7 +3532,7 @@ ALLOW_ERROR_INJECTION(should_fail_alloc_page, TRUE);
* to check in the allocation paths if no pages are free.
*/
bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags,
+ int highest_zoneidx, unsigned int alloc_flags,
long free_pages)
{
long min = mark;
@@ -3472,19 +3540,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
const bool alloc_harder = (alloc_flags & (ALLOC_HARDER|ALLOC_OOM));
/* free_pages may go negative - that's OK */
- free_pages -= (1 << order) - 1;
+ free_pages -= __zone_watermark_unusable_free(z, order, alloc_flags);
if (alloc_flags & ALLOC_HIGH)
min -= min / 2;
- /*
- * If the caller does not have rights to ALLOC_HARDER then subtract
- * the high-atomic reserves. This will over-estimate the size of the
- * atomic reserve but it avoids a search.
- */
- if (likely(!alloc_harder)) {
- free_pages -= z->nr_reserved_highatomic;
- } else {
+ if (unlikely(alloc_harder)) {
/*
* OOM victims can try even harder than normal ALLOC_HARDER
* users on the grounds that it's definitely going to be in
@@ -3497,19 +3558,12 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
min -= min / 4;
}
-
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- free_pages -= zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
-
/*
* Check watermarks for an order-0 allocation request. If these
* are not met, then a high-order request also cannot go ahead
* even if a suitable page happened to be free.
*/
- if (free_pages <= min + z->lowmem_reserve[classzone_idx])
+ if (free_pages <= min + z->lowmem_reserve[highest_zoneidx])
return false;
/* If this is an order-0 request then the watermark is fine */
@@ -3542,47 +3596,61 @@ bool __zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
}
bool zone_watermark_ok(struct zone *z, unsigned int order, unsigned long mark,
- int classzone_idx, unsigned int alloc_flags)
+ int highest_zoneidx, unsigned int alloc_flags)
{
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
zone_page_state(z, NR_FREE_PAGES));
}
static inline bool zone_watermark_fast(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx, unsigned int alloc_flags)
+ unsigned long mark, int highest_zoneidx,
+ unsigned int alloc_flags, gfp_t gfp_mask)
{
- long free_pages = zone_page_state(z, NR_FREE_PAGES);
- long cma_pages = 0;
+ long free_pages;
-#ifdef CONFIG_CMA
- /* If allocation can't use CMA areas don't use free CMA pages */
- if (!(alloc_flags & ALLOC_CMA))
- cma_pages = zone_page_state(z, NR_FREE_CMA_PAGES);
-#endif
+ free_pages = zone_page_state(z, NR_FREE_PAGES);
/*
* Fast check for order-0 only. If this fails then the reserves
- * need to be calculated. There is a corner case where the check
- * passes but only the high-order atomic reserve are free. If
- * the caller is !atomic then it'll uselessly search the free
- * list. That corner case is then slower but it is harmless.
+ * need to be calculated.
*/
- if (!order && (free_pages - cma_pages) > mark + z->lowmem_reserve[classzone_idx])
+ if (!order) {
+ long fast_free;
+
+ fast_free = free_pages;
+ fast_free -= __zone_watermark_unusable_free(z, 0, alloc_flags);
+ if (fast_free > mark + z->lowmem_reserve[highest_zoneidx])
+ return true;
+ }
+
+ if (__zone_watermark_ok(z, order, mark, highest_zoneidx, alloc_flags,
+ free_pages))
return true;
+ /*
+ * Ignore watermark boosting for GFP_ATOMIC order-0 allocations
+ * when checking the min watermark. The min watermark is the
+ * point where boosting is ignored so that kswapd is woken up
+ * when below the low watermark.
+ */
+ if (unlikely(!order && (gfp_mask & __GFP_ATOMIC) && z->watermark_boost
+ && ((alloc_flags & ALLOC_WMARK_MASK) == WMARK_MIN))) {
+ mark = z->_watermark[WMARK_MIN];
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx,
+ alloc_flags, free_pages);
+ }
- return __zone_watermark_ok(z, order, mark, classzone_idx, alloc_flags,
- free_pages);
+ return false;
}
bool zone_watermark_ok_safe(struct zone *z, unsigned int order,
- unsigned long mark, int classzone_idx)
+ unsigned long mark, int highest_zoneidx)
{
long free_pages = zone_page_state(z, NR_FREE_PAGES);
if (z->percpu_drift_mark && free_pages < z->percpu_drift_mark)
free_pages = zone_page_state_snapshot(z, NR_FREE_PAGES);
- return __zone_watermark_ok(z, order, mark, classzone_idx, 0,
+ return __zone_watermark_ok(z, order, mark, highest_zoneidx, 0,
free_pages);
}
@@ -3639,6 +3707,20 @@ alloc_flags_nofragment(struct zone *zone, gfp_t gfp_mask)
return alloc_flags;
}
+static inline unsigned int current_alloc_flags(gfp_t gfp_mask,
+ unsigned int alloc_flags)
+{
+#ifdef CONFIG_CMA
+ unsigned int pflags = current->flags;
+
+ if (!(pflags & PF_MEMALLOC_NOCMA) &&
+ gfp_migratetype(gfp_mask) == MIGRATE_MOVABLE)
+ alloc_flags |= ALLOC_CMA;
+
+#endif
+ return alloc_flags;
+}
+
/*
* get_page_from_freelist goes through the zonelist trying to allocate
* a page.
@@ -3659,8 +3741,8 @@ retry:
*/
no_fallback = alloc_flags & ALLOC_NOFRAGMENT;
z = ac->preferred_zoneref;
- for_next_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_next_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
struct page *page;
unsigned long mark;
@@ -3715,7 +3797,8 @@ retry:
mark = wmark_pages(zone, alloc_flags & ALLOC_WMARK_MASK);
if (!zone_watermark_fast(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags)) {
+ ac->highest_zoneidx, alloc_flags,
+ gfp_mask)) {
int ret;
#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT
@@ -3748,7 +3831,7 @@ retry:
default:
/* did we reclaim enough */
if (zone_watermark_ok(zone, order, mark,
- ac_classzone_idx(ac), alloc_flags))
+ ac->highest_zoneidx, alloc_flags))
goto try_this_zone;
continue;
@@ -3907,7 +3990,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
if (gfp_mask & __GFP_RETRY_MAYFAIL)
goto out;
/* The OOM killer does not needlessly kill tasks for lowmem */
- if (ac->high_zoneidx < ZONE_NORMAL)
+ if (ac->highest_zoneidx < ZONE_NORMAL)
goto out;
if (pm_suspended_storage())
goto out;
@@ -4110,10 +4193,10 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
* Let's give them a good hope and keep retrying while the order-0
* watermarks are OK.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
if (zone_watermark_ok(zone, 0, min_wmark_pages(zone),
- ac_classzone_idx(ac), alloc_flags))
+ ac->highest_zoneidx, alloc_flags))
return true;
}
return false;
@@ -4219,7 +4302,7 @@ retry:
/*
* If an allocation failed after direct reclaim, it could be because
* pages are pinned on the per-cpu lists or in high alloc reserves.
- * Shrink them them and try again
+ * Shrink them and try again
*/
if (!page && !drained) {
unreserve_highatomic_pageblock(ac, false);
@@ -4237,12 +4320,12 @@ static void wake_all_kswapds(unsigned int order, gfp_t gfp_mask,
struct zoneref *z;
struct zone *zone;
pg_data_t *last_pgdat = NULL;
- enum zone_type high_zoneidx = ac->high_zoneidx;
+ enum zone_type highest_zoneidx = ac->highest_zoneidx;
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, high_zoneidx,
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, highest_zoneidx,
ac->nodemask) {
if (last_pgdat != zone->zone_pgdat)
- wakeup_kswapd(zone, gfp_mask, order, high_zoneidx);
+ wakeup_kswapd(zone, gfp_mask, order, highest_zoneidx);
last_pgdat = zone->zone_pgdat;
}
}
@@ -4284,10 +4367,8 @@ gfp_to_alloc_flags(gfp_t gfp_mask)
} else if (unlikely(rt_task(current)) && !in_interrupt())
alloc_flags |= ALLOC_HARDER;
-#ifdef CONFIG_CMA
- if (gfpflags_to_migratetype(gfp_mask) == MIGRATE_MOVABLE)
- alloc_flags |= ALLOC_CMA;
-#endif
+ alloc_flags = current_alloc_flags(gfp_mask, alloc_flags);
+
return alloc_flags;
}
@@ -4377,8 +4458,8 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* request even if all reclaimable pages are considered then we are
* screwed and have to go OOM.
*/
- for_each_zone_zonelist_nodemask(zone, z, ac->zonelist, ac->high_zoneidx,
- ac->nodemask) {
+ for_each_zone_zonelist_nodemask(zone, z, ac->zonelist,
+ ac->highest_zoneidx, ac->nodemask) {
unsigned long available;
unsigned long reclaimable;
unsigned long min_wmark = min_wmark_pages(zone);
@@ -4392,7 +4473,7 @@ should_reclaim_retry(gfp_t gfp_mask, unsigned order,
* reclaimable pages?
*/
wmark = __zone_watermark_ok(zone, order, min_wmark,
- ac_classzone_idx(ac), alloc_flags, available);
+ ac->highest_zoneidx, alloc_flags, available);
trace_reclaim_retry_zone(z, order, reclaimable,
available, min_wmark, *no_progress_loops, wmark);
if (wmark) {
@@ -4511,7 +4592,7 @@ retry_cpuset:
* could end up iterating over non-eligible zones endlessly.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
if (!ac->preferred_zoneref->zone)
goto nopage;
@@ -4588,7 +4669,7 @@ retry:
reserve_flags = __gfp_pfmemalloc_flags(gfp_mask);
if (reserve_flags)
- alloc_flags = reserve_flags;
+ alloc_flags = current_alloc_flags(gfp_mask, reserve_flags);
/*
* Reset the nodemask and zonelist iterators if memory policies can be
@@ -4598,7 +4679,7 @@ retry:
if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) {
ac->nodemask = NULL;
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
}
/* Attempt with potentially adjusted zonelist and alloc_flags */
@@ -4665,7 +4746,7 @@ retry:
/* Avoid allocations with no watermarks from looping endlessly */
if (tsk_is_oom_victim(current) &&
- (alloc_flags == ALLOC_OOM ||
+ (alloc_flags & ALLOC_OOM ||
(gfp_mask & __GFP_NOMEMALLOC)))
goto nopage;
@@ -4732,14 +4813,18 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
struct alloc_context *ac, gfp_t *alloc_mask,
unsigned int *alloc_flags)
{
- ac->high_zoneidx = gfp_zone(gfp_mask);
+ ac->highest_zoneidx = gfp_zone(gfp_mask);
ac->zonelist = node_zonelist(preferred_nid, gfp_mask);
ac->nodemask = nodemask;
- ac->migratetype = gfpflags_to_migratetype(gfp_mask);
+ ac->migratetype = gfp_migratetype(gfp_mask);
if (cpusets_enabled()) {
*alloc_mask |= __GFP_HARDWALL;
- if (!ac->nodemask)
+ /*
+ * When we are in the interrupt context, it is irrelevant
+ * to the current task context. It means that any node ok.
+ */
+ if (!in_interrupt() && !ac->nodemask)
ac->nodemask = &cpuset_current_mems_allowed;
else
*alloc_flags |= ALLOC_CPUSET;
@@ -4753,8 +4838,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order,
if (should_fail_alloc_page(gfp_mask, order))
return false;
- if (IS_ENABLED(CONFIG_CMA) && ac->migratetype == MIGRATE_MOVABLE)
- *alloc_flags |= ALLOC_CMA;
+ *alloc_flags = current_alloc_flags(gfp_mask, *alloc_flags);
return true;
}
@@ -4771,7 +4855,7 @@ static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac)
* may get reset for allocations that ignore memory policies.
*/
ac->preferred_zoneref = first_zones_zonelist(ac->zonelist,
- ac->high_zoneidx, ac->nodemask);
+ ac->highest_zoneidx, ac->nodemask);
}
/*
@@ -5133,19 +5217,6 @@ unsigned long nr_free_buffer_pages(void)
}
EXPORT_SYMBOL_GPL(nr_free_buffer_pages);
-/**
- * nr_free_pagecache_pages - count number of pages beyond high watermark
- *
- * nr_free_pagecache_pages() counts the number of pages which are beyond the
- * high watermark within all zones.
- *
- * Return: number of pages beyond high watermark within all zones.
- */
-unsigned long nr_free_pagecache_pages(void)
-{
- return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
-}
-
static inline void show_node(struct zone *zone)
{
if (IS_ENABLED(CONFIG_NUMA))
@@ -5188,8 +5259,8 @@ long si_mem_available(void)
* items that are in use, and cannot be freed. Cap this estimate at the
* low watermark.
*/
- reclaimable = global_node_page_state(NR_SLAB_RECLAIMABLE) +
- global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
+ reclaimable = global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B) +
+ global_node_page_state(NR_KERNEL_MISC_RECLAIMABLE);
available += reclaimable - min(reclaimable / 2, wmark_low);
if (available < 0)
@@ -5319,7 +5390,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n"
" active_file:%lu inactive_file:%lu isolated_file:%lu\n"
- " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n"
+ " unevictable:%lu dirty:%lu writeback:%lu\n"
" slab_reclaimable:%lu slab_unreclaimable:%lu\n"
" mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n"
" free:%lu free_pcp:%lu free_cma:%lu\n",
@@ -5332,9 +5403,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state(NR_UNEVICTABLE),
global_node_page_state(NR_FILE_DIRTY),
global_node_page_state(NR_WRITEBACK),
- global_node_page_state(NR_UNSTABLE_NFS),
- global_node_page_state(NR_SLAB_RECLAIMABLE),
- global_node_page_state(NR_SLAB_UNRECLAIMABLE),
+ global_node_page_state_pages(NR_SLAB_RECLAIMABLE_B),
+ global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
global_zone_page_state(NR_PAGETABLE),
@@ -5365,7 +5435,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" anon_thp: %lukB"
#endif
" writeback_tmp:%lukB"
- " unstable:%lukB"
+ " kernel_stack:%lukB"
+#ifdef CONFIG_SHADOW_CALL_STACK
+ " shadow_call_stack:%lukB"
+#endif
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5387,7 +5460,10 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR),
#endif
K(node_page_state(pgdat, NR_WRITEBACK_TEMP)),
- K(node_page_state(pgdat, NR_UNSTABLE_NFS)),
+ node_page_state(pgdat, NR_KERNEL_STACK_KB),
+#ifdef CONFIG_SHADOW_CALL_STACK
+ node_page_state(pgdat, NR_KERNEL_SCS_KB),
+#endif
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -5419,7 +5495,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " kernel_stack:%lukB"
" pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
@@ -5441,7 +5516,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- zone_page_state(zone, NR_KERNEL_STACK_KB),
K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
@@ -5540,36 +5614,17 @@ static int __parse_numa_zonelist_order(char *s)
return 0;
}
-static __init int setup_numa_zonelist_order(char *s)
-{
- if (!s)
- return 0;
-
- return __parse_numa_zonelist_order(s);
-}
-early_param("numa_zonelist_order", setup_numa_zonelist_order);
-
char numa_zonelist_order[] = "Node";
/*
* sysctl handler for numa_zonelist_order
*/
int numa_zonelist_order_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length,
- loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
- char *str;
- int ret;
-
- if (!write)
- return proc_dostring(table, write, buffer, length, ppos);
- str = memdup_user_nul(buffer, 16);
- if (IS_ERR(str))
- return PTR_ERR(str);
-
- ret = __parse_numa_zonelist_order(str);
- kfree(str);
- return ret;
+ if (write)
+ return __parse_numa_zonelist_order(buffer);
+ return proc_dostring(table, write, buffer, length, ppos);
}
@@ -5689,14 +5744,13 @@ static void build_zonelists(pg_data_t *pgdat)
{
static int node_order[MAX_NUMNODES];
int node, load, nr_nodes = 0;
- nodemask_t used_mask;
+ nodemask_t used_mask = NODE_MASK_NONE;
int local_node, prev_node;
/* NUMA-aware ordering of nodes */
local_node = pgdat->node_id;
load = nr_online_nodes;
prev_node = local_node;
- nodes_clear(used_mask);
memset(node_order, 0, sizeof(node_order));
while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
@@ -5876,13 +5930,16 @@ build_all_zonelists_init(void)
*/
void __ref build_all_zonelists(pg_data_t *pgdat)
{
+ unsigned long vm_total_pages;
+
if (system_state == SYSTEM_BOOTING) {
build_all_zonelists_init();
} else {
__build_all_zonelists(pgdat);
/* cpuset refresh routine should be here */
}
- vm_total_pages = nr_free_pagecache_pages();
+ /* Get the number of free pages beyond high watermark in all zones. */
+ vm_total_pages = nr_free_zone_pages(gfp_zone(GFP_HIGHUSER_MOVABLE));
/*
* Disable grouping by mobility if the number of pages in the
* system is too low to allow the mechanism to work. It would be
@@ -5908,7 +5965,6 @@ void __ref build_all_zonelists(pg_data_t *pgdat)
static bool __meminit
overlap_memmap_init(unsigned long zone, unsigned long *pfn)
{
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
static struct memblock_region *r;
if (mirrored_kernelcore && zone == ZONE_MOVABLE) {
@@ -5924,34 +5980,16 @@ overlap_memmap_init(unsigned long zone, unsigned long *pfn)
return true;
}
}
-#endif
return false;
}
-#ifdef CONFIG_SPARSEMEM
-/* Skip PFNs that belong to non-present sections */
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
- const unsigned long section_nr = pfn_to_section_nr(++pfn);
-
- if (present_section_nr(section_nr))
- return pfn;
- return section_nr_to_pfn(next_present_section_nr(section_nr));
-}
-#else
-static inline __meminit unsigned long next_pfn(unsigned long pfn)
-{
- return pfn++;
-}
-#endif
-
/*
* Initially all pages are reserved - free ones are freed
* up by memblock_free_all() once the early boot process is
* done. Non-atomic initialization, single-pass.
*/
void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
- unsigned long start_pfn, enum memmap_context context,
+ unsigned long start_pfn, enum meminit_context context,
struct vmem_altmap *altmap)
{
unsigned long pfn, end_pfn = start_pfn + size;
@@ -5983,15 +6021,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
* There can be holes in boot-time mem_map[]s handed to this
* function. They do not exist on hotplugged memory.
*/
- if (context == MEMMAP_EARLY) {
- if (!early_pfn_valid(pfn)) {
- pfn = next_pfn(pfn);
- continue;
- }
- if (!early_pfn_in_nid(pfn, nid)) {
- pfn++;
- continue;
- }
+ if (context == MEMINIT_EARLY) {
if (overlap_memmap_init(zone, &pfn))
continue;
if (defer_init(nid, pfn, end_pfn))
@@ -6000,7 +6030,7 @@ void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
page = pfn_to_page(pfn);
__init_single_page(page, pfn, zone, nid);
- if (context == MEMMAP_HOTPLUG)
+ if (context == MEMINIT_HOTPLUG)
__SetPageReserved(page);
/*
@@ -6083,7 +6113,7 @@ void __ref memmap_init_zone_device(struct zone *zone,
* check here not to call set_pageblock_migratetype() against
* pfn out of zone.
*
- * Please note that MEMMAP_HOTPLUG path doesn't clear memmap
+ * Please note that MEMINIT_HOTPLUG path doesn't clear memmap
* because this is done early in section_activate()
*/
if (!(pfn & (pageblock_nr_pages - 1))) {
@@ -6107,9 +6137,23 @@ static void __meminit zone_init_free_lists(struct zone *zone)
}
void __meminit __weak memmap_init(unsigned long size, int nid,
- unsigned long zone, unsigned long start_pfn)
+ unsigned long zone,
+ unsigned long range_start_pfn)
{
- memmap_init_zone(size, nid, zone, start_pfn, MEMMAP_EARLY, NULL);
+ unsigned long start_pfn, end_pfn;
+ unsigned long range_end_pfn = range_start_pfn + size;
+ int i;
+
+ for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, NULL) {
+ start_pfn = clamp(start_pfn, range_start_pfn, range_end_pfn);
+ end_pfn = clamp(end_pfn, range_start_pfn, range_end_pfn);
+
+ if (end_pfn > start_pfn) {
+ size = end_pfn - start_pfn;
+ memmap_init_zone(size, nid, zone, start_pfn,
+ MEMINIT_EARLY, NULL);
+ }
+ }
}
static int zone_batchsize(struct zone *zone)
@@ -6168,7 +6212,7 @@ static int zone_batchsize(struct zone *zone)
* locking.
*
* Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording the the above rule).
+ * those fields changing asynchronously (acording to the above rule).
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
@@ -6261,10 +6305,25 @@ void __init setup_per_cpu_pageset(void)
{
struct pglist_data *pgdat;
struct zone *zone;
+ int __maybe_unused cpu;
for_each_populated_zone(zone)
setup_zone_pageset(zone);
+#ifdef CONFIG_NUMA
+ /*
+ * Unpopulated zones continue using the boot pagesets.
+ * The numa stats for these pagesets need to be reset.
+ * Otherwise, they will end up skewing the stats of
+ * the nodes these zones are associated with.
+ */
+ for_each_possible_cpu(cpu) {
+ struct per_cpu_pageset *pcp = &per_cpu(boot_pageset, cpu);
+ memset(pcp->vm_numa_stat_diff, 0,
+ sizeof(pcp->vm_numa_stat_diff));
+ }
+#endif
+
for_each_online_pgdat(pgdat)
pgdat->per_cpu_nodestats =
alloc_percpu(struct per_cpu_nodestat);
@@ -6307,73 +6366,6 @@ void __meminit init_currently_empty_zone(struct zone *zone,
zone->initialized = 1;
}
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
-
-/*
- * Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
- */
-int __meminit __early_pfn_to_nid(unsigned long pfn,
- struct mminit_pfnnid_cache *state)
-{
- unsigned long start_pfn, end_pfn;
- int nid;
-
- if (state->last_start <= pfn && pfn < state->last_end)
- return state->last_nid;
-
- nid = memblock_search_pfn_nid(pfn, &start_pfn, &end_pfn);
- if (nid != NUMA_NO_NODE) {
- state->last_start = start_pfn;
- state->last_end = end_pfn;
- state->last_nid = nid;
- }
-
- return nid;
-}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
-
-/**
- * free_bootmem_with_active_regions - Call memblock_free_early_nid for each active range
- * @nid: The node to free memory on. If MAX_NUMNODES, all nodes are freed.
- * @max_low_pfn: The highest PFN that will be passed to memblock_free_early_nid
- *
- * If an architecture guarantees that all ranges registered contain no holes
- * and may be freed, this this function may be used instead of calling
- * memblock_free_early_nid() manually.
- */
-void __init free_bootmem_with_active_regions(int nid, unsigned long max_low_pfn)
-{
- unsigned long start_pfn, end_pfn;
- int i, this_nid;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid) {
- start_pfn = min(start_pfn, max_low_pfn);
- end_pfn = min(end_pfn, max_low_pfn);
-
- if (start_pfn < end_pfn)
- memblock_free_early_nid(PFN_PHYS(start_pfn),
- (end_pfn - start_pfn) << PAGE_SHIFT,
- this_nid);
- }
-}
-
-/**
- * sparse_memory_present_with_active_regions - Call memory_present for each active range
- * @nid: The node to call memory_present for. If MAX_NUMNODES, all nodes will be used.
- *
- * If an architecture guarantees that all ranges registered contain no holes and may
- * be freed, this function may be used instead of calling memory_present() manually.
- */
-void __init sparse_memory_present_with_active_regions(int nid)
-{
- unsigned long start_pfn, end_pfn;
- int i, this_nid;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, &end_pfn, &this_nid)
- memory_present(this_nid, start_pfn, end_pfn);
-}
-
/**
* get_pfn_range_for_nid - Return the start and end page frames for a node
* @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned.
@@ -6470,8 +6462,7 @@ static unsigned long __init zone_spanned_pages_in_node(int nid,
unsigned long node_start_pfn,
unsigned long node_end_pfn,
unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn,
- unsigned long *ignored)
+ unsigned long *zone_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6535,8 +6526,7 @@ unsigned long __init absent_pages_in_range(unsigned long start_pfn,
static unsigned long __init zone_absent_pages_in_node(int nid,
unsigned long zone_type,
unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *ignored)
+ unsigned long node_end_pfn)
{
unsigned long zone_low = arch_zone_lowest_possible_pfn[zone_type];
unsigned long zone_high = arch_zone_highest_possible_pfn[zone_type];
@@ -6583,45 +6573,9 @@ static unsigned long __init zone_absent_pages_in_node(int nid,
return nr_absent;
}
-#else /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-static inline unsigned long __init zone_spanned_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zone_start_pfn,
- unsigned long *zone_end_pfn,
- unsigned long *zones_size)
-{
- unsigned int zone;
-
- *zone_start_pfn = node_start_pfn;
- for (zone = 0; zone < zone_type; zone++)
- *zone_start_pfn += zones_size[zone];
-
- *zone_end_pfn = *zone_start_pfn + zones_size[zone_type];
-
- return zones_size[zone_type];
-}
-
-static inline unsigned long __init zone_absent_pages_in_node(int nid,
- unsigned long zone_type,
- unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zholes_size)
-{
- if (!zholes_size)
- return 0;
-
- return zholes_size[zone_type];
-}
-
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
static void __init calculate_node_totalpages(struct pglist_data *pgdat,
unsigned long node_start_pfn,
- unsigned long node_end_pfn,
- unsigned long *zones_size,
- unsigned long *zholes_size)
+ unsigned long node_end_pfn)
{
unsigned long realtotalpages = 0, totalpages = 0;
enum zone_type i;
@@ -6629,17 +6583,21 @@ static void __init calculate_node_totalpages(struct pglist_data *pgdat,
for (i = 0; i < MAX_NR_ZONES; i++) {
struct zone *zone = pgdat->node_zones + i;
unsigned long zone_start_pfn, zone_end_pfn;
+ unsigned long spanned, absent;
unsigned long size, real_size;
- size = zone_spanned_pages_in_node(pgdat->node_id, i,
- node_start_pfn,
- node_end_pfn,
- &zone_start_pfn,
- &zone_end_pfn,
- zones_size);
- real_size = size - zone_absent_pages_in_node(pgdat->node_id, i,
- node_start_pfn, node_end_pfn,
- zholes_size);
+ spanned = zone_spanned_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn,
+ &zone_start_pfn,
+ &zone_end_pfn);
+ absent = zone_absent_pages_in_node(pgdat->node_id, i,
+ node_start_pfn,
+ node_end_pfn);
+
+ size = spanned;
+ real_size = size - absent;
+
if (size)
zone->zone_start_pfn = zone_start_pfn;
else
@@ -6939,10 +6897,8 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat)
*/
if (pgdat == NODE_DATA(0)) {
mem_map = NODE_DATA(0)->node_mem_map;
-#if defined(CONFIG_HAVE_MEMBLOCK_NODE_MAP) || defined(CONFIG_FLATMEM)
if (page_to_pfn(mem_map) != pgdat->node_start_pfn)
mem_map -= offset;
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
}
#endif
}
@@ -6959,30 +6915,25 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat)
static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {}
#endif
-void __init free_area_init_node(int nid, unsigned long *zones_size,
- unsigned long node_start_pfn,
- unsigned long *zholes_size)
+static void __init free_area_init_node(int nid)
{
pg_data_t *pgdat = NODE_DATA(nid);
unsigned long start_pfn = 0;
unsigned long end_pfn = 0;
/* pg_data_t should be reset to zero when it's allocated */
- WARN_ON(pgdat->nr_zones || pgdat->kswapd_classzone_idx);
+ WARN_ON(pgdat->nr_zones || pgdat->kswapd_highest_zoneidx);
+
+ get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
pgdat->node_id = nid;
- pgdat->node_start_pfn = node_start_pfn;
+ pgdat->node_start_pfn = start_pfn;
pgdat->per_cpu_nodestats = NULL;
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
- get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
+
pr_info("Initmem setup node %d [mem %#018Lx-%#018Lx]\n", nid,
(u64)start_pfn << PAGE_SHIFT,
end_pfn ? ((u64)end_pfn << PAGE_SHIFT) - 1 : 0);
-#else
- start_pfn = node_start_pfn;
-#endif
- calculate_node_totalpages(pgdat, start_pfn, end_pfn,
- zones_size, zholes_size);
+ calculate_node_totalpages(pgdat, start_pfn, end_pfn);
alloc_node_mem_map(pgdat);
pgdat_set_deferred_range(pgdat);
@@ -6990,6 +6941,11 @@ void __init free_area_init_node(int nid, unsigned long *zones_size,
free_area_init_core(pgdat);
}
+void __init free_area_init_memoryless_node(int nid)
+{
+ free_area_init_node(nid);
+}
+
#if !defined(CONFIG_FLAT_NODE_MEM_MAP)
/*
* Initialize all valid struct pages in the range [spfn, epfn) and mark them
@@ -7073,8 +7029,6 @@ static inline void __init init_unavailable_mem(void)
}
#endif /* !CONFIG_FLAT_NODE_MEM_MAP */
-#ifdef CONFIG_HAVE_MEMBLOCK_NODE_MAP
-
#if MAX_NUMNODES > 1
/*
* Figure out the number of possible node ids.
@@ -7138,24 +7092,6 @@ unsigned long __init node_map_pfn_alignment(void)
return ~accl_mask + 1;
}
-/* Find the lowest pfn for a node */
-static unsigned long __init find_min_pfn_for_node(int nid)
-{
- unsigned long min_pfn = ULONG_MAX;
- unsigned long start_pfn;
- int i;
-
- for_each_mem_pfn_range(i, nid, &start_pfn, NULL, NULL)
- min_pfn = min(min_pfn, start_pfn);
-
- if (min_pfn == ULONG_MAX) {
- pr_warn("Could not find start_pfn for node %d\n", nid);
- return 0;
- }
-
- return min_pfn;
-}
-
/**
* find_min_pfn_with_active_regions - Find the minimum PFN registered
*
@@ -7164,7 +7100,7 @@ static unsigned long __init find_min_pfn_for_node(int nid)
*/
unsigned long __init find_min_pfn_with_active_regions(void)
{
- return find_min_pfn_for_node(MAX_NUMNODES);
+ return PHYS_PFN(memblock_start_of_DRAM());
}
/*
@@ -7217,7 +7153,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (!memblock_is_hotpluggable(r))
continue;
- nid = r->nid;
+ nid = memblock_get_region_node(r);
usable_startpfn = PFN_DOWN(r->base);
zone_movable_pfn[nid] = zone_movable_pfn[nid] ?
@@ -7238,7 +7174,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
if (memblock_is_mirror(r))
continue;
- nid = r->nid;
+ nid = memblock_get_region_node(r);
usable_startpfn = memblock_region_memory_base_pfn(r);
@@ -7253,7 +7189,7 @@ static void __init find_zone_movable_pfns_for_nodes(void)
}
if (mem_below_4gb_not_mirrored)
- pr_warn("This configuration results in unmirrored kernel memory.");
+ pr_warn("This configuration results in unmirrored kernel memory.\n");
goto out2;
}
@@ -7418,8 +7354,17 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
}
}
+/*
+ * Some architecturs, e.g. ARC may have ZONE_HIGHMEM below ZONE_NORMAL. For
+ * such cases we allow max_zone_pfn sorted in the descending order
+ */
+bool __weak arch_has_descending_max_zone_pfns(void)
+{
+ return false;
+}
+
/**
- * free_area_init_nodes - Initialise all pg_data_t and zone data
+ * free_area_init - Initialise all pg_data_t and zone data
* @max_zone_pfn: an array of max PFNs for each zone
*
* This will call free_area_init_node() for each active node in the system.
@@ -7431,10 +7376,11 @@ static void check_for_memory(pg_data_t *pgdat, int nid)
* starts where the previous one ended. For example, ZONE_DMA32 starts
* at arch_max_dma_pfn.
*/
-void __init free_area_init_nodes(unsigned long *max_zone_pfn)
+void __init free_area_init(unsigned long *max_zone_pfn)
{
unsigned long start_pfn, end_pfn;
- int i, nid;
+ int i, nid, zone;
+ bool descending;
/* Record where the zone boundaries are */
memset(arch_zone_lowest_possible_pfn, 0,
@@ -7443,14 +7389,20 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
sizeof(arch_zone_highest_possible_pfn));
start_pfn = find_min_pfn_with_active_regions();
+ descending = arch_has_descending_max_zone_pfns();
for (i = 0; i < MAX_NR_ZONES; i++) {
- if (i == ZONE_MOVABLE)
+ if (descending)
+ zone = MAX_NR_ZONES - i - 1;
+ else
+ zone = i;
+
+ if (zone == ZONE_MOVABLE)
continue;
- end_pfn = max(max_zone_pfn[i], start_pfn);
- arch_zone_lowest_possible_pfn[i] = start_pfn;
- arch_zone_highest_possible_pfn[i] = end_pfn;
+ end_pfn = max(max_zone_pfn[zone], start_pfn);
+ arch_zone_lowest_possible_pfn[zone] = start_pfn;
+ arch_zone_highest_possible_pfn[zone] = end_pfn;
start_pfn = end_pfn;
}
@@ -7503,8 +7455,7 @@ void __init free_area_init_nodes(unsigned long *max_zone_pfn)
init_unavailable_mem();
for_each_online_node(nid) {
pg_data_t *pgdat = NODE_DATA(nid);
- free_area_init_node(nid, NULL,
- find_min_pfn_for_node(nid), NULL);
+ free_area_init_node(nid);
/* Any memory on that node */
if (pgdat->node_present_pages)
@@ -7569,8 +7520,6 @@ static int __init cmdline_parse_movablecore(char *p)
early_param("kernelcore", cmdline_parse_kernelcore);
early_param("movablecore", cmdline_parse_movablecore);
-#endif /* CONFIG_HAVE_MEMBLOCK_NODE_MAP */
-
void adjust_managed_page_count(struct page *page, long count)
{
atomic_long_add(count, &page_zone(page)->managed_pages);
@@ -7693,13 +7642,6 @@ void __init set_dma_reserve(unsigned long new_dma_reserve)
dma_reserve = new_dma_reserve;
}
-void __init free_area_init(unsigned long *zones_size)
-{
- init_unavailable_mem();
- free_area_init_node(0, zones_size,
- __pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
-}
-
static int page_alloc_cpu_dead(unsigned int cpu)
{
@@ -7817,9 +7759,10 @@ static void setup_per_zone_lowmem_reserve(void)
idx--;
lower_zone = pgdat->node_zones + idx;
- if (sysctl_lowmem_reserve_ratio[idx] < 1) {
- sysctl_lowmem_reserve_ratio[idx] = 0;
+ if (!sysctl_lowmem_reserve_ratio[idx] ||
+ !zone_managed_pages(lower_zone)) {
lower_zone->lowmem_reserve[j] = 0;
+ continue;
} else {
lower_zone->lowmem_reserve[j] =
managed_pages / sysctl_lowmem_reserve_ratio[idx];
@@ -7884,9 +7827,9 @@ static void __setup_per_zone_wmarks(void)
mult_frac(zone_managed_pages(zone),
watermark_scale_factor, 10000));
+ zone->watermark_boost = 0;
zone->_watermark[WMARK_LOW] = min_wmark_pages(zone) + tmp;
zone->_watermark[WMARK_HIGH] = min_wmark_pages(zone) + tmp * 2;
- zone->watermark_boost = 0;
spin_unlock_irqrestore(&zone->lock, flags);
}
@@ -7915,7 +7858,7 @@ void setup_per_zone_wmarks(void)
* Initialise min_free_kbytes.
*
* For small machines we want it small (128k min). For large machines
- * we want it large (64MB max). But it is not linear, because network
+ * we want it large (256MB max). But it is not linear, because network
* bandwidth does not increase linearly with machine size. We use
*
* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
@@ -7962,9 +7905,11 @@ int __meminit init_per_zone_wmark_min(void)
setup_min_slab_ratio();
#endif
+ khugepaged_min_free_kbytes_update();
+
return 0;
}
-core_initcall(init_per_zone_wmark_min)
+postcore_initcall(init_per_zone_wmark_min)
/*
* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
@@ -7972,7 +7917,7 @@ core_initcall(init_per_zone_wmark_min)
* changes.
*/
int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -7987,20 +7932,8 @@ int min_free_kbytes_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-int watermark_boost_factor_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
-{
- int rc;
-
- rc = proc_dointvec_minmax(table, write, buffer, length, ppos);
- if (rc)
- return rc;
-
- return 0;
-}
-
int watermark_scale_factor_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8030,7 +7963,7 @@ static void setup_min_unmapped_ratio(void)
int sysctl_min_unmapped_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8057,7 +7990,7 @@ static void setup_min_slab_ratio(void)
}
int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int rc;
@@ -8081,9 +8014,17 @@ int sysctl_min_slab_ratio_sysctl_handler(struct ctl_table *table, int write,
* if in function of the boot time zone sizes.
*/
int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
+ int i;
+
proc_dointvec_minmax(table, write, buffer, length, ppos);
+
+ for (i = 0; i < MAX_NR_ZONES; i++) {
+ if (sysctl_lowmem_reserve_ratio[i] < 1)
+ sysctl_lowmem_reserve_ratio[i] = 0;
+ }
+
setup_per_zone_lowmem_reserve();
return 0;
}
@@ -8103,7 +8044,7 @@ static void __zone_pcp_update(struct zone *zone)
* pagelist can have before it gets flushed back to buddy allocator.
*/
int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
struct zone *zone;
int old_percpu_pagelist_fraction;
@@ -8247,7 +8188,7 @@ void *__init alloc_large_system_hash(const char *tablename,
table = memblock_alloc_raw(size,
SMP_CACHE_BYTES);
} else if (get_order(size) >= MAX_ORDER || hashdist) {
- table = __vmalloc(size, gfp_flags, PAGE_KERNEL);
+ table = __vmalloc(size, gfp_flags);
virt = true;
} else {
/*
@@ -8284,7 +8225,7 @@ void *__init alloc_large_system_hash(const char *tablename,
* race condition. So you can't expect this function should be exact.
*
* Returns a page without holding a reference. If the caller wants to
- * dereference that page (e.g., dumping), it has to make sure that that it
+ * dereference that page (e.g., dumping), it has to make sure that it
* cannot get removed (e.g., via memory unplug) concurrently.
*
*/
@@ -8372,6 +8313,19 @@ struct page *has_unmovable_pages(struct zone *zone, struct page *page,
if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
continue;
+ /*
+ * We treat all PageOffline() pages as movable when offlining
+ * to give drivers a chance to decrement their reference count
+ * in MEM_GOING_OFFLINE in order to indicate that these pages
+ * can be offlined as there are no direct references anymore.
+ * For actually unmovable PageOffline() where the driver does
+ * not support this, we will fail later when trying to actually
+ * move these pages that still have a reference count > 0.
+ * (false negatives in this function only)
+ */
+ if ((flags & MEMORY_OFFLINE) && PageOffline(page))
+ continue;
+
if (__PageMovable(page) || PageLRU(page))
continue;
@@ -8411,10 +8365,14 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
unsigned long start, unsigned long end)
{
/* This function is based on compact_zone() from compaction.c. */
- unsigned long nr_reclaimed;
+ unsigned int nr_reclaimed;
unsigned long pfn = start;
unsigned int tries = 0;
int ret = 0;
+ struct migration_target_control mtc = {
+ .nid = zone_to_nid(cc->zone),
+ .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL,
+ };
migrate_prep();
@@ -8441,8 +8399,8 @@ static int __alloc_contig_migrate_range(struct compact_control *cc,
&cc->migratepages);
cc->nr_migratepages -= nr_reclaimed;
- ret = migrate_pages(&cc->migratepages, alloc_migrate_target,
- NULL, 0, cc->mode, MR_CONTIG_RANGE);
+ ret = migrate_pages(&cc->migratepages, alloc_migration_target,
+ NULL, (unsigned long)&mtc, cc->mode, MR_CONTIG_RANGE);
}
if (ret < 0) {
putback_movable_pages(&cc->migratepages);
@@ -8603,6 +8561,7 @@ done:
pfn_max_align_up(end), migratetype);
return ret;
}
+EXPORT_SYMBOL(alloc_contig_range);
static int __alloc_contig_pages(unsigned long start_pfn,
unsigned long nr_pages, gfp_t gfp_mask)
@@ -8718,6 +8677,7 @@ void free_contig_range(unsigned long pfn, unsigned int nr_pages)
}
WARN(count != 0, "%d pages are still in use!\n", count);
}
+EXPORT_SYMBOL(free_contig_range);
/*
* The zone indicated has a new number of managed_pages; batch sizes and percpu
@@ -8790,6 +8750,17 @@ __offline_isolated_pages(unsigned long start_pfn, unsigned long end_pfn)
offlined_pages++;
continue;
}
+ /*
+ * At this point all remaining PageOffline() pages have a
+ * reference count of 0 and can simply be skipped.
+ */
+ if (PageOffline(page)) {
+ BUG_ON(page_count(page));
+ BUG_ON(PageBuddy(page));
+ pfn++;
+ offlined_pages++;
+ continue;
+ }
BUG_ON(page_count(page));
BUG_ON(!PageBuddy(page));
diff --git a/mm/page_counter.c b/mm/page_counter.c
index c56db2d5e159..afe22ad335cc 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -72,13 +72,13 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages)
long new;
new = atomic_long_add_return(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is indeed racy, but we can live with some
* inaccuracy in the watermark.
*/
- if (new > c->watermark)
- c->watermark = new;
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
}
}
@@ -116,22 +116,23 @@ bool page_counter_try_charge(struct page_counter *counter,
new = atomic_long_add_return(nr_pages, &c->usage);
if (new > c->max) {
atomic_long_sub(nr_pages, &c->usage);
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* This is racy, but we can live with some
- * inaccuracy in the failcnt.
+ * inaccuracy in the failcnt which is only used
+ * to report stats.
*/
- c->failcnt++;
+ data_race(c->failcnt++);
*fail = c;
goto failed;
}
- propagate_protected_usage(counter, new);
+ propagate_protected_usage(c, new);
/*
* Just like with failcnt, we can live with some
* inaccuracy in the watermark.
*/
- if (new > c->watermark)
- c->watermark = new;
+ if (new > READ_ONCE(c->watermark))
+ WRITE_ONCE(c->watermark, new);
}
return true;
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 295512465065..057c61df12db 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -4,6 +4,7 @@
#include <linux/fs.h>
#include <linux/sysfs.h>
#include <linux/kobject.h>
+#include <linux/memory_hotplug.h>
#include <linux/mm.h>
#include <linux/mmzone.h>
#include <linux/pagemap.h>
@@ -30,13 +31,9 @@
*/
static struct page *page_idle_get_page(unsigned long pfn)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
pg_data_t *pgdat;
- if (!pfn_valid(pfn))
- return NULL;
-
- page = pfn_to_page(pfn);
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
diff --git a/mm/page_io.c b/mm/page_io.c
index 76965be1d40e..e485a6e8a6cd 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -25,7 +25,6 @@
#include <linux/psi.h>
#include <linux/uio.h>
#include <linux/sched/task.h>
-#include <asm/pgtable.h>
static struct bio *get_swap_bio(gfp_t gfp_flags,
struct page *page, bio_end_io_t end_io)
@@ -41,7 +40,7 @@ static struct bio *get_swap_bio(gfp_t gfp_flags,
bio->bi_iter.bi_sector <<= PAGE_SHIFT - 9;
bio->bi_end_io = end_io;
- bio_add_page(bio, page, PAGE_SIZE * hpage_nr_pages(page), 0);
+ bio_add_page(bio, page, thp_size(page), 0);
}
return bio;
}
@@ -86,7 +85,7 @@ static void swap_slot_free_notify(struct page *page)
return;
sis = page_swap_info(page);
- if (!(sis->flags & SWP_BLKDEV))
+ if (data_race(!(sis->flags & SWP_BLKDEV)))
return;
/*
@@ -275,9 +274,26 @@ static inline void count_swpout_vm_event(struct page *page)
if (unlikely(PageTransHuge(page)))
count_vm_event(THP_SWPOUT);
#endif
- count_vm_events(PSWPOUT, hpage_nr_pages(page));
+ count_vm_events(PSWPOUT, thp_nr_pages(page));
}
+#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
+static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
+{
+ struct cgroup_subsys_state *css;
+
+ if (!page->mem_cgroup)
+ return;
+
+ rcu_read_lock();
+ css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ bio_associate_blkg_from_css(bio, css);
+ rcu_read_unlock();
+}
+#else
+#define bio_associate_blkg_from_page(bio, page) do { } while (0)
+#endif /* CONFIG_MEMCG && CONFIG_BLK_CGROUP */
+
int __swap_writepage(struct page *page, struct writeback_control *wbc,
bio_end_io_t end_write_func)
{
@@ -286,7 +302,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
struct swap_info_struct *sis = page_swap_info(page);
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
- if (sis->flags & SWP_FS) {
+ if (data_race(sis->flags & SWP_FS)) {
struct kiocb kiocb;
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -377,7 +393,7 @@ int swap_readpage(struct page *page, bool synchronous)
goto out;
}
- if (sis->flags & SWP_FS) {
+ if (data_race(sis->flags & SWP_FS)) {
struct file *swap_file = sis->swap_file;
struct address_space *mapping = swap_file->f_mapping;
@@ -425,7 +441,7 @@ int swap_readpage(struct page *page, bool synchronous)
break;
if (!blk_poll(disk->queue, qc, true))
- io_schedule();
+ blk_io_schedule();
}
__set_current_state(TASK_RUNNING);
bio_put(bio);
@@ -439,7 +455,7 @@ int swap_set_page_dirty(struct page *page)
{
struct swap_info_struct *sis = page_swap_info(page);
- if (sis->flags & SWP_FS) {
+ if (data_race(sis->flags & SWP_FS)) {
struct address_space *mapping = sis->swap_file->f_mapping;
VM_BUG_ON_PAGE(!PageSwapCache(page), page);
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index 2c11a38d6e87..63a3db10a8c0 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -151,6 +151,7 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* a bit mask)
* MEMORY_OFFLINE - isolate to offline (!allocate) memory
* e.g., skip over PageHWPoison() pages
+ * and PageOffline() pages.
* REPORT_FAILURE - report details about the failure to
* isolate the range
*
@@ -169,6 +170,14 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
* pageblocks we may have modified and return -EBUSY to caller. This
* prevents two threads from simultaneously working on overlapping ranges.
*
+ * Please note that there is no strong synchronization with the page allocator
+ * either. Pages might be freed while their page blocks are marked ISOLATED.
+ * In some cases pages might still end up on pcp lists and that would allow
+ * for their allocation even when they are in fact isolated already. Depending
+ * on how strong of a guarantee the caller needs drain_all_pages might be needed
+ * (e.g. __offline_pages will need to call it after check for isolated range for
+ * a next retry).
+ *
* Return: the number of isolated pageblocks on success and -EBUSY if any part
* of range cannot be isolated.
*/
@@ -259,6 +268,14 @@ __test_page_isolated_in_pageblock(unsigned long pfn, unsigned long end_pfn,
else if ((flags & MEMORY_OFFLINE) && PageHWPoison(page))
/* A HWPoisoned page cannot be also PageBuddy */
pfn++;
+ else if ((flags & MEMORY_OFFLINE) && PageOffline(page) &&
+ !page_count(page))
+ /*
+ * The responsible driver agreed to skip PageOffline()
+ * pages when offlining memory by dropping its
+ * reference in MEM_GOING_OFFLINE.
+ */
+ pfn++;
else
break;
}
@@ -297,8 +314,3 @@ int test_pages_isolated(unsigned long start_pfn, unsigned long end_pfn,
return pfn < end_pfn ? -EBUSY : 0;
}
-
-struct page *alloc_migrate_target(struct page *page, unsigned long private)
-{
- return new_page_nodemask(page, numa_node_id(), &node_states[N_MEMORY]);
-}
diff --git a/mm/page_owner.c b/mm/page_owner.c
index 18ecde9f45b2..360461509423 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -312,8 +312,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m,
continue;
page_owner = get_page_owner(page_ext);
- page_mt = gfpflags_to_migratetype(
- page_owner->gfp_mask);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
if (pageblock_mt != page_mt) {
if (is_migrate_cma(pageblock_mt))
count[MIGRATE_MOVABLE]++;
@@ -359,7 +358,7 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
/* Print information relevant to grouping pages by mobility */
pageblock_mt = get_pageblock_migratetype(page);
- page_mt = gfpflags_to_migratetype(page_owner->gfp_mask);
+ page_mt = gfp_migratetype(page_owner->gfp_mask);
ret += snprintf(kbuf + ret, count - ret,
"PFN %lu type %s Block %lu type %s Flags %#lx(%pGp)\n",
pfn,
@@ -416,7 +415,7 @@ void __dump_page_owner(struct page *page)
page_owner = get_page_owner(page_ext);
gfp_mask = page_owner->gfp_mask;
- mt = gfpflags_to_migratetype(gfp_mask);
+ mt = gfp_migratetype(gfp_mask);
if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) {
pr_alert("page_owner info is not present (never set?)\n");
diff --git a/mm/page_reporting.h b/mm/page_reporting.h
index aa6d37f4dc22..2c385dd4ddbd 100644
--- a/mm/page_reporting.h
+++ b/mm/page_reporting.h
@@ -7,7 +7,7 @@
#include <linux/page-isolation.h>
#include <linux/jump_label.h>
#include <linux/slab.h>
-#include <asm/pgtable.h>
+#include <linux/pgtable.h>
#include <linux/scatterlist.h>
#define PAGE_REPORTING_MIN_ORDER pageblock_order
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 719c35246cfa..5e77b269c330 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -61,7 +61,7 @@ static inline bool pfn_is_match(struct page *page, unsigned long pfn)
return page_pfn == pfn;
/* THP can be referenced by any subpage */
- return pfn >= page_pfn && pfn - page_pfn < hpage_nr_pages(page);
+ return pfn >= page_pfn && pfn - page_pfn < thp_nr_pages(page);
}
/**
@@ -227,7 +227,7 @@ next_pte:
if (pvmw->address >= pvmw->vma->vm_end ||
pvmw->address >=
__vma_address(pvmw->page, pvmw->vma) +
- hpage_nr_pages(pvmw->page) * PAGE_SIZE)
+ thp_size(pvmw->page))
return not_found(pvmw);
/* Did we cross page table boundary? */
if (pvmw->address % PMD_SIZE == 0) {
@@ -268,7 +268,7 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma)
unsigned long start, end;
start = __vma_address(page, vma);
- end = start + PAGE_SIZE * (hpage_nr_pages(page) - 1);
+ end = start + thp_size(page) - PAGE_SIZE;
if (unlikely(end < vma->vm_start || start >= vma->vm_end))
return 0;
diff --git a/mm/pagewalk.c b/mm/pagewalk.c
index 928df1638c30..e81640d9f177 100644
--- a/mm/pagewalk.c
+++ b/mm/pagewalk.c
@@ -373,7 +373,7 @@ static int __walk_page_range(unsigned long start, unsigned long end,
* caller-specific data to callbacks, @private should be helpful.
*
* Locking:
- * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_sem,
+ * Callers of walk_page_range() and walk_page_vma() should hold @mm->mmap_lock,
* because these function traverse vma list and/or access to vma's data.
*/
int walk_page_range(struct mm_struct *mm, unsigned long start,
@@ -395,7 +395,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start,
if (!walk.mm)
return -EINVAL;
- lockdep_assert_held(&walk.mm->mmap_sem);
+ mmap_assert_locked(walk.mm);
vma = find_vma(walk.mm, start);
do {
@@ -453,7 +453,7 @@ int walk_page_range_novma(struct mm_struct *mm, unsigned long start,
if (start >= end || !walk.mm)
return -EINVAL;
- lockdep_assert_held(&walk.mm->mmap_sem);
+ mmap_assert_locked(walk.mm);
return __walk_page_range(start, end, &walk);
}
@@ -472,7 +472,7 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
if (!walk.mm)
return -EINVAL;
- lockdep_assert_held(&walk.mm->mmap_sem);
+ mmap_assert_locked(walk.mm);
err = walk_page_test(vma->vm_start, vma->vm_end, &walk);
if (err > 0)
@@ -498,11 +498,11 @@ int walk_page_vma(struct vm_area_struct *vma, const struct mm_walk_ops *ops,
* Also see walk_page_range() for additional information.
*
* Locking:
- * This function can't require that the struct mm_struct::mmap_sem is held,
+ * This function can't require that the struct mm_struct::mmap_lock is held,
* since @mapping may be mapped by multiple processes. Instead
* @mapping->i_mmap_rwsem must be held. This might have implications in the
* callbacks, and it's up tho the caller to ensure that the
- * struct mm_struct::mmap_sem is not needed.
+ * struct mm_struct::mmap_lock is not needed.
*
* Also this means that a caller can't rely on the struct
* vm_area_struct::vm_flags to be constant across a call,
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
index 0468ba500bd4..18b768ac7dca 100644
--- a/mm/percpu-internal.h
+++ b/mm/percpu-internal.h
@@ -6,6 +6,25 @@
#include <linux/percpu.h>
/*
+ * There are two chunk types: root and memcg-aware.
+ * Chunks of each type have separate slots list.
+ *
+ * Memcg-aware chunks have an attached vector of obj_cgroup pointers, which is
+ * used to store memcg membership data of a percpu object. Obj_cgroups are
+ * ref-counted pointers to a memory cgroup with an ability to switch dynamically
+ * to the parent memory cgroup. This allows to reclaim a deleted memory cgroup
+ * without reclaiming of all outstanding objects, which hold a reference at it.
+ */
+enum pcpu_chunk_type {
+ PCPU_CHUNK_ROOT,
+#ifdef CONFIG_MEMCG_KMEM
+ PCPU_CHUNK_MEMCG,
+#endif
+ PCPU_NR_CHUNK_TYPES,
+ PCPU_FAIL_ALLOC = PCPU_NR_CHUNK_TYPES
+};
+
+/*
* pcpu_block_md is the metadata block struct.
* Each chunk's bitmap is split into a number of full blocks.
* All units are in terms of bits.
@@ -54,6 +73,9 @@ struct pcpu_chunk {
int end_offset; /* additional area required to
have the region end page
aligned */
+#ifdef CONFIG_MEMCG_KMEM
+ struct obj_cgroup **obj_cgroups; /* vector of object cgroups */
+#endif
int nr_pages; /* # of pages served by this chunk */
int nr_populated; /* # of populated pages */
@@ -63,7 +85,7 @@ struct pcpu_chunk {
extern spinlock_t pcpu_lock;
-extern struct list_head *pcpu_slot;
+extern struct list_head *pcpu_chunk_lists;
extern int pcpu_nr_slots;
extern int pcpu_nr_empty_pop_pages;
@@ -106,6 +128,37 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk)
return pcpu_nr_pages_to_map_bits(chunk->nr_pages);
}
+#ifdef CONFIG_MEMCG_KMEM
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ if (chunk->obj_cgroups)
+ return PCPU_CHUNK_MEMCG;
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return chunk_type == PCPU_CHUNK_MEMCG;
+}
+
+#else
+static inline enum pcpu_chunk_type pcpu_chunk_type(struct pcpu_chunk *chunk)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static inline bool pcpu_is_memcg_chunk(enum pcpu_chunk_type chunk_type)
+{
+ return false;
+}
+#endif
+
+static inline struct list_head *pcpu_chunk_list(enum pcpu_chunk_type chunk_type)
+{
+ return &pcpu_chunk_lists[pcpu_nr_slots *
+ pcpu_is_memcg_chunk(chunk_type)];
+}
+
#ifdef CONFIG_PERCPU_STATS
#include <linux/spinlock.h>
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index 20d2b69a13b0..35c9941077ee 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -44,7 +44,8 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
/* nada */
}
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
@@ -52,7 +53,7 @@ static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
unsigned long flags;
int i;
- chunk = pcpu_alloc_chunk(gfp);
+ chunk = pcpu_alloc_chunk(type, gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
index 32558063c3f9..c8400a2adbc2 100644
--- a/mm/percpu-stats.c
+++ b/mm/percpu-stats.c
@@ -34,11 +34,15 @@ static int find_max_nr_alloc(void)
{
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
+ enum pcpu_chunk_type type;
max_nr_alloc = 0;
- for (slot = 0; slot < pcpu_nr_slots; slot++)
- list_for_each_entry(chunk, &pcpu_slot[slot], list)
- max_nr_alloc = max(max_nr_alloc, chunk->nr_alloc);
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (slot = 0; slot < pcpu_nr_slots; slot++)
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list)
+ max_nr_alloc = max(max_nr_alloc,
+ chunk->nr_alloc);
return max_nr_alloc;
}
@@ -129,6 +133,9 @@ static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
P("cur_min_alloc", cur_min_alloc);
P("cur_med_alloc", cur_med_alloc);
P("cur_max_alloc", cur_max_alloc);
+#ifdef CONFIG_MEMCG_KMEM
+ P("memcg_aware", pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)));
+#endif
seq_putc(m, '\n');
}
@@ -137,6 +144,7 @@ static int percpu_stats_show(struct seq_file *m, void *v)
struct pcpu_chunk *chunk;
int slot, max_nr_alloc;
int *buffer;
+ enum pcpu_chunk_type type;
alloc_buffer:
spin_lock_irq(&pcpu_lock);
@@ -202,18 +210,18 @@ alloc_buffer:
chunk_map_stats(m, pcpu_reserved_chunk, buffer);
}
- for (slot = 0; slot < pcpu_nr_slots; slot++) {
- list_for_each_entry(chunk, &pcpu_slot[slot], list) {
- if (chunk == pcpu_first_chunk) {
- seq_puts(m, "Chunk: <- First Chunk\n");
- chunk_map_stats(m, chunk, buffer);
-
-
- } else {
- seq_puts(m, "Chunk:\n");
- chunk_map_stats(m, chunk, buffer);
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++) {
+ for (slot = 0; slot < pcpu_nr_slots; slot++) {
+ list_for_each_entry(chunk, &pcpu_chunk_list(type)[slot],
+ list) {
+ if (chunk == pcpu_first_chunk) {
+ seq_puts(m, "Chunk: <- First Chunk\n");
+ chunk_map_stats(m, chunk, buffer);
+ } else {
+ seq_puts(m, "Chunk:\n");
+ chunk_map_stats(m, chunk, buffer);
+ }
}
-
}
}
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index a2b395acef89..e46f7a6917f9 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -328,12 +328,13 @@ static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
pcpu_free_pages(chunk, pages, page_start, page_end);
}
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp)
{
struct pcpu_chunk *chunk;
struct vm_struct **vms;
- chunk = pcpu_alloc_chunk(gfp);
+ chunk = pcpu_alloc_chunk(type, gfp);
if (!chunk)
return NULL;
diff --git a/mm/percpu.c b/mm/percpu.c
index 7da7d7737dab..1ed1a349eab8 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -37,9 +37,14 @@
* takes care of normal allocations.
*
* The allocator organizes chunks into lists according to free size and
- * tries to allocate from the fullest chunk first. Each chunk is managed
- * by a bitmap with metadata blocks. The allocation map is updated on
- * every allocation and free to reflect the current state while the boundary
+ * memcg-awareness. To make a percpu allocation memcg-aware the __GFP_ACCOUNT
+ * flag should be passed. All memcg-aware allocations are sharing one set
+ * of chunks and all unaccounted allocations and allocations performed
+ * by processes belonging to the root memory cgroup are using the second set.
+ *
+ * The allocator tries to allocate from the fullest chunk first. Each chunk
+ * is managed by a bitmap with metadata blocks. The allocation map is updated
+ * on every allocation and free to reflect the current state while the boundary
* map is only updated on allocation. Each metadata block contains
* information to help mitigate the need to iterate over large portions
* of the bitmap. The reverse mapping from page to chunk is stored in
@@ -81,6 +86,7 @@
#include <linux/kmemleak.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
+#include <linux/memcontrol.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
@@ -160,7 +166,7 @@ struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
DEFINE_SPINLOCK(pcpu_lock); /* all internal data structures */
static DEFINE_MUTEX(pcpu_alloc_mutex); /* chunk create/destroy, [de]pop, map ext */
-struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
+struct list_head *pcpu_chunk_lists __ro_after_init; /* chunk list slots */
/* chunks which need their map areas extended, protected by pcpu_lock */
static LIST_HEAD(pcpu_map_extend_chunks);
@@ -482,7 +488,7 @@ static void *pcpu_mem_zalloc(size_t size, gfp_t gfp)
if (size <= PAGE_SIZE)
return kzalloc(size, gfp);
else
- return __vmalloc(size, gfp | __GFP_ZERO, PAGE_KERNEL);
+ return __vmalloc(size, gfp | __GFP_ZERO);
}
/**
@@ -500,6 +506,9 @@ static void __pcpu_chunk_move(struct pcpu_chunk *chunk, int slot,
bool move_front)
{
if (chunk != pcpu_reserved_chunk) {
+ struct list_head *pcpu_slot;
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
if (move_front)
list_move(&chunk->list, &pcpu_slot[slot]);
else
@@ -1211,11 +1220,14 @@ static int pcpu_alloc_area(struct pcpu_chunk *chunk, int alloc_bits,
*
* This function determines the size of an allocation to free using
* the boundary bitmap and clears the allocation map.
+ *
+ * RETURNS:
+ * Number of freed bytes.
*/
-static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
+static int pcpu_free_area(struct pcpu_chunk *chunk, int off)
{
struct pcpu_block_md *chunk_md = &chunk->chunk_md;
- int bit_off, bits, end, oslot;
+ int bit_off, bits, end, oslot, freed;
lockdep_assert_held(&pcpu_lock);
pcpu_stats_area_dealloc(chunk);
@@ -1230,8 +1242,10 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
bits = end - bit_off;
bitmap_clear(chunk->alloc_map, bit_off, bits);
+ freed = bits * PCPU_MIN_ALLOC_SIZE;
+
/* update metadata */
- chunk->free_bytes += bits * PCPU_MIN_ALLOC_SIZE;
+ chunk->free_bytes += freed;
/* update first free bit */
chunk_md->first_free = min(chunk_md->first_free, bit_off);
@@ -1239,6 +1253,8 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int off)
pcpu_block_update_hint_free(chunk, bit_off, bits);
pcpu_chunk_relocate(chunk, oslot);
+
+ return freed;
}
static void pcpu_init_md_block(struct pcpu_block_md *block, int nr_bits)
@@ -1300,7 +1316,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
/* allocate chunk */
alloc_size = sizeof(struct pcpu_chunk) +
- BITS_TO_LONGS(region_size >> PAGE_SHIFT);
+ BITS_TO_LONGS(region_size >> PAGE_SHIFT) * sizeof(unsigned long);
chunk = memblock_alloc(alloc_size, SMP_CACHE_BYTES);
if (!chunk)
panic("%s: Failed to allocate %zu bytes\n", __func__,
@@ -1334,6 +1350,10 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
panic("%s: Failed to allocate %zu bytes\n", __func__,
alloc_size);
+#ifdef CONFIG_MEMCG_KMEM
+ /* first chunk isn't memcg-aware */
+ chunk->obj_cgroups = NULL;
+#endif
pcpu_init_md_blocks(chunk);
/* manage populated page bitmap */
@@ -1373,7 +1393,7 @@ static struct pcpu_chunk * __init pcpu_alloc_first_chunk(unsigned long tmp_addr,
return chunk;
}
-static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
+static struct pcpu_chunk *pcpu_alloc_chunk(enum pcpu_chunk_type type, gfp_t gfp)
{
struct pcpu_chunk *chunk;
int region_bits;
@@ -1401,6 +1421,16 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
if (!chunk->md_blocks)
goto md_blocks_fail;
+#ifdef CONFIG_MEMCG_KMEM
+ if (pcpu_is_memcg_chunk(type)) {
+ chunk->obj_cgroups =
+ pcpu_mem_zalloc(pcpu_chunk_map_bits(chunk) *
+ sizeof(struct obj_cgroup *), gfp);
+ if (!chunk->obj_cgroups)
+ goto objcg_fail;
+ }
+#endif
+
pcpu_init_md_blocks(chunk);
/* init metadata */
@@ -1408,6 +1438,10 @@ static struct pcpu_chunk *pcpu_alloc_chunk(gfp_t gfp)
return chunk;
+#ifdef CONFIG_MEMCG_KMEM
+objcg_fail:
+ pcpu_mem_free(chunk->md_blocks);
+#endif
md_blocks_fail:
pcpu_mem_free(chunk->bound_map);
bound_map_fail:
@@ -1422,6 +1456,9 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
+#ifdef CONFIG_MEMCG_KMEM
+ pcpu_mem_free(chunk->obj_cgroups);
+#endif
pcpu_mem_free(chunk->md_blocks);
pcpu_mem_free(chunk->bound_map);
pcpu_mem_free(chunk->alloc_map);
@@ -1498,7 +1535,8 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end, gfp_t gfp);
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk,
int page_start, int page_end);
-static struct pcpu_chunk *pcpu_create_chunk(gfp_t gfp);
+static struct pcpu_chunk *pcpu_create_chunk(enum pcpu_chunk_type type,
+ gfp_t gfp);
static void pcpu_destroy_chunk(struct pcpu_chunk *chunk);
static struct page *pcpu_addr_to_page(void *addr);
static int __init pcpu_verify_alloc_info(const struct pcpu_alloc_info *ai);
@@ -1540,6 +1578,87 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
return pcpu_get_page_chunk(pcpu_addr_to_page(addr));
}
+#ifdef CONFIG_MEMCG_KMEM
+static enum pcpu_chunk_type pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp,
+ struct obj_cgroup **objcgp)
+{
+ struct obj_cgroup *objcg;
+
+ if (!memcg_kmem_enabled() || !(gfp & __GFP_ACCOUNT) ||
+ memcg_kmem_bypass())
+ return PCPU_CHUNK_ROOT;
+
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return PCPU_CHUNK_ROOT;
+
+ if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) {
+ obj_cgroup_put(objcg);
+ return PCPU_FAIL_ALLOC;
+ }
+
+ *objcgp = objcg;
+ return PCPU_CHUNK_MEMCG;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+ if (!objcg)
+ return;
+
+ if (chunk) {
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = objcg;
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ size * num_possible_cpus());
+ rcu_read_unlock();
+ } else {
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+ obj_cgroup_put(objcg);
+ }
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+ struct obj_cgroup *objcg;
+
+ if (!pcpu_is_memcg_chunk(pcpu_chunk_type(chunk)))
+ return;
+
+ objcg = chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT];
+ chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL;
+
+ obj_cgroup_uncharge(objcg, size * num_possible_cpus());
+
+ rcu_read_lock();
+ mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B,
+ -(size * num_possible_cpus()));
+ rcu_read_unlock();
+
+ obj_cgroup_put(objcg);
+}
+
+#else /* CONFIG_MEMCG_KMEM */
+static enum pcpu_chunk_type
+pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, struct obj_cgroup **objcgp)
+{
+ return PCPU_CHUNK_ROOT;
+}
+
+static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg,
+ struct pcpu_chunk *chunk, int off,
+ size_t size)
+{
+}
+
+static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size)
+{
+}
+#endif /* CONFIG_MEMCG_KMEM */
+
/**
* pcpu_alloc - the percpu allocator
* @size: size of area to allocate in bytes
@@ -1561,6 +1680,9 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t pcpu_gfp;
bool is_atomic;
bool do_warn;
+ enum pcpu_chunk_type type;
+ struct list_head *pcpu_slot;
+ struct obj_cgroup *objcg = NULL;
static int warn_limit = 10;
struct pcpu_chunk *chunk, *next;
const char *err;
@@ -1595,16 +1717,23 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
return NULL;
}
+ type = pcpu_memcg_pre_alloc_hook(size, gfp, &objcg);
+ if (unlikely(type == PCPU_FAIL_ALLOC))
+ return NULL;
+ pcpu_slot = pcpu_chunk_list(type);
+
if (!is_atomic) {
/*
* pcpu_balance_workfn() allocates memory under this mutex,
* and it may wait for memory reclaim. Allow current task
* to become OOM victim, in case of memory pressure.
*/
- if (gfp & __GFP_NOFAIL)
+ if (gfp & __GFP_NOFAIL) {
mutex_lock(&pcpu_alloc_mutex);
- else if (mutex_lock_killable(&pcpu_alloc_mutex))
+ } else if (mutex_lock_killable(&pcpu_alloc_mutex)) {
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
return NULL;
+ }
}
spin_lock_irqsave(&pcpu_lock, flags);
@@ -1659,7 +1788,7 @@ restart:
}
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
- chunk = pcpu_create_chunk(pcpu_gfp);
+ chunk = pcpu_create_chunk(type, pcpu_gfp);
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
@@ -1716,6 +1845,8 @@ area_found:
trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
chunk->base_addr, off, ptr);
+ pcpu_memcg_post_alloc_hook(objcg, chunk, off, size);
+
return ptr;
fail_unlock:
@@ -1737,6 +1868,9 @@ fail:
} else {
mutex_unlock(&pcpu_alloc_mutex);
}
+
+ pcpu_memcg_post_alloc_hook(objcg, NULL, 0, size);
+
return NULL;
}
@@ -1796,8 +1930,8 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
}
/**
- * pcpu_balance_workfn - manage the amount of free chunks and populated pages
- * @work: unused
+ * __pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @type: chunk type
*
* Reclaim all fully free chunks except for the first one. This is also
* responsible for maintaining the pool of empty populated pages. However,
@@ -1806,11 +1940,12 @@ void __percpu *__alloc_reserved_percpu(size_t size, size_t align)
* allocation causes the failure as it is possible that requests can be
* serviced from already backed regions.
*/
-static void pcpu_balance_workfn(struct work_struct *work)
+static void __pcpu_balance_workfn(enum pcpu_chunk_type type)
{
/* gfp flags passed to underlying allocators */
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
LIST_HEAD(to_free);
+ struct list_head *pcpu_slot = pcpu_chunk_list(type);
struct list_head *free_head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
int slot, nr_to_pop, ret;
@@ -1908,7 +2043,7 @@ retry_pop:
if (nr_to_pop) {
/* ran out of chunks to populate, create a new one and retry */
- chunk = pcpu_create_chunk(gfp);
+ chunk = pcpu_create_chunk(type, gfp);
if (chunk) {
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
@@ -1921,6 +2056,20 @@ retry_pop:
}
/**
+ * pcpu_balance_workfn - manage the amount of free chunks and populated pages
+ * @work: unused
+ *
+ * Call __pcpu_balance_workfn() for each chunk type.
+ */
+static void pcpu_balance_workfn(struct work_struct *work)
+{
+ enum pcpu_chunk_type type;
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ __pcpu_balance_workfn(type);
+}
+
+/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
@@ -1934,8 +2083,9 @@ void free_percpu(void __percpu *ptr)
void *addr;
struct pcpu_chunk *chunk;
unsigned long flags;
- int off;
+ int size, off;
bool need_balance = false;
+ struct list_head *pcpu_slot;
if (!ptr)
return;
@@ -1949,7 +2099,11 @@ void free_percpu(void __percpu *ptr)
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->base_addr;
- pcpu_free_area(chunk, off);
+ size = pcpu_free_area(chunk, off);
+
+ pcpu_slot = pcpu_chunk_list(pcpu_chunk_type(chunk));
+
+ pcpu_memcg_free_hook(chunk, off, size);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_bytes == pcpu_unit_size) {
@@ -2260,6 +2414,7 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
int map_size;
unsigned long tmp_addr;
size_t alloc_size;
+ enum pcpu_chunk_type type;
#define PCPU_SETUP_BUG_ON(cond) do { \
if (unlikely(cond)) { \
@@ -2377,13 +2532,18 @@ void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
- pcpu_slot = memblock_alloc(pcpu_nr_slots * sizeof(pcpu_slot[0]),
- SMP_CACHE_BYTES);
- if (!pcpu_slot)
+ pcpu_chunk_lists = memblock_alloc(pcpu_nr_slots *
+ sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES,
+ SMP_CACHE_BYTES);
+ if (!pcpu_chunk_lists)
panic("%s: Failed to allocate %zu bytes\n", __func__,
- pcpu_nr_slots * sizeof(pcpu_slot[0]));
- for (i = 0; i < pcpu_nr_slots; i++)
- INIT_LIST_HEAD(&pcpu_slot[i]);
+ pcpu_nr_slots * sizeof(pcpu_chunk_lists[0]) *
+ PCPU_NR_CHUNK_TYPES);
+
+ for (type = 0; type < PCPU_NR_CHUNK_TYPES; type++)
+ for (i = 0; i < pcpu_nr_slots; i++)
+ INIT_LIST_HEAD(&pcpu_chunk_list(type)[i]);
/*
* The end of the static region needs to be aligned with the
@@ -2513,7 +2673,7 @@ static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
- int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
+ int upa, max_upa, best_upa; /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
diff --git a/mm/pgalloc-track.h b/mm/pgalloc-track.h
new file mode 100644
index 000000000000..1dcc865029a2
--- /dev/null
+++ b/mm/pgalloc-track.h
@@ -0,0 +1,51 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _LINUX_PGALLLC_TRACK_H
+#define _LINUX_PGALLLC_TRACK_H
+
+#if defined(CONFIG_MMU)
+static inline p4d_t *p4d_alloc_track(struct mm_struct *mm, pgd_t *pgd,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pgd_none(*pgd))) {
+ if (__p4d_alloc(mm, pgd, address))
+ return NULL;
+ *mod_mask |= PGTBL_PGD_MODIFIED;
+ }
+
+ return p4d_offset(pgd, address);
+}
+
+static inline pud_t *pud_alloc_track(struct mm_struct *mm, p4d_t *p4d,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(p4d_none(*p4d))) {
+ if (__pud_alloc(mm, p4d, address))
+ return NULL;
+ *mod_mask |= PGTBL_P4D_MODIFIED;
+ }
+
+ return pud_offset(p4d, address);
+}
+
+static inline pmd_t *pmd_alloc_track(struct mm_struct *mm, pud_t *pud,
+ unsigned long address,
+ pgtbl_mod_mask *mod_mask)
+{
+ if (unlikely(pud_none(*pud))) {
+ if (__pmd_alloc(mm, pud, address))
+ return NULL;
+ *mod_mask |= PGTBL_PUD_MODIFIED;
+ }
+
+ return pmd_offset(pud, address);
+}
+#endif /* CONFIG_MMU */
+
+#define pte_alloc_kernel_track(pmd, address, mask) \
+ ((unlikely(pmd_none(*(pmd))) && \
+ (__pte_alloc_kernel(pmd) || ({*(mask)|=PGTBL_PMD_MODIFIED;0;})))?\
+ NULL: pte_offset_kernel(pmd, address))
+
+#endif /* _LINUX_PGALLLC_TRACK_H */
diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c
index 3d7c01e76efc..9578db83e312 100644
--- a/mm/pgtable-generic.c
+++ b/mm/pgtable-generic.c
@@ -2,15 +2,15 @@
/*
* mm/pgtable-generic.c
*
- * Generic pgtable methods declared in asm-generic/pgtable.h
+ * Generic pgtable methods declared in linux/pgtable.h
*
* Copyright (C) 2010 Linus Torvalds
*/
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
+#include <linux/pgtable.h>
#include <asm/tlb.h>
-#include <asm-generic/pgtable.h>
/*
* If a p?d_bad entry is found while walking page tables, report
@@ -53,7 +53,7 @@ void pmd_clear_bad(pmd_t *pmd)
#ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS
/*
- * Only sets the access flags (dirty, accessed), as well as write
+ * Only sets the access flags (dirty, accessed), as well as write
* permission. Furthermore, we know it always gets set to a "more
* permissive" setting, which allows most architectures to optimize
* this. We return whether the PTE actually changed, which in turn
@@ -194,7 +194,7 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp)
pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address,
pmd_t *pmdp)
{
- pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp));
+ pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mkinvalid(*pmdp));
flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE);
return old;
}
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 74e957e302fe..29c052099aff 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -104,12 +104,12 @@ static int process_vm_rw_single_vec(unsigned long addr,
* access remotely because task/mm might not
* current/current->mm
*/
- down_read(&mm->mmap_sem);
- pinned_pages = pin_user_pages_remote(task, mm, pa, pinned_pages,
+ mmap_read_lock(mm);
+ pinned_pages = pin_user_pages_remote(mm, pa, pinned_pages,
flags, process_pages,
NULL, &locked);
if (locked)
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
if (pinned_pages <= 0)
return -EFAULT;
diff --git a/mm/ptdump.c b/mm/ptdump.c
index 26208d0d03b7..ba88ec43ff21 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -36,6 +36,9 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 0, pgd_val(val));
+
if (pgd_leaf(val))
st->note_page(st, addr, 0, pgd_val(val));
@@ -53,6 +56,9 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 1, p4d_val(val));
+
if (p4d_leaf(val))
st->note_page(st, addr, 1, p4d_val(val));
@@ -70,6 +76,9 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 2, pud_val(val));
+
if (pud_leaf(val))
st->note_page(st, addr, 2, pud_val(val));
@@ -87,6 +96,8 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
return note_kasan_page_table(walk, addr);
#endif
+ if (st->effective_prot)
+ st->effective_prot(st, 3, pmd_val(val));
if (pmd_leaf(val))
st->note_page(st, addr, 3, pmd_val(val));
@@ -97,8 +108,12 @@ static int ptdump_pte_entry(pte_t *pte, unsigned long addr,
unsigned long next, struct mm_walk *walk)
{
struct ptdump_state *st = walk->private;
+ pte_t val = READ_ONCE(*pte);
+
+ if (st->effective_prot)
+ st->effective_prot(st, 4, pte_val(val));
- st->note_page(st, addr, 4, pte_val(READ_ONCE(*pte)));
+ st->note_page(st, addr, 4, pte_val(val));
return 0;
}
@@ -126,13 +141,13 @@ void ptdump_walk_pgd(struct ptdump_state *st, struct mm_struct *mm, pgd_t *pgd)
{
const struct ptdump_range *range = st->range;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
while (range->start != range->end) {
walk_page_range_novma(mm, range->start, range->end,
&ptdump_ops, pgd, st);
range++;
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
/* Flush out the last page */
st->note_page(st, 0, -1, 0);
diff --git a/mm/readahead.c b/mm/readahead.c
index 2fe72cd29b47..3c9a8dd7c56c 100644
--- a/mm/readahead.c
+++ b/mm/readahead.c
@@ -22,6 +22,7 @@
#include <linux/mm_inline.h>
#include <linux/blk-cgroup.h>
#include <linux/fadvise.h>
+#include <linux/sched/mm.h>
#include "internal.h"
@@ -113,94 +114,126 @@ int read_cache_pages(struct address_space *mapping, struct list_head *pages,
EXPORT_SYMBOL(read_cache_pages);
-static int read_pages(struct address_space *mapping, struct file *filp,
- struct list_head *pages, unsigned int nr_pages, gfp_t gfp)
+static void read_pages(struct readahead_control *rac, struct list_head *pages,
+ bool skip_page)
{
+ const struct address_space_operations *aops = rac->mapping->a_ops;
+ struct page *page;
struct blk_plug plug;
- unsigned page_idx;
- int ret;
+
+ if (!readahead_count(rac))
+ goto out;
blk_start_plug(&plug);
- if (mapping->a_ops->readpages) {
- ret = mapping->a_ops->readpages(filp, mapping, pages, nr_pages);
+ if (aops->readahead) {
+ aops->readahead(rac);
+ /* Clean up the remaining pages */
+ while ((page = readahead_page(rac))) {
+ unlock_page(page);
+ put_page(page);
+ }
+ } else if (aops->readpages) {
+ aops->readpages(rac->file, rac->mapping, pages,
+ readahead_count(rac));
/* Clean up the remaining pages */
put_pages_list(pages);
- goto out;
- }
-
- for (page_idx = 0; page_idx < nr_pages; page_idx++) {
- struct page *page = lru_to_page(pages);
- list_del(&page->lru);
- if (!add_to_page_cache_lru(page, mapping, page->index, gfp))
- mapping->a_ops->readpage(filp, page);
- put_page(page);
+ rac->_index += rac->_nr_pages;
+ rac->_nr_pages = 0;
+ } else {
+ while ((page = readahead_page(rac))) {
+ aops->readpage(rac->file, page);
+ put_page(page);
+ }
}
- ret = 0;
-out:
blk_finish_plug(&plug);
- return ret;
+ BUG_ON(!list_empty(pages));
+ BUG_ON(readahead_count(rac));
+
+out:
+ if (skip_page)
+ rac->_index++;
}
-/*
- * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
- * the pages first, then submits them for I/O. This avoids the very bad
- * behaviour which would occur if page allocations are causing VM writeback.
- * We really don't want to intermingle reads and writes like that.
+/**
+ * page_cache_readahead_unbounded - Start unchecked readahead.
+ * @mapping: File address space.
+ * @file: This instance of the open file; used for authentication.
+ * @index: First page index to read.
+ * @nr_to_read: The number of pages to read.
+ * @lookahead_size: Where to start the next readahead.
*
- * Returns the number of pages requested, or the maximum amount of I/O allowed.
+ * This function is for filesystems to call when they want to start
+ * readahead beyond a file's stated i_size. This is almost certainly
+ * not the function you want to call. Use page_cache_async_readahead()
+ * or page_cache_sync_readahead() instead.
+ *
+ * Context: File is referenced by caller. Mutexes may be held by caller.
+ * May sleep, but will not reenter filesystem to reclaim memory.
*/
-unsigned int __do_page_cache_readahead(struct address_space *mapping,
- struct file *filp, pgoff_t offset, unsigned long nr_to_read,
+void page_cache_readahead_unbounded(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read,
unsigned long lookahead_size)
{
- struct inode *inode = mapping->host;
- struct page *page;
- unsigned long end_index; /* The last page we want to read */
LIST_HEAD(page_pool);
- int page_idx;
- unsigned int nr_pages = 0;
- loff_t isize = i_size_read(inode);
gfp_t gfp_mask = readahead_gfp_mask(mapping);
+ struct readahead_control rac = {
+ .mapping = mapping,
+ .file = file,
+ ._index = index,
+ };
+ unsigned long i;
- if (isize == 0)
- goto out;
-
- end_index = ((isize - 1) >> PAGE_SHIFT);
+ /*
+ * Partway through the readahead operation, we will have added
+ * locked pages to the page cache, but will not yet have submitted
+ * them for I/O. Adding another page may need to allocate memory,
+ * which can trigger memory reclaim. Telling the VM we're in
+ * the middle of a filesystem operation will cause it to not
+ * touch file-backed pages, preventing a deadlock. Most (all?)
+ * filesystems already specify __GFP_NOFS in their mapping's
+ * gfp_mask, but let's be explicit here.
+ */
+ unsigned int nofs = memalloc_nofs_save();
/*
* Preallocate as many pages as we will need.
*/
- for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
- pgoff_t page_offset = offset + page_idx;
+ for (i = 0; i < nr_to_read; i++) {
+ struct page *page = xa_load(&mapping->i_pages, index + i);
- if (page_offset > end_index)
- break;
+ BUG_ON(index + i != rac._index + rac._nr_pages);
- page = xa_load(&mapping->i_pages, page_offset);
if (page && !xa_is_value(page)) {
/*
- * Page already present? Kick off the current batch of
- * contiguous pages before continuing with the next
- * batch.
+ * Page already present? Kick off the current batch
+ * of contiguous pages before continuing with the
+ * next batch. This page may be the one we would
+ * have intended to mark as Readahead, but we don't
+ * have a stable reference to this page, and it's
+ * not worth getting one just for that.
*/
- if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages,
- gfp_mask);
- nr_pages = 0;
+ read_pages(&rac, &page_pool, true);
continue;
}
page = __page_cache_alloc(gfp_mask);
if (!page)
break;
- page->index = page_offset;
- list_add(&page->lru, &page_pool);
- if (page_idx == nr_to_read - lookahead_size)
+ if (mapping->a_ops->readpages) {
+ page->index = index + i;
+ list_add(&page->lru, &page_pool);
+ } else if (add_to_page_cache_lru(page, mapping, index + i,
+ gfp_mask) < 0) {
+ put_page(page);
+ read_pages(&rac, &page_pool, true);
+ continue;
+ }
+ if (i == nr_to_read - lookahead_size)
SetPageReadahead(page);
- nr_pages++;
+ rac._nr_pages++;
}
/*
@@ -208,26 +241,53 @@ unsigned int __do_page_cache_readahead(struct address_space *mapping,
* uptodate then the caller will launch readpage again, and
* will then handle the error.
*/
- if (nr_pages)
- read_pages(mapping, filp, &page_pool, nr_pages, gfp_mask);
- BUG_ON(!list_empty(&page_pool));
-out:
- return nr_pages;
+ read_pages(&rac, &page_pool, false);
+ memalloc_nofs_restore(nofs);
+}
+EXPORT_SYMBOL_GPL(page_cache_readahead_unbounded);
+
+/*
+ * __do_page_cache_readahead() actually reads a chunk of disk. It allocates
+ * the pages first, then submits them for I/O. This avoids the very bad
+ * behaviour which would occur if page allocations are causing VM writeback.
+ * We really don't want to intermingle reads and writes like that.
+ */
+void __do_page_cache_readahead(struct address_space *mapping,
+ struct file *file, pgoff_t index, unsigned long nr_to_read,
+ unsigned long lookahead_size)
+{
+ struct inode *inode = mapping->host;
+ loff_t isize = i_size_read(inode);
+ pgoff_t end_index; /* The last page we want to read */
+
+ if (isize == 0)
+ return;
+
+ end_index = (isize - 1) >> PAGE_SHIFT;
+ if (index > end_index)
+ return;
+ /* Don't read past the page containing the last byte of the file */
+ if (nr_to_read > end_index - index)
+ nr_to_read = end_index - index + 1;
+
+ page_cache_readahead_unbounded(mapping, file, index, nr_to_read,
+ lookahead_size);
}
/*
* Chunk the readahead into 2 megabyte units, so that we don't pin too much
* memory at once.
*/
-int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
- pgoff_t offset, unsigned long nr_to_read)
+void force_page_cache_readahead(struct address_space *mapping,
+ struct file *filp, pgoff_t index, unsigned long nr_to_read)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
struct file_ra_state *ra = &filp->f_ra;
unsigned long max_pages;
- if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages))
- return -EINVAL;
+ if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages &&
+ !mapping->a_ops->readahead))
+ return;
/*
* If the request exceeds the readahead window, allow the read to
@@ -240,12 +300,11 @@ int force_page_cache_readahead(struct address_space *mapping, struct file *filp,
if (this_chunk > nr_to_read)
this_chunk = nr_to_read;
- __do_page_cache_readahead(mapping, filp, offset, this_chunk, 0);
+ __do_page_cache_readahead(mapping, filp, index, this_chunk, 0);
- offset += this_chunk;
+ index += this_chunk;
nr_to_read -= this_chunk;
}
- return 0;
}
/*
@@ -324,21 +383,21 @@ static unsigned long get_next_ra_size(struct file_ra_state *ra,
*/
/*
- * Count contiguously cached pages from @offset-1 to @offset-@max,
+ * Count contiguously cached pages from @index-1 to @index-@max,
* this count is a conservative estimation of
* - length of the sequential read sequence, or
* - thrashing threshold in memory tight systems
*/
static pgoff_t count_history_pages(struct address_space *mapping,
- pgoff_t offset, unsigned long max)
+ pgoff_t index, unsigned long max)
{
pgoff_t head;
rcu_read_lock();
- head = page_cache_prev_miss(mapping, offset - 1, max);
+ head = page_cache_prev_miss(mapping, index - 1, max);
rcu_read_unlock();
- return offset - 1 - head;
+ return index - 1 - head;
}
/*
@@ -346,13 +405,13 @@ static pgoff_t count_history_pages(struct address_space *mapping,
*/
static int try_context_readahead(struct address_space *mapping,
struct file_ra_state *ra,
- pgoff_t offset,
+ pgoff_t index,
unsigned long req_size,
unsigned long max)
{
pgoff_t size;
- size = count_history_pages(mapping, offset, max);
+ size = count_history_pages(mapping, index, max);
/*
* not enough history pages:
@@ -365,10 +424,10 @@ static int try_context_readahead(struct address_space *mapping,
* starts from beginning of file:
* it is a strong indication of long-run stream (or whole-file-read)
*/
- if (size >= offset)
+ if (size >= index)
size *= 2;
- ra->start = offset;
+ ra->start = index;
ra->size = min(size + req_size, max);
ra->async_size = 1;
@@ -378,16 +437,15 @@ static int try_context_readahead(struct address_space *mapping,
/*
* A minimal readahead algorithm for trivial sequential/random reads.
*/
-static unsigned long
-ondemand_readahead(struct address_space *mapping,
- struct file_ra_state *ra, struct file *filp,
- bool hit_readahead_marker, pgoff_t offset,
- unsigned long req_size)
+static void ondemand_readahead(struct address_space *mapping,
+ struct file_ra_state *ra, struct file *filp,
+ bool hit_readahead_marker, pgoff_t index,
+ unsigned long req_size)
{
struct backing_dev_info *bdi = inode_to_bdi(mapping->host);
unsigned long max_pages = ra->ra_pages;
unsigned long add_pages;
- pgoff_t prev_offset;
+ pgoff_t prev_index;
/*
* If the request exceeds the readahead window, allow the read to
@@ -399,15 +457,15 @@ ondemand_readahead(struct address_space *mapping,
/*
* start of file
*/
- if (!offset)
+ if (!index)
goto initial_readahead;
/*
- * It's the expected callback offset, assume sequential access.
+ * It's the expected callback index, assume sequential access.
* Ramp up sizes, and push forward the readahead window.
*/
- if ((offset == (ra->start + ra->size - ra->async_size) ||
- offset == (ra->start + ra->size))) {
+ if ((index == (ra->start + ra->size - ra->async_size) ||
+ index == (ra->start + ra->size))) {
ra->start += ra->size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -424,14 +482,14 @@ ondemand_readahead(struct address_space *mapping,
pgoff_t start;
rcu_read_lock();
- start = page_cache_next_miss(mapping, offset + 1, max_pages);
+ start = page_cache_next_miss(mapping, index + 1, max_pages);
rcu_read_unlock();
- if (!start || start - offset > max_pages)
- return 0;
+ if (!start || start - index > max_pages)
+ return;
ra->start = start;
- ra->size = start - offset; /* old async_size */
+ ra->size = start - index; /* old async_size */
ra->size += req_size;
ra->size = get_next_ra_size(ra, max_pages);
ra->async_size = ra->size;
@@ -446,28 +504,29 @@ ondemand_readahead(struct address_space *mapping,
/*
* sequential cache miss
- * trivial case: (offset - prev_offset) == 1
- * unaligned reads: (offset - prev_offset) == 0
+ * trivial case: (index - prev_index) == 1
+ * unaligned reads: (index - prev_index) == 0
*/
- prev_offset = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
- if (offset - prev_offset <= 1UL)
+ prev_index = (unsigned long long)ra->prev_pos >> PAGE_SHIFT;
+ if (index - prev_index <= 1UL)
goto initial_readahead;
/*
* Query the page cache and look for the traces(cached history pages)
* that a sequential stream would leave behind.
*/
- if (try_context_readahead(mapping, ra, offset, req_size, max_pages))
+ if (try_context_readahead(mapping, ra, index, req_size, max_pages))
goto readit;
/*
* standalone, small random read
* Read as is, and do not pollute the readahead state.
*/
- return __do_page_cache_readahead(mapping, filp, offset, req_size, 0);
+ __do_page_cache_readahead(mapping, filp, index, req_size, 0);
+ return;
initial_readahead:
- ra->start = offset;
+ ra->start = index;
ra->size = get_init_ra_size(req_size, max_pages);
ra->async_size = ra->size > req_size ? ra->size - req_size : ra->size;
@@ -478,7 +537,7 @@ readit:
* the resulted next readahead window into the current one.
* Take care of maximum IO pages as above.
*/
- if (offset == ra->start && ra->size == ra->async_size) {
+ if (index == ra->start && ra->size == ra->async_size) {
add_pages = get_next_ra_size(ra, max_pages);
if (ra->size + add_pages <= max_pages) {
ra->async_size = add_pages;
@@ -489,7 +548,7 @@ readit:
}
}
- return ra_submit(ra, mapping, filp);
+ ra_submit(ra, mapping, filp);
}
/**
@@ -497,9 +556,8 @@ readit:
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- * pagecache pages
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
*
* page_cache_sync_readahead() should be called when a cache miss happened:
* it will submit the read. The readahead logic may decide to piggyback more
@@ -508,7 +566,7 @@ readit:
*/
void page_cache_sync_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
- pgoff_t offset, unsigned long req_size)
+ pgoff_t index, unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -519,12 +577,12 @@ void page_cache_sync_readahead(struct address_space *mapping,
/* be dumb */
if (filp && (filp->f_mode & FMODE_RANDOM)) {
- force_page_cache_readahead(mapping, filp, offset, req_size);
+ force_page_cache_readahead(mapping, filp, index, req_count);
return;
}
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, false, offset, req_size);
+ ondemand_readahead(mapping, ra, filp, false, index, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
@@ -533,21 +591,20 @@ EXPORT_SYMBOL_GPL(page_cache_sync_readahead);
* @mapping: address_space which holds the pagecache and I/O vectors
* @ra: file_ra_state which holds the readahead state
* @filp: passed on to ->readpage() and ->readpages()
- * @page: the page at @offset which has the PG_readahead flag set
- * @offset: start offset into @mapping, in pagecache page-sized units
- * @req_size: hint: total size of the read which the caller is performing in
- * pagecache pages
+ * @page: The page at @index which triggered the readahead call.
+ * @index: Index of first page to be read.
+ * @req_count: Total number of pages being read by the caller.
*
* page_cache_async_readahead() should be called when a page is used which
- * has the PG_readahead flag; this is a marker to suggest that the application
+ * is marked as PageReadahead; this is a marker to suggest that the application
* has used up enough of the readahead window that we should start pulling in
* more pages.
*/
void
page_cache_async_readahead(struct address_space *mapping,
struct file_ra_state *ra, struct file *filp,
- struct page *page, pgoff_t offset,
- unsigned long req_size)
+ struct page *page, pgoff_t index,
+ unsigned long req_count)
{
/* no read-ahead */
if (!ra->ra_pages)
@@ -571,7 +628,7 @@ page_cache_async_readahead(struct address_space *mapping,
return;
/* do read-ahead */
- ondemand_readahead(mapping, ra, filp, true, offset, req_size);
+ ondemand_readahead(mapping, ra, filp, true, index, req_count);
}
EXPORT_SYMBOL_GPL(page_cache_async_readahead);
diff --git a/mm/rmap.c b/mm/rmap.c
index f79a206b271a..9425260774a1 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -21,7 +21,7 @@
* Lock ordering in mm:
*
* inode->i_mutex (while writing or truncating, not reading or faulting)
- * mm->mmap_sem
+ * mm->mmap_lock
* page->flags PG_locked (lock_page) * (see huegtlbfs below)
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
* mapping->i_mmap_rwsem
@@ -177,7 +177,7 @@ static void anon_vma_chain_link(struct vm_area_struct *vma,
* to do any locking for the common case of already having
* an anon_vma.
*
- * This must be called with the mmap_sem held for reading.
+ * This must be called with the mmap_lock held for reading.
*/
int __anon_vma_prepare(struct vm_area_struct *vma)
{
@@ -672,7 +672,7 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
*/
void flush_tlb_batched_pending(struct mm_struct *mm)
{
- if (mm->tlb_flush_batched) {
+ if (data_race(mm->tlb_flush_batched)) {
flush_tlb_mm(mm);
/*
@@ -1114,6 +1114,11 @@ void do_page_add_anon_rmap(struct page *page,
bool compound = flags & RMAP_COMPOUND;
bool first;
+ if (unlikely(PageKsm(page)))
+ lock_page_memcg(page);
+ else
+ VM_BUG_ON_PAGE(!PageLocked(page), page);
+
if (compound) {
atomic_t *mapcount;
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -1125,7 +1130,7 @@ void do_page_add_anon_rmap(struct page *page,
}
if (first) {
- int nr = compound ? hpage_nr_pages(page) : 1;
+ int nr = compound ? thp_nr_pages(page) : 1;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
@@ -1133,13 +1138,14 @@ void do_page_add_anon_rmap(struct page *page,
* disabled.
*/
if (compound)
- __inc_node_page_state(page, NR_ANON_THPS);
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
}
- if (unlikely(PageKsm(page)))
- return;
- VM_BUG_ON_PAGE(!PageLocked(page), page);
+ if (unlikely(PageKsm(page))) {
+ unlock_page_memcg(page);
+ return;
+ }
/* address might be in next vma when migration races vma_adjust */
if (first)
@@ -1163,7 +1169,7 @@ void do_page_add_anon_rmap(struct page *page,
void page_add_new_anon_rmap(struct page *page,
struct vm_area_struct *vma, unsigned long address, bool compound)
{
- int nr = compound ? hpage_nr_pages(page) : 1;
+ int nr = compound ? thp_nr_pages(page) : 1;
VM_BUG_ON_VMA(address < vma->vm_start || address >= vma->vm_end, vma);
__SetPageSwapBacked(page);
@@ -1174,14 +1180,14 @@ void page_add_new_anon_rmap(struct page *page,
if (hpage_pincount_available(page))
atomic_set(compound_pincount_ptr(page), 0);
- __inc_node_page_state(page, NR_ANON_THPS);
+ __inc_lruvec_page_state(page, NR_ANON_THPS);
} else {
/* Anon THP always mapped first with PMD */
VM_BUG_ON_PAGE(PageTransCompound(page), page);
/* increment count (starts at -1) */
atomic_set(&page->_mapcount, 0);
}
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, nr);
__page_set_anon_rmap(page, vma, address, 1);
}
@@ -1230,13 +1236,12 @@ static void page_remove_file_rmap(struct page *page, bool compound)
int i, nr = 1;
VM_BUG_ON_PAGE(compound && !PageHead(page), page);
- lock_page_memcg(page);
/* Hugepages are not counted in NR_FILE_MAPPED for now. */
if (unlikely(PageHuge(page))) {
/* hugetlb pages are always mapped with pmds */
atomic_dec(compound_mapcount_ptr(page));
- goto out;
+ return;
}
/* page still mapped by someone else? */
@@ -1246,14 +1251,14 @@ static void page_remove_file_rmap(struct page *page, bool compound)
nr++;
}
if (!atomic_add_negative(-1, compound_mapcount_ptr(page)))
- goto out;
+ return;
if (PageSwapBacked(page))
__dec_node_page_state(page, NR_SHMEM_PMDMAPPED);
else
__dec_node_page_state(page, NR_FILE_PMDMAPPED);
} else {
if (!atomic_add_negative(-1, &page->_mapcount))
- goto out;
+ return;
}
/*
@@ -1265,8 +1270,6 @@ static void page_remove_file_rmap(struct page *page, bool compound)
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
-out:
- unlock_page_memcg(page);
}
static void page_remove_anon_compound_rmap(struct page *page)
@@ -1283,7 +1286,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE))
return;
- __dec_node_page_state(page, NR_ANON_THPS);
+ __dec_lruvec_page_state(page, NR_ANON_THPS);
if (TestClearPageDoubleMap(page)) {
/*
@@ -1310,7 +1313,7 @@ static void page_remove_anon_compound_rmap(struct page *page)
clear_page_mlock(page);
if (nr)
- __mod_node_page_state(page_pgdat(page), NR_ANON_MAPPED, -nr);
+ __mod_lruvec_page_state(page, NR_ANON_MAPPED, -nr);
}
/**
@@ -1322,22 +1325,28 @@ static void page_remove_anon_compound_rmap(struct page *page)
*/
void page_remove_rmap(struct page *page, bool compound)
{
- if (!PageAnon(page))
- return page_remove_file_rmap(page, compound);
+ lock_page_memcg(page);
+
+ if (!PageAnon(page)) {
+ page_remove_file_rmap(page, compound);
+ goto out;
+ }
- if (compound)
- return page_remove_anon_compound_rmap(page);
+ if (compound) {
+ page_remove_anon_compound_rmap(page);
+ goto out;
+ }
/* page still mapped by someone else? */
if (!atomic_add_negative(-1, &page->_mapcount))
- return;
+ goto out;
/*
* We use the irq-unsafe __{inc|mod}_zone_page_stat because
* these counters are not modified in interrupt context, and
* pte lock(a spinlock) is held, which implies preemption disabled.
*/
- __dec_node_page_state(page, NR_ANON_MAPPED);
+ __dec_lruvec_page_state(page, NR_ANON_MAPPED);
if (unlikely(PageMlocked(page)))
clear_page_mlock(page);
@@ -1354,6 +1363,8 @@ void page_remove_rmap(struct page *page, bool compound)
* Leaving it set also helps swapoff to reinstate ptes
* faster for those pages still in swapcache.
*/
+out:
+ unlock_page_memcg(page);
}
/*
@@ -1433,7 +1444,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
if (!PageTransCompound(page)) {
/*
* Holding pte lock, we do *not* need
- * mmap_sem here
+ * mmap_lock here
*/
mlock_vma_page(page);
}
@@ -1458,7 +1469,7 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
* do this outside rmap routines.
*/
VM_BUG_ON(!(flags & TTU_RMAP_LOCKED));
- if (huge_pmd_unshare(mm, &address, pvmw.pte)) {
+ if (huge_pmd_unshare(mm, vma, &address, pvmw.pte)) {
/*
* huge_pmd_unshare unmapped an entire PMD
* page. There is no way of knowing exactly
@@ -1500,9 +1511,14 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
*/
entry = make_migration_entry(page, 0);
swp_pte = swp_entry_to_pte(entry);
- if (pte_soft_dirty(pteval))
+
+ /*
+ * pteval maps a zone device page and is therefore
+ * a swap pte.
+ */
+ if (pte_swp_soft_dirty(pteval))
swp_pte = pte_swp_mksoft_dirty(swp_pte);
- if (pte_uffd_wp(pteval))
+ if (pte_swp_uffd_wp(pteval))
swp_pte = pte_swp_mkuffd_wp(swp_pte);
set_pte_at(mm, pvmw.address, pvmw.pte, swp_pte);
/*
@@ -1806,7 +1822,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
/*
* Note: remove_migration_ptes() cannot use page_lock_anon_vma_read()
* because that depends on page_mapped(); but not all its usages
- * are holding mmap_sem. Users without mmap_sem are required to
+ * are holding mmap_lock. Users without mmap_lock are required to
* take a reference count to prevent the anon_vma disappearing
*/
anon_vma = page_anon_vma(page);
@@ -1826,7 +1842,7 @@ static struct anon_vma *rmap_walk_anon_lock(struct page *page,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the anon_vma struct it points to.
*
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1849,7 +1865,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
return;
pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
anon_vma_interval_tree_foreach(avc, &anon_vma->rb_root,
pgoff_start, pgoff_end) {
struct vm_area_struct *vma = avc->vma;
@@ -1878,7 +1894,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc,
* Find all the mappings of a page using the mapping pointer and the vma chains
* contained in the address_space struct it points to.
*
- * When called from try_to_munlock(), the mmap_sem of the mm containing the vma
+ * When called from try_to_munlock(), the mmap_lock of the mm containing the vma
* where the page was found will be held for write. So, we won't recheck
* vm_flags for that VMA. That should be OK, because that vma shouldn't be
* LOCKED.
@@ -1902,7 +1918,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc,
return;
pgoff_start = page_to_pgoff(page);
- pgoff_end = pgoff_start + hpage_nr_pages(page) - 1;
+ pgoff_end = pgoff_start + thp_nr_pages(page) - 1;
if (!locked)
i_mmap_lock_read(mapping);
vma_interval_tree_foreach(vma, &mapping->i_mmap,
diff --git a/mm/rodata_test.c b/mm/rodata_test.c
index 5e313fa93276..2613371945b7 100644
--- a/mm/rodata_test.c
+++ b/mm/rodata_test.c
@@ -7,6 +7,7 @@
*/
#define pr_fmt(fmt) "rodata_test: " fmt
+#include <linux/rodata_test.h>
#include <linux/uaccess.h>
#include <asm/sections.h>
@@ -25,7 +26,7 @@ void rodata_test(void)
}
/* test 2: write to the variable; this should fault */
- if (!probe_kernel_write((void *)&rodata_test_data,
+ if (!copy_to_kernel_nofault((void *)&rodata_test_data,
(void *)&zero, sizeof(zero))) {
pr_err("test data was not read only\n");
return;
diff --git a/mm/shmem.c b/mm/shmem.c
index bd8840082c94..8e2b35ba93ad 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -82,7 +82,6 @@ static struct vfsmount *shm_mnt;
#include <linux/uuid.h>
#include <linux/uaccess.h>
-#include <asm/pgtable.h>
#include "internal.h"
@@ -115,11 +114,13 @@ struct shmem_options {
kuid_t uid;
kgid_t gid;
umode_t mode;
+ bool full_inums;
int huge;
int seen;
#define SHMEM_SEEN_BLOCKS 1
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
+#define SHMEM_SEEN_INUMS 8
};
#ifdef CONFIG_TMPFS
@@ -261,18 +262,78 @@ bool vma_is_shmem(struct vm_area_struct *vma)
static LIST_HEAD(shmem_swaplist);
static DEFINE_MUTEX(shmem_swaplist_mutex);
-static int shmem_reserve_inode(struct super_block *sb)
+/*
+ * shmem_reserve_inode() performs bookkeeping to reserve a shmem inode, and
+ * produces a novel ino for the newly allocated inode.
+ *
+ * It may also be called when making a hard link to permit the space needed by
+ * each dentry. However, in that case, no new inode number is needed since that
+ * internally draws from another pool of inode numbers (currently global
+ * get_next_ino()). This case is indicated by passing NULL as inop.
+ */
+#define SHMEM_INO_BATCH 1024
+static int shmem_reserve_inode(struct super_block *sb, ino_t *inop)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
- if (sbinfo->max_inodes) {
+ ino_t ino;
+
+ if (!(sb->s_flags & SB_KERNMOUNT)) {
spin_lock(&sbinfo->stat_lock);
- if (!sbinfo->free_inodes) {
- spin_unlock(&sbinfo->stat_lock);
- return -ENOSPC;
+ if (sbinfo->max_inodes) {
+ if (!sbinfo->free_inodes) {
+ spin_unlock(&sbinfo->stat_lock);
+ return -ENOSPC;
+ }
+ sbinfo->free_inodes--;
+ }
+ if (inop) {
+ ino = sbinfo->next_ino++;
+ if (unlikely(is_zero_ino(ino)))
+ ino = sbinfo->next_ino++;
+ if (unlikely(!sbinfo->full_inums &&
+ ino > UINT_MAX)) {
+ /*
+ * Emulate get_next_ino uint wraparound for
+ * compatibility
+ */
+ if (IS_ENABLED(CONFIG_64BIT))
+ pr_warn("%s: inode number overflow on device %d, consider using inode64 mount option\n",
+ __func__, MINOR(sb->s_dev));
+ sbinfo->next_ino = 1;
+ ino = sbinfo->next_ino++;
+ }
+ *inop = ino;
}
- sbinfo->free_inodes--;
spin_unlock(&sbinfo->stat_lock);
+ } else if (inop) {
+ /*
+ * __shmem_file_setup, one of our callers, is lock-free: it
+ * doesn't hold stat_lock in shmem_reserve_inode since
+ * max_inodes is always 0, and is called from potentially
+ * unknown contexts. As such, use a per-cpu batched allocator
+ * which doesn't require the per-sb stat_lock unless we are at
+ * the batch boundary.
+ *
+ * We don't need to worry about inode{32,64} since SB_KERNMOUNT
+ * shmem mounts are not exposed to userspace, so we don't need
+ * to worry about things like glibc compatibility.
+ */
+ ino_t *next_ino;
+ next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu());
+ ino = *next_ino;
+ if (unlikely(ino % SHMEM_INO_BATCH == 0)) {
+ spin_lock(&sbinfo->stat_lock);
+ ino = sbinfo->next_ino;
+ sbinfo->next_ino += SHMEM_INO_BATCH;
+ spin_unlock(&sbinfo->stat_lock);
+ if (unlikely(is_zero_ino(ino)))
+ ino++;
+ }
+ *inop = ino;
+ *next_ino = ++ino;
+ put_cpu();
}
+
return 0;
}
@@ -605,11 +666,13 @@ static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo)
*/
static int shmem_add_to_page_cache(struct page *page,
struct address_space *mapping,
- pgoff_t index, void *expected, gfp_t gfp)
+ pgoff_t index, void *expected, gfp_t gfp,
+ struct mm_struct *charge_mm)
{
XA_STATE_ORDER(xas, &mapping->i_pages, index, compound_order(page));
unsigned long i = 0;
unsigned long nr = compound_nr(page);
+ int error;
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(index != round_down(index, nr), page);
@@ -621,6 +684,18 @@ static int shmem_add_to_page_cache(struct page *page,
page->mapping = mapping;
page->index = index;
+ if (!PageSwapCache(page)) {
+ error = mem_cgroup_charge(page, charge_mm, gfp);
+ if (error) {
+ if (PageTransHuge(page)) {
+ count_vm_event(THP_FILE_FALLBACK);
+ count_vm_event(THP_FILE_FALLBACK_CHARGE);
+ }
+ goto error;
+ }
+ }
+ cgroup_throttle_swaprate(page, gfp);
+
do {
void *entry;
xas_lock_irq(&xas);
@@ -641,19 +716,22 @@ next:
__inc_node_page_state(page, NR_SHMEM_THPS);
}
mapping->nrpages += nr;
- __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
- __mod_node_page_state(page_pgdat(page), NR_SHMEM, nr);
+ __mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
+ __mod_lruvec_page_state(page, NR_SHMEM, nr);
unlock:
xas_unlock_irq(&xas);
} while (xas_nomem(&xas, gfp));
if (xas_error(&xas)) {
- page->mapping = NULL;
- page_ref_sub(page, nr);
- return xas_error(&xas);
+ error = xas_error(&xas);
+ goto error;
}
return 0;
+error:
+ page->mapping = NULL;
+ page_ref_sub(page, nr);
+ return error;
}
/*
@@ -670,8 +748,8 @@ static void shmem_delete_from_page_cache(struct page *page, void *radswap)
error = shmem_replace_entry(mapping, page->index, page, radswap);
page->mapping = NULL;
mapping->nrpages--;
- __dec_node_page_state(page, NR_FILE_PAGES);
- __dec_node_page_state(page, NR_SHMEM);
+ __dec_lruvec_page_state(page, NR_FILE_PAGES);
+ __dec_lruvec_page_state(page, NR_SHMEM);
xa_unlock_irq(&mapping->i_pages);
put_page(page);
BUG_ON(error);
@@ -1358,7 +1436,8 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc)
list_add(&info->swaplist, &shmem_swaplist);
if (add_to_swap_cache(page, swap,
- __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN) == 0) {
+ __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN,
+ NULL) == 0) {
spin_lock_irq(&info->lock);
shmem_recalc_inode(inode);
info->swapped++;
@@ -1578,8 +1657,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
xa_lock_irq(&swap_mapping->i_pages);
error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage);
if (!error) {
- __inc_node_page_state(newpage, NR_FILE_PAGES);
- __dec_node_page_state(oldpage, NR_FILE_PAGES);
+ mem_cgroup_migrate(oldpage, newpage);
+ __inc_lruvec_page_state(newpage, NR_FILE_PAGES);
+ __dec_lruvec_page_state(oldpage, NR_FILE_PAGES);
}
xa_unlock_irq(&swap_mapping->i_pages);
@@ -1591,8 +1671,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
*/
oldpage = newpage;
} else {
- mem_cgroup_migrate(oldpage, newpage);
- lru_cache_add_anon(newpage);
+ lru_cache_add(newpage);
*pagep = newpage;
}
@@ -1609,7 +1688,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp,
* Swap in the page pointed to by *pagep.
* Caller has to make sure that *pagep contains a valid swapped page.
* Returns 0 and the page in pagep if success. On failure, returns the
- * the error code and NULL in *pagep.
+ * error code and NULL in *pagep.
*/
static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct page **pagep, enum sgp_type sgp,
@@ -1619,7 +1698,6 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
struct address_space *mapping = inode->i_mapping;
struct shmem_inode_info *info = SHMEM_I(inode);
struct mm_struct *charge_mm = vma ? vma->vm_mm : current->mm;
- struct mem_cgroup *memcg;
struct page *page;
swp_entry_t swap;
int error;
@@ -1664,31 +1742,12 @@ static int shmem_swapin_page(struct inode *inode, pgoff_t index,
goto failed;
}
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- false);
- if (!error) {
- error = shmem_add_to_page_cache(page, mapping, index,
- swp_to_radix_entry(swap), gfp);
- /*
- * We already confirmed swap under page lock, and make
- * no memory allocation here, so usually no possibility
- * of error; but free_swap_and_cache() only trylocks a
- * page, so it is just possible that the entry has been
- * truncated or holepunched since swap was confirmed.
- * shmem_undo_range() will have done some of the
- * unaccounting, now delete_from_swap_cache() will do
- * the rest.
- */
- if (error) {
- mem_cgroup_cancel_charge(page, memcg, false);
- delete_from_swap_cache(page);
- }
- }
+ error = shmem_add_to_page_cache(page, mapping, index,
+ swp_to_radix_entry(swap), gfp,
+ charge_mm);
if (error)
goto failed;
- mem_cgroup_commit_charge(page, memcg, true, false);
-
spin_lock_irq(&info->lock);
info->swapped--;
shmem_recalc_inode(inode);
@@ -1734,7 +1793,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index,
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo;
struct mm_struct *charge_mm;
- struct mem_cgroup *memcg;
struct page *page;
enum sgp_type sgp_huge = sgp;
pgoff_t hindex = index;
@@ -1859,25 +1917,12 @@ alloc_nohuge:
if (sgp == SGP_WRITE)
__SetPageReferenced(page);
- error = mem_cgroup_try_charge_delay(page, charge_mm, gfp, &memcg,
- PageTransHuge(page));
- if (error) {
- if (PageTransHuge(page)) {
- count_vm_event(THP_FILE_FALLBACK);
- count_vm_event(THP_FILE_FALLBACK_CHARGE);
- }
- goto unacct;
- }
error = shmem_add_to_page_cache(page, mapping, hindex,
- NULL, gfp & GFP_RECLAIM_MASK);
- if (error) {
- mem_cgroup_cancel_charge(page, memcg,
- PageTransHuge(page));
+ NULL, gfp & GFP_RECLAIM_MASK,
+ charge_mm);
+ if (error)
goto unacct;
- }
- mem_cgroup_commit_charge(page, memcg, false,
- PageTransHuge(page));
- lru_cache_add_anon(page);
+ lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced += compound_nr(page);
@@ -2240,13 +2285,14 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
struct inode *inode;
struct shmem_inode_info *info;
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ ino_t ino;
- if (shmem_reserve_inode(sb))
+ if (shmem_reserve_inode(sb, &ino))
return NULL;
inode = new_inode(sb);
if (inode) {
- inode->i_ino = get_next_ino();
+ inode->i_ino = ino;
inode_init_owner(inode, dir, mode);
inode->i_blocks = 0;
inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);
@@ -2314,7 +2360,6 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
struct address_space *mapping = inode->i_mapping;
gfp_t gfp = mapping_gfp_mask(mapping);
pgoff_t pgoff = linear_page_index(dst_vma, dst_addr);
- struct mem_cgroup *memcg;
spinlock_t *ptl;
void *page_kaddr;
struct page *page;
@@ -2338,7 +2383,7 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
PAGE_SIZE);
kunmap_atomic(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
*pagep = page;
shmem_inode_unacct_blocks(inode, 1);
@@ -2364,16 +2409,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
if (unlikely(offset >= max_off))
goto out_release;
- ret = mem_cgroup_try_charge_delay(page, dst_mm, gfp, &memcg, false);
- if (ret)
- goto out_release;
-
ret = shmem_add_to_page_cache(page, mapping, pgoff, NULL,
- gfp & GFP_RECLAIM_MASK);
+ gfp & GFP_RECLAIM_MASK, dst_mm);
if (ret)
- goto out_release_uncharge;
-
- mem_cgroup_commit_charge(page, memcg, false, false);
+ goto out_release;
_dst_pte = mk_pte(page, dst_vma->vm_page_prot);
if (dst_vma->vm_flags & VM_WRITE)
@@ -2394,13 +2433,13 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
ret = -EFAULT;
max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
if (unlikely(offset >= max_off))
- goto out_release_uncharge_unlock;
+ goto out_release_unlock;
ret = -EEXIST;
if (!pte_none(*dst_pte))
- goto out_release_uncharge_unlock;
+ goto out_release_unlock;
- lru_cache_add_anon(page);
+ lru_cache_add(page);
spin_lock_irq(&info->lock);
info->alloced++;
@@ -2419,12 +2458,10 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
ret = 0;
out:
return ret;
-out_release_uncharge_unlock:
+out_release_unlock:
pte_unmap_unlock(dst_pte, ptl);
ClearPageDirty(page);
delete_from_page_cache(page);
-out_release_uncharge:
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
unlock_page(page);
put_page(page);
@@ -2959,7 +2996,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr
* first link must skip that, to get the accounting right.
*/
if (inode->i_nlink) {
- ret = shmem_reserve_inode(inode->i_sb);
+ ret = shmem_reserve_inode(inode->i_sb, NULL);
if (ret)
goto out;
}
@@ -3205,7 +3242,7 @@ static int shmem_initxattrs(struct inode *inode,
new_xattr->name = kmalloc(XATTR_SECURITY_PREFIX_LEN + len,
GFP_KERNEL);
if (!new_xattr->name) {
- kfree(new_xattr);
+ kvfree(new_xattr);
return -ENOMEM;
}
@@ -3374,6 +3411,8 @@ enum shmem_param {
Opt_nr_inodes,
Opt_size,
Opt_uid,
+ Opt_inode32,
+ Opt_inode64,
};
static const struct constant_table shmem_param_enums_huge[] = {
@@ -3393,6 +3432,8 @@ const struct fs_parameter_spec shmem_fs_parameters[] = {
fsparam_string("nr_inodes", Opt_nr_inodes),
fsparam_string("size", Opt_size),
fsparam_u32 ("uid", Opt_uid),
+ fsparam_flag ("inode32", Opt_inode32),
+ fsparam_flag ("inode64", Opt_inode64),
{}
};
@@ -3464,6 +3505,18 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
break;
}
goto unsupported_parameter;
+ case Opt_inode32:
+ ctx->full_inums = false;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
+ case Opt_inode64:
+ if (sizeof(ino_t) < 8) {
+ return invalfc(fc,
+ "Cannot use inode64 with <64bit inums in kernel\n");
+ }
+ ctx->full_inums = true;
+ ctx->seen |= SHMEM_SEEN_INUMS;
+ break;
}
return 0;
@@ -3555,8 +3608,16 @@ static int shmem_reconfigure(struct fs_context *fc)
}
}
+ if ((ctx->seen & SHMEM_SEEN_INUMS) && !ctx->full_inums &&
+ sbinfo->next_ino > UINT_MAX) {
+ err = "Current inum too high to switch to 32-bit inums";
+ goto out;
+ }
+
if (ctx->seen & SHMEM_SEEN_HUGE)
sbinfo->huge = ctx->huge;
+ if (ctx->seen & SHMEM_SEEN_INUMS)
+ sbinfo->full_inums = ctx->full_inums;
if (ctx->seen & SHMEM_SEEN_BLOCKS)
sbinfo->max_blocks = ctx->blocks;
if (ctx->seen & SHMEM_SEEN_INODES) {
@@ -3596,6 +3657,29 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root)
if (!gid_eq(sbinfo->gid, GLOBAL_ROOT_GID))
seq_printf(seq, ",gid=%u",
from_kgid_munged(&init_user_ns, sbinfo->gid));
+
+ /*
+ * Showing inode{64,32} might be useful even if it's the system default,
+ * since then people don't have to resort to checking both here and
+ * /proc/config.gz to confirm 64-bit inums were successfully applied
+ * (which may not even exist if IKCONFIG_PROC isn't enabled).
+ *
+ * We hide it when inode64 isn't the default and we are using 32-bit
+ * inodes, since that probably just means the feature isn't even under
+ * consideration.
+ *
+ * As such:
+ *
+ * +-----------------+-----------------+
+ * | TMPFS_INODE64=y | TMPFS_INODE64=n |
+ * +------------------+-----------------+-----------------+
+ * | full_inums=true | show | show |
+ * | full_inums=false | show | hide |
+ * +------------------+-----------------+-----------------+
+ *
+ */
+ if (IS_ENABLED(CONFIG_TMPFS_INODE64) || sbinfo->full_inums)
+ seq_printf(seq, ",inode%d", (sbinfo->full_inums ? 64 : 32));
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Rightly or wrongly, show huge mount option unmasked by shmem_huge */
if (sbinfo->huge)
@@ -3611,6 +3695,7 @@ static void shmem_put_super(struct super_block *sb)
{
struct shmem_sb_info *sbinfo = SHMEM_SB(sb);
+ free_percpu(sbinfo->ino_batch);
percpu_counter_destroy(&sbinfo->used_blocks);
mpol_put(sbinfo->mpol);
kfree(sbinfo);
@@ -3643,6 +3728,8 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
ctx->blocks = shmem_default_max_blocks();
if (!(ctx->seen & SHMEM_SEEN_INODES))
ctx->inodes = shmem_default_max_inodes();
+ if (!(ctx->seen & SHMEM_SEEN_INUMS))
+ ctx->full_inums = IS_ENABLED(CONFIG_TMPFS_INODE64);
} else {
sb->s_flags |= SB_NOUSER;
}
@@ -3653,8 +3740,14 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc)
#endif
sbinfo->max_blocks = ctx->blocks;
sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes;
+ if (sb->s_flags & SB_KERNMOUNT) {
+ sbinfo->ino_batch = alloc_percpu(ino_t);
+ if (!sbinfo->ino_batch)
+ goto failed;
+ }
sbinfo->uid = ctx->uid;
sbinfo->gid = ctx->gid;
+ sbinfo->full_inums = ctx->full_inums;
sbinfo->mode = ctx->mode;
sbinfo->huge = ctx->huge;
sbinfo->mpol = ctx->mpol;
@@ -4155,7 +4248,7 @@ EXPORT_SYMBOL_GPL(shmem_file_setup_with_mnt);
/**
* shmem_zero_setup - setup a shared anonymous mapping
- * @vma: the vma to be mmapped is prepared by do_mmap_pgoff
+ * @vma: the vma to be mmapped is prepared by do_mmap
*/
int shmem_zero_setup(struct vm_area_struct *vma)
{
@@ -4163,7 +4256,7 @@ int shmem_zero_setup(struct vm_area_struct *vma)
loff_t size = vma->vm_end - vma->vm_start;
/*
- * Cloning a new file under mmap_sem leads to a lock ordering conflict
+ * Cloning a new file under mmap_lock leads to a lock ordering conflict
* between XFS directory reading and selinux: since this file is only
* accessible to the user through its mapping, use S_PRIVATE flag to
* bypass file security, in the same way as shmem_kernel_file_setup().
diff --git a/mm/shuffle.c b/mm/shuffle.c
index 44406d9977c7..9b5cd4b004b0 100644
--- a/mm/shuffle.c
+++ b/mm/shuffle.c
@@ -10,33 +10,11 @@
#include "shuffle.h"
DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-static unsigned long shuffle_state __ro_after_init;
-
-/*
- * Depending on the architecture, module parameter parsing may run
- * before, or after the cache detection. SHUFFLE_FORCE_DISABLE prevents,
- * or reverts the enabling of the shuffle implementation. SHUFFLE_ENABLE
- * attempts to turn on the implementation, but aborts if it finds
- * SHUFFLE_FORCE_DISABLE already set.
- */
-__meminit void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
- if (ctl == SHUFFLE_FORCE_DISABLE)
- set_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state);
-
- if (test_bit(SHUFFLE_FORCE_DISABLE, &shuffle_state)) {
- if (test_and_clear_bit(SHUFFLE_ENABLE, &shuffle_state))
- static_branch_disable(&page_alloc_shuffle_key);
- } else if (ctl == SHUFFLE_ENABLE
- && !test_and_set_bit(SHUFFLE_ENABLE, &shuffle_state))
- static_branch_enable(&page_alloc_shuffle_key);
-}
static bool shuffle_param;
static int shuffle_show(char *buffer, const struct kernel_param *kp)
{
- return sprintf(buffer, "%c\n", test_bit(SHUFFLE_ENABLE, &shuffle_state)
- ? 'Y' : 'N');
+ return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N');
}
static __meminit int shuffle_store(const char *val,
@@ -47,9 +25,7 @@ static __meminit int shuffle_store(const char *val,
if (rc < 0)
return rc;
if (shuffle_param)
- page_alloc_shuffle(SHUFFLE_ENABLE);
- else
- page_alloc_shuffle(SHUFFLE_FORCE_DISABLE);
+ static_branch_enable(&page_alloc_shuffle_key);
return 0;
}
module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
@@ -58,25 +34,25 @@ module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400);
* For two pages to be swapped in the shuffle, they must be free (on a
* 'free_area' lru), have the same order, and have the same migratetype.
*/
-static struct page * __meminit shuffle_valid_page(unsigned long pfn, int order)
+static struct page * __meminit shuffle_valid_page(struct zone *zone,
+ unsigned long pfn, int order)
{
- struct page *page;
+ struct page *page = pfn_to_online_page(pfn);
/*
* Given we're dealing with randomly selected pfns in a zone we
* need to ask questions like...
*/
- /* ...is the pfn even in the memmap? */
- if (!pfn_valid_within(pfn))
+ /* ... is the page managed by the buddy? */
+ if (!page)
return NULL;
- /* ...is the pfn in a present section or a hole? */
- if (!pfn_in_present_section(pfn))
+ /* ... is the page assigned to the same zone? */
+ if (page_zone(page) != zone)
return NULL;
/* ...is the page free and currently on a free_area list? */
- page = pfn_to_page(pfn);
if (!PageBuddy(page))
return NULL;
@@ -123,7 +99,7 @@ void __meminit __shuffle_zone(struct zone *z)
* page_j randomly selected in the span @zone_start_pfn to
* @spanned_pages.
*/
- page_i = shuffle_valid_page(i, order);
+ page_i = shuffle_valid_page(z, i, order);
if (!page_i)
continue;
@@ -137,7 +113,7 @@ void __meminit __shuffle_zone(struct zone *z)
j = z->zone_start_pfn +
ALIGN_DOWN(get_random_long() % z->spanned_pages,
order_pages);
- page_j = shuffle_valid_page(j, order);
+ page_j = shuffle_valid_page(z, j, order);
if (page_j && page_j != page_i)
break;
}
diff --git a/mm/shuffle.h b/mm/shuffle.h
index 4d79f03b6658..71b784f0b7c3 100644
--- a/mm/shuffle.h
+++ b/mm/shuffle.h
@@ -4,23 +4,10 @@
#define _MM_SHUFFLE_H
#include <linux/jump_label.h>
-/*
- * SHUFFLE_ENABLE is called from the command line enabling path, or by
- * platform-firmware enabling that indicates the presence of a
- * direct-mapped memory-side-cache. SHUFFLE_FORCE_DISABLE is called from
- * the command line path and overrides any previous or future
- * SHUFFLE_ENABLE.
- */
-enum mm_shuffle_ctl {
- SHUFFLE_ENABLE,
- SHUFFLE_FORCE_DISABLE,
-};
-
#define SHUFFLE_ORDER (MAX_ORDER-1)
#ifdef CONFIG_SHUFFLE_PAGE_ALLOCATOR
DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key);
-extern void page_alloc_shuffle(enum mm_shuffle_ctl ctl);
extern void __shuffle_free_memory(pg_data_t *pgdat);
extern bool shuffle_pick_tail(void);
static inline void shuffle_free_memory(pg_data_t *pgdat)
@@ -58,10 +45,6 @@ static inline void shuffle_zone(struct zone *z)
{
}
-static inline void page_alloc_shuffle(enum mm_shuffle_ctl ctl)
-{
-}
-
static inline bool is_shuffle_order(int order)
{
return false;
diff --git a/mm/slab.c b/mm/slab.c
index a89633603b2d..f658e86ec8ce 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -588,6 +588,16 @@ static int transfer_objects(struct array_cache *to,
return nr;
}
+/* &alien->lock must be held by alien callers. */
+static __always_inline void __free_one(struct array_cache *ac, void *objp)
+{
+ /* Avoid trivial double-free. */
+ if (IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ WARN_ON_ONCE(ac->avail > 0 && ac->entry[ac->avail - 1] == objp))
+ return;
+ ac->entry[ac->avail++] = objp;
+}
+
#ifndef CONFIG_NUMA
#define drain_alien_cache(cachep, alien) do { } while (0)
@@ -767,7 +777,7 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp,
STATS_INC_ACOVERFLOW(cachep);
__drain_alien_cache(cachep, ac, page_node, &list);
}
- ac->entry[ac->avail++] = objp;
+ __free_one(ac, objp);
spin_unlock(&alien->lock);
slabs_destroy(cachep, &list);
} else {
@@ -1050,7 +1060,7 @@ int slab_prepare_cpu(unsigned int cpu)
* offline.
*
* Even if all the cpus of a node are down, we don't free the
- * kmem_list3 of any cache. This to avoid a race between cpu_down, and
+ * kmem_cache_node of any cache. This to avoid a race between cpu_down, and
* a kmalloc allocation from another cpu for memory from the node of
* the cpu going down. The list3 structure is usually allocated from
* kmem_cache_create() and gets destroyed at kmem_cache_destroy().
@@ -1239,7 +1249,6 @@ void __init kmem_cache_init(void)
nr_node_ids * sizeof(struct kmem_cache_node *),
SLAB_HWCACHE_ALIGN, 0, 0);
list_add(&kmem_cache->list, &slab_caches);
- memcg_link_cache(kmem_cache, NULL);
slab_state = PARTIAL;
/*
@@ -1370,11 +1379,7 @@ static struct page *kmem_getpages(struct kmem_cache *cachep, gfp_t flags,
return NULL;
}
- if (charge_slab_page(page, flags, cachep->gfporder, cachep)) {
- __free_pages(page, cachep->gfporder);
- return NULL;
- }
-
+ account_slab_page(page, cachep->gfporder, cachep);
__SetPageSlab(page);
/* Record if ALLOC_NO_WATERMARKS was set when allocating the slab */
if (sk_memalloc_socks() && page_is_pfmemalloc(page))
@@ -1398,7 +1403,7 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- uncharge_slab_page(page, order, cachep);
+ unaccount_slab_page(page, order, cachep);
__free_pages(page, order);
}
@@ -1627,6 +1632,10 @@ static void slab_destroy(struct kmem_cache *cachep, struct page *page)
kmem_cache_free(cachep->freelist_cache, freelist);
}
+/*
+ * Update the size of the caches before calling slabs_destroy as it may
+ * recursively call kfree.
+ */
static void slabs_destroy(struct kmem_cache *cachep, struct list_head *list)
{
struct page *page, *n;
@@ -2148,8 +2157,8 @@ static void do_drain(void *arg)
spin_lock(&n->list_lock);
free_block(cachep, ac->entry, ac->avail, node, &list);
spin_unlock(&n->list_lock);
- slabs_destroy(cachep, &list);
ac->avail = 0;
+ slabs_destroy(cachep, &list);
}
static void drain_cpu_caches(struct kmem_cache *cachep)
@@ -2243,17 +2252,6 @@ int __kmem_cache_shrink(struct kmem_cache *cachep)
return (ret ? 1 : 0);
}
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate(struct kmem_cache *cachep)
-{
- __kmem_cache_shrink(cachep);
-}
-
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
-}
-#endif
-
int __kmem_cache_shutdown(struct kmem_cache *cachep)
{
return __kmem_cache_shrink(cachep);
@@ -2579,13 +2577,9 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep,
* Be lazy and only check for valid flags here, keeping it out of the
* critical path in kmem_cache_alloc().
*/
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO));
local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK);
@@ -3106,7 +3100,7 @@ static void *fallback_alloc(struct kmem_cache *cache, gfp_t flags)
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *obj = NULL;
struct page *page;
int nid;
@@ -3124,7 +3118,7 @@ retry:
* Look through allowed nodes for objects available
* from existing per node queues.
*/
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
nid = zone_to_nid(zone);
if (cpuset_zone_allowed(zone, flags) &&
@@ -3222,9 +3216,10 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
unsigned long save_flags;
void *ptr;
int slab_node = numa_mem_id();
+ struct obj_cgroup *objcg = NULL;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3260,7 +3255,7 @@ slab_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid,
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && ptr)
memset(ptr, 0, cachep->object_size);
- slab_post_alloc_hook(cachep, flags, 1, &ptr);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &ptr);
return ptr;
}
@@ -3301,9 +3296,10 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
{
unsigned long save_flags;
void *objp;
+ struct obj_cgroup *objcg = NULL;
flags &= gfp_allowed_mask;
- cachep = slab_pre_alloc_hook(cachep, flags);
+ cachep = slab_pre_alloc_hook(cachep, &objcg, 1, flags);
if (unlikely(!cachep))
return NULL;
@@ -3317,7 +3313,7 @@ slab_alloc(struct kmem_cache *cachep, gfp_t flags, unsigned long caller)
if (unlikely(slab_want_init_on_alloc(flags, cachep)) && objp)
memset(objp, 0, cachep->object_size);
- slab_post_alloc_hook(cachep, flags, 1, &objp);
+ slab_post_alloc_hook(cachep, objcg, flags, 1, &objp);
return objp;
}
@@ -3410,9 +3406,9 @@ free_done:
}
#endif
spin_unlock(&n->list_lock);
- slabs_destroy(cachep, &list);
ac->avail -= batchcount;
memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail);
+ slabs_destroy(cachep, &list);
}
/*
@@ -3426,6 +3422,11 @@ static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
if (kasan_slab_free(cachep, objp, _RET_IP_))
return;
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(cachep->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(objp, cachep->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
___cache_free(cachep, objp, caller);
}
@@ -3439,6 +3440,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
+ memcg_slab_free_hook(cachep, virt_to_head_page(objp), objp);
/*
* Skip calling cache_free_alien() when the platform is not numa.
@@ -3466,7 +3468,7 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
}
}
- ac->entry[ac->avail++] = objp;
+ __free_one(ac, objp);
}
/**
@@ -3504,8 +3506,9 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
void **p)
{
size_t i;
+ struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (!s)
return 0;
@@ -3528,13 +3531,13 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
for (i = 0; i < size; i++)
memset(p[i], 0, s->object_size);
- slab_post_alloc_hook(s, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
/* FIXME: Trace call missing. Christoph would like a bulk variant */
return size;
error:
local_irq_enable();
cache_alloc_debugcheck_after_bulk(s, flags, i, p, _RET_IP_);
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3796,8 +3799,8 @@ fail:
}
/* Always called with the slab_mutex held */
-static int __do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
+static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
+ int batchcount, int shared, gfp_t gfp)
{
struct array_cache __percpu *cpu_cache, *prev;
int cpu;
@@ -3842,29 +3845,6 @@ setup_node:
return setup_kmem_cache_nodes(cachep, gfp);
}
-static int do_tune_cpucache(struct kmem_cache *cachep, int limit,
- int batchcount, int shared, gfp_t gfp)
-{
- int ret;
- struct kmem_cache *c;
-
- ret = __do_tune_cpucache(cachep, limit, batchcount, shared, gfp);
-
- if (slab_state < FULL)
- return ret;
-
- if ((ret < 0) || !is_root_cache(cachep))
- return ret;
-
- lockdep_assert_held(&slab_mutex);
- for_each_memcg_cache(c, cachep) {
- /* return value determined by the root cache only */
- __do_tune_cpucache(c, limit, batchcount, shared, gfp);
- }
-
- return ret;
-}
-
/* Called with slab_mutex held always */
static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
{
@@ -3877,13 +3857,6 @@ static int enable_cpucache(struct kmem_cache *cachep, gfp_t gfp)
if (err)
goto end;
- if (!is_root_cache(cachep)) {
- struct kmem_cache *root = memcg_root_cache(cachep);
- limit = root->limit;
- shared = root->shared;
- batchcount = root->batchcount;
- }
-
if (limit && shared && batchcount)
goto skip_setup;
/*
diff --git a/mm/slab.h b/mm/slab.h
index 207c83ef6e06..6cc323f1313a 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -30,69 +30,6 @@ struct kmem_cache {
struct list_head list; /* List of all slab caches on the system */
};
-#else /* !CONFIG_SLOB */
-
-struct memcg_cache_array {
- struct rcu_head rcu;
- struct kmem_cache *entries[0];
-};
-
-/*
- * This is the main placeholder for memcg-related information in kmem caches.
- * Both the root cache and the child caches will have it. For the root cache,
- * this will hold a dynamically allocated array large enough to hold
- * information about the currently limited memcgs in the system. To allow the
- * array to be accessed without taking any locks, on relocation we free the old
- * version only after a grace period.
- *
- * Root and child caches hold different metadata.
- *
- * @root_cache: Common to root and child caches. NULL for root, pointer to
- * the root cache for children.
- *
- * The following fields are specific to root caches.
- *
- * @memcg_caches: kmemcg ID indexed table of child caches. This table is
- * used to index child cachces during allocation and cleared
- * early during shutdown.
- *
- * @root_caches_node: List node for slab_root_caches list.
- *
- * @children: List of all child caches. While the child caches are also
- * reachable through @memcg_caches, a child cache remains on
- * this list until it is actually destroyed.
- *
- * The following fields are specific to child caches.
- *
- * @memcg: Pointer to the memcg this cache belongs to.
- *
- * @children_node: List node for @root_cache->children list.
- *
- * @kmem_caches_node: List node for @memcg->kmem_caches list.
- */
-struct memcg_cache_params {
- struct kmem_cache *root_cache;
- union {
- struct {
- struct memcg_cache_array __rcu *memcg_caches;
- struct list_head __root_caches_node;
- struct list_head children;
- bool dying;
- };
- struct {
- struct mem_cgroup *memcg;
- struct list_head children_node;
- struct list_head kmem_caches_node;
- struct percpu_ref refcnt;
-
- void (*work_fn)(struct kmem_cache *);
- union {
- struct rcu_head rcu_head;
- struct work_struct work;
- };
- };
- };
-};
#endif /* CONFIG_SLOB */
#ifdef CONFIG_SLAB
@@ -109,6 +46,7 @@ struct memcg_cache_params {
#include <linux/kmemleak.h>
#include <linux/random.h>
#include <linux/sched/mm.h>
+#include <linux/kmemleak.h>
/*
* State of the slab allocator.
@@ -152,6 +90,7 @@ void create_kmalloc_caches(slab_flags_t);
struct kmem_cache *kmalloc_slab(size_t, gfp_t);
#endif
+gfp_t kmalloc_fix_flags(gfp_t flags);
/* Functions provided by the slab allocators */
int __kmem_cache_create(struct kmem_cache *, slab_flags_t flags);
@@ -234,10 +173,7 @@ bool __kmem_cache_empty(struct kmem_cache *);
int __kmem_cache_shutdown(struct kmem_cache *);
void __kmem_cache_release(struct kmem_cache *);
int __kmem_cache_shrink(struct kmem_cache *);
-void __kmemcg_cache_deactivate(struct kmem_cache *s);
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s);
void slab_kmem_cache_release(struct kmem_cache *);
-void kmem_cache_shrink_all(struct kmem_cache *s);
struct seq_file;
struct file;
@@ -272,199 +208,208 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
static inline int cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
- NR_SLAB_RECLAIMABLE : NR_SLAB_UNRECLAIMABLE;
+ NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
}
-#ifdef CONFIG_MEMCG_KMEM
-
-/* List of all root caches. */
-extern struct list_head slab_root_caches;
-#define root_caches_node memcg_params.__root_caches_node
+#ifdef CONFIG_SLUB_DEBUG
+#ifdef CONFIG_SLUB_DEBUG_ON
+DECLARE_STATIC_KEY_TRUE(slub_debug_enabled);
+#else
+DECLARE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
+extern void print_tracking(struct kmem_cache *s, void *object);
+#else
+static inline void print_tracking(struct kmem_cache *s, void *object)
+{
+}
+#endif
/*
- * Iterate over all memcg caches of the given root cache. The caller must hold
- * slab_mutex.
+ * Returns true if any of the specified slub_debug flags is enabled for the
+ * cache. Use only for flags parsed by setup_slub_debug() as it also enables
+ * the static key.
*/
-#define for_each_memcg_cache(iter, root) \
- list_for_each_entry(iter, &(root)->memcg_params.children, \
- memcg_params.children_node)
-
-static inline bool is_root_cache(struct kmem_cache *s)
+static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t flags)
{
- return !s->memcg_params.root_cache;
+#ifdef CONFIG_SLUB_DEBUG
+ VM_WARN_ON_ONCE(!(flags & SLAB_DEBUG_FLAGS));
+ if (static_branch_unlikely(&slub_debug_enabled))
+ return s->flags & flags;
+#endif
+ return false;
}
-static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
+#ifdef CONFIG_MEMCG_KMEM
+static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
{
- return p == s || p == s->memcg_params.root_cache;
+ /*
+ * page->mem_cgroup and page->obj_cgroups are sharing the same
+ * space. To distinguish between them in case we don't know for sure
+ * that the page is a slab page (e.g. page_cgroup_ino()), let's
+ * always set the lowest bit of obj_cgroups.
+ */
+ return (struct obj_cgroup **)
+ ((unsigned long)page->obj_cgroups & ~0x1UL);
}
-/*
- * We use suffixes to the name in memcg because we can't have caches
- * created in the system with the same name. But when we print them
- * locally, better refer to them with the base name
- */
-static inline const char *cache_name(struct kmem_cache *s)
+static inline bool page_has_obj_cgroups(struct page *page)
{
- if (!is_root_cache(s))
- s = s->memcg_params.root_cache;
- return s->name;
+ return ((unsigned long)page->obj_cgroups & 0x1UL);
}
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
+ gfp_t gfp);
+
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
- if (is_root_cache(s))
- return s;
- return s->memcg_params.root_cache;
+ kfree(page_obj_cgroups(page));
+ page->obj_cgroups = NULL;
}
-/*
- * Expects a pointer to a slab page. Please note, that PageSlab() check
- * isn't sufficient, as it returns true also for tail compound slab pages,
- * which do not have slab_cache pointer set.
- * So this function assumes that the page can pass PageSlab() && !PageTail()
- * check.
- *
- * The kmem_cache can be reparented asynchronously. The caller must ensure
- * the memcg lifetime, e.g. by taking rcu_read_lock() or cgroup_mutex.
- */
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline size_t obj_full_size(struct kmem_cache *s)
{
- struct kmem_cache *s;
-
- s = READ_ONCE(page->slab_cache);
- if (s && !is_root_cache(s))
- return READ_ONCE(s->memcg_params.memcg);
-
- return NULL;
+ /*
+ * For each accounted object there is an extra space which is used
+ * to store obj_cgroup membership. Charge it too.
+ */
+ return s->size + sizeof(struct obj_cgroup *);
}
-/*
- * Charge the slab page belonging to the non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline int memcg_charge_slab(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ size_t objects,
+ gfp_t flags)
{
- unsigned int nr_pages = 1 << order;
- struct mem_cgroup *memcg;
- struct lruvec *lruvec;
- int ret;
-
- rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- while (memcg && !css_tryget_online(&memcg->css))
- memcg = parent_mem_cgroup(memcg);
- rcu_read_unlock();
+ struct obj_cgroup *objcg;
- if (unlikely(!memcg || mem_cgroup_is_root(memcg))) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- nr_pages);
- percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
- return 0;
- }
+ if (memcg_kmem_bypass())
+ return NULL;
- ret = memcg_kmem_charge(memcg, gfp, nr_pages);
- if (ret)
- goto out;
+ objcg = get_obj_cgroup_from_current();
+ if (!objcg)
+ return NULL;
- lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), nr_pages);
+ if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
+ obj_cgroup_put(objcg);
+ return NULL;
+ }
- /* transer try_charge() page references to kmem_cache */
- percpu_ref_get_many(&s->memcg_params.refcnt, nr_pages);
- css_put_many(&memcg->css, nr_pages);
-out:
- css_put(&memcg->css);
- return ret;
+ return objcg;
}
-/*
- * Uncharge a slab page belonging to a non-root kmem_cache.
- * Can be called for non-root kmem_caches only.
- */
-static __always_inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
+static inline void mod_objcg_state(struct obj_cgroup *objcg,
+ struct pglist_data *pgdat,
+ int idx, int nr)
{
- unsigned int nr_pages = 1 << order;
struct mem_cgroup *memcg;
struct lruvec *lruvec;
rcu_read_lock();
- memcg = READ_ONCE(s->memcg_params.memcg);
- if (likely(!mem_cgroup_is_root(memcg))) {
- lruvec = mem_cgroup_lruvec(memcg, page_pgdat(page));
- mod_lruvec_state(lruvec, cache_vmstat_idx(s), -nr_pages);
- memcg_kmem_uncharge(memcg, nr_pages);
- } else {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -nr_pages);
- }
+ memcg = obj_cgroup_memcg(objcg);
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ mod_memcg_lruvec_state(lruvec, idx, nr);
rcu_read_unlock();
+}
+
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
+{
+ struct page *page;
+ unsigned long off;
+ size_t i;
+
+ if (!objcg)
+ return;
- percpu_ref_put_many(&s->memcg_params.refcnt, nr_pages);
+ flags &= ~__GFP_ACCOUNT;
+ for (i = 0; i < size; i++) {
+ if (likely(p[i])) {
+ page = virt_to_head_page(p[i]);
+
+ if (!page_has_obj_cgroups(page) &&
+ memcg_alloc_page_obj_cgroups(page, s, flags)) {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ continue;
+ }
+
+ off = obj_to_index(s, page, p[i]);
+ obj_cgroup_get(objcg);
+ page_obj_cgroups(page)[off] = objcg;
+ mod_objcg_state(objcg, page_pgdat(page),
+ cache_vmstat_idx(s), obj_full_size(s));
+ } else {
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ }
+ }
+ obj_cgroup_put(objcg);
}
-extern void slab_init_memcg_params(struct kmem_cache *);
-extern void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg);
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+ void *p)
+{
+ struct obj_cgroup *objcg;
+ unsigned int off;
-#else /* CONFIG_MEMCG_KMEM */
+ if (!memcg_kmem_enabled())
+ return;
-/* If !memcg, all caches are root. */
-#define slab_root_caches slab_caches
-#define root_caches_node list
+ if (!page_has_obj_cgroups(page))
+ return;
-#define for_each_memcg_cache(iter, root) \
- for ((void)(iter), (void)(root); 0; )
+ off = obj_to_index(s, page, p);
+ objcg = page_obj_cgroups(page)[off];
+ page_obj_cgroups(page)[off] = NULL;
-static inline bool is_root_cache(struct kmem_cache *s)
-{
- return true;
-}
+ if (!objcg)
+ return;
-static inline bool slab_equal_or_root(struct kmem_cache *s,
- struct kmem_cache *p)
-{
- return s == p;
-}
+ obj_cgroup_uncharge(objcg, obj_full_size(s));
+ mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
+ -obj_full_size(s));
-static inline const char *cache_name(struct kmem_cache *s)
-{
- return s->name;
+ obj_cgroup_put(objcg);
}
-static inline struct kmem_cache *memcg_root_cache(struct kmem_cache *s)
+#else /* CONFIG_MEMCG_KMEM */
+static inline bool page_has_obj_cgroups(struct page *page)
{
- return s;
+ return false;
}
-static inline struct mem_cgroup *memcg_from_slab_page(struct page *page)
+static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
}
-static inline int memcg_charge_slab(struct page *page, gfp_t gfp, int order,
- struct kmem_cache *s)
+static inline int memcg_alloc_page_obj_cgroups(struct page *page,
+ struct kmem_cache *s, gfp_t gfp)
{
return 0;
}
-static inline void memcg_uncharge_slab(struct page *page, int order,
- struct kmem_cache *s)
+static inline void memcg_free_page_obj_cgroups(struct page *page)
{
}
-static inline void slab_init_memcg_params(struct kmem_cache *s)
+static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ size_t objects,
+ gfp_t flags)
{
+ return NULL;
}
-static inline void memcg_link_cache(struct kmem_cache *s,
- struct mem_cgroup *memcg)
+static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size,
+ void **p)
{
}
+static inline void memcg_slab_free_hook(struct kmem_cache *s, struct page *page,
+ void *p)
+{
+}
#endif /* CONFIG_MEMCG_KMEM */
static inline struct kmem_cache *virt_to_cache(const void *obj)
@@ -478,51 +423,36 @@ static inline struct kmem_cache *virt_to_cache(const void *obj)
return page->slab_cache;
}
-static __always_inline int charge_slab_page(struct page *page,
- gfp_t gfp, int order,
- struct kmem_cache *s)
+static __always_inline void account_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- 1 << order);
- return 0;
- }
-
- return memcg_charge_slab(page, gfp, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ PAGE_SIZE << order);
}
-static __always_inline void uncharge_slab_page(struct page *page, int order,
- struct kmem_cache *s)
+static __always_inline void unaccount_slab_page(struct page *page, int order,
+ struct kmem_cache *s)
{
- if (is_root_cache(s)) {
- mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
- -(1 << order));
- return;
- }
+ if (memcg_kmem_enabled())
+ memcg_free_page_obj_cgroups(page);
- memcg_uncharge_slab(page, order, s);
+ mod_node_page_state(page_pgdat(page), cache_vmstat_idx(s),
+ -(PAGE_SIZE << order));
}
static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x)
{
struct kmem_cache *cachep;
- /*
- * When kmemcg is not being used, both assignments should return the
- * same value. but we don't want to pay the assignment price in that
- * case. If it is not compiled in, the compiler should be smart enough
- * to not do even the assignment. In that case, slab_equal_or_root
- * will also be a constant.
- */
- if (!memcg_kmem_enabled() &&
- !IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
- !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS))
+ if (!IS_ENABLED(CONFIG_SLAB_FREELIST_HARDENED) &&
+ !kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS))
return s;
cachep = virt_to_cache(x);
- WARN_ONCE(cachep && !slab_equal_or_root(cachep, s),
+ if (WARN(cachep && cachep != s,
"%s: Wrong slab cache. %s but object is from %s\n",
- __func__, s->name, cachep->name);
+ __func__, s->name, cachep->name))
+ print_tracking(cachep, x);
return cachep;
}
@@ -557,7 +487,8 @@ static inline size_t slab_ksize(const struct kmem_cache *s)
}
static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
- gfp_t flags)
+ struct obj_cgroup **objcgp,
+ size_t size, gfp_t flags)
{
flags &= gfp_allowed_mask;
@@ -571,13 +502,14 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
if (memcg_kmem_enabled() &&
((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- return memcg_kmem_get_cache(s);
+ *objcgp = memcg_slab_pre_alloc_hook(s, size, flags);
return s;
}
-static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
- size_t size, void **p)
+static inline void slab_post_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup *objcg,
+ gfp_t flags, size_t size, void **p)
{
size_t i;
@@ -590,7 +522,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, gfp_t flags,
}
if (memcg_kmem_enabled())
- memcg_kmem_put_cache(s);
+ memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
}
#ifndef CONFIG_SLOB
@@ -645,9 +577,6 @@ static inline struct kmem_cache_node *get_node(struct kmem_cache *s, int node)
void *slab_start(struct seq_file *m, loff_t *pos);
void *slab_next(struct seq_file *m, void *p, loff_t *pos);
void slab_stop(struct seq_file *m, void *p);
-void *memcg_slab_start(struct seq_file *m, loff_t *pos);
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos);
-void memcg_slab_stop(struct seq_file *m, void *p);
int memcg_slab_show(struct seq_file *m, void *p);
#if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
diff --git a/mm/slab_common.c b/mm/slab_common.c
index 23c7500eea7d..f9ccd5dc13f3 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -26,6 +26,8 @@
#define CREATE_TRACE_POINTS
#include <trace/events/kmem.h>
+#include "internal.h"
+
#include "slab.h"
enum slab_state slab_state;
@@ -128,152 +130,6 @@ int __kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t nr,
return i;
}
-#ifdef CONFIG_MEMCG_KMEM
-
-LIST_HEAD(slab_root_caches);
-static DEFINE_SPINLOCK(memcg_kmem_wq_lock);
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref);
-
-void slab_init_memcg_params(struct kmem_cache *s)
-{
- s->memcg_params.root_cache = NULL;
- RCU_INIT_POINTER(s->memcg_params.memcg_caches, NULL);
- INIT_LIST_HEAD(&s->memcg_params.children);
- s->memcg_params.dying = false;
-}
-
-static int init_memcg_params(struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- struct memcg_cache_array *arr;
-
- if (root_cache) {
- int ret = percpu_ref_init(&s->memcg_params.refcnt,
- kmemcg_cache_shutdown,
- 0, GFP_KERNEL);
- if (ret)
- return ret;
-
- s->memcg_params.root_cache = root_cache;
- INIT_LIST_HEAD(&s->memcg_params.children_node);
- INIT_LIST_HEAD(&s->memcg_params.kmem_caches_node);
- return 0;
- }
-
- slab_init_memcg_params(s);
-
- if (!memcg_nr_cache_ids)
- return 0;
-
- arr = kvzalloc(sizeof(struct memcg_cache_array) +
- memcg_nr_cache_ids * sizeof(void *),
- GFP_KERNEL);
- if (!arr)
- return -ENOMEM;
-
- RCU_INIT_POINTER(s->memcg_params.memcg_caches, arr);
- return 0;
-}
-
-static void destroy_memcg_params(struct kmem_cache *s)
-{
- if (is_root_cache(s)) {
- kvfree(rcu_access_pointer(s->memcg_params.memcg_caches));
- } else {
- mem_cgroup_put(s->memcg_params.memcg);
- WRITE_ONCE(s->memcg_params.memcg, NULL);
- percpu_ref_exit(&s->memcg_params.refcnt);
- }
-}
-
-static void free_memcg_params(struct rcu_head *rcu)
-{
- struct memcg_cache_array *old;
-
- old = container_of(rcu, struct memcg_cache_array, rcu);
- kvfree(old);
-}
-
-static int update_memcg_params(struct kmem_cache *s, int new_array_size)
-{
- struct memcg_cache_array *old, *new;
-
- new = kvzalloc(sizeof(struct memcg_cache_array) +
- new_array_size * sizeof(void *), GFP_KERNEL);
- if (!new)
- return -ENOMEM;
-
- old = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- if (old)
- memcpy(new->entries, old->entries,
- memcg_nr_cache_ids * sizeof(void *));
-
- rcu_assign_pointer(s->memcg_params.memcg_caches, new);
- if (old)
- call_rcu(&old->rcu, free_memcg_params);
- return 0;
-}
-
-int memcg_update_all_caches(int num_memcgs)
-{
- struct kmem_cache *s;
- int ret = 0;
-
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- ret = update_memcg_params(s, num_memcgs);
- /*
- * Instead of freeing the memory, we'll just leave the caches
- * up to this point in an updated state.
- */
- if (ret)
- break;
- }
- mutex_unlock(&slab_mutex);
- return ret;
-}
-
-void memcg_link_cache(struct kmem_cache *s, struct mem_cgroup *memcg)
-{
- if (is_root_cache(s)) {
- list_add(&s->root_caches_node, &slab_root_caches);
- } else {
- css_get(&memcg->css);
- s->memcg_params.memcg = memcg;
- list_add(&s->memcg_params.children_node,
- &s->memcg_params.root_cache->memcg_params.children);
- list_add(&s->memcg_params.kmem_caches_node,
- &s->memcg_params.memcg->kmem_caches);
- }
-}
-
-static void memcg_unlink_cache(struct kmem_cache *s)
-{
- if (is_root_cache(s)) {
- list_del(&s->root_caches_node);
- } else {
- list_del(&s->memcg_params.children_node);
- list_del(&s->memcg_params.kmem_caches_node);
- }
-}
-#else
-static inline int init_memcg_params(struct kmem_cache *s,
- struct kmem_cache *root_cache)
-{
- return 0;
-}
-
-static inline void destroy_memcg_params(struct kmem_cache *s)
-{
-}
-
-static inline void memcg_unlink_cache(struct kmem_cache *s)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
/*
* Figure out what the alignment of the objects will be given a set of
* flags, a user specified alignment and the size of the objects.
@@ -311,9 +167,6 @@ int slab_unmergeable(struct kmem_cache *s)
if (slab_nomerge || (s->flags & SLAB_NEVER_MERGE))
return 1;
- if (!is_root_cache(s))
- return 1;
-
if (s->ctor)
return 1;
@@ -348,7 +201,7 @@ struct kmem_cache *find_mergeable(unsigned int size, unsigned int align,
if (flags & SLAB_NEVER_MERGE)
return NULL;
- list_for_each_entry_reverse(s, &slab_root_caches, root_caches_node) {
+ list_for_each_entry_reverse(s, &slab_caches, list) {
if (slab_unmergeable(s))
continue;
@@ -380,7 +233,7 @@ static struct kmem_cache *create_cache(const char *name,
unsigned int object_size, unsigned int align,
slab_flags_t flags, unsigned int useroffset,
unsigned int usersize, void (*ctor)(void *),
- struct mem_cgroup *memcg, struct kmem_cache *root_cache)
+ struct kmem_cache *root_cache)
{
struct kmem_cache *s;
int err;
@@ -400,24 +253,18 @@ static struct kmem_cache *create_cache(const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
- err = init_memcg_params(s, root_cache);
- if (err)
- goto out_free_cache;
-
err = __kmem_cache_create(s, flags);
if (err)
goto out_free_cache;
s->refcount = 1;
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, memcg);
out:
if (err)
return ERR_PTR(err);
return s;
out_free_cache:
- destroy_memcg_params(s);
kmem_cache_free(kmem_cache, s);
goto out;
}
@@ -463,7 +310,6 @@ kmem_cache_create_usercopy(const char *name,
get_online_cpus();
get_online_mems();
- memcg_get_cache_ids();
mutex_lock(&slab_mutex);
@@ -504,7 +350,7 @@ kmem_cache_create_usercopy(const char *name,
s = create_cache(cache_name, size,
calculate_alignment(flags, align, size),
- flags, useroffset, usersize, ctor, NULL, NULL);
+ flags, useroffset, usersize, ctor, NULL);
if (IS_ERR(s)) {
err = PTR_ERR(s);
kfree_const(cache_name);
@@ -513,7 +359,6 @@ kmem_cache_create_usercopy(const char *name,
out_unlock:
mutex_unlock(&slab_mutex);
- memcg_put_cache_ids();
put_online_mems();
put_online_cpus();
@@ -574,7 +419,7 @@ static void slab_caches_to_rcu_destroy_workfn(struct work_struct *work)
/*
* On destruction, SLAB_TYPESAFE_BY_RCU kmem_caches are put on the
* @slab_caches_to_rcu_destroy list. The slab pages are freed
- * through RCU and and the associated kmem_cache are dereferenced
+ * through RCU and the associated kmem_cache are dereferenced
* while freeing the pages, so the kmem_caches should be freed only
* after the pending RCU operations are finished. As rcu_barrier()
* is a pretty slow operation, we batch all pending destructions
@@ -606,7 +451,6 @@ static int shutdown_cache(struct kmem_cache *s)
if (__kmem_cache_shutdown(s) != 0)
return -EBUSY;
- memcg_unlink_cache(s);
list_del(&s->list);
if (s->flags & SLAB_TYPESAFE_BY_RCU) {
@@ -627,312 +471,9 @@ static int shutdown_cache(struct kmem_cache *s)
return 0;
}
-#ifdef CONFIG_MEMCG_KMEM
-/*
- * memcg_create_kmem_cache - Create a cache for a memory cgroup.
- * @memcg: The memory cgroup the new cache is for.
- * @root_cache: The parent of the new cache.
- *
- * This function attempts to create a kmem cache that will serve allocation
- * requests going from @memcg to @root_cache. The new cache inherits properties
- * from its parent.
- */
-void memcg_create_kmem_cache(struct mem_cgroup *memcg,
- struct kmem_cache *root_cache)
-{
- static char memcg_name_buf[NAME_MAX + 1]; /* protected by slab_mutex */
- struct cgroup_subsys_state *css = &memcg->css;
- struct memcg_cache_array *arr;
- struct kmem_cache *s = NULL;
- char *cache_name;
- int idx;
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
-
- /*
- * The memory cgroup could have been offlined while the cache
- * creation work was pending.
- */
- if (memcg->kmem_state != KMEM_ONLINE)
- goto out_unlock;
-
- idx = memcg_cache_id(memcg);
- arr = rcu_dereference_protected(root_cache->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
-
- /*
- * Since per-memcg caches are created asynchronously on first
- * allocation (see memcg_kmem_get_cache()), several threads can try to
- * create the same cache, but only one of them may succeed.
- */
- if (arr->entries[idx])
- goto out_unlock;
-
- cgroup_name(css->cgroup, memcg_name_buf, sizeof(memcg_name_buf));
- cache_name = kasprintf(GFP_KERNEL, "%s(%llu:%s)", root_cache->name,
- css->serial_nr, memcg_name_buf);
- if (!cache_name)
- goto out_unlock;
-
- s = create_cache(cache_name, root_cache->object_size,
- root_cache->align,
- root_cache->flags & CACHE_CREATE_MASK,
- root_cache->useroffset, root_cache->usersize,
- root_cache->ctor, memcg, root_cache);
- /*
- * If we could not create a memcg cache, do not complain, because
- * that's not critical at all as we can always proceed with the root
- * cache.
- */
- if (IS_ERR(s)) {
- kfree(cache_name);
- goto out_unlock;
- }
-
- /*
- * Since readers won't lock (see memcg_kmem_get_cache()), we need a
- * barrier here to ensure nobody will see the kmem_cache partially
- * initialized.
- */
- smp_wmb();
- arr->entries[idx] = s;
-
-out_unlock:
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static void kmemcg_workfn(struct work_struct *work)
-{
- struct kmem_cache *s = container_of(work, struct kmem_cache,
- memcg_params.work);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- s->memcg_params.work_fn(s);
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static void kmemcg_rcufn(struct rcu_head *head)
-{
- struct kmem_cache *s = container_of(head, struct kmem_cache,
- memcg_params.rcu_head);
-
- /*
- * We need to grab blocking locks. Bounce to ->work. The
- * work item shares the space with the RCU head and can't be
- * initialized earlier.
- */
- INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-}
-
-static void kmemcg_cache_shutdown_fn(struct kmem_cache *s)
-{
- WARN_ON(shutdown_cache(s));
-}
-
-static void kmemcg_cache_shutdown(struct percpu_ref *percpu_ref)
-{
- struct kmem_cache *s = container_of(percpu_ref, struct kmem_cache,
- memcg_params.refcnt);
- unsigned long flags;
-
- spin_lock_irqsave(&memcg_kmem_wq_lock, flags);
- if (s->memcg_params.root_cache->memcg_params.dying)
- goto unlock;
-
- s->memcg_params.work_fn = kmemcg_cache_shutdown_fn;
- INIT_WORK(&s->memcg_params.work, kmemcg_workfn);
- queue_work(memcg_kmem_cache_wq, &s->memcg_params.work);
-
-unlock:
- spin_unlock_irqrestore(&memcg_kmem_wq_lock, flags);
-}
-
-static void kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
- __kmemcg_cache_deactivate_after_rcu(s);
- percpu_ref_kill(&s->memcg_params.refcnt);
-}
-
-static void kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- if (WARN_ON_ONCE(is_root_cache(s)))
- return;
-
- __kmemcg_cache_deactivate(s);
- s->flags |= SLAB_DEACTIVATED;
-
- /*
- * memcg_kmem_wq_lock is used to synchronize memcg_params.dying
- * flag and make sure that no new kmem_cache deactivation tasks
- * are queued (see flush_memcg_workqueue() ).
- */
- spin_lock_irq(&memcg_kmem_wq_lock);
- if (s->memcg_params.root_cache->memcg_params.dying)
- goto unlock;
-
- s->memcg_params.work_fn = kmemcg_cache_deactivate_after_rcu;
- call_rcu(&s->memcg_params.rcu_head, kmemcg_rcufn);
-unlock:
- spin_unlock_irq(&memcg_kmem_wq_lock);
-}
-
-void memcg_deactivate_kmem_caches(struct mem_cgroup *memcg,
- struct mem_cgroup *parent)
-{
- int idx;
- struct memcg_cache_array *arr;
- struct kmem_cache *s, *c;
- unsigned int nr_reparented;
-
- idx = memcg_cache_id(memcg);
-
- get_online_cpus();
- get_online_mems();
-
- mutex_lock(&slab_mutex);
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- c = arr->entries[idx];
- if (!c)
- continue;
-
- kmemcg_cache_deactivate(c);
- arr->entries[idx] = NULL;
- }
- nr_reparented = 0;
- list_for_each_entry(s, &memcg->kmem_caches,
- memcg_params.kmem_caches_node) {
- WRITE_ONCE(s->memcg_params.memcg, parent);
- css_put(&memcg->css);
- nr_reparented++;
- }
- if (nr_reparented) {
- list_splice_init(&memcg->kmem_caches,
- &parent->kmem_caches);
- css_get_many(&parent->css, nr_reparented);
- }
- mutex_unlock(&slab_mutex);
-
- put_online_mems();
- put_online_cpus();
-}
-
-static int shutdown_memcg_caches(struct kmem_cache *s)
-{
- struct memcg_cache_array *arr;
- struct kmem_cache *c, *c2;
- LIST_HEAD(busy);
- int i;
-
- BUG_ON(!is_root_cache(s));
-
- /*
- * First, shutdown active caches, i.e. caches that belong to online
- * memory cgroups.
- */
- arr = rcu_dereference_protected(s->memcg_params.memcg_caches,
- lockdep_is_held(&slab_mutex));
- for_each_memcg_cache_index(i) {
- c = arr->entries[i];
- if (!c)
- continue;
- if (shutdown_cache(c))
- /*
- * The cache still has objects. Move it to a temporary
- * list so as not to try to destroy it for a second
- * time while iterating over inactive caches below.
- */
- list_move(&c->memcg_params.children_node, &busy);
- else
- /*
- * The cache is empty and will be destroyed soon. Clear
- * the pointer to it in the memcg_caches array so that
- * it will never be accessed even if the root cache
- * stays alive.
- */
- arr->entries[i] = NULL;
- }
-
- /*
- * Second, shutdown all caches left from memory cgroups that are now
- * offline.
- */
- list_for_each_entry_safe(c, c2, &s->memcg_params.children,
- memcg_params.children_node)
- shutdown_cache(c);
-
- list_splice(&busy, &s->memcg_params.children);
-
- /*
- * A cache being destroyed must be empty. In particular, this means
- * that all per memcg caches attached to it must be empty too.
- */
- if (!list_empty(&s->memcg_params.children))
- return -EBUSY;
- return 0;
-}
-
-static void flush_memcg_workqueue(struct kmem_cache *s)
-{
- spin_lock_irq(&memcg_kmem_wq_lock);
- s->memcg_params.dying = true;
- spin_unlock_irq(&memcg_kmem_wq_lock);
-
- /*
- * SLAB and SLUB deactivate the kmem_caches through call_rcu. Make
- * sure all registered rcu callbacks have been invoked.
- */
- rcu_barrier();
-
- /*
- * SLAB and SLUB create memcg kmem_caches through workqueue and SLUB
- * deactivates the memcg kmem_caches through workqueue. Make sure all
- * previous workitems on workqueue are processed.
- */
- if (likely(memcg_kmem_cache_wq))
- flush_workqueue(memcg_kmem_cache_wq);
-
- /*
- * If we're racing with children kmem_cache deactivation, it might
- * take another rcu grace period to complete their destruction.
- * At this moment the corresponding percpu_ref_kill() call should be
- * done, but it might take another rcu grace period to complete
- * switching to the atomic mode.
- * Please, note that we check without grabbing the slab_mutex. It's safe
- * because at this moment the children list can't grow.
- */
- if (!list_empty(&s->memcg_params.children))
- rcu_barrier();
-}
-#else
-static inline int shutdown_memcg_caches(struct kmem_cache *s)
-{
- return 0;
-}
-
-static inline void flush_memcg_workqueue(struct kmem_cache *s)
-{
-}
-#endif /* CONFIG_MEMCG_KMEM */
-
void slab_kmem_cache_release(struct kmem_cache *s)
{
__kmem_cache_release(s);
- destroy_memcg_params(s);
kfree_const(s->name);
kmem_cache_free(kmem_cache, s);
}
@@ -944,8 +485,6 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (unlikely(!s))
return;
- flush_memcg_workqueue(s);
-
get_online_cpus();
get_online_mems();
@@ -955,10 +494,7 @@ void kmem_cache_destroy(struct kmem_cache *s)
if (s->refcount)
goto out_unlock;
- err = shutdown_memcg_caches(s);
- if (!err)
- err = shutdown_cache(s);
-
+ err = shutdown_cache(s);
if (err) {
pr_err("kmem_cache_destroy %s: Slab cache still has objects\n",
s->name);
@@ -995,43 +531,6 @@ int kmem_cache_shrink(struct kmem_cache *cachep)
}
EXPORT_SYMBOL(kmem_cache_shrink);
-/**
- * kmem_cache_shrink_all - shrink a cache and all memcg caches for root cache
- * @s: The cache pointer
- */
-void kmem_cache_shrink_all(struct kmem_cache *s)
-{
- struct kmem_cache *c;
-
- if (!IS_ENABLED(CONFIG_MEMCG_KMEM) || !is_root_cache(s)) {
- kmem_cache_shrink(s);
- return;
- }
-
- get_online_cpus();
- get_online_mems();
- kasan_cache_shrink(s);
- __kmem_cache_shrink(s);
-
- /*
- * We have to take the slab_mutex to protect from the memcg list
- * modification.
- */
- mutex_lock(&slab_mutex);
- for_each_memcg_cache(c, s) {
- /*
- * Don't need to shrink deactivated memcg caches.
- */
- if (s->flags & SLAB_DEACTIVATED)
- continue;
- kasan_cache_shrink(c);
- __kmem_cache_shrink(c);
- }
- mutex_unlock(&slab_mutex);
- put_online_mems();
- put_online_cpus();
-}
-
bool slab_is_available(void)
{
return slab_state >= UP;
@@ -1060,8 +559,6 @@ void __init create_boot_cache(struct kmem_cache *s, const char *name,
s->useroffset = useroffset;
s->usersize = usersize;
- slab_init_memcg_params(s);
-
err = __kmem_cache_create(s, flags);
if (err)
@@ -1082,7 +579,6 @@ struct kmem_cache *__init create_kmalloc_cache(const char *name,
create_boot_cache(s, name, size, flags, useroffset, usersize);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, NULL);
s->refcount = 1;
return s;
}
@@ -1303,13 +799,26 @@ void __init create_kmalloc_caches(slab_flags_t flags)
kmalloc_caches[KMALLOC_DMA][i] = create_kmalloc_cache(
kmalloc_info[i].name[KMALLOC_DMA],
kmalloc_info[i].size,
- SLAB_CACHE_DMA | flags, 0, 0);
+ SLAB_CACHE_DMA | flags, 0,
+ kmalloc_info[i].size);
}
}
#endif
}
#endif /* !CONFIG_SLOB */
+gfp_t kmalloc_fix_flags(gfp_t flags)
+{
+ gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
+
+ flags &= ~GFP_SLAB_BUG_MASK;
+ pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
+ invalid_mask, &invalid_mask, flags, &flags);
+ dump_stack();
+
+ return flags;
+}
+
/*
* To avoid unnecessary overhead, we pass through large allocation requests
* directly to the page allocator. We use __GFP_COMP, because we will need to
@@ -1320,12 +829,15 @@ void *kmalloc_order(size_t size, gfp_t flags, unsigned int order)
void *ret = NULL;
struct page *page;
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
+
flags |= __GFP_COMP;
page = alloc_pages(flags, order);
if (likely(page)) {
ret = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
ret = kasan_kmalloc_large(ret, size, flags);
/* As ret might get tagged, call kmemleak hook after KASAN. */
@@ -1422,12 +934,12 @@ static void print_slabinfo_header(struct seq_file *m)
void *slab_start(struct seq_file *m, loff_t *pos)
{
mutex_lock(&slab_mutex);
- return seq_list_start(&slab_root_caches, *pos);
+ return seq_list_start(&slab_caches, *pos);
}
void *slab_next(struct seq_file *m, void *p, loff_t *pos)
{
- return seq_list_next(p, &slab_root_caches, pos);
+ return seq_list_next(p, &slab_caches, pos);
}
void slab_stop(struct seq_file *m, void *p)
@@ -1435,27 +947,6 @@ void slab_stop(struct seq_file *m, void *p)
mutex_unlock(&slab_mutex);
}
-static void
-memcg_accumulate_slabinfo(struct kmem_cache *s, struct slabinfo *info)
-{
- struct kmem_cache *c;
- struct slabinfo sinfo;
-
- if (!is_root_cache(s))
- return;
-
- for_each_memcg_cache(c, s) {
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
-
- info->active_slabs += sinfo.active_slabs;
- info->num_slabs += sinfo.num_slabs;
- info->shared_avail += sinfo.shared_avail;
- info->active_objs += sinfo.active_objs;
- info->num_objs += sinfo.num_objs;
- }
-}
-
static void cache_show(struct kmem_cache *s, struct seq_file *m)
{
struct slabinfo sinfo;
@@ -1463,10 +954,8 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
memset(&sinfo, 0, sizeof(sinfo));
get_slabinfo(s, &sinfo);
- memcg_accumulate_slabinfo(s, &sinfo);
-
seq_printf(m, "%-17s %6lu %6lu %6u %4u %4d",
- cache_name(s), sinfo.active_objs, sinfo.num_objs, s->size,
+ s->name, sinfo.active_objs, sinfo.num_objs, s->size,
sinfo.objects_per_slab, (1 << sinfo.cache_order));
seq_printf(m, " : tunables %4u %4u %4u",
@@ -1479,9 +968,9 @@ static void cache_show(struct kmem_cache *s, struct seq_file *m)
static int slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache, root_caches_node);
+ struct kmem_cache *s = list_entry(p, struct kmem_cache, list);
- if (p == slab_root_caches.next)
+ if (p == slab_caches.next)
print_slabinfo_header(m);
cache_show(s, m);
return 0;
@@ -1508,13 +997,13 @@ void dump_unreclaimable_slab(void)
pr_info("Name Used Total\n");
list_for_each_entry_safe(s, s2, &slab_caches, list) {
- if (!is_root_cache(s) || (s->flags & SLAB_RECLAIM_ACCOUNT))
+ if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
get_slabinfo(s, &sinfo);
if (sinfo.num_objs > 0)
- pr_info("%-17s %10luKB %10luKB\n", cache_name(s),
+ pr_info("%-17s %10luKB %10luKB\n", s->name,
(sinfo.active_objs * s->size) / 1024,
(sinfo.num_objs * s->size) / 1024);
}
@@ -1522,35 +1011,12 @@ void dump_unreclaimable_slab(void)
}
#if defined(CONFIG_MEMCG_KMEM)
-void *memcg_slab_start(struct seq_file *m, loff_t *pos)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- mutex_lock(&slab_mutex);
- return seq_list_start(&memcg->kmem_caches, *pos);
-}
-
-void *memcg_slab_next(struct seq_file *m, void *p, loff_t *pos)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- return seq_list_next(p, &memcg->kmem_caches, pos);
-}
-
-void memcg_slab_stop(struct seq_file *m, void *p)
-{
- mutex_unlock(&slab_mutex);
-}
-
int memcg_slab_show(struct seq_file *m, void *p)
{
- struct kmem_cache *s = list_entry(p, struct kmem_cache,
- memcg_params.kmem_caches_node);
- struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
-
- if (p == memcg->kmem_caches.next)
- print_slabinfo_header(m);
- cache_show(s, m);
+ /*
+ * Deprecated.
+ * Please, take a look at tools/cgroup/slabinfo.py .
+ */
return 0;
}
#endif
@@ -1596,73 +1062,15 @@ static int __init slab_proc_init(void)
}
module_init(slab_proc_init);
-#if defined(CONFIG_DEBUG_FS) && defined(CONFIG_MEMCG_KMEM)
-/*
- * Display information about kmem caches that have child memcg caches.
- */
-static int memcg_slabinfo_show(struct seq_file *m, void *unused)
-{
- struct kmem_cache *s, *c;
- struct slabinfo sinfo;
-
- mutex_lock(&slab_mutex);
- seq_puts(m, "# <name> <css_id[:dead|deact]> <active_objs> <num_objs>");
- seq_puts(m, " <active_slabs> <num_slabs>\n");
- list_for_each_entry(s, &slab_root_caches, root_caches_node) {
- /*
- * Skip kmem caches that don't have any memcg children.
- */
- if (list_empty(&s->memcg_params.children))
- continue;
-
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(s, &sinfo);
- seq_printf(m, "%-17s root %6lu %6lu %6lu %6lu\n",
- cache_name(s), sinfo.active_objs, sinfo.num_objs,
- sinfo.active_slabs, sinfo.num_slabs);
-
- for_each_memcg_cache(c, s) {
- struct cgroup_subsys_state *css;
- char *status = "";
-
- css = &c->memcg_params.memcg->css;
- if (!(css->flags & CSS_ONLINE))
- status = ":dead";
- else if (c->flags & SLAB_DEACTIVATED)
- status = ":deact";
-
- memset(&sinfo, 0, sizeof(sinfo));
- get_slabinfo(c, &sinfo);
- seq_printf(m, "%-17s %4d%-6s %6lu %6lu %6lu %6lu\n",
- cache_name(c), css->id, status,
- sinfo.active_objs, sinfo.num_objs,
- sinfo.active_slabs, sinfo.num_slabs);
- }
- }
- mutex_unlock(&slab_mutex);
- return 0;
-}
-DEFINE_SHOW_ATTRIBUTE(memcg_slabinfo);
-
-static int __init memcg_slabinfo_init(void)
-{
- debugfs_create_file("memcg_slabinfo", S_IFREG | S_IRUGO,
- NULL, NULL, &memcg_slabinfo_fops);
- return 0;
-}
-
-late_initcall(memcg_slabinfo_init);
-#endif /* CONFIG_DEBUG_FS && CONFIG_MEMCG_KMEM */
#endif /* CONFIG_SLAB || CONFIG_SLUB_DEBUG */
static __always_inline void *__do_krealloc(const void *p, size_t new_size,
gfp_t flags)
{
void *ret;
- size_t ks = 0;
+ size_t ks;
- if (p)
- ks = ksize(p);
+ ks = ksize(p);
if (ks >= new_size) {
p = kasan_krealloc((void *)p, new_size, flags);
@@ -1707,28 +1115,27 @@ void *krealloc(const void *p, size_t new_size, gfp_t flags)
EXPORT_SYMBOL(krealloc);
/**
- * kzfree - like kfree but zero memory
+ * kfree_sensitive - Clear sensitive information in memory before freeing
* @p: object to free memory of
*
* The memory of the object @p points to is zeroed before freed.
- * If @p is %NULL, kzfree() does nothing.
+ * If @p is %NULL, kfree_sensitive() does nothing.
*
* Note: this function zeroes the whole allocated buffer which can be a good
* deal bigger than the requested buffer size passed to kmalloc(). So be
* careful when using this function in performance sensitive code.
*/
-void kzfree(const void *p)
+void kfree_sensitive(const void *p)
{
size_t ks;
void *mem = (void *)p;
- if (unlikely(ZERO_OR_NULL_PTR(mem)))
- return;
ks = ksize(mem);
- memset(mem, 0, ks);
+ if (ks)
+ memzero_explicit(mem, ks);
kfree(mem);
}
-EXPORT_SYMBOL(kzfree);
+EXPORT_SYMBOL(kfree_sensitive);
/**
* ksize - get the actual amount of memory allocated for a given object
@@ -1748,8 +1155,6 @@ size_t ksize(const void *objp)
{
size_t size;
- if (WARN_ON_ONCE(!objp))
- return 0;
/*
* We need to check that the pointed to object is valid, and only then
* unpoison the shadow memory below. We use __kasan_check_read(), to
@@ -1763,7 +1168,7 @@ size_t ksize(const void *objp)
* We want to perform the check before __ksize(), to avoid potentially
* crashing in __ksize() due to accessing invalid metadata.
*/
- if (unlikely(objp == ZERO_SIZE_PTR) || !__kasan_check_read(objp, 1))
+ if (unlikely(ZERO_OR_NULL_PTR(objp)) || !__kasan_check_read(objp, 1))
return 0;
size = __ksize(objp);
diff --git a/mm/slob.c b/mm/slob.c
index fa53e9f73893..7cc9805c8091 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -202,8 +202,8 @@ static void *slob_new_pages(gfp_t gfp, int order, int node)
if (!page)
return NULL;
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
return page_address(page);
}
@@ -214,8 +214,8 @@ static void slob_free_pages(void *b, int order)
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(sp, order);
}
@@ -524,6 +524,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfp, unsigned long caller)
{
return __do_kmalloc_node(size, gfp, NUMA_NO_NODE, caller);
}
+EXPORT_SYMBOL(__kmalloc_track_caller);
#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
@@ -531,6 +532,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfp,
{
return __do_kmalloc_node(size, gfp, node, caller);
}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif
void kfree(const void *block)
@@ -550,8 +552,8 @@ void kfree(const void *block)
slob_free(m, *m + align);
} else {
unsigned int order = compound_order(sp);
- mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(sp), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(sp, order);
}
diff --git a/mm/slub.c b/mm/slub.c
index b762450fc9f0..6d3574013b2f 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -114,18 +114,22 @@
* the fast path and disables lockless freelists.
*/
-static inline int kmem_cache_debug(struct kmem_cache *s)
-{
#ifdef CONFIG_SLUB_DEBUG
- return unlikely(s->flags & SLAB_DEBUG_FLAGS);
+#ifdef CONFIG_SLUB_DEBUG_ON
+DEFINE_STATIC_KEY_TRUE(slub_debug_enabled);
#else
- return 0;
+DEFINE_STATIC_KEY_FALSE(slub_debug_enabled);
+#endif
#endif
+
+static inline bool kmem_cache_debug(struct kmem_cache *s)
+{
+ return kmem_cache_debug_flags(s, SLAB_DEBUG_FLAGS);
}
void *fixup_red_left(struct kmem_cache *s, void *p)
{
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE)
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE))
p += s->red_left_pad;
return p;
@@ -214,14 +218,10 @@ enum track_item { TRACK_ALLOC, TRACK_FREE };
#ifdef CONFIG_SYSFS
static int sysfs_slab_add(struct kmem_cache *);
static int sysfs_slab_alias(struct kmem_cache *, const char *);
-static void memcg_propagate_slab_attrs(struct kmem_cache *s);
-static void sysfs_slab_remove(struct kmem_cache *s);
#else
static inline int sysfs_slab_add(struct kmem_cache *s) { return 0; }
static inline int sysfs_slab_alias(struct kmem_cache *s, const char *p)
{ return 0; }
-static inline void memcg_propagate_slab_attrs(struct kmem_cache *s) { }
-static inline void sysfs_slab_remove(struct kmem_cache *s) { }
#endif
static inline void stat(const struct kmem_cache *s, enum stat_item si)
@@ -292,7 +292,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object)
return get_freepointer(s, object);
freepointer_addr = (unsigned long)object + s->offset;
- probe_kernel_read(&p, (void **)freepointer_addr, sizeof(p));
+ copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p));
return freelist_ptr(s, p, freepointer_addr);
}
@@ -313,12 +313,6 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
__p < (__addr) + (__objects) * (__s)->size; \
__p += (__s)->size)
-/* Determine object index from a given position */
-static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr)
-{
- return (kasan_reset_tag(p) - addr) / s->size;
-}
-
static inline unsigned int order_objects(unsigned int order, unsigned int size)
{
return ((unsigned int)PAGE_SIZE << order) / size;
@@ -461,7 +455,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
bitmap_zero(object_map, page->objects);
for (p = page->freelist; p; p = get_freepointer(s, p))
- set_bit(slab_index(p, s, addr), object_map);
+ set_bit(__obj_to_index(s, addr, p), object_map);
return object_map;
}
@@ -469,8 +463,6 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page)
static void put_map(unsigned long *map) __releases(&object_map_lock)
{
VM_BUG_ON(map != object_map);
- lockdep_assert_held(&object_map_lock);
-
spin_unlock(&object_map_lock);
}
@@ -499,7 +491,7 @@ static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS;
static slab_flags_t slub_debug;
#endif
-static char *slub_debug_slabs;
+static char *slub_debug_string;
static int disable_higher_order_debug;
/*
@@ -634,7 +626,7 @@ static void print_track(const char *s, struct track *t, unsigned long pr_time)
#endif
}
-static void print_tracking(struct kmem_cache *s, void *object)
+void print_tracking(struct kmem_cache *s, void *object)
{
unsigned long pr_time = jiffies;
if (!(s->flags & SLAB_STORE_USER))
@@ -679,6 +671,20 @@ static void slab_fix(struct kmem_cache *s, char *fmt, ...)
va_end(args);
}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ if ((s->flags & SLAB_CONSISTENCY_CHECKS) &&
+ !check_valid_pointer(s, page, nextfree) && freelist) {
+ object_err(s, page, *freelist, "Freechain corrupt");
+ *freelist = NULL;
+ slab_fix(s, "Isolate corrupted freechain");
+ return true;
+ }
+
+ return false;
+}
+
static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p)
{
unsigned int off; /* Offset of last byte */
@@ -1098,7 +1104,7 @@ static inline void dec_slabs_node(struct kmem_cache *s, int node, int objects)
static void setup_object_debug(struct kmem_cache *s, struct page *page,
void *object)
{
- if (!(s->flags & (SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON)))
+ if (!kmem_cache_debug_flags(s, SLAB_STORE_USER|SLAB_RED_ZONE|__OBJECT_POISON))
return;
init_object(s, object, SLUB_RED_INACTIVE);
@@ -1108,7 +1114,7 @@ static void setup_object_debug(struct kmem_cache *s, struct page *page,
static
void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
{
- if (!(s->flags & SLAB_POISON))
+ if (!kmem_cache_debug_flags(s, SLAB_POISON))
return;
metadata_access_enable();
@@ -1204,7 +1210,7 @@ static noinline int free_debug_processing(
struct kmem_cache_node *n = get_node(s, page_to_nid(page));
void *object = head;
int cnt = 0;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
int ret = 0;
spin_lock_irqsave(&n->list_lock, flags);
@@ -1248,69 +1254,135 @@ out:
return ret;
}
-static int __init setup_slub_debug(char *str)
+/*
+ * Parse a block of slub_debug options. Blocks are delimited by ';'
+ *
+ * @str: start of block
+ * @flags: returns parsed flags, or DEBUG_DEFAULT_FLAGS if none specified
+ * @slabs: return start of list of slabs, or NULL when there's no list
+ * @init: assume this is initial parsing and not per-kmem-create parsing
+ *
+ * returns the start of next block if there's any, or NULL
+ */
+static char *
+parse_slub_debug_flags(char *str, slab_flags_t *flags, char **slabs, bool init)
{
- slub_debug = DEBUG_DEFAULT_FLAGS;
- if (*str++ != '=' || !*str)
- /*
- * No options specified. Switch on full debugging.
- */
- goto out;
+ bool higher_order_disable = false;
- if (*str == ',')
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (*str == ',') {
/*
* No options but restriction on slabs. This means full
* debugging for slabs matching a pattern.
*/
+ *flags = DEBUG_DEFAULT_FLAGS;
goto check_slabs;
+ }
+ *flags = 0;
- slub_debug = 0;
- if (*str == '-')
- /*
- * Switch off all debugging measures.
- */
- goto out;
-
- /*
- * Determine which debug features should be switched on
- */
- for (; *str && *str != ','; str++) {
+ /* Determine which debug features should be switched on */
+ for (; *str && *str != ',' && *str != ';'; str++) {
switch (tolower(*str)) {
+ case '-':
+ *flags = 0;
+ break;
case 'f':
- slub_debug |= SLAB_CONSISTENCY_CHECKS;
+ *flags |= SLAB_CONSISTENCY_CHECKS;
break;
case 'z':
- slub_debug |= SLAB_RED_ZONE;
+ *flags |= SLAB_RED_ZONE;
break;
case 'p':
- slub_debug |= SLAB_POISON;
+ *flags |= SLAB_POISON;
break;
case 'u':
- slub_debug |= SLAB_STORE_USER;
+ *flags |= SLAB_STORE_USER;
break;
case 't':
- slub_debug |= SLAB_TRACE;
+ *flags |= SLAB_TRACE;
break;
case 'a':
- slub_debug |= SLAB_FAILSLAB;
+ *flags |= SLAB_FAILSLAB;
break;
case 'o':
/*
* Avoid enabling debugging on caches if its minimum
* order would increase as a result.
*/
- disable_higher_order_debug = 1;
+ higher_order_disable = true;
break;
default:
- pr_err("slub_debug option '%c' unknown. skipped\n",
- *str);
+ if (init)
+ pr_err("slub_debug option '%c' unknown. skipped\n", *str);
}
}
-
check_slabs:
if (*str == ',')
- slub_debug_slabs = str + 1;
+ *slabs = ++str;
+ else
+ *slabs = NULL;
+
+ /* Skip over the slab list */
+ while (*str && *str != ';')
+ str++;
+
+ /* Skip any completely empty blocks */
+ while (*str && *str == ';')
+ str++;
+
+ if (init && higher_order_disable)
+ disable_higher_order_debug = 1;
+
+ if (*str)
+ return str;
+ else
+ return NULL;
+}
+
+static int __init setup_slub_debug(char *str)
+{
+ slab_flags_t flags;
+ char *saved_str;
+ char *slab_list;
+ bool global_slub_debug_changed = false;
+ bool slab_list_specified = false;
+
+ slub_debug = DEBUG_DEFAULT_FLAGS;
+ if (*str++ != '=' || !*str)
+ /*
+ * No options specified. Switch on full debugging.
+ */
+ goto out;
+
+ saved_str = str;
+ while (str) {
+ str = parse_slub_debug_flags(str, &flags, &slab_list, true);
+
+ if (!slab_list) {
+ slub_debug = flags;
+ global_slub_debug_changed = true;
+ } else {
+ slab_list_specified = true;
+ }
+ }
+
+ /*
+ * For backwards compatibility, a single list of flags with list of
+ * slabs means debugging is only enabled for those slabs, so the global
+ * slub_debug should be 0. We can extended that to multiple lists as
+ * long as there is no option specifying flags without a slab list.
+ */
+ if (slab_list_specified) {
+ if (!global_slub_debug_changed)
+ slub_debug = 0;
+ slub_debug_string = saved_str;
+ }
out:
+ if (slub_debug != 0 || slub_debug_string)
+ static_branch_enable(&slub_debug_enabled);
if ((static_branch_unlikely(&init_on_alloc) ||
static_branch_unlikely(&init_on_free)) &&
(slub_debug & SLAB_POISON))
@@ -1338,36 +1410,43 @@ slab_flags_t kmem_cache_flags(unsigned int object_size,
{
char *iter;
size_t len;
-
- /* If slub_debug = 0, it folds into the if conditional. */
- if (!slub_debug_slabs)
- return flags | slub_debug;
+ char *next_block;
+ slab_flags_t block_flags;
len = strlen(name);
- iter = slub_debug_slabs;
- while (*iter) {
- char *end, *glob;
- size_t cmplen;
-
- end = strchrnul(iter, ',');
+ next_block = slub_debug_string;
+ /* Go through all blocks of debug options, see if any matches our slab's name */
+ while (next_block) {
+ next_block = parse_slub_debug_flags(next_block, &block_flags, &iter, false);
+ if (!iter)
+ continue;
+ /* Found a block that has a slab list, search it */
+ while (*iter) {
+ char *end, *glob;
+ size_t cmplen;
+
+ end = strchrnul(iter, ',');
+ if (next_block && next_block < end)
+ end = next_block - 1;
+
+ glob = strnchr(iter, end - iter, '*');
+ if (glob)
+ cmplen = glob - iter;
+ else
+ cmplen = max_t(size_t, len, (end - iter));
- glob = strnchr(iter, end - iter, '*');
- if (glob)
- cmplen = glob - iter;
- else
- cmplen = max_t(size_t, len, (end - iter));
+ if (!strncmp(name, iter, cmplen)) {
+ flags |= block_flags;
+ return flags;
+ }
- if (!strncmp(name, iter, cmplen)) {
- flags |= slub_debug;
- break;
+ if (!*end || *end == ';')
+ break;
+ iter = end + 1;
}
-
- if (!*end)
- break;
- iter = end + 1;
}
- return flags;
+ return flags | slub_debug;
}
#else /* !CONFIG_SLUB_DEBUG */
static inline void setup_object_debug(struct kmem_cache *s,
@@ -1410,6 +1489,11 @@ static inline void inc_slabs_node(struct kmem_cache *s, int node,
static inline void dec_slabs_node(struct kmem_cache *s, int node,
int objects) {}
+static bool freelist_corrupted(struct kmem_cache *s, struct page *page,
+ void **freelist, void *nextfree)
+{
+ return false;
+}
#endif /* CONFIG_SLUB_DEBUG */
/*
@@ -1451,6 +1535,11 @@ static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x)
if (!(s->flags & SLAB_DEBUG_OBJECTS))
debug_check_no_obj_freed(x, s->object_size);
+ /* Use KCSAN to help debug racy use-after-free. */
+ if (!(s->flags & SLAB_TYPESAFE_BY_RCU))
+ __kcsan_check_access(x, s->object_size,
+ KCSAN_ACCESS_WRITE | KCSAN_ACCESS_ASSERT);
+
/* KASAN might put x into memory quarantine, delaying its reuse */
return kasan_slab_free(s, x, _RET_IP_);
}
@@ -1527,10 +1616,8 @@ static inline struct page *alloc_slab_page(struct kmem_cache *s,
else
page = __alloc_pages_node(node, flags, order);
- if (page && charge_slab_page(page, flags, order, s)) {
- __free_pages(page, order);
- page = NULL;
- }
+ if (page)
+ account_slab_page(page, order, s);
return page;
}
@@ -1726,13 +1813,8 @@ out:
static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
{
- if (unlikely(flags & GFP_SLAB_BUG_MASK)) {
- gfp_t invalid_mask = flags & GFP_SLAB_BUG_MASK;
- flags &= ~GFP_SLAB_BUG_MASK;
- pr_warn("Unexpected gfp: %#x (%pGg). Fixing up to gfp: %#x (%pGg). Fix your code!\n",
- invalid_mask, &invalid_mask, flags, &flags);
- dump_stack();
- }
+ if (unlikely(flags & GFP_SLAB_BUG_MASK))
+ flags = kmalloc_fix_flags(flags);
return allocate_slab(s,
flags & (GFP_RECLAIM_MASK | GFP_CONSTRAINT_MASK), node);
@@ -1743,7 +1825,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
int order = compound_order(page);
int pages = 1 << order;
- if (s->flags & SLAB_CONSISTENCY_CHECKS) {
+ if (kmem_cache_debug_flags(s, SLAB_CONSISTENCY_CHECKS)) {
void *p;
slab_pad_check(s, page);
@@ -1758,7 +1840,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
page->mapping = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
- uncharge_slab_page(page, order, s);
+ unaccount_slab_page(page, order, s);
__free_pages(page, order);
}
@@ -1919,7 +2001,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
struct zonelist *zonelist;
struct zoneref *z;
struct zone *zone;
- enum zone_type high_zoneidx = gfp_zone(flags);
+ enum zone_type highest_zoneidx = gfp_zone(flags);
void *object;
unsigned int cpuset_mems_cookie;
@@ -1948,7 +2030,7 @@ static void *get_any_partial(struct kmem_cache *s, gfp_t flags,
do {
cpuset_mems_cookie = read_mems_allowed_begin();
zonelist = node_zonelist(mempolicy_slab_node(), flags);
- for_each_zone_zonelist(zone, z, zonelist, high_zoneidx) {
+ for_each_zone_zonelist(zone, z, zonelist, highest_zoneidx) {
struct kmem_cache_node *n;
n = get_node(s, zone_to_nid(zone));
@@ -1994,7 +2076,7 @@ static void *get_partial(struct kmem_cache *s, gfp_t flags, int node,
#ifdef CONFIG_PREEMPTION
/*
- * Calculate the next globally unique transaction for disambiguiation
+ * Calculate the next globally unique transaction for disambiguation
* during cmpxchg. The transactions start with the cpu number and are then
* incremented by CONFIG_NR_CPUS.
*/
@@ -2093,6 +2175,14 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page,
void *prior;
unsigned long counters;
+ /*
+ * If 'nextfree' is invalid, it is possible that the object at
+ * 'freelist' is already corrupted. So isolate all objects
+ * starting at 'freelist'.
+ */
+ if (freelist_corrupted(s, page, &freelist, nextfree))
+ break;
+
do {
prior = page->freelist;
counters = page->counters;
@@ -2717,8 +2807,9 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s,
struct kmem_cache_cpu *c;
struct page *page;
unsigned long tid;
+ struct obj_cgroup *objcg = NULL;
- s = slab_pre_alloc_hook(s, gfpflags);
+ s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags);
if (!s)
return NULL;
redo:
@@ -2794,7 +2885,7 @@ redo:
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
memset(object, 0, s->object_size);
- slab_post_alloc_hook(s, gfpflags, 1, &object);
+ slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
return object;
}
@@ -2874,7 +2965,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page,
struct page new;
unsigned long counters;
struct kmem_cache_node *n = NULL;
- unsigned long uninitialized_var(flags);
+ unsigned long flags;
stat(s, FREE_SLOWPATH);
@@ -2999,6 +3090,8 @@ static __always_inline void do_slab_free(struct kmem_cache *s,
void *tail_obj = tail ? : head;
struct kmem_cache_cpu *c;
unsigned long tid;
+
+ memcg_slab_free_hook(s, page, head);
redo:
/*
* Determine the currently cpus per cpu slab.
@@ -3178,9 +3271,10 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
{
struct kmem_cache_cpu *c;
int i;
+ struct obj_cgroup *objcg = NULL;
/* memcg and kmem_cache debug support */
- s = slab_pre_alloc_hook(s, flags);
+ s = slab_pre_alloc_hook(s, &objcg, size, flags);
if (unlikely(!s))
return false;
/*
@@ -3234,11 +3328,11 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size,
}
/* memcg and kmem_cache debug support */
- slab_post_alloc_hook(s, flags, size, p);
+ slab_post_alloc_hook(s, objcg, flags, size, p);
return i;
error:
local_irq_enable();
- slab_post_alloc_hook(s, flags, i, p);
+ slab_post_alloc_hook(s, objcg, flags, i, p);
__kmem_cache_free_bulk(s, i, p);
return 0;
}
@@ -3648,6 +3742,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order)
*/
size = ALIGN(size, s->align);
s->size = size;
+ s->reciprocal_size = reciprocal_value(size);
if (forced_order >= 0)
order = forced_order;
else
@@ -3739,12 +3834,12 @@ error:
}
static void list_slab_objects(struct kmem_cache *s, struct page *page,
- const char *text)
+ const char *text)
{
#ifdef CONFIG_SLUB_DEBUG
void *addr = page_address(page);
- void *p;
unsigned long *map;
+ void *p;
slab_err(s, page, text, s->name);
slab_lock(page);
@@ -3752,13 +3847,12 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- if (!test_bit(slab_index(p, s, addr), map)) {
+ if (!test_bit(__obj_to_index(s, addr, p), map)) {
pr_err("INFO: Object 0x%p @offset=%tu\n", p, p - addr);
print_tracking(s, p);
}
}
put_map(map);
-
slab_unlock(page);
#endif
}
@@ -3781,7 +3875,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
list_add(&page->slab_list, &discard);
} else {
list_slab_objects(s, page,
- "Objects remaining in %s on __kmem_cache_shutdown()");
+ "Objects remaining in %s on __kmem_cache_shutdown()");
}
}
spin_unlock_irq(&n->list_lock);
@@ -3816,7 +3910,6 @@ int __kmem_cache_shutdown(struct kmem_cache *s)
if (n->nr_partial || slabs_node(s, node))
return 1;
}
- sysfs_slab_remove(s);
return 0;
}
@@ -3886,8 +3979,8 @@ static void *kmalloc_large_node(size_t size, gfp_t flags, int node)
page = alloc_pages_node(node, flags, order);
if (page) {
ptr = page_address(page);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- 1 << order);
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ PAGE_SIZE << order);
}
return kmalloc_large_node_hook(ptr, size, flags);
@@ -3954,7 +4047,7 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page,
offset = (ptr - page_address(page)) % s->size;
/* Adjust for redzone and reject if within the redzone. */
- if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) {
+ if (kmem_cache_debug_flags(s, SLAB_RED_ZONE)) {
if (offset < s->red_left_pad)
usercopy_abort("SLUB object in left red zone",
s->name, to_user, offset, n);
@@ -4018,8 +4111,8 @@ void kfree(const void *x)
BUG_ON(!PageCompound(page));
kfree_hook(object);
- mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE,
- -(1 << order));
+ mod_node_page_state(page_pgdat(page), NR_SLAB_UNRECLAIMABLE_B,
+ -(PAGE_SIZE << order));
__free_pages(page, order);
return;
}
@@ -4100,36 +4193,6 @@ int __kmem_cache_shrink(struct kmem_cache *s)
return ret;
}
-#ifdef CONFIG_MEMCG
-void __kmemcg_cache_deactivate_after_rcu(struct kmem_cache *s)
-{
- /*
- * Called with all the locks held after a sched RCU grace period.
- * Even if @s becomes empty after shrinking, we can't know that @s
- * doesn't have allocations already in-flight and thus can't
- * destroy @s until the associated memcg is released.
- *
- * However, let's remove the sysfs files for empty caches here.
- * Each cache has a lot of interface files which aren't
- * particularly useful for empty draining caches; otherwise, we can
- * easily end up with millions of unnecessary sysfs files on
- * systems which have a lot of memory and transient cgroups.
- */
- if (!__kmem_cache_shrink(s))
- sysfs_slab_remove(s);
-}
-
-void __kmemcg_cache_deactivate(struct kmem_cache *s)
-{
- /*
- * Disable empty slabs caching. Used to avoid pinning offline
- * memory cgroups by kmem pages that can be freed.
- */
- slub_set_cpu_partial(s, 0);
- s->min_partial = 0;
-}
-#endif /* CONFIG_MEMCG */
-
static int slab_mem_going_offline_callback(void *arg)
{
struct kmem_cache *s;
@@ -4284,9 +4347,7 @@ static struct kmem_cache * __init bootstrap(struct kmem_cache *static_cache)
p->slab_cache = s;
#endif
}
- slab_init_memcg_params(s);
list_add(&s->list, &slab_caches);
- memcg_link_cache(s, NULL);
return s;
}
@@ -4341,7 +4402,7 @@ struct kmem_cache *
__kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
slab_flags_t flags, void (*ctor)(void *))
{
- struct kmem_cache *s, *c;
+ struct kmem_cache *s;
s = find_mergeable(size, align, flags, name, ctor);
if (s) {
@@ -4354,11 +4415,6 @@ __kmem_cache_alias(const char *name, unsigned int size, unsigned int align,
s->object_size = max(s->object_size, size);
s->inuse = max(s->inuse, ALIGN(size, sizeof(void *)));
- for_each_memcg_cache(c, s) {
- c->object_size = s->object_size;
- c->inuse = max(c->inuse, ALIGN(size, sizeof(void *)));
- }
-
if (sysfs_slab_alias(s, name)) {
s->refcount--;
s = NULL;
@@ -4380,7 +4436,6 @@ int __kmem_cache_create(struct kmem_cache *s, slab_flags_t flags)
if (slab_state <= UP)
return 0;
- memcg_propagate_slab_attrs(s);
err = sysfs_slab_add(s);
if (err)
__kmem_cache_release(s);
@@ -4408,6 +4463,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
return ret;
}
+EXPORT_SYMBOL(__kmalloc_track_caller);
#ifdef CONFIG_NUMA
void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
@@ -4438,6 +4494,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
return ret;
}
+EXPORT_SYMBOL(__kmalloc_node_track_caller);
#endif
#ifdef CONFIG_SYSFS
@@ -4467,7 +4524,7 @@ static void validate_slab(struct kmem_cache *s, struct page *page)
/* Now we know that a valid freelist exists */
map = get_map(s, page);
for_each_object(p, s, addr, page->objects) {
- u8 val = test_bit(slab_index(p, s, addr), map) ?
+ u8 val = test_bit(__obj_to_index(s, addr, p), map) ?
SLUB_RED_INACTIVE : SLUB_RED_ACTIVE;
if (!check_object(s, page, p, val))
@@ -4658,7 +4715,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
map = get_map(s, page);
for_each_object(p, s, addr, page->objects)
- if (!test_bit(slab_index(p, s, addr), map))
+ if (!test_bit(__obj_to_index(s, addr, p), map))
add_location(t, s, get_track(s, p, alloc));
put_map(map);
}
@@ -4942,20 +4999,6 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
return x + sprintf(buf + x, "\n");
}
-#ifdef CONFIG_SLUB_DEBUG
-static int any_slab_objects(struct kmem_cache *s)
-{
- int node;
- struct kmem_cache_node *n;
-
- for_each_kmem_cache_node(s, node, n)
- if (atomic_long_read(&n->total_objects))
- return 1;
-
- return 0;
-}
-#endif
-
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
#define to_slab(n) container_of(n, struct kmem_cache, kobj)
@@ -4997,28 +5040,11 @@ static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
}
SLAB_ATTR_RO(objs_per_slab);
-static ssize_t order_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- unsigned int order;
- int err;
-
- err = kstrtouint(buf, 10, &order);
- if (err)
- return err;
-
- if (order > slub_max_order || order < slub_min_order)
- return -EINVAL;
-
- calculate_sizes(s, order);
- return length;
-}
-
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%u\n", oo_order(s->oo));
}
-SLAB_ATTR(order);
+SLAB_ATTR_RO(order);
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
@@ -5140,16 +5166,7 @@ static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
-
-static ssize_t reclaim_account_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_RECLAIM_ACCOUNT;
- if (buf[0] == '1')
- s->flags |= SLAB_RECLAIM_ACCOUNT;
- return length;
-}
-SLAB_ATTR(reclaim_account);
+SLAB_ATTR_RO(reclaim_account);
static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
@@ -5194,104 +5211,34 @@ static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
-
-static ssize_t sanity_checks_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- s->flags &= ~SLAB_CONSISTENCY_CHECKS;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_CONSISTENCY_CHECKS;
- }
- return length;
-}
-SLAB_ATTR(sanity_checks);
+SLAB_ATTR_RO(sanity_checks);
static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
-
-static ssize_t trace_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- /*
- * Tracing a merged cache is going to give confusing results
- * as well as cause other issues like converting a mergeable
- * cache into an umergeable one.
- */
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_TRACE;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_TRACE;
- }
- return length;
-}
-SLAB_ATTR(trace);
+SLAB_ATTR_RO(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}
-static ssize_t red_zone_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_RED_ZONE;
- if (buf[0] == '1') {
- s->flags |= SLAB_RED_ZONE;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(red_zone);
+SLAB_ATTR_RO(red_zone);
static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
}
-static ssize_t poison_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_POISON;
- if (buf[0] == '1') {
- s->flags |= SLAB_POISON;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(poison);
+SLAB_ATTR_RO(poison);
static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}
-static ssize_t store_user_store(struct kmem_cache *s,
- const char *buf, size_t length)
-{
- if (any_slab_objects(s))
- return -EBUSY;
-
- s->flags &= ~SLAB_STORE_USER;
- if (buf[0] == '1') {
- s->flags &= ~__CMPXCHG_DOUBLE;
- s->flags |= SLAB_STORE_USER;
- }
- calculate_sizes(s, -1);
- return length;
-}
-SLAB_ATTR(store_user);
+SLAB_ATTR_RO(store_user);
static ssize_t validate_show(struct kmem_cache *s, char *buf)
{
@@ -5334,19 +5281,7 @@ static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
-
-static ssize_t failslab_store(struct kmem_cache *s, const char *buf,
- size_t length)
-{
- if (s->refcount > 1)
- return -EINVAL;
-
- s->flags &= ~SLAB_FAILSLAB;
- if (buf[0] == '1')
- s->flags |= SLAB_FAILSLAB;
- return length;
-}
-SLAB_ATTR(failslab);
+SLAB_ATTR_RO(failslab);
#endif
static ssize_t shrink_show(struct kmem_cache *s, char *buf)
@@ -5358,7 +5293,7 @@ static ssize_t shrink_store(struct kmem_cache *s,
const char *buf, size_t length)
{
if (buf[0] == '1')
- kmem_cache_shrink_all(s);
+ kmem_cache_shrink(s);
else
return -EINVAL;
return length;
@@ -5582,97 +5517,9 @@ static ssize_t slab_attr_store(struct kobject *kobj,
return -EIO;
err = attribute->store(s, buf, len);
-#ifdef CONFIG_MEMCG
- if (slab_state >= FULL && err >= 0 && is_root_cache(s)) {
- struct kmem_cache *c;
-
- mutex_lock(&slab_mutex);
- if (s->max_attr_size < len)
- s->max_attr_size = len;
-
- /*
- * This is a best effort propagation, so this function's return
- * value will be determined by the parent cache only. This is
- * basically because not all attributes will have a well
- * defined semantics for rollbacks - most of the actions will
- * have permanent effects.
- *
- * Returning the error value of any of the children that fail
- * is not 100 % defined, in the sense that users seeing the
- * error code won't be able to know anything about the state of
- * the cache.
- *
- * Only returning the error code for the parent cache at least
- * has well defined semantics. The cache being written to
- * directly either failed or succeeded, in which case we loop
- * through the descendants with best-effort propagation.
- */
- for_each_memcg_cache(c, s)
- attribute->store(c, buf, len);
- mutex_unlock(&slab_mutex);
- }
-#endif
return err;
}
-static void memcg_propagate_slab_attrs(struct kmem_cache *s)
-{
-#ifdef CONFIG_MEMCG
- int i;
- char *buffer = NULL;
- struct kmem_cache *root_cache;
-
- if (is_root_cache(s))
- return;
-
- root_cache = s->memcg_params.root_cache;
-
- /*
- * This mean this cache had no attribute written. Therefore, no point
- * in copying default values around
- */
- if (!root_cache->max_attr_size)
- return;
-
- for (i = 0; i < ARRAY_SIZE(slab_attrs); i++) {
- char mbuf[64];
- char *buf;
- struct slab_attribute *attr = to_slab_attr(slab_attrs[i]);
- ssize_t len;
-
- if (!attr || !attr->store || !attr->show)
- continue;
-
- /*
- * It is really bad that we have to allocate here, so we will
- * do it only as a fallback. If we actually allocate, though,
- * we can just use the allocated buffer until the end.
- *
- * Most of the slub attributes will tend to be very small in
- * size, but sysfs allows buffers up to a page, so they can
- * theoretically happen.
- */
- if (buffer)
- buf = buffer;
- else if (root_cache->max_attr_size < ARRAY_SIZE(mbuf))
- buf = mbuf;
- else {
- buffer = (char *) get_zeroed_page(GFP_KERNEL);
- if (WARN_ON(!buffer))
- continue;
- buf = buffer;
- }
-
- len = attr->show(root_cache, buf);
- if (len > 0)
- attr->store(s, buf, len);
- }
-
- if (buffer)
- free_page((unsigned long)buffer);
-#endif /* CONFIG_MEMCG */
-}
-
static void kmem_cache_release(struct kobject *k)
{
slab_kmem_cache_release(to_slab(k));
@@ -5688,27 +5535,10 @@ static struct kobj_type slab_ktype = {
.release = kmem_cache_release,
};
-static int uevent_filter(struct kset *kset, struct kobject *kobj)
-{
- struct kobj_type *ktype = get_ktype(kobj);
-
- if (ktype == &slab_ktype)
- return 1;
- return 0;
-}
-
-static const struct kset_uevent_ops slab_uevent_ops = {
- .filter = uevent_filter,
-};
-
static struct kset *slab_kset;
static inline struct kset *cache_kset(struct kmem_cache *s)
{
-#ifdef CONFIG_MEMCG
- if (!is_root_cache(s))
- return s->memcg_params.root_cache->memcg_kset;
-#endif
return slab_kset;
}
@@ -5751,28 +5581,6 @@ static char *create_unique_id(struct kmem_cache *s)
return name;
}
-static void sysfs_slab_remove_workfn(struct work_struct *work)
-{
- struct kmem_cache *s =
- container_of(work, struct kmem_cache, kobj_remove_work);
-
- if (!s->kobj.state_in_sysfs)
- /*
- * For a memcg cache, this may be called during
- * deactivation and again on shutdown. Remove only once.
- * A cache is never shut down before deactivation is
- * complete, so no need to worry about synchronization.
- */
- goto out;
-
-#ifdef CONFIG_MEMCG
- kset_unregister(s->memcg_kset);
-#endif
- kobject_uevent(&s->kobj, KOBJ_REMOVE);
-out:
- kobject_put(&s->kobj);
-}
-
static int sysfs_slab_add(struct kmem_cache *s)
{
int err;
@@ -5780,8 +5588,6 @@ static int sysfs_slab_add(struct kmem_cache *s)
struct kset *kset = cache_kset(s);
int unmergeable = slab_unmergeable(s);
- INIT_WORK(&s->kobj_remove_work, sysfs_slab_remove_workfn);
-
if (!kset) {
kobject_init(&s->kobj, &slab_ktype);
return 0;
@@ -5809,24 +5615,15 @@ static int sysfs_slab_add(struct kmem_cache *s)
s->kobj.kset = kset;
err = kobject_init_and_add(&s->kobj, &slab_ktype, NULL, "%s", name);
- if (err)
+ if (err) {
+ kobject_put(&s->kobj);
goto out;
+ }
err = sysfs_create_group(&s->kobj, &slab_attr_group);
if (err)
goto out_del_kobj;
-#ifdef CONFIG_MEMCG
- if (is_root_cache(s) && memcg_sysfs_enabled) {
- s->memcg_kset = kset_create_and_add("cgroup", NULL, &s->kobj);
- if (!s->memcg_kset) {
- err = -ENOMEM;
- goto out_del_kobj;
- }
- }
-#endif
-
- kobject_uevent(&s->kobj, KOBJ_ADD);
if (!unmergeable) {
/* Setup first alias */
sysfs_slab_alias(s, s->name);
@@ -5840,19 +5637,6 @@ out_del_kobj:
goto out;
}
-static void sysfs_slab_remove(struct kmem_cache *s)
-{
- if (slab_state < FULL)
- /*
- * Sysfs has not been setup yet so no need to remove the
- * cache from sysfs.
- */
- return;
-
- kobject_get(&s->kobj);
- schedule_work(&s->kobj_remove_work);
-}
-
void sysfs_slab_unlink(struct kmem_cache *s)
{
if (slab_state >= FULL)
@@ -5907,7 +5691,7 @@ static int __init slab_sysfs_init(void)
mutex_lock(&slab_mutex);
- slab_kset = kset_create_and_add("slab", &slab_uevent_ops, kernel_kobj);
+ slab_kset = kset_create_and_add("slab", NULL, kernel_kobj);
if (!slab_kset) {
mutex_unlock(&slab_mutex);
pr_err("Cannot register slab subsystem.\n");
diff --git a/mm/sparse-vmemmap.c b/mm/sparse-vmemmap.c
index 200aef686722..16183d85a7d5 100644
--- a/mm/sparse-vmemmap.c
+++ b/mm/sparse-vmemmap.c
@@ -29,7 +29,6 @@
#include <linux/sched.h>
#include <asm/dma.h>
#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
/*
* Allocate a block of memory to be used to back the virtual memory map
@@ -70,11 +69,19 @@ void * __meminit vmemmap_alloc_block(unsigned long size, int node)
__pa(MAX_DMA_ADDRESS));
}
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap);
+
/* need to make sure size is all the same during early stage */
-void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node)
+void * __meminit vmemmap_alloc_block_buf(unsigned long size, int node,
+ struct vmem_altmap *altmap)
{
- void *ptr = sparse_buffer_alloc(size);
+ void *ptr;
+
+ if (altmap)
+ return altmap_alloc_block_buf(size, altmap);
+ ptr = sparse_buffer_alloc(size);
if (!ptr)
ptr = vmemmap_alloc_block(size, node);
return ptr;
@@ -95,15 +102,8 @@ static unsigned long __meminit vmem_altmap_nr_free(struct vmem_altmap *altmap)
return 0;
}
-/**
- * altmap_alloc_block_buf - allocate pages from the device page map
- * @altmap: device page map
- * @size: size (in bytes) of the allocation
- *
- * Allocations are aligned to the size of the request.
- */
-void * __meminit altmap_alloc_block_buf(unsigned long size,
- struct vmem_altmap *altmap)
+static void * __meminit altmap_alloc_block_buf(unsigned long size,
+ struct vmem_altmap *altmap)
{
unsigned long pfn, nr_pfns, nr_align;
@@ -140,12 +140,15 @@ void __meminit vmemmap_verify(pte_t *pte, int node,
start, end - 1);
}
-pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node)
+pte_t * __meminit vmemmap_pte_populate(pmd_t *pmd, unsigned long addr, int node,
+ struct vmem_altmap *altmap)
{
pte_t *pte = pte_offset_kernel(pmd, addr);
if (pte_none(*pte)) {
pte_t entry;
- void *p = vmemmap_alloc_block_buf(PAGE_SIZE, node);
+ void *p;
+
+ p = vmemmap_alloc_block_buf(PAGE_SIZE, node, altmap);
if (!p)
return NULL;
entry = pfn_pte(__pa(p) >> PAGE_SHIFT, PAGE_KERNEL);
@@ -213,8 +216,8 @@ pgd_t * __meminit vmemmap_pgd_populate(unsigned long addr, int node)
return pgd;
}
-int __meminit vmemmap_populate_basepages(unsigned long start,
- unsigned long end, int node)
+int __meminit vmemmap_populate_basepages(unsigned long start, unsigned long end,
+ int node, struct vmem_altmap *altmap)
{
unsigned long addr = start;
pgd_t *pgd;
@@ -236,7 +239,7 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
pmd = vmemmap_pmd_populate(pud, addr, node);
if (!pmd)
return -ENOMEM;
- pte = vmemmap_pte_populate(pmd, addr, node);
+ pte = vmemmap_pte_populate(pmd, addr, node, altmap);
if (!pte)
return -ENOMEM;
vmemmap_verify(pte, node, addr, addr + PAGE_SIZE);
@@ -248,20 +251,12 @@ int __meminit vmemmap_populate_basepages(unsigned long start,
struct page * __meminit __populate_section_memmap(unsigned long pfn,
unsigned long nr_pages, int nid, struct vmem_altmap *altmap)
{
- unsigned long start;
- unsigned long end;
-
- /*
- * The minimum granularity of memmap extensions is
- * PAGES_PER_SUBSECTION as allocations are tracked in the
- * 'subsection_map' bitmap of the section.
- */
- end = ALIGN(pfn + nr_pages, PAGES_PER_SUBSECTION);
- pfn &= PAGE_SUBSECTION_MASK;
- nr_pages = end - pfn;
-
- start = (unsigned long) pfn_to_page(pfn);
- end = start + nr_pages * sizeof(struct page);
+ unsigned long start = (unsigned long) pfn_to_page(pfn);
+ unsigned long end = start + nr_pages * sizeof(struct page);
+
+ if (WARN_ON_ONCE(!IS_ALIGNED(pfn, PAGES_PER_SUBSECTION) ||
+ !IS_ALIGNED(nr_pages, PAGES_PER_SUBSECTION)))
+ return NULL;
if (vmemmap_populate(start, end, nid, altmap))
return NULL;
diff --git a/mm/sparse.c b/mm/sparse.c
index 1aee5a481571..fcc3d176f1ea 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -16,8 +16,6 @@
#include "internal.h"
#include <asm/dma.h>
-#include <asm/pgalloc.h>
-#include <asm/pgtable.h>
/*
* Permanent SPARSEMEM data:
@@ -251,7 +249,7 @@ void __init subsection_map_init(unsigned long pfn, unsigned long nr_pages)
#endif
/* Record a memory area against a node. */
-void __init memory_present(int nid, unsigned long start, unsigned long end)
+static void __init memory_present(int nid, unsigned long start, unsigned long end)
{
unsigned long pfn;
@@ -287,11 +285,11 @@ void __init memory_present(int nid, unsigned long start, unsigned long end)
}
/*
- * Mark all memblocks as present using memory_present(). This is a
- * convienence function that is useful for a number of arches
- * to mark all of the systems memory as present during initialization.
+ * Mark all memblocks as present using memory_present().
+ * This is a convenience function that is useful to mark all of the systems
+ * memory as present during initialization.
*/
-void __init memblocks_present(void)
+static void __init memblocks_present(void)
{
struct memblock_region *reg;
@@ -576,9 +574,13 @@ failed:
*/
void __init sparse_init(void)
{
- unsigned long pnum_begin = first_present_section_nr();
- int nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
- unsigned long pnum_end, map_count = 1;
+ unsigned long pnum_end, pnum_begin, map_count = 1;
+ int nid_begin;
+
+ memblocks_present();
+
+ pnum_begin = first_present_section_nr();
+ nid_begin = sparse_early_nid(__nr_to_section(pnum_begin));
/* Setup pageblock_order for HUGETLB_PAGE_SIZE_VARIABLE */
set_pageblock_order();
@@ -826,10 +828,14 @@ static void section_deactivate(unsigned long pfn, unsigned long nr_pages,
ms->section_mem_map &= ~SECTION_HAS_MEM_MAP;
}
- if (section_is_early && memmap)
- free_map_bootmem(memmap);
- else
+ /*
+ * The memmap of early sections is always fully populated. See
+ * section_activate() and pfn_valid() .
+ */
+ if (!section_is_early)
depopulate_section_memmap(pfn, nr_pages, altmap);
+ else if (memmap)
+ free_map_bootmem(memmap);
if (empty)
ms->section_mem_map = (unsigned long)NULL;
diff --git a/mm/swap.c b/mm/swap.c
index bf9a79fed62d..e7bdf094f76a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -35,6 +35,7 @@
#include <linux/uio.h>
#include <linux/hugetlb.h>
#include <linux/page_idle.h>
+#include <linux/local_lock.h>
#include "internal.h"
@@ -44,14 +45,32 @@
/* How many pages do we try to swap or page in/out together? */
int page_cluster;
-static DEFINE_PER_CPU(struct pagevec, lru_add_pvec);
-static DEFINE_PER_CPU(struct pagevec, lru_rotate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_file_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_deactivate_pvecs);
-static DEFINE_PER_CPU(struct pagevec, lru_lazyfree_pvecs);
+/* Protecting only lru_rotate.pvec which requires disabling interrupts */
+struct lru_rotate {
+ local_lock_t lock;
+ struct pagevec pvec;
+};
+static DEFINE_PER_CPU(struct lru_rotate, lru_rotate) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
+
+/*
+ * The following struct pagevec are grouped together because they are protected
+ * by disabling preemption (and interrupts remain enabled).
+ */
+struct lru_pvecs {
+ local_lock_t lock;
+ struct pagevec lru_add;
+ struct pagevec lru_deactivate_file;
+ struct pagevec lru_deactivate;
+ struct pagevec lru_lazyfree;
#ifdef CONFIG_SMP
-static DEFINE_PER_CPU(struct pagevec, activate_page_pvecs);
+ struct pagevec activate_page;
#endif
+};
+static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
+ .lock = INIT_LOCAL_LOCK(lock),
+};
/*
* This path almost never happens for VM activity - pages are normally
@@ -83,8 +102,6 @@ static void __put_single_page(struct page *page)
static void __put_compound_page(struct page *page)
{
- compound_page_dtor *dtor;
-
/*
* __page_cache_release() is supposed to be called for thp, not for
* hugetlb. This is because hugetlb page does never have PageLRU set
@@ -93,8 +110,7 @@ static void __put_compound_page(struct page *page)
*/
if (!PageHuge(page))
__page_cache_release(page);
- dtor = get_compound_page_dtor(page);
- (*dtor)(page);
+ destroy_compound_page(page);
}
void __put_page(struct page *page)
@@ -225,7 +241,7 @@ static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
del_page_from_lru_list(page, lruvec, page_lru(page));
ClearPageActive(page);
add_page_to_lru_list_tail(page, lruvec, page_lru(page));
- (*pgmoved)++;
+ (*pgmoved) += thp_nr_pages(page);
}
}
@@ -254,30 +270,57 @@ void rotate_reclaimable_page(struct page *page)
unsigned long flags;
get_page(page);
- local_irq_save(flags);
- pvec = this_cpu_ptr(&lru_rotate_pvecs);
+ local_lock_irqsave(&lru_rotate.lock, flags);
+ pvec = this_cpu_ptr(&lru_rotate.pvec);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
-static void update_page_reclaim_stat(struct lruvec *lruvec,
- int file, int rotated)
+void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
{
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
+ do {
+ unsigned long lrusize;
- reclaim_stat->recent_scanned[file]++;
- if (rotated)
- reclaim_stat->recent_rotated[file]++;
+ /* Record cost event */
+ if (file)
+ lruvec->file_cost += nr_pages;
+ else
+ lruvec->anon_cost += nr_pages;
+
+ /*
+ * Decay previous events
+ *
+ * Because workloads change over time (and to avoid
+ * overflow) we keep these statistics as a floating
+ * average, which ends up weighing recent refaults
+ * more than old ones.
+ */
+ lrusize = lruvec_page_state(lruvec, NR_INACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_ACTIVE_ANON) +
+ lruvec_page_state(lruvec, NR_INACTIVE_FILE) +
+ lruvec_page_state(lruvec, NR_ACTIVE_FILE);
+
+ if (lruvec->file_cost + lruvec->anon_cost > lrusize / 4) {
+ lruvec->file_cost /= 2;
+ lruvec->anon_cost /= 2;
+ }
+ } while ((lruvec = parent_lruvec(lruvec)));
+}
+
+void lru_note_cost_page(struct page *page)
+{
+ lru_note_cost(mem_cgroup_page_lruvec(page, page_pgdat(page)),
+ page_is_file_lru(page), thp_nr_pages(page));
}
static void __activate_page(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
+ int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec, lru);
SetPageActive(page);
@@ -285,15 +328,16 @@ static void __activate_page(struct page *page, struct lruvec *lruvec,
add_page_to_lru_list(page, lruvec, lru);
trace_mm_lru_activate(page);
- __count_vm_event(PGACTIVATE);
- update_page_reclaim_stat(lruvec, file, 1);
+ __count_vm_events(PGACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGACTIVATE,
+ nr_pages);
}
}
#ifdef CONFIG_SMP
static void activate_page_drain(int cpu)
{
- struct pagevec *pvec = &per_cpu(activate_page_pvecs, cpu);
+ struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
@@ -301,19 +345,21 @@ static void activate_page_drain(int cpu)
static bool need_activate_page_drain(int cpu)
{
- return pagevec_count(&per_cpu(activate_page_pvecs, cpu)) != 0;
+ return pagevec_count(&per_cpu(lru_pvecs.activate_page, cpu)) != 0;
}
void activate_page(struct page *page)
{
page = compound_head(page);
if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(activate_page_pvecs);
+ struct pagevec *pvec;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, __activate_page, NULL);
- put_cpu_var(activate_page_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
@@ -335,9 +381,12 @@ void activate_page(struct page *page)
static void __lru_cache_activate_page(struct page *page)
{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
+ struct pagevec *pvec;
int i;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+
/*
* Search backwards on the optimistic assumption that the page being
* activated has just been added to this pagevec. Note that only
@@ -357,7 +406,7 @@ static void __lru_cache_activate_page(struct page *page)
}
}
- put_cpu_var(lru_add_pvec);
+ local_unlock(&lru_pvecs.lock);
}
/*
@@ -385,7 +434,7 @@ void mark_page_accessed(struct page *page)
} else if (!PageActive(page)) {
/*
* If the page is on the LRU, queue it for activation via
- * activate_page_pvecs. Otherwise, assume the page is on a
+ * lru_pvecs.activate_page. Otherwise, assume the page is on a
* pagevec, mark it active and it'll be moved to the active
* LRU on the next drain.
*/
@@ -394,43 +443,13 @@ void mark_page_accessed(struct page *page)
else
__lru_cache_activate_page(page);
ClearPageReferenced(page);
- if (page_is_file_lru(page))
- workingset_activation(page);
+ workingset_activation(page);
}
if (page_is_idle(page))
clear_page_idle(page);
}
EXPORT_SYMBOL(mark_page_accessed);
-static void __lru_cache_add(struct page *page)
-{
- struct pagevec *pvec = &get_cpu_var(lru_add_pvec);
-
- get_page(page);
- if (!pagevec_add(pvec, page) || PageCompound(page))
- __pagevec_lru_add(pvec);
- put_cpu_var(lru_add_pvec);
-}
-
-/**
- * lru_cache_add_anon - add a page to the page lists
- * @page: the page to add
- */
-void lru_cache_add_anon(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-
-void lru_cache_add_file(struct page *page)
-{
- if (PageActive(page))
- ClearPageActive(page);
- __lru_cache_add(page);
-}
-EXPORT_SYMBOL(lru_cache_add_file);
-
/**
* lru_cache_add - add a page to a page list
* @page: the page to be added to the LRU.
@@ -442,37 +461,47 @@ EXPORT_SYMBOL(lru_cache_add_file);
*/
void lru_cache_add(struct page *page)
{
+ struct pagevec *pvec;
+
VM_BUG_ON_PAGE(PageActive(page) && PageUnevictable(page), page);
VM_BUG_ON_PAGE(PageLRU(page), page);
- __lru_cache_add(page);
+
+ get_page(page);
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_add);
+ if (!pagevec_add(pvec, page) || PageCompound(page))
+ __pagevec_lru_add(pvec);
+ local_unlock(&lru_pvecs.lock);
}
+EXPORT_SYMBOL(lru_cache_add);
/**
- * lru_cache_add_active_or_unevictable
+ * lru_cache_add_inactive_or_unevictable
* @page: the page to be added to LRU
* @vma: vma in which page is mapped for determining reclaimability
*
- * Place @page on the active or unevictable LRU list, depending on its
+ * Place @page on the inactive or unevictable LRU list, depending on its
* evictability. Note that if the page is not evictable, it goes
* directly back onto it's zone's unevictable list, it does NOT use a
* per cpu pagevec.
*/
-void lru_cache_add_active_or_unevictable(struct page *page,
+void lru_cache_add_inactive_or_unevictable(struct page *page,
struct vm_area_struct *vma)
{
+ bool unevictable;
+
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (likely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) != VM_LOCKED))
- SetPageActive(page);
- else if (!TestSetPageMlocked(page)) {
+ unevictable = (vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED;
+ if (unlikely(unevictable) && !TestSetPageMlocked(page)) {
+ int nr_pages = thp_nr_pages(page);
/*
* We use the irq-unsafe __mod_zone_page_stat because this
* counter is not modified from interrupt context, and the pte
* lock is held(spinlock), which implies preemption disabled.
*/
- __mod_zone_page_state(page_zone(page), NR_MLOCK,
- hpage_nr_pages(page));
- count_vm_event(UNEVICTABLE_PGMLOCKED);
+ __mod_zone_page_state(page_zone(page), NR_MLOCK, nr_pages);
+ count_vm_events(UNEVICTABLE_PGMLOCKED, nr_pages);
}
lru_cache_add(page);
}
@@ -501,8 +530,9 @@ void lru_cache_add_active_or_unevictable(struct page *page,
static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
- int lru, file;
+ int lru;
bool active;
+ int nr_pages = thp_nr_pages(page);
if (!PageLRU(page))
return;
@@ -515,7 +545,6 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
return;
active = PageActive(page);
- file = page_is_file_lru(page);
lru = page_lru_base_type(page);
del_page_from_lru_list(page, lruvec, lru + active);
@@ -536,28 +565,31 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
* We moves tha page into tail of inactive.
*/
add_page_to_lru_list_tail(page, lruvec, lru);
- __count_vm_event(PGROTATED);
+ __count_vm_events(PGROTATED, nr_pages);
}
- if (active)
- __count_vm_event(PGDEACTIVATE);
- update_page_reclaim_stat(lruvec, file, 0);
+ if (active) {
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_pages);
+ }
}
static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
void *arg)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- int file = page_is_file_lru(page);
int lru = page_lru_base_type(page);
+ int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec, lru + LRU_ACTIVE);
ClearPageActive(page);
ClearPageReferenced(page);
add_page_to_lru_list(page, lruvec, lru);
- __count_vm_events(PGDEACTIVATE, hpage_nr_pages(page));
- update_page_reclaim_stat(lruvec, file, 0);
+ __count_vm_events(PGDEACTIVATE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE,
+ nr_pages);
}
}
@@ -567,6 +599,7 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
bool active = PageActive(page);
+ int nr_pages = thp_nr_pages(page);
del_page_from_lru_list(page, lruvec,
LRU_INACTIVE_ANON + active);
@@ -580,9 +613,9 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
ClearPageSwapBacked(page);
add_page_to_lru_list(page, lruvec, LRU_INACTIVE_FILE);
- __count_vm_events(PGLAZYFREE, hpage_nr_pages(page));
- count_memcg_page_event(page, PGLAZYFREE);
- update_page_reclaim_stat(lruvec, 1, 0);
+ __count_vm_events(PGLAZYFREE, nr_pages);
+ __count_memcg_events(lruvec_memcg(lruvec), PGLAZYFREE,
+ nr_pages);
}
}
@@ -593,30 +626,31 @@ static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
*/
void lru_add_drain_cpu(int cpu)
{
- struct pagevec *pvec = &per_cpu(lru_add_pvec, cpu);
+ struct pagevec *pvec = &per_cpu(lru_pvecs.lru_add, cpu);
if (pagevec_count(pvec))
__pagevec_lru_add(pvec);
- pvec = &per_cpu(lru_rotate_pvecs, cpu);
- if (pagevec_count(pvec)) {
+ pvec = &per_cpu(lru_rotate.pvec, cpu);
+ /* Disabling interrupts below acts as a compiler barrier. */
+ if (data_race(pagevec_count(pvec))) {
unsigned long flags;
/* No harm done if a racing interrupt already did this */
- local_irq_save(flags);
+ local_lock_irqsave(&lru_rotate.lock, flags);
pagevec_move_tail(pvec);
- local_irq_restore(flags);
+ local_unlock_irqrestore(&lru_rotate.lock, flags);
}
- pvec = &per_cpu(lru_deactivate_file_pvecs, cpu);
+ pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- pvec = &per_cpu(lru_deactivate_pvecs, cpu);
+ pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- pvec = &per_cpu(lru_lazyfree_pvecs, cpu);
+ pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
if (pagevec_count(pvec))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
@@ -641,11 +675,14 @@ void deactivate_file_page(struct page *page)
return;
if (likely(get_page_unless_zero(page))) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_file_pvecs);
+ struct pagevec *pvec;
+
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
- put_cpu_var(lru_deactivate_file_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
@@ -660,12 +697,14 @@ void deactivate_file_page(struct page *page)
void deactivate_page(struct page *page)
{
if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_deactivate_pvecs);
+ struct pagevec *pvec;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
- put_cpu_var(lru_deactivate_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
@@ -680,19 +719,30 @@ void mark_page_lazyfree(struct page *page)
{
if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
- struct pagevec *pvec = &get_cpu_var(lru_lazyfree_pvecs);
+ struct pagevec *pvec;
+ local_lock(&lru_pvecs.lock);
+ pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
- put_cpu_var(lru_lazyfree_pvecs);
+ local_unlock(&lru_pvecs.lock);
}
}
void lru_add_drain(void)
{
- lru_add_drain_cpu(get_cpu());
- put_cpu();
+ local_lock(&lru_pvecs.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ local_unlock(&lru_pvecs.lock);
+}
+
+void lru_add_drain_cpu_zone(struct zone *zone)
+{
+ local_lock(&lru_pvecs.lock);
+ lru_add_drain_cpu(smp_processor_id());
+ drain_local_pages(zone);
+ local_unlock(&lru_pvecs.lock);
}
#ifdef CONFIG_SMP
@@ -743,11 +793,11 @@ void lru_add_drain_all(void)
for_each_online_cpu(cpu) {
struct work_struct *work = &per_cpu(lru_add_drain_work, cpu);
- if (pagevec_count(&per_cpu(lru_add_pvec, cpu)) ||
- pagevec_count(&per_cpu(lru_rotate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_file_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_deactivate_pvecs, cpu)) ||
- pagevec_count(&per_cpu(lru_lazyfree_pvecs, cpu)) ||
+ if (pagevec_count(&per_cpu(lru_pvecs.lru_add, cpu)) ||
+ data_race(pagevec_count(&per_cpu(lru_rotate.pvec, cpu))) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_deactivate_file, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_deactivate, cpu)) ||
+ pagevec_count(&per_cpu(lru_pvecs.lru_lazyfree, cpu)) ||
need_activate_page_drain(cpu)) {
INIT_WORK(work, lru_add_drain_per_cpu);
queue_work_on(cpu, mm_percpu_wq, work);
@@ -782,8 +832,8 @@ void release_pages(struct page **pages, int nr)
LIST_HEAD(pages_to_free);
struct pglist_data *locked_pgdat = NULL;
struct lruvec *lruvec;
- unsigned long uninitialized_var(flags);
- unsigned int uninitialized_var(lock_batch);
+ unsigned long flags;
+ unsigned int lock_batch;
for (i = 0; i < nr; i++) {
struct page *page = pages[i];
@@ -890,8 +940,6 @@ EXPORT_SYMBOL(__pagevec_release);
void lru_add_page_tail(struct page *page, struct page *page_tail,
struct lruvec *lruvec, struct list_head *list)
{
- const int file = 0;
-
VM_BUG_ON_PAGE(!PageHead(page), page);
VM_BUG_ON_PAGE(PageCompound(page_tail), page);
VM_BUG_ON_PAGE(PageLRU(page_tail), page);
@@ -917,9 +965,6 @@ void lru_add_page_tail(struct page *page, struct page *page_tail,
add_page_to_lru_list_tail(page_tail, lruvec,
page_lru(page_tail));
}
-
- if (!PageUnevictable(page))
- update_page_reclaim_stat(lruvec, file, PageActive(page_tail));
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -928,6 +973,7 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
{
enum lru_list lru;
int was_unevictable = TestClearPageUnevictable(page);
+ int nr_pages = thp_nr_pages(page);
VM_BUG_ON_PAGE(PageLRU(page), page);
@@ -962,16 +1008,14 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
if (page_evictable(page)) {
lru = page_lru(page);
- update_page_reclaim_stat(lruvec, page_is_file_lru(page),
- PageActive(page));
if (was_unevictable)
- count_vm_event(UNEVICTABLE_PGRESCUED);
+ __count_vm_events(UNEVICTABLE_PGRESCUED, nr_pages);
} else {
lru = LRU_UNEVICTABLE;
ClearPageActive(page);
SetPageUnevictable(page);
if (!was_unevictable)
- count_vm_event(UNEVICTABLE_PGCULLED);
+ __count_vm_events(UNEVICTABLE_PGCULLED, nr_pages);
}
add_page_to_lru_list(page, lruvec, lru);
diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c
index 45affaef3bc6..7f34343c075a 100644
--- a/mm/swap_cgroup.c
+++ b/mm/swap_cgroup.c
@@ -171,9 +171,6 @@ int swap_cgroup_swapon(int type, unsigned long max_pages)
unsigned long length;
struct swap_cgroup_ctrl *ctrl;
- if (!do_swap_account)
- return 0;
-
length = DIV_ROUND_UP(max_pages, SC_PER_PAGE);
array_size = length * sizeof(void *);
@@ -209,9 +206,6 @@ void swap_cgroup_swapoff(int type)
unsigned long i, length;
struct swap_cgroup_ctrl *ctrl;
- if (!do_swap_account)
- return;
-
mutex_lock(&swap_cgroup_mutex);
ctrl = &swap_cgroup_ctrl[type];
map = ctrl->map;
diff --git a/mm/swap_slots.c b/mm/swap_slots.c
index 0975adc72253..3e6453573a89 100644
--- a/mm/swap_slots.c
+++ b/mm/swap_slots.c
@@ -46,8 +46,7 @@ static void __drain_swap_slots_cache(unsigned int type);
static void deactivate_swap_slots_cache(void);
static void reactivate_swap_slots_cache(void);
-#define use_swap_slot_cache (swap_slot_cache_active && \
- swap_slot_cache_enabled && swap_slot_cache_initialized)
+#define use_swap_slot_cache (swap_slot_cache_active && swap_slot_cache_enabled)
#define SLOTS_CACHE 0x1
#define SLOTS_CACHE_RET 0x2
@@ -94,7 +93,7 @@ static bool check_cache_active(void)
{
long pages;
- if (!swap_slot_cache_enabled || !swap_slot_cache_initialized)
+ if (!swap_slot_cache_enabled)
return false;
pages = get_nr_swap_pages();
@@ -136,9 +135,16 @@ static int alloc_swap_slot_cache(unsigned int cpu)
mutex_lock(&swap_slots_cache_mutex);
cache = &per_cpu(swp_slots, cpu);
- if (cache->slots || cache->slots_ret)
+ if (cache->slots || cache->slots_ret) {
/* cache already allocated */
- goto out;
+ mutex_unlock(&swap_slots_cache_mutex);
+
+ kvfree(slots);
+ kvfree(slots_ret);
+
+ return 0;
+ }
+
if (!cache->lock_initialized) {
mutex_init(&cache->alloc_lock);
spin_lock_init(&cache->free_lock);
@@ -155,15 +161,8 @@ static int alloc_swap_slot_cache(unsigned int cpu)
*/
mb();
cache->slots = slots;
- slots = NULL;
cache->slots_ret = slots_ret;
- slots_ret = NULL;
-out:
mutex_unlock(&swap_slots_cache_mutex);
- if (slots)
- kvfree(slots);
- if (slots_ret)
- kvfree(slots_ret);
return 0;
}
@@ -240,21 +239,19 @@ static int free_slot_cache(unsigned int cpu)
int enable_swap_slots_cache(void)
{
- int ret = 0;
-
mutex_lock(&swap_slots_cache_enable_mutex);
- if (swap_slot_cache_initialized) {
- __reenable_swap_slots_cache();
- goto out_unlock;
- }
+ if (!swap_slot_cache_initialized) {
+ int ret;
- ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
- alloc_swap_slot_cache, free_slot_cache);
- if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
- "without swap slots cache.\n", __func__))
- goto out_unlock;
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "swap_slots_cache",
+ alloc_swap_slot_cache, free_slot_cache);
+ if (WARN_ONCE(ret < 0, "Cache allocation failed (%s), operating "
+ "without swap slots cache.\n", __func__))
+ goto out_unlock;
+
+ swap_slot_cache_initialized = true;
+ }
- swap_slot_cache_initialized = true;
__reenable_swap_slots_cache();
out_unlock:
mutex_unlock(&swap_slots_cache_enable_mutex);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ebed37bbf7a3..c16eebb81d8b 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -21,8 +21,7 @@
#include <linux/vmalloc.h>
#include <linux/swap_slots.h>
#include <linux/huge_mm.h>
-
-#include <asm/pgtable.h>
+#include "internal.h"
/*
* swapper_space is a fiction, retained to simplify the path through
@@ -58,8 +57,8 @@ static bool enable_vma_readahead __read_mostly = true;
#define GET_SWAP_RA_VAL(vma) \
(atomic_long_read(&(vma)->swap_readahead_info) ? : 4)
-#define INC_CACHE_INFO(x) do { swap_cache_info.x++; } while (0)
-#define ADD_CACHE_INFO(x, nr) do { swap_cache_info.x += (nr); } while (0)
+#define INC_CACHE_INFO(x) data_race(swap_cache_info.x++)
+#define ADD_CACHE_INFO(x, nr) data_race(swap_cache_info.x += (nr))
static struct {
unsigned long add_total;
@@ -107,16 +106,32 @@ void show_swap_cache_info(void)
printk("Total swap = %lukB\n", total_swap_pages << (PAGE_SHIFT - 10));
}
+void *get_shadow_from_swap_cache(swp_entry_t entry)
+{
+ struct address_space *address_space = swap_address_space(entry);
+ pgoff_t idx = swp_offset(entry);
+ struct page *page;
+
+ page = find_get_entry(address_space, idx);
+ if (xa_is_value(page))
+ return page;
+ if (page)
+ put_page(page);
+ return NULL;
+}
+
/*
* add_to_swap_cache resembles add_to_page_cache_locked on swapper_space,
* but sets SwapCache flag and private instead of mapping and index.
*/
-int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
+int add_to_swap_cache(struct page *page, swp_entry_t entry,
+ gfp_t gfp, void **shadowp)
{
struct address_space *address_space = swap_address_space(entry);
pgoff_t idx = swp_offset(entry);
XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page));
- unsigned long i, nr = hpage_nr_pages(page);
+ unsigned long i, nr = thp_nr_pages(page);
+ void *old;
VM_BUG_ON_PAGE(!PageLocked(page), page);
VM_BUG_ON_PAGE(PageSwapCache(page), page);
@@ -126,16 +141,25 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp)
SetPageSwapCache(page);
do {
+ unsigned long nr_shadows = 0;
+
xas_lock_irq(&xas);
xas_create_range(&xas);
if (xas_error(&xas))
goto unlock;
for (i = 0; i < nr; i++) {
VM_BUG_ON_PAGE(xas.xa_index != idx + i, page);
+ old = xas_load(&xas);
+ if (xa_is_value(old)) {
+ nr_shadows++;
+ if (shadowp)
+ *shadowp = old;
+ }
set_page_private(page + i, entry.val + i);
xas_store(&xas, page);
xas_next(&xas);
}
+ address_space->nrexceptional -= nr_shadows;
address_space->nrpages += nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr);
ADD_CACHE_INFO(add_total, nr);
@@ -155,10 +179,11 @@ unlock:
* This must be called only on pages that have
* been verified to be in the swap cache.
*/
-void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
+void __delete_from_swap_cache(struct page *page,
+ swp_entry_t entry, void *shadow)
{
struct address_space *address_space = swap_address_space(entry);
- int i, nr = hpage_nr_pages(page);
+ int i, nr = thp_nr_pages(page);
pgoff_t idx = swp_offset(entry);
XA_STATE(xas, &address_space->i_pages, idx);
@@ -167,12 +192,14 @@ void __delete_from_swap_cache(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageWriteback(page), page);
for (i = 0; i < nr; i++) {
- void *entry = xas_store(&xas, NULL);
+ void *entry = xas_store(&xas, shadow);
VM_BUG_ON_PAGE(entry != page, entry);
set_page_private(page + i, 0);
xas_next(&xas);
}
ClearPageSwapCache(page);
+ if (shadow)
+ address_space->nrexceptional += nr;
address_space->nrpages -= nr;
__mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, -nr);
ADD_CACHE_INFO(del_total, nr);
@@ -209,7 +236,7 @@ int add_to_swap(struct page *page)
* Add it to the swap cache.
*/
err = add_to_swap_cache(page, entry,
- __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN);
+ __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL);
if (err)
/*
* add_to_swap_cache() doesn't return -EEXIST, so we can safely
@@ -247,11 +274,42 @@ void delete_from_swap_cache(struct page *page)
struct address_space *address_space = swap_address_space(entry);
xa_lock_irq(&address_space->i_pages);
- __delete_from_swap_cache(page, entry);
+ __delete_from_swap_cache(page, entry, NULL);
xa_unlock_irq(&address_space->i_pages);
put_swap_page(page, entry);
- page_ref_sub(page, hpage_nr_pages(page));
+ page_ref_sub(page, thp_nr_pages(page));
+}
+
+void clear_shadow_from_swap_cache(int type, unsigned long begin,
+ unsigned long end)
+{
+ unsigned long curr = begin;
+ void *old;
+
+ for (;;) {
+ unsigned long nr_shadows = 0;
+ swp_entry_t entry = swp_entry(type, curr);
+ struct address_space *address_space = swap_address_space(entry);
+ XA_STATE(xas, &address_space->i_pages, curr);
+
+ xa_lock_irq(&address_space->i_pages);
+ xas_for_each(&xas, old, end) {
+ if (!xa_is_value(old))
+ continue;
+ xas_store(&xas, NULL);
+ nr_shadows++;
+ }
+ address_space->nrexceptional -= nr_shadows;
+ xa_unlock_irq(&address_space->i_pages);
+
+ /* search the next swapcache until we meet end */
+ curr >>= SWAP_ADDRESS_SPACE_SHIFT;
+ curr++;
+ curr <<= SWAP_ADDRESS_SPACE_SHIFT;
+ if (curr > end)
+ break;
+ }
}
/*
@@ -360,12 +418,14 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
struct vm_area_struct *vma, unsigned long addr,
bool *new_page_allocated)
{
- struct page *found_page = NULL, *new_page = NULL;
struct swap_info_struct *si;
- int err;
+ struct page *page;
+ void *shadow = NULL;
+
*new_page_allocated = false;
- do {
+ for (;;) {
+ int err;
/*
* First check the swap cache. Since this is normally
* called after lookup_swap_cache() failed, re-calling
@@ -373,12 +433,12 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
*/
si = get_swap_device(entry);
if (!si)
- break;
- found_page = find_get_page(swap_address_space(entry),
- swp_offset(entry));
+ return NULL;
+ page = find_get_page(swap_address_space(entry),
+ swp_offset(entry));
put_swap_device(si);
- if (found_page)
- break;
+ if (page)
+ return page;
/*
* Just skip read ahead for unused swap slot.
@@ -389,54 +449,69 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
* else swap_off will be aborted if we return NULL.
*/
if (!__swp_swapcount(entry) && swap_slot_cache_enabled)
- break;
+ return NULL;
/*
- * Get a new page to read into from swap.
+ * Get a new page to read into from swap. Allocate it now,
+ * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will
+ * cause any racers to loop around until we add it to cache.
*/
- if (!new_page) {
- new_page = alloc_page_vma(gfp_mask, vma, addr);
- if (!new_page)
- break; /* Out of memory */
- }
+ page = alloc_page_vma(gfp_mask, vma, addr);
+ if (!page)
+ return NULL;
/*
* Swap entry may have been freed since our caller observed it.
*/
err = swapcache_prepare(entry);
- if (err == -EEXIST) {
- /*
- * We might race against get_swap_page() and stumble
- * across a SWAP_HAS_CACHE swap_map entry whose page
- * has not been brought into the swapcache yet.
- */
- cond_resched();
- continue;
- } else if (err) /* swp entry is obsolete ? */
+ if (!err)
break;
- /* May fail (-ENOMEM) if XArray node allocation failed. */
- __SetPageLocked(new_page);
- __SetPageSwapBacked(new_page);
- err = add_to_swap_cache(new_page, entry, gfp_mask & GFP_KERNEL);
- if (likely(!err)) {
- /* Initiate read into locked page */
- SetPageWorkingset(new_page);
- lru_cache_add_anon(new_page);
- *new_page_allocated = true;
- return new_page;
- }
- __ClearPageLocked(new_page);
+ put_page(page);
+ if (err != -EEXIST)
+ return NULL;
+
/*
- * add_to_swap_cache() doesn't return -EEXIST, so we can safely
- * clear SWAP_HAS_CACHE flag.
+ * We might race against __delete_from_swap_cache(), and
+ * stumble across a swap_map entry whose SWAP_HAS_CACHE
+ * has not yet been cleared. Or race against another
+ * __read_swap_cache_async(), which has set SWAP_HAS_CACHE
+ * in swap_map, but not yet added its page to swap cache.
*/
- put_swap_page(new_page, entry);
- } while (err != -ENOMEM);
+ cond_resched();
+ }
- if (new_page)
- put_page(new_page);
- return found_page;
+ /*
+ * The swap entry is ours to swap in. Prepare the new page.
+ */
+
+ __SetPageLocked(page);
+ __SetPageSwapBacked(page);
+
+ /* May fail (-ENOMEM) if XArray node allocation failed. */
+ if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) {
+ put_swap_page(page, entry);
+ goto fail_unlock;
+ }
+
+ if (mem_cgroup_charge(page, NULL, gfp_mask)) {
+ delete_from_swap_cache(page);
+ goto fail_unlock;
+ }
+
+ if (shadow)
+ workingset_refault(page, shadow);
+
+ /* Caller will initiate read into locked page */
+ SetPageWorkingset(page);
+ lru_cache_add(page);
+ *new_page_allocated = true;
+ return page;
+
+fail_unlock:
+ unlock_page(page);
+ put_page(page);
+ return NULL;
}
/*
@@ -509,10 +584,11 @@ static unsigned long swapin_nr_pages(unsigned long offset)
return 1;
hits = atomic_xchg(&swapin_readahead_hits, 0);
- pages = __swapin_nr_pages(prev_offset, offset, hits, max_pages,
+ pages = __swapin_nr_pages(READ_ONCE(prev_offset), offset, hits,
+ max_pages,
atomic_read(&last_readahead_pages));
if (!hits)
- prev_offset = offset;
+ WRITE_ONCE(prev_offset, offset);
atomic_set(&last_readahead_pages, pages);
return pages;
@@ -534,7 +610,7 @@ static unsigned long swapin_nr_pages(unsigned long offset)
* This has been extended to use the NUMA policies from the mm triggering
* the readahead.
*
- * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
*/
struct page *swap_cluster_readahead(swp_entry_t entry, gfp_t gfp_mask,
struct vm_fault *vmf)
@@ -707,7 +783,7 @@ static void swap_ra_info(struct vm_fault *vmf,
/**
* swap_vma_readahead - swap in pages in hope we need them soon
- * @entry: swap entry of this memory
+ * @fentry: swap entry of this memory
* @gfp_mask: memory allocation flags
* @vmf: fault information
*
@@ -716,7 +792,7 @@ static void swap_ra_info(struct vm_fault *vmf,
* Primitive swap readahead code. We simply read in a few pages whoes
* virtual addresses are around the fault address in the same vma.
*
- * Caller must hold read mmap_sem if vmf->vma is not NULL.
+ * Caller must hold read mmap_lock if vmf->vma is not NULL.
*
*/
static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 5871a2aa86a5..debc94155f74 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -40,7 +40,6 @@
#include <linux/swap_slots.h>
#include <linux/sort.h>
-#include <asm/pgtable.h>
#include <asm/tlbflush.h>
#include <linux/swapops.h>
#include <linux/swap_cgroup.h>
@@ -601,7 +600,6 @@ static bool scan_swap_map_try_ssd_cluster(struct swap_info_struct *si,
{
struct percpu_cluster *cluster;
struct swap_cluster_info *ci;
- bool found_free;
unsigned long tmp, max;
new_cluster:
@@ -614,17 +612,17 @@ new_cluster:
} else if (!cluster_list_empty(&si->discard_clusters)) {
/*
* we don't have free cluster but have some clusters in
- * discarding, do discard now and reclaim them
+ * discarding, do discard now and reclaim them, then
+ * reread cluster_next_cpu since we dropped si->lock
*/
swap_do_scheduled_discard(si);
- *scan_base = *offset = si->cluster_next;
+ *scan_base = this_cpu_read(*si->cluster_next_cpu);
+ *offset = *scan_base;
goto new_cluster;
} else
return false;
}
- found_free = false;
-
/*
* Other CPUs can use our cluster if they can't find a free cluster,
* check if there is still free entry in the cluster
@@ -632,27 +630,23 @@ new_cluster:
tmp = cluster->next;
max = min_t(unsigned long, si->max,
(cluster_next(&cluster->index) + 1) * SWAPFILE_CLUSTER);
- if (tmp >= max) {
- cluster_set_null(&cluster->index);
- goto new_cluster;
- }
- ci = lock_cluster(si, tmp);
- while (tmp < max) {
- if (!si->swap_map[tmp]) {
- found_free = true;
- break;
+ if (tmp < max) {
+ ci = lock_cluster(si, tmp);
+ while (tmp < max) {
+ if (!si->swap_map[tmp])
+ break;
+ tmp++;
}
- tmp++;
+ unlock_cluster(ci);
}
- unlock_cluster(ci);
- if (!found_free) {
+ if (tmp >= max) {
cluster_set_null(&cluster->index);
goto new_cluster;
}
cluster->next = tmp + 1;
*offset = tmp;
*scan_base = tmp;
- return found_free;
+ return true;
}
static void __del_from_avail_list(struct swap_info_struct *p)
@@ -678,7 +672,7 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset,
if (offset == si->lowest_bit)
si->lowest_bit += nr_entries;
if (end == si->highest_bit)
- si->highest_bit -= nr_entries;
+ WRITE_ONCE(si->highest_bit, si->highest_bit - nr_entries);
si->inuse_pages += nr_entries;
if (si->inuse_pages == si->pages) {
si->lowest_bit = si->max;
@@ -702,6 +696,7 @@ static void add_to_avail_list(struct swap_info_struct *p)
static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
unsigned int nr_entries)
{
+ unsigned long begin = offset;
unsigned long end = offset + nr_entries - 1;
void (*swap_slot_free_notify)(struct block_device *, unsigned long);
@@ -710,7 +705,7 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
if (end > si->highest_bit) {
bool was_full = !si->highest_bit;
- si->highest_bit = end;
+ WRITE_ONCE(si->highest_bit, end);
if (was_full && (si->flags & SWP_WRITEOK))
add_to_avail_list(si);
}
@@ -727,6 +722,35 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset,
swap_slot_free_notify(si->bdev, offset);
offset++;
}
+ clear_shadow_from_swap_cache(si->type, begin, end);
+}
+
+static void set_cluster_next(struct swap_info_struct *si, unsigned long next)
+{
+ unsigned long prev;
+
+ if (!(si->flags & SWP_SOLIDSTATE)) {
+ si->cluster_next = next;
+ return;
+ }
+
+ prev = this_cpu_read(*si->cluster_next_cpu);
+ /*
+ * Cross the swap address space size aligned trunk, choose
+ * another trunk randomly to avoid lock contention on swap
+ * address space if possible.
+ */
+ if ((prev >> SWAP_ADDRESS_SPACE_SHIFT) !=
+ (next >> SWAP_ADDRESS_SPACE_SHIFT)) {
+ /* No free swap slots available */
+ if (si->highest_bit <= si->lowest_bit)
+ return;
+ next = si->lowest_bit +
+ prandom_u32_max(si->highest_bit - si->lowest_bit + 1);
+ next = ALIGN_DOWN(next, SWAP_ADDRESS_SPACE_PAGES);
+ next = max_t(unsigned int, next, si->lowest_bit);
+ }
+ this_cpu_write(*si->cluster_next_cpu, next);
}
static int scan_swap_map_slots(struct swap_info_struct *si,
@@ -739,9 +763,7 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
unsigned long last_in_cluster = 0;
int latency_ration = LATENCY_LIMIT;
int n_ret = 0;
-
- if (nr > SWAP_BATCH)
- nr = SWAP_BATCH;
+ bool scanned_many = false;
/*
* We try to cluster swap pages by allocating them sequentially
@@ -755,17 +777,22 @@ static int scan_swap_map_slots(struct swap_info_struct *si,
*/
si->flags += SWP_SCANNING;
- scan_base = offset = si->cluster_next;
+ /*
+ * Use percpu scan base for SSD to reduce lock contention on
+ * cluster and swap cache. For HDD, sequential access is more
+ * important.
+ */
+ if (si->flags & SWP_SOLIDSTATE)
+ scan_base = this_cpu_read(*si->cluster_next_cpu);
+ else
+ scan_base = si->cluster_next;
+ offset = scan_base;
/* SSD algorithm */
if (si->cluster_info) {
- if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
- goto checks;
- else
+ if (!scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
goto scan;
- }
-
- if (unlikely(!si->cluster_nr--)) {
+ } else if (unlikely(!si->cluster_nr--)) {
if (si->pages - si->inuse_pages < SWAPFILE_CLUSTER) {
si->cluster_nr = SWAPFILE_CLUSTER - 1;
goto checks;
@@ -843,12 +870,11 @@ checks:
else
goto done;
}
- si->swap_map[offset] = usage;
+ WRITE_ONCE(si->swap_map[offset], usage);
inc_cluster_info_page(si, si->cluster_info, offset);
unlock_cluster(ci);
swap_range_alloc(si, offset, 1);
- si->cluster_next = offset + 1;
slots[n_ret++] = swp_entry(si->type, offset);
/* got enough slots or reach max slots? */
@@ -871,51 +897,69 @@ checks:
if (si->cluster_info) {
if (scan_swap_map_try_ssd_cluster(si, &offset, &scan_base))
goto checks;
- else
- goto done;
- }
- /* non-ssd case */
- ++offset;
-
- /* non-ssd case, still more slots in cluster? */
- if (si->cluster_nr && !si->swap_map[offset]) {
+ } else if (si->cluster_nr && !si->swap_map[++offset]) {
+ /* non-ssd case, still more slots in cluster? */
--si->cluster_nr;
goto checks;
}
+ /*
+ * Even if there's no free clusters available (fragmented),
+ * try to scan a little more quickly with lock held unless we
+ * have scanned too many slots already.
+ */
+ if (!scanned_many) {
+ unsigned long scan_limit;
+
+ if (offset < scan_base)
+ scan_limit = scan_base;
+ else
+ scan_limit = si->highest_bit;
+ for (; offset <= scan_limit && --latency_ration > 0;
+ offset++) {
+ if (!si->swap_map[offset])
+ goto checks;
+ }
+ }
+
done:
+ set_cluster_next(si, offset + 1);
si->flags -= SWP_SCANNING;
return n_ret;
scan:
spin_unlock(&si->lock);
- while (++offset <= si->highest_bit) {
- if (!si->swap_map[offset]) {
+ while (++offset <= READ_ONCE(si->highest_bit)) {
+ if (data_race(!si->swap_map[offset])) {
spin_lock(&si->lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full() &&
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
spin_lock(&si->lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
}
}
offset = si->lowest_bit;
while (offset < scan_base) {
- if (!si->swap_map[offset]) {
+ if (data_race(!si->swap_map[offset])) {
spin_lock(&si->lock);
goto checks;
}
- if (vm_swap_full() && si->swap_map[offset] == SWAP_HAS_CACHE) {
+ if (vm_swap_full() &&
+ READ_ONCE(si->swap_map[offset]) == SWAP_HAS_CACHE) {
spin_lock(&si->lock);
goto checks;
}
if (unlikely(--latency_ration < 0)) {
cond_resched();
latency_ration = LATENCY_LIMIT;
+ scanned_many = true;
}
offset++;
}
@@ -1004,11 +1048,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
if (avail_pgs <= 0)
goto noswap;
- if (n_goal > SWAP_BATCH)
- n_goal = SWAP_BATCH;
-
- if (n_goal > avail_pgs)
- n_goal = avail_pgs;
+ n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
atomic_long_sub(n_goal * size, &nr_swap_pages);
@@ -1038,7 +1078,7 @@ start_over:
goto nextsi;
}
if (size == SWAPFILE_CLUSTER) {
- if (!(si->flags & SWP_FS))
+ if (si->flags & SWP_BLKDEV)
n_ret = swap_alloc_cluster(si, swp_entries);
} else
n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE,
@@ -1111,7 +1151,7 @@ static struct swap_info_struct *__swap_info_get(swp_entry_t entry)
p = swp_swap_info(entry);
if (!p)
goto bad_nofile;
- if (!(p->flags & SWP_USED))
+ if (data_race(!(p->flags & SWP_USED)))
goto bad_device;
offset = swp_offset(entry);
if (offset >= p->max)
@@ -1137,7 +1177,7 @@ static struct swap_info_struct *_swap_info_get(swp_entry_t entry)
p = __swap_info_get(entry);
if (!p)
goto out;
- if (!p->swap_map[swp_offset(entry)])
+ if (data_race(!p->swap_map[swp_offset(entry)]))
goto bad_free;
return p;
@@ -1206,7 +1246,10 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *p,
}
usage = count | has_cache;
- p->swap_map[offset] = usage ? : SWAP_HAS_CACHE;
+ if (usage)
+ WRITE_ONCE(p->swap_map[offset], usage);
+ else
+ WRITE_ONCE(p->swap_map[offset], SWAP_HAS_CACHE);
return usage;
}
@@ -1258,7 +1301,7 @@ struct swap_info_struct *get_swap_device(swp_entry_t entry)
goto bad_nofile;
rcu_read_lock();
- if (!(si->flags & SWP_VALID))
+ if (data_race(!(si->flags & SWP_VALID)))
goto unlock_out;
offset = swp_offset(entry);
if (offset >= si->max)
@@ -1275,13 +1318,14 @@ unlock_out:
}
static unsigned char __swap_entry_free(struct swap_info_struct *p,
- swp_entry_t entry, unsigned char usage)
+ swp_entry_t entry)
{
struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
+ unsigned char usage;
ci = lock_cluster_or_swap_info(p, offset);
- usage = __swap_entry_free_locked(p, offset, usage);
+ usage = __swap_entry_free_locked(p, offset, 1);
unlock_cluster_or_swap_info(p, ci);
if (!usage)
free_swap_slot(entry);
@@ -1316,7 +1360,7 @@ void swap_free(swp_entry_t entry)
p = _swap_info_get(entry);
if (p)
- __swap_entry_free(p, entry, 1);
+ __swap_entry_free(p, entry);
}
/*
@@ -1331,7 +1375,7 @@ void put_swap_page(struct page *page, swp_entry_t entry)
unsigned char *map;
unsigned int i, free_entries = 0;
unsigned char val;
- int size = swap_entry_size(hpage_nr_pages(page));
+ int size = swap_entry_size(thp_nr_pages(page));
si = _swap_info_get(entry);
if (!si)
@@ -1739,7 +1783,7 @@ int free_swap_and_cache(swp_entry_t entry)
p = _swap_info_get(entry);
if (p) {
- count = __swap_entry_free(p, entry, 1);
+ count = __swap_entry_free(p, entry);
if (count == SWAP_HAS_CACHE &&
!swap_page_trans_huge_swapped(p, entry))
__try_to_reclaim_swap(p, swp_offset(entry),
@@ -1854,7 +1898,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
unsigned long addr, swp_entry_t entry, struct page *page)
{
struct page *swapcache;
- struct mem_cgroup *memcg;
spinlock_t *ptl;
pte_t *pte;
int ret = 1;
@@ -1864,15 +1907,8 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
if (unlikely(!page))
return -ENOMEM;
- if (mem_cgroup_try_charge(page, vma->vm_mm, GFP_KERNEL,
- &memcg, false)) {
- ret = -ENOMEM;
- goto out_nolock;
- }
-
pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
if (unlikely(!pte_same_as_swp(*pte, swp_entry_to_pte(entry)))) {
- mem_cgroup_cancel_charge(page, memcg, false);
ret = 0;
goto out;
}
@@ -1884,11 +1920,9 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
pte_mkold(mk_pte(page, vma->vm_page_prot)));
if (page == swapcache) {
page_add_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, true, false);
} else { /* ksm created a completely new copy */
page_add_new_anon_rmap(page, vma, addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, vma);
+ lru_cache_add_inactive_or_unevictable(page, vma);
}
swap_free(entry);
/*
@@ -1898,7 +1932,6 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd,
activate_page(page);
out:
pte_unmap_unlock(pte, ptl);
-out_nolock:
if (page != swapcache) {
unlock_page(page);
put_page(page);
@@ -1937,10 +1970,14 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
pte_unmap(pte);
swap_map = &si->swap_map[offset];
- vmf.vma = vma;
- vmf.address = addr;
- vmf.pmd = pmd;
- page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf);
+ page = lookup_swap_cache(entry, vma, addr);
+ if (!page) {
+ vmf.vma = vma;
+ vmf.address = addr;
+ vmf.pmd = pmd;
+ page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE,
+ &vmf);
+ }
if (!page) {
if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD)
goto try_next;
@@ -2070,7 +2107,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
struct vm_area_struct *vma;
int ret = 0;
- down_read(&mm->mmap_sem);
+ mmap_read_lock(mm);
for (vma = mm->mmap; vma; vma = vma->vm_next) {
if (vma->anon_vma) {
ret = unuse_vma(vma, type, frontswap,
@@ -2080,7 +2117,7 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type,
}
cond_resched();
}
- up_read(&mm->mmap_sem);
+ mmap_read_unlock(mm);
return ret;
}
@@ -2650,6 +2687,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
mutex_unlock(&swapon_mutex);
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
vfree(swap_map);
kvfree(cluster_info);
kvfree(frontswap_map);
@@ -2757,20 +2796,24 @@ static int swap_show(struct seq_file *swap, void *v)
struct swap_info_struct *si = v;
struct file *file;
int len;
+ unsigned int bytes, inuse;
if (si == SEQ_START_TOKEN) {
- seq_puts(swap,"Filename\t\t\t\tType\t\tSize\tUsed\tPriority\n");
+ seq_puts(swap,"Filename\t\t\t\tType\t\tSize\t\tUsed\t\tPriority\n");
return 0;
}
+ bytes = si->pages << (PAGE_SHIFT - 10);
+ inuse = si->inuse_pages << (PAGE_SHIFT - 10);
+
file = si->swap_file;
len = seq_file_path(swap, file, " \t\n\\");
- seq_printf(swap, "%*s%s\t%u\t%u\t%d\n",
+ seq_printf(swap, "%*s%s\t%u\t%s%u\t%s%d\n",
len < 40 ? 40 - len : 1, " ",
S_ISBLK(file_inode(file)->i_mode) ?
"partition" : "file\t",
- si->pages << (PAGE_SHIFT - 10),
- si->inuse_pages << (PAGE_SHIFT - 10),
+ bytes, bytes < 10000000 ? "\t" : "",
+ inuse, inuse < 10000000 ? "\t" : "",
si->prio);
return 0;
}
@@ -2893,7 +2936,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode)
* write only restriction. Hence zoned block devices are not
* suitable for swapping. Disallow them here.
*/
- if (blk_queue_is_zoned(p->bdev->bd_queue))
+ if (blk_queue_is_zoned(p->bdev->bd_disk->queue))
return -EINVAL;
p->flags |= SWP_BLKDEV;
} else if (S_ISREG(inode->i_mode)) {
@@ -3202,11 +3245,19 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
unsigned long ci, nr_cluster;
p->flags |= SWP_SOLIDSTATE;
+ p->cluster_next_cpu = alloc_percpu(unsigned int);
+ if (!p->cluster_next_cpu) {
+ error = -ENOMEM;
+ goto bad_swap_unlock_inode;
+ }
/*
* select a random position to start with to help wear leveling
* SSD
*/
- p->cluster_next = 1 + (prandom_u32() % p->highest_bit);
+ for_each_possible_cpu(cpu) {
+ per_cpu(*p->cluster_next_cpu, cpu) =
+ 1 + prandom_u32_max(p->highest_bit);
+ }
nr_cluster = DIV_ROUND_UP(maxpages, SWAPFILE_CLUSTER);
cluster_info = kvcalloc(nr_cluster, sizeof(*cluster_info),
@@ -3322,6 +3373,8 @@ bad_swap_unlock_inode:
bad_swap:
free_percpu(p->percpu_cluster);
p->percpu_cluster = NULL;
+ free_percpu(p->cluster_next_cpu);
+ p->cluster_next_cpu = NULL;
if (inode && S_ISBLK(inode->i_mode) && p->bdev) {
set_blocksize(p->bdev, p->old_block_size);
blkdev_put(p->bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL);
@@ -3436,7 +3489,7 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
} else
err = -ENOENT; /* unused swap entry */
- p->swap_map[offset] = count | has_cache;
+ WRITE_ONCE(p->swap_map[offset], count | has_cache);
unlock_out:
unlock_cluster_or_swap_info(p, ci);
@@ -3654,7 +3707,7 @@ static bool swap_count_continued(struct swap_info_struct *si,
spin_lock(&si->cont_lock);
offset &= ~PAGE_MASK;
- page = list_entry(head->lru.next, struct page, lru);
+ page = list_next_entry(head, lru);
map = kmap_atomic(page) + offset;
if (count == SWAP_MAP_MAX) /* initial increment from swap_map */
@@ -3666,13 +3719,13 @@ static bool swap_count_continued(struct swap_info_struct *si,
*/
while (*map == (SWAP_CONT_MAX | COUNT_CONTINUED)) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
BUG_ON(page == head);
map = kmap_atomic(page) + offset;
}
if (*map == SWAP_CONT_MAX) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
if (page == head) {
ret = false; /* add count continuation */
goto out;
@@ -3682,12 +3735,10 @@ init_map: *map = 0; /* we didn't zero the page */
}
*map += 1;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
- while (page != head) {
+ while ((page = list_prev_entry(page, lru)) != head) {
map = kmap_atomic(page) + offset;
*map = COUNT_CONTINUED;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
}
ret = true; /* incremented */
@@ -3698,7 +3749,7 @@ init_map: *map = 0; /* we didn't zero the page */
BUG_ON(count != COUNT_CONTINUED);
while (*map == COUNT_CONTINUED) {
kunmap_atomic(map);
- page = list_entry(page->lru.next, struct page, lru);
+ page = list_next_entry(page, lru);
BUG_ON(page == head);
map = kmap_atomic(page) + offset;
}
@@ -3707,13 +3758,11 @@ init_map: *map = 0; /* we didn't zero the page */
if (*map == 0)
count = 0;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
- while (page != head) {
+ while ((page = list_prev_entry(page, lru)) != head) {
map = kmap_atomic(page) + offset;
*map = SWAP_CONT_MAX | count;
count = COUNT_CONTINUED;
kunmap_atomic(map);
- page = list_entry(page->lru.prev, struct page, lru);
}
ret = count == COUNT_CONTINUED;
}
@@ -3745,11 +3794,12 @@ static void free_swap_count_continuations(struct swap_info_struct *si)
}
#if defined(CONFIG_MEMCG) && defined(CONFIG_BLK_CGROUP)
-void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
- gfp_t gfp_mask)
+void cgroup_throttle_swaprate(struct page *page, gfp_t gfp_mask)
{
struct swap_info_struct *si, *next;
- if (!(gfp_mask & __GFP_IO) || !memcg)
+ int nid = page_to_nid(page);
+
+ if (!(gfp_mask & __GFP_IO))
return;
if (!blk_cgroup_congested())
@@ -3763,11 +3813,10 @@ void mem_cgroup_throttle_swaprate(struct mem_cgroup *memcg, int node,
return;
spin_lock(&swap_avail_lock);
- plist_for_each_entry_safe(si, next, &swap_avail_heads[node],
- avail_lists[node]) {
+ plist_for_each_entry_safe(si, next, &swap_avail_heads[nid],
+ avail_lists[nid]) {
if (si->bdev) {
- blkcg_schedule_throttle(bdev_get_queue(si->bdev),
- true);
+ blkcg_schedule_throttle(bdev_get_queue(si->bdev), true);
break;
}
}
diff --git a/mm/usercopy.c b/mm/usercopy.c
index 660717a1ea5c..b3de3c4eefba 100644
--- a/mm/usercopy.c
+++ b/mm/usercopy.c
@@ -43,7 +43,7 @@ static noinline int check_stack_object(const void *obj, unsigned long len)
/*
* Reject: object partially overlaps the stack (passing the
- * the check above means at least one end is within the stack,
+ * check above means at least one end is within the stack,
* so if this check fails, the other end is outside the stack).
*/
if (obj < stack || stackend < obj + len)
diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
index 512576e171ce..9a3d451402d7 100644
--- a/mm/userfaultfd.c
+++ b/mm/userfaultfd.c
@@ -56,7 +56,6 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
struct page **pagep,
bool wp_copy)
{
- struct mem_cgroup *memcg;
pte_t _dst_pte, *dst_pte;
spinlock_t *ptl;
void *page_kaddr;
@@ -77,7 +76,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
PAGE_SIZE);
kunmap_atomic(page_kaddr);
- /* fallback to copy_from_user outside mmap_sem */
+ /* fallback to copy_from_user outside mmap_lock */
if (unlikely(ret)) {
ret = -ENOENT;
*pagep = page;
@@ -97,7 +96,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
__SetPageUptodate(page);
ret = -ENOMEM;
- if (mem_cgroup_try_charge(page, dst_mm, GFP_KERNEL, &memcg, false))
+ if (mem_cgroup_charge(page, dst_mm, GFP_KERNEL))
goto out_release;
_dst_pte = pte_mkdirty(mk_pte(page, dst_vma->vm_page_prot));
@@ -124,8 +123,7 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm,
inc_mm_counter(dst_mm, MM_ANONPAGES);
page_add_new_anon_rmap(page, dst_vma, dst_addr, false);
- mem_cgroup_commit_charge(page, memcg, false, false);
- lru_cache_add_active_or_unevictable(page, dst_vma);
+ lru_cache_add_inactive_or_unevictable(page, dst_vma);
set_pte_at(dst_mm, dst_addr, dst_pte, _dst_pte);
@@ -138,7 +136,6 @@ out:
return ret;
out_release_uncharge_unlock:
pte_unmap_unlock(dst_pte, ptl);
- mem_cgroup_cancel_charge(page, memcg, false);
out_release:
put_page(page);
goto out;
@@ -203,7 +200,7 @@ static pmd_t *mm_alloc_pmd(struct mm_struct *mm, unsigned long address)
#ifdef CONFIG_HUGETLB_PAGE
/*
* __mcopy_atomic processing for HUGETLB vmas. Note that this routine is
- * called with mmap_sem held, it will release mmap_sem before returning.
+ * called with mmap_lock held, it will release mmap_lock before returning.
*/
static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
struct vm_area_struct *dst_vma,
@@ -231,7 +228,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
* feature is not supported.
*/
if (zeropage) {
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
return -EINVAL;
}
@@ -250,7 +247,7 @@ static __always_inline ssize_t __mcopy_atomic_hugetlb(struct mm_struct *dst_mm,
retry:
/*
- * On routine entry dst_vma is set. If we had to drop mmap_sem and
+ * On routine entry dst_vma is set. If we had to drop mmap_lock and
* retry, dst_vma will be set to NULL and we must lookup again.
*/
if (!dst_vma) {
@@ -318,7 +315,7 @@ retry:
cond_resched();
if (unlikely(err == -ENOENT)) {
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
BUG_ON(!page);
err = copy_huge_page_from_user(page,
@@ -329,7 +326,7 @@ retry:
err = -EFAULT;
goto out;
}
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
dst_vma = NULL;
goto retry;
@@ -349,7 +346,7 @@ retry:
}
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
out:
if (page) {
/*
@@ -360,7 +357,7 @@ out:
* private and shared mappings. See the routine
* restore_reserve_on_error for details. Unfortunately, we
* can not call restore_reserve_on_error now as it would
- * require holding mmap_sem.
+ * require holding mmap_lock.
*
* If a reservation for the page existed in the reservation
* map of a private mapping, the map was modified to indicate
@@ -488,7 +485,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
copied = 0;
page = NULL;
retry:
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
/*
* If memory mappings are changing because of non-cooperative
@@ -586,7 +583,7 @@ retry:
if (unlikely(err == -ENOENT)) {
void *page_kaddr;
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
BUG_ON(!page);
page_kaddr = kmap(page);
@@ -615,7 +612,7 @@ retry:
}
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
out:
if (page)
put_page(page);
@@ -655,7 +652,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
/* Does the address range wrap, or is the span zero-sized? */
BUG_ON(start + len <= start);
- down_read(&dst_mm->mmap_sem);
+ mmap_read_lock(dst_mm);
/*
* If memory mappings are changing because of non-cooperative
@@ -689,6 +686,6 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start,
err = 0;
out_unlock:
- up_read(&dst_mm->mmap_sem);
+ mmap_read_unlock(dst_mm);
return err;
}
diff --git a/mm/util.c b/mm/util.c
index 988d11e6c17c..5ef378a2a038 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -425,7 +425,7 @@ void arch_pick_mmap_layout(struct mm_struct *mm, struct rlimit *rlim_stack)
* @bypass_rlim: %true if checking RLIMIT_MEMLOCK should be skipped
*
* Assumes @task and @mm are valid (i.e. at least one reference on each), and
- * that mmap_sem is held as writer.
+ * that mmap_lock is held as writer.
*
* Return:
* * 0 on success
@@ -437,7 +437,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
unsigned long locked_vm, limit;
int ret = 0;
- lockdep_assert_held_write(&mm->mmap_sem);
+ mmap_assert_write_locked(mm);
locked_vm = mm->locked_vm;
if (inc) {
@@ -481,10 +481,10 @@ int account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc)
if (pages == 0 || !mm)
return 0;
- down_write(&mm->mmap_sem);
+ mmap_write_lock(mm);
ret = __account_locked_vm(mm, pages, inc, current,
capable(CAP_IPC_LOCK));
- up_write(&mm->mmap_sem);
+ mmap_write_unlock(mm);
return ret;
}
@@ -501,11 +501,11 @@ unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
ret = security_mmap_file(file, prot, flag);
if (!ret) {
- if (down_write_killable(&mm->mmap_sem))
+ if (mmap_write_lock_killable(mm))
return -EINTR;
- ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
- &populate, &uf);
- up_write(&mm->mmap_sem);
+ ret = do_mmap(file, addr, len, prot, flag, pgoff, &populate,
+ &uf);
+ mmap_write_unlock(mm);
userfaultfd_unmap_complete(mm, &uf);
if (populate)
mm_populate(ret, populate);
@@ -580,7 +580,7 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node)
if (ret || size <= PAGE_SIZE)
return ret;
- return __vmalloc_node_flags_caller(size, node, flags,
+ return __vmalloc_node(size, 1, flags, node,
__builtin_return_address(0));
}
EXPORT_SYMBOL(kvmalloc_node);
@@ -604,6 +604,24 @@ void kvfree(const void *addr)
}
EXPORT_SYMBOL(kvfree);
+/**
+ * kvfree_sensitive - Free a data object containing sensitive information.
+ * @addr: address of the data object to be freed.
+ * @len: length of the data object.
+ *
+ * Use the special memzero_explicit() function to clear the content of a
+ * kvmalloc'ed object containing sensitive data to make sure that the
+ * compiler won't optimize out the data clearing.
+ */
+void kvfree_sensitive(const void *addr, size_t len)
+{
+ if (likely(!ZERO_OR_NULL_PTR(addr))) {
+ memzero_explicit((void *)addr, len);
+ kvfree(addr);
+ }
+}
+EXPORT_SYMBOL(kvfree_sensitive);
+
static inline void *__page_rmapping(struct page *page)
{
unsigned long mapping;
@@ -717,9 +735,8 @@ int sysctl_max_map_count __read_mostly = DEFAULT_MAX_MAP_COUNT;
unsigned long sysctl_user_reserve_kbytes __read_mostly = 1UL << 17; /* 128MB */
unsigned long sysctl_admin_reserve_kbytes __read_mostly = 1UL << 13; /* 8MB */
-int overcommit_ratio_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+int overcommit_ratio_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -729,9 +746,49 @@ int overcommit_ratio_handler(struct ctl_table *table, int write,
return ret;
}
-int overcommit_kbytes_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp,
- loff_t *ppos)
+static void sync_overcommit_as(struct work_struct *dummy)
+{
+ percpu_counter_sync(&vm_committed_as);
+}
+
+int overcommit_policy_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table t;
+ int new_policy;
+ int ret;
+
+ /*
+ * The deviation of sync_overcommit_as could be big with loose policy
+ * like OVERCOMMIT_ALWAYS/OVERCOMMIT_GUESS. When changing policy to
+ * strict OVERCOMMIT_NEVER, we need to reduce the deviation to comply
+ * with the strict "NEVER", and to avoid possible race condtion (even
+ * though user usually won't too frequently do the switching to policy
+ * OVERCOMMIT_NEVER), the switch is done in the following order:
+ * 1. changing the batch
+ * 2. sync percpu count on each CPU
+ * 3. switch the policy
+ */
+ if (write) {
+ t = *table;
+ t.data = &new_policy;
+ ret = proc_dointvec_minmax(&t, write, buffer, lenp, ppos);
+ if (ret)
+ return ret;
+
+ mm_compute_batch(new_policy);
+ if (new_policy == OVERCOMMIT_NEVER)
+ schedule_on_each_cpu(sync_overcommit_as);
+ sysctl_overcommit_memory = new_policy;
+ } else {
+ ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+ }
+
+ return ret;
+}
+
+int overcommit_kbytes_handler(struct ctl_table *table, int write, void *buffer,
+ size_t *lenp, loff_t *ppos)
{
int ret;
@@ -771,10 +828,15 @@ struct percpu_counter vm_committed_as ____cacheline_aligned_in_smp;
* balancing memory across competing virtual machines that are hosted.
* Several metrics drive this policy engine including the guest reported
* memory commitment.
+ *
+ * The time cost of this is very low for small platforms, and for big
+ * platform like a 2S/36C/72T Skylake server, in worst case where
+ * vm_committed_as's spinlock is under severe contention, the time cost
+ * could be about 30~40 microseconds.
*/
unsigned long vm_memory_committed(void)
{
- return percpu_counter_read_positive(&vm_committed_as);
+ return percpu_counter_sum_positive(&vm_committed_as);
}
EXPORT_SYMBOL_GPL(vm_memory_committed);
@@ -798,10 +860,6 @@ int __vm_enough_memory(struct mm_struct *mm, long pages, int cap_sys_admin)
{
long allowed;
- VM_WARN_ONCE(percpu_counter_read(&vm_committed_as) <
- -(s64)vm_committed_as_batch * num_online_cpus(),
- "memory commitment underflow");
-
vm_acct_memory(pages);
/*
diff --git a/mm/vmacache.c b/mm/vmacache.c
index cdc32a3b02fa..01a6e6688ec1 100644
--- a/mm/vmacache.c
+++ b/mm/vmacache.c
@@ -6,7 +6,6 @@
#include <linux/sched/task.h>
#include <linux/mm.h>
#include <linux/vmacache.h>
-#include <asm/pgtable.h>
/*
* Hash based on the pmd of addr if configured with MMU, which provides a good
@@ -25,8 +24,8 @@
* task's vmacache pertains to a different mm (ie, its own). There is
* nothing we can do here.
*
- * Also handle the case where a kernel thread has adopted this mm via use_mm().
- * That kernel thread's vmacache is not applicable to this mm.
+ * Also handle the case where a kernel thread has adopted this mm via
+ * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm.
*/
static inline bool vmacache_valid_mm(struct mm_struct *mm)
{
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 9a8227afa073..be4724b916b3 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -7,6 +7,7 @@
* SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian <tigran@veritas.com>, May 2000
* Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002
* Numa awareness, Christoph Lameter, SGI, June 2005
+ * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019
*/
#include <linux/vmalloc.h>
@@ -25,7 +26,7 @@
#include <linux/list.h>
#include <linux/notifier.h>
#include <linux/rbtree.h>
-#include <linux/radix-tree.h>
+#include <linux/xarray.h>
#include <linux/rcupdate.h>
#include <linux/pfn.h>
#include <linux/kmemleak.h>
@@ -41,6 +42,7 @@
#include <asm/shmparam.h>
#include "internal.h"
+#include "pgalloc-track.h"
bool is_vmalloc_addr(const void *x)
{
@@ -69,7 +71,8 @@ static void free_work(struct work_struct *w)
/*** Page table manipulation functions ***/
-static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
+static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -78,73 +81,119 @@ static void vunmap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end)
pte_t ptent = ptep_get_and_clear(&init_mm, addr, pte);
WARN_ON(!pte_none(ptent) && !pte_present(ptent));
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
}
-static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end)
+static void vunmap_pmd_range(pud_t *pud, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
+ int cleared;
pmd = pmd_offset(pud, addr);
do {
next = pmd_addr_end(addr, end);
- if (pmd_clear_huge(pmd))
+
+ cleared = pmd_clear_huge(pmd);
+ if (cleared || pmd_bad(*pmd))
+ *mask |= PGTBL_PMD_MODIFIED;
+
+ if (cleared)
continue;
if (pmd_none_or_clear_bad(pmd))
continue;
- vunmap_pte_range(pmd, addr, next);
+ vunmap_pte_range(pmd, addr, next, mask);
+
+ cond_resched();
} while (pmd++, addr = next, addr != end);
}
-static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end)
+static void vunmap_pud_range(p4d_t *p4d, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
+ int cleared;
pud = pud_offset(p4d, addr);
do {
next = pud_addr_end(addr, end);
- if (pud_clear_huge(pud))
+
+ cleared = pud_clear_huge(pud);
+ if (cleared || pud_bad(*pud))
+ *mask |= PGTBL_PUD_MODIFIED;
+
+ if (cleared)
continue;
if (pud_none_or_clear_bad(pud))
continue;
- vunmap_pmd_range(pud, addr, next);
+ vunmap_pmd_range(pud, addr, next, mask);
} while (pud++, addr = next, addr != end);
}
-static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end)
+static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
+ int cleared;
p4d = p4d_offset(pgd, addr);
do {
next = p4d_addr_end(addr, end);
- if (p4d_clear_huge(p4d))
+
+ cleared = p4d_clear_huge(p4d);
+ if (cleared || p4d_bad(*p4d))
+ *mask |= PGTBL_P4D_MODIFIED;
+
+ if (cleared)
continue;
if (p4d_none_or_clear_bad(p4d))
continue;
- vunmap_pud_range(p4d, addr, next);
+ vunmap_pud_range(p4d, addr, next, mask);
} while (p4d++, addr = next, addr != end);
}
-static void vunmap_page_range(unsigned long addr, unsigned long end)
+/**
+ * unmap_kernel_range_noflush - unmap kernel VM area
+ * @start: start of the VM area to unmap
+ * @size: size of the VM area to unmap
+ *
+ * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size specify
+ * should have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible
+ * for calling flush_cache_vunmap() on to-be-mapped areas before calling this
+ * function and flush_tlb_kernel_range() after.
+ */
+void unmap_kernel_range_noflush(unsigned long start, unsigned long size)
{
- pgd_t *pgd;
+ unsigned long end = start + size;
unsigned long next;
+ pgd_t *pgd;
+ unsigned long addr = start;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
if (pgd_none_or_clear_bad(pgd))
continue;
- vunmap_p4d_range(pgd, addr, next);
+ vunmap_p4d_range(pgd, addr, next, &mask);
} while (pgd++, addr = next, addr != end);
+
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
}
static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pte_t *pte;
@@ -153,7 +202,7 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
* callers keep track of where we're up to.
*/
- pte = pte_alloc_kernel(pmd, addr);
+ pte = pte_alloc_kernel_track(pmd, addr, mask);
if (!pte)
return -ENOMEM;
do {
@@ -166,94 +215,117 @@ static int vmap_pte_range(pmd_t *pmd, unsigned long addr,
set_pte_at(&init_mm, addr, pte, mk_pte(page, prot));
(*nr)++;
} while (pte++, addr += PAGE_SIZE, addr != end);
+ *mask |= PGTBL_PTE_MODIFIED;
return 0;
}
static int vmap_pmd_range(pud_t *pud, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pmd_t *pmd;
unsigned long next;
- pmd = pmd_alloc(&init_mm, pud, addr);
+ pmd = pmd_alloc_track(&init_mm, pud, addr, mask);
if (!pmd)
return -ENOMEM;
do {
next = pmd_addr_end(addr, end);
- if (vmap_pte_range(pmd, addr, next, prot, pages, nr))
+ if (vmap_pte_range(pmd, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pmd++, addr = next, addr != end);
return 0;
}
static int vmap_pud_range(p4d_t *p4d, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
pud_t *pud;
unsigned long next;
- pud = pud_alloc(&init_mm, p4d, addr);
+ pud = pud_alloc_track(&init_mm, p4d, addr, mask);
if (!pud)
return -ENOMEM;
do {
next = pud_addr_end(addr, end);
- if (vmap_pmd_range(pud, addr, next, prot, pages, nr))
+ if (vmap_pmd_range(pud, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (pud++, addr = next, addr != end);
return 0;
}
static int vmap_p4d_range(pgd_t *pgd, unsigned long addr,
- unsigned long end, pgprot_t prot, struct page **pages, int *nr)
+ unsigned long end, pgprot_t prot, struct page **pages, int *nr,
+ pgtbl_mod_mask *mask)
{
p4d_t *p4d;
unsigned long next;
- p4d = p4d_alloc(&init_mm, pgd, addr);
+ p4d = p4d_alloc_track(&init_mm, pgd, addr, mask);
if (!p4d)
return -ENOMEM;
do {
next = p4d_addr_end(addr, end);
- if (vmap_pud_range(p4d, addr, next, prot, pages, nr))
+ if (vmap_pud_range(p4d, addr, next, prot, pages, nr, mask))
return -ENOMEM;
} while (p4d++, addr = next, addr != end);
return 0;
}
-/*
- * Set up page tables in kva (addr, end). The ptes shall have prot "prot", and
- * will have pfns corresponding to the "pages" array.
+/**
+ * map_kernel_range_noflush - map kernel VM area with the specified pages
+ * @addr: start of the VM area to map
+ * @size: size of the VM area to map
+ * @prot: page protection flags to use
+ * @pages: pages to map
+ *
+ * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size specify should
+ * have been allocated using get_vm_area() and its friends.
+ *
+ * NOTE:
+ * This function does NOT do any cache flushing. The caller is responsible for
+ * calling flush_cache_vmap() on to-be-mapped areas before calling this
+ * function.
*
- * Ie. pte at addr+N*PAGE_SIZE shall point to pfn corresponding to pages[N]
+ * RETURNS:
+ * 0 on success, -errno on failure.
*/
-static int vmap_page_range_noflush(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range_noflush(unsigned long addr, unsigned long size,
+ pgprot_t prot, struct page **pages)
{
- pgd_t *pgd;
+ unsigned long start = addr;
+ unsigned long end = addr + size;
unsigned long next;
- unsigned long addr = start;
+ pgd_t *pgd;
int err = 0;
int nr = 0;
+ pgtbl_mod_mask mask = 0;
BUG_ON(addr >= end);
pgd = pgd_offset_k(addr);
do {
next = pgd_addr_end(addr, end);
- err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr);
+ if (pgd_bad(*pgd))
+ mask |= PGTBL_PGD_MODIFIED;
+ err = vmap_p4d_range(pgd, addr, next, prot, pages, &nr, &mask);
if (err)
return err;
} while (pgd++, addr = next, addr != end);
- return nr;
+ if (mask & ARCH_PAGE_TABLE_SYNC_MASK)
+ arch_sync_kernel_mappings(start, end);
+
+ return 0;
}
-static int vmap_page_range(unsigned long start, unsigned long end,
- pgprot_t prot, struct page **pages)
+int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot,
+ struct page **pages)
{
int ret;
- ret = vmap_page_range_noflush(start, end, prot, pages);
- flush_cache_vmap(start, end);
+ ret = map_kernel_range_noflush(start, size, prot, pages);
+ flush_cache_vmap(start, start + size);
return ret;
}
@@ -442,6 +514,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr)
/*
* This function returns back addresses of parent node
* and its left or right link for further processing.
+ *
+ * Otherwise NULL is returned. In that case all further
+ * steps regarding inserting of conflicting overlap range
+ * have to be declined and actually considered as a bug.
*/
static __always_inline struct rb_node **
find_va_links(struct vmap_area *va,
@@ -480,8 +556,12 @@ find_va_links(struct vmap_area *va,
else if (va->va_end > tmp_va->va_start &&
va->va_start >= tmp_va->va_end)
link = &(*link)->rb_right;
- else
- BUG();
+ else {
+ WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n",
+ va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end);
+
+ return NULL;
+ }
} while (*link);
*parent = &tmp_va->rb_node;
@@ -563,43 +643,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root)
#if DEBUG_AUGMENT_PROPAGATE_CHECK
static void
-augment_tree_propagate_check(struct rb_node *n)
+augment_tree_propagate_check(void)
{
struct vmap_area *va;
- struct rb_node *node;
- unsigned long size;
- bool found = false;
-
- if (n == NULL)
- return;
-
- va = rb_entry(n, struct vmap_area, rb_node);
- size = va->subtree_max_size;
- node = n;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
-
- if (get_subtree_max_size(node->rb_left) == size) {
- node = node->rb_left;
- } else {
- if (va_size(va) == size) {
- found = true;
- break;
- }
-
- node = node->rb_right;
- }
- }
+ unsigned long computed_size;
- if (!found) {
- va = rb_entry(n, struct vmap_area, rb_node);
- pr_emerg("tree is corrupted: %lu, %lu\n",
- va_size(va), va->subtree_max_size);
+ list_for_each_entry(va, &free_vmap_area_list, list) {
+ computed_size = compute_subtree_max_size(va);
+ if (computed_size != va->subtree_max_size)
+ pr_emerg("tree is corrupted: %lu, %lu\n",
+ va_size(va), va->subtree_max_size);
}
-
- augment_tree_propagate_check(n->rb_left);
- augment_tree_propagate_check(n->rb_right);
}
#endif
@@ -633,28 +687,15 @@ augment_tree_propagate_check(struct rb_node *n)
static __always_inline void
augment_tree_propagate_from(struct vmap_area *va)
{
- struct rb_node *node = &va->rb_node;
- unsigned long new_va_sub_max_size;
-
- while (node) {
- va = rb_entry(node, struct vmap_area, rb_node);
- new_va_sub_max_size = compute_subtree_max_size(va);
-
- /*
- * If the newly calculated maximum available size of the
- * subtree is equal to the current one, then it means that
- * the tree is propagated correctly. So we have to stop at
- * this point to save cycles.
- */
- if (va->subtree_max_size == new_va_sub_max_size)
- break;
-
- va->subtree_max_size = new_va_sub_max_size;
- node = rb_parent(&va->rb_node);
- }
+ /*
+ * Populate the tree from bottom towards the root until
+ * the calculated maximum available size of checked node
+ * is equal to its current one.
+ */
+ free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL);
#if DEBUG_AUGMENT_PROPAGATE_CHECK
- augment_tree_propagate_check(free_vmap_area_root.rb_node);
+ augment_tree_propagate_check();
#endif
}
@@ -666,7 +707,8 @@ insert_vmap_area(struct vmap_area *va,
struct rb_node *parent;
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
+ if (link)
+ link_va(va, root, parent, link, head);
}
static void
@@ -682,8 +724,10 @@ insert_vmap_area_augment(struct vmap_area *va,
else
link = find_va_links(va, root, NULL, &parent);
- link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
+ if (link) {
+ link_va(va, root, parent, link, head);
+ augment_tree_propagate_from(va);
+ }
}
/*
@@ -691,6 +735,11 @@ insert_vmap_area_augment(struct vmap_area *va,
* and next free blocks. If coalesce is not done a new
* free area is inserted. If VA has been merged, it is
* freed.
+ *
+ * Please note, it can return NULL in case of overlap
+ * ranges, followed by WARN() report. Despite it is a
+ * buggy behaviour, a system can be alive and keep
+ * ongoing.
*/
static __always_inline struct vmap_area *
merge_or_add_vmap_area(struct vmap_area *va,
@@ -707,6 +756,8 @@ merge_or_add_vmap_area(struct vmap_area *va,
* inserted, unless it is merged with its sibling/siblings.
*/
link = find_va_links(va, root, NULL, &parent);
+ if (!link)
+ return NULL;
/*
* Get next node of VA to check if merging can be done.
@@ -727,9 +778,6 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (sibling->va_start == va->va_end) {
sibling->va_start = va->va_start;
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -749,14 +797,18 @@ merge_or_add_vmap_area(struct vmap_area *va,
if (next->prev != head) {
sibling = list_entry(next->prev, struct vmap_area, list);
if (sibling->va_end == va->va_start) {
- sibling->va_end = va->va_end;
-
- /* Check and update the tree if needed. */
- augment_tree_propagate_from(sibling);
-
+ /*
+ * If both neighbors are coalesced, it is important
+ * to unlink the "next" node first, followed by merging
+ * with "previous" one. Otherwise the tree might not be
+ * fully populated if a sibling's augmented value is
+ * "normalized" because of rotation operations.
+ */
if (merged)
unlink_va(va, root);
+ sibling->va_end = va->va_end;
+
/* Free vmap_area object. */
kmem_cache_free(vmap_area_cachep, va);
@@ -767,11 +819,13 @@ merge_or_add_vmap_area(struct vmap_area *va,
}
insert:
- if (!merged) {
+ if (!merged)
link_va(va, root, parent, link, head);
- augment_tree_propagate_from(va);
- }
+ /*
+ * Last step is to check and update the tree.
+ */
+ augment_tree_propagate_from(va);
return va;
}
@@ -1223,14 +1277,6 @@ int unregister_vmap_purge_notifier(struct notifier_block *nb)
EXPORT_SYMBOL_GPL(unregister_vmap_purge_notifier);
/*
- * Clear the pagetable entries of a given vmap_area
- */
-static void unmap_vmap_area(struct vmap_area *va)
-{
- vunmap_page_range(va->va_start, va->va_end);
-}
-
-/*
* lazy_max_pages is the maximum amount of virtual address space we gather up
* before attempting to purge with a TLB flush.
*
@@ -1293,12 +1339,6 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
return false;
/*
- * First make sure the mappings are removed from all page-tables
- * before they are freed.
- */
- vmalloc_sync_unmappings();
-
- /*
* TODO: to calculate a flush range without looping.
* The list can be up to lazy_max_pages() elements.
*/
@@ -1326,6 +1366,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
va = merge_or_add_vmap_area(va, &free_vmap_area_root,
&free_vmap_area_list);
+ if (!va)
+ continue;
+
if (is_vmalloc_or_module_addr((void *)orig_start))
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -1391,7 +1434,7 @@ static void free_vmap_area_noflush(struct vmap_area *va)
static void free_unmap_vmap_area(struct vmap_area *va)
{
flush_cache_vunmap(va->va_start, va->va_end);
- unmap_vmap_area(va);
+ unmap_kernel_range_noflush(va->va_start, va->va_end - va->va_start);
if (debug_pagealloc_enabled_static())
flush_tlb_kernel_range(va->va_start, va->va_end);
@@ -1458,12 +1501,11 @@ struct vmap_block {
static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue);
/*
- * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block
+ * XArray of vmap blocks, indexed by address, to quickly find a vmap block
* in the free path. Could get rid of this if we change the API to return a
* "cookie" from alloc, to be passed to free. But no big deal yet.
*/
-static DEFINE_SPINLOCK(vmap_block_tree_lock);
-static RADIX_TREE(vmap_block_tree, GFP_ATOMIC);
+static DEFINE_XARRAY(vmap_blocks);
/*
* We should probably have a fallback mechanism to allocate virtual memory
@@ -1520,13 +1562,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
return ERR_CAST(va);
}
- err = radix_tree_preload(gfp_mask);
- if (unlikely(err)) {
- kfree(vb);
- free_vmap_area(va);
- return ERR_PTR(err);
- }
-
vaddr = vmap_block_vaddr(va->va_start, 0);
spin_lock_init(&vb->lock);
vb->va = va;
@@ -1539,11 +1574,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
INIT_LIST_HEAD(&vb->free_list);
vb_idx = addr_to_vb_idx(va->va_start);
- spin_lock(&vmap_block_tree_lock);
- err = radix_tree_insert(&vmap_block_tree, vb_idx, vb);
- spin_unlock(&vmap_block_tree_lock);
- BUG_ON(err);
- radix_tree_preload_end();
+ err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask);
+ if (err) {
+ kfree(vb);
+ free_vmap_area(va);
+ return ERR_PTR(err);
+ }
vbq = &get_cpu_var(vmap_block_queue);
spin_lock(&vbq->lock);
@@ -1557,12 +1593,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask)
static void free_vmap_block(struct vmap_block *vb)
{
struct vmap_block *tmp;
- unsigned long vb_idx;
- vb_idx = addr_to_vb_idx(vb->va->va_start);
- spin_lock(&vmap_block_tree_lock);
- tmp = radix_tree_delete(&vmap_block_tree, vb_idx);
- spin_unlock(&vmap_block_tree_lock);
+ tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start));
BUG_ON(tmp != vb);
free_vmap_area_noflush(vb->va);
@@ -1665,34 +1697,25 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask)
return vaddr;
}
-static void vb_free(const void *addr, unsigned long size)
+static void vb_free(unsigned long addr, unsigned long size)
{
unsigned long offset;
- unsigned long vb_idx;
unsigned int order;
struct vmap_block *vb;
BUG_ON(offset_in_page(size));
BUG_ON(size > PAGE_SIZE*VMAP_MAX_ALLOC);
- flush_cache_vunmap((unsigned long)addr, (unsigned long)addr + size);
+ flush_cache_vunmap(addr, addr + size);
order = get_order(size);
+ offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT;
+ vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr));
- offset = (unsigned long)addr & (VMAP_BLOCK_SIZE - 1);
- offset >>= PAGE_SHIFT;
-
- vb_idx = addr_to_vb_idx((unsigned long)addr);
- rcu_read_lock();
- vb = radix_tree_lookup(&vmap_block_tree, vb_idx);
- rcu_read_unlock();
- BUG_ON(!vb);
-
- vunmap_page_range((unsigned long)addr, (unsigned long)addr + size);
+ unmap_kernel_range_noflush(addr, size);
if (debug_pagealloc_enabled_static())
- flush_tlb_kernel_range((unsigned long)addr,
- (unsigned long)addr + size);
+ flush_tlb_kernel_range(addr, addr + size);
spin_lock(&vb->lock);
@@ -1792,7 +1815,7 @@ void vm_unmap_ram(const void *mem, unsigned int count)
if (likely(count <= VMAP_MAX_ALLOC)) {
debug_check_no_locks_freed(mem, size);
- vb_free(mem, size);
+ vb_free(addr, size);
return;
}
@@ -1809,7 +1832,6 @@ EXPORT_SYMBOL(vm_unmap_ram);
* @pages: an array of pointers to the pages to be mapped
* @count: number of pages
* @node: prefer to allocate data structures on this node
- * @prot: memory protection to use. PAGE_KERNEL for regular RAM
*
* If you use this function for less than VMAP_MAX_ALLOC pages, it could be
* faster than vmap so it's good. But if you mix long-life and short-life
@@ -1819,7 +1841,7 @@ EXPORT_SYMBOL(vm_unmap_ram);
*
* Returns: a pointer to the address that has been mapped, or %NULL on failure
*/
-void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t prot)
+void *vm_map_ram(struct page **pages, unsigned int count, int node)
{
unsigned long size = (unsigned long)count << PAGE_SHIFT;
unsigned long addr;
@@ -1843,7 +1865,7 @@ void *vm_map_ram(struct page **pages, unsigned int count, int node, pgprot_t pro
kasan_unpoison_vmalloc(mem, size);
- if (vmap_page_range(addr, addr + size, prot, pages) < 0) {
+ if (map_kernel_range(addr, size, PAGE_KERNEL, pages) < 0) {
vm_unmap_ram(mem, count);
return NULL;
}
@@ -1988,51 +2010,6 @@ void __init vmalloc_init(void)
}
/**
- * map_kernel_range_noflush - map kernel VM area with the specified pages
- * @addr: start of the VM area to map
- * @size: size of the VM area to map
- * @prot: page protection flags to use
- * @pages: pages to map
- *
- * Map PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vmap() on to-be-mapped areas
- * before calling this function.
- *
- * RETURNS:
- * The number of pages mapped on success, -errno on failure.
- */
-int map_kernel_range_noflush(unsigned long addr, unsigned long size,
- pgprot_t prot, struct page **pages)
-{
- return vmap_page_range_noflush(addr, addr + size, prot, pages);
-}
-
-/**
- * unmap_kernel_range_noflush - unmap kernel VM area
- * @addr: start of the VM area to unmap
- * @size: size of the VM area to unmap
- *
- * Unmap PFN_UP(@size) pages at @addr. The VM area @addr and @size
- * specify should have been allocated using get_vm_area() and its
- * friends.
- *
- * NOTE:
- * This function does NOT do any cache flushing. The caller is
- * responsible for calling flush_cache_vunmap() on to-be-mapped areas
- * before calling this function and flush_tlb_kernel_range() after.
- */
-void unmap_kernel_range_noflush(unsigned long addr, unsigned long size)
-{
- vunmap_page_range(addr, addr + size);
-}
-EXPORT_SYMBOL_GPL(unmap_kernel_range_noflush);
-
-/**
* unmap_kernel_range - unmap kernel VM area and flush cache and TLB
* @addr: start of the VM area to unmap
* @size: size of the VM area to unmap
@@ -2045,22 +2022,9 @@ void unmap_kernel_range(unsigned long addr, unsigned long size)
unsigned long end = addr + size;
flush_cache_vunmap(addr, end);
- vunmap_page_range(addr, end);
+ unmap_kernel_range_noflush(addr, size);
flush_tlb_kernel_range(addr, end);
}
-EXPORT_SYMBOL_GPL(unmap_kernel_range);
-
-int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages)
-{
- unsigned long addr = (unsigned long)area->addr;
- unsigned long end = addr + get_vm_area_size(area);
- int err;
-
- err = vmap_page_range(addr, end, prot, pages);
-
- return err > 0 ? 0 : err;
-}
-EXPORT_SYMBOL_GPL(map_vm_area);
static inline void setup_vmalloc_vm_locked(struct vm_struct *vm,
struct vmap_area *va, unsigned long flags, const void *caller)
@@ -2128,14 +2092,6 @@ static struct vm_struct *__get_vm_area_node(unsigned long size,
return area;
}
-struct vm_struct *__get_vm_area(unsigned long size, unsigned long flags,
- unsigned long start, unsigned long end)
-{
- return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE,
- GFP_KERNEL, __builtin_return_address(0));
-}
-EXPORT_SYMBOL_GPL(__get_vm_area);
-
struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags,
unsigned long start, unsigned long end,
const void *caller)
@@ -2330,7 +2286,7 @@ static inline void __vfree_deferred(const void *addr)
* Use raw_cpu_ptr() because this can be called from preemptible
* context. Preemption is absolutely fine here, because the llist_add()
* implementation is lockless, so it works even if we are adding to
- * nother cpu's list. schedule_work() should be fine with this too.
+ * another cpu's list. schedule_work() should be fine with this too.
*/
struct vfree_deferred *p = raw_cpu_ptr(&vfree_deferred);
@@ -2441,7 +2397,8 @@ void *vmap(struct page **pages, unsigned int count,
if (!area)
return NULL;
- if (map_vm_area(area, prot, pages)) {
+ if (map_kernel_range((unsigned long)area->addr, size, pgprot_nx(prot),
+ pages) < 0) {
vunmap(area->addr);
return NULL;
}
@@ -2450,9 +2407,6 @@ void *vmap(struct page **pages, unsigned int count,
}
EXPORT_SYMBOL(vmap);
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller);
static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
pgprot_t prot, int node)
{
@@ -2470,7 +2424,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
/* Please note that the recursion is strictly bounded. */
if (array_size > PAGE_SIZE) {
pages = __vmalloc_node(array_size, 1, nested_gfp|highmem_mask,
- PAGE_KERNEL, node, area->caller);
+ node, area->caller);
} else {
pages = kmalloc_node(array_size, nested_gfp, node);
}
@@ -2504,8 +2458,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
atomic_long_add(area->nr_pages, &nr_vmalloc_pages);
- if (map_vm_area(area, prot, pages))
+ if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area),
+ prot, pages) < 0)
goto fail;
+
return area->addr;
fail:
@@ -2573,27 +2529,16 @@ fail:
return NULL;
}
-/*
- * This is only for performance analysis of vmalloc and stress purpose.
- * It is required by vmalloc test module, therefore do not use it other
- * than that.
- */
-#ifdef CONFIG_TEST_VMALLOC_MODULE
-EXPORT_SYMBOL_GPL(__vmalloc_node_range);
-#endif
-
/**
* __vmalloc_node - allocate virtually contiguous memory
* @size: allocation size
* @align: desired alignment
* @gfp_mask: flags for the page level allocator
- * @prot: protection mask for the allocated pages
* @node: node to use for allocation or NUMA_NO_NODE
* @caller: caller's return address
*
- * Allocate enough pages to cover @size from the page level
- * allocator with @gfp_mask flags. Map them into contiguous
- * kernel virtual space, using a pagetable protection of @prot.
+ * Allocate enough pages to cover @size from the page level allocator with
+ * @gfp_mask flags. Map them into contiguous kernel virtual space.
*
* Reclaim modifiers in @gfp_mask - __GFP_NORETRY, __GFP_RETRY_MAYFAIL
* and __GFP_NOFAIL are not supported
@@ -2603,35 +2548,28 @@ EXPORT_SYMBOL_GPL(__vmalloc_node_range);
*
* Return: pointer to the allocated memory or %NULL on error
*/
-static void *__vmalloc_node(unsigned long size, unsigned long align,
- gfp_t gfp_mask, pgprot_t prot,
- int node, const void *caller)
+void *__vmalloc_node(unsigned long size, unsigned long align,
+ gfp_t gfp_mask, int node, const void *caller)
{
return __vmalloc_node_range(size, align, VMALLOC_START, VMALLOC_END,
- gfp_mask, prot, 0, node, caller);
+ gfp_mask, PAGE_KERNEL, 0, node, caller);
}
+/*
+ * This is only for performance analysis of vmalloc and stress purpose.
+ * It is required by vmalloc test module, therefore do not use it other
+ * than that.
+ */
+#ifdef CONFIG_TEST_VMALLOC_MODULE
+EXPORT_SYMBOL_GPL(__vmalloc_node);
+#endif
-void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot)
+void *__vmalloc(unsigned long size, gfp_t gfp_mask)
{
- return __vmalloc_node(size, 1, gfp_mask, prot, NUMA_NO_NODE,
+ return __vmalloc_node(size, 1, gfp_mask, NUMA_NO_NODE,
__builtin_return_address(0));
}
EXPORT_SYMBOL(__vmalloc);
-static inline void *__vmalloc_node_flags(unsigned long size,
- int node, gfp_t flags)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL,
- node, __builtin_return_address(0));
-}
-
-
-void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
- void *caller)
-{
- return __vmalloc_node(size, 1, flags, PAGE_KERNEL, node, caller);
-}
-
/**
* vmalloc - allocate virtually contiguous memory
* @size: allocation size
@@ -2646,8 +2584,8 @@ void *__vmalloc_node_flags_caller(unsigned long size, int node, gfp_t flags,
*/
void *vmalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL);
+ return __vmalloc_node(size, 1, GFP_KERNEL, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc);
@@ -2666,8 +2604,8 @@ EXPORT_SYMBOL(vmalloc);
*/
void *vzalloc(unsigned long size)
{
- return __vmalloc_node_flags(size, NUMA_NO_NODE,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc);
@@ -2704,8 +2642,8 @@ EXPORT_SYMBOL(vmalloc_user);
*/
void *vmalloc_node(unsigned long size, int node)
{
- return __vmalloc_node(size, 1, GFP_KERNEL, PAGE_KERNEL,
- node, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_KERNEL, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_node);
@@ -2718,58 +2656,15 @@ EXPORT_SYMBOL(vmalloc_node);
* allocator and map them into contiguous kernel virtual space.
* The memory allocated is set to zero.
*
- * For tight control over page level allocator and protection flags
- * use __vmalloc_node() instead.
- *
* Return: pointer to the allocated memory or %NULL on error
*/
void *vzalloc_node(unsigned long size, int node)
{
- return __vmalloc_node_flags(size, node,
- GFP_KERNEL | __GFP_ZERO);
+ return __vmalloc_node(size, 1, GFP_KERNEL | __GFP_ZERO, node,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vzalloc_node);
-/**
- * vmalloc_user_node_flags - allocate memory for userspace on a specific node
- * @size: allocation size
- * @node: numa node
- * @flags: flags for the page level allocator
- *
- * The resulting memory area is zeroed so it can be mapped to userspace
- * without leaking data.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_user_node_flags(unsigned long size, int node, gfp_t flags)
-{
- return __vmalloc_node_range(size, SHMLBA, VMALLOC_START, VMALLOC_END,
- flags | __GFP_ZERO, PAGE_KERNEL,
- VM_USERMAP, node,
- __builtin_return_address(0));
-}
-EXPORT_SYMBOL(vmalloc_user_node_flags);
-
-/**
- * vmalloc_exec - allocate virtually contiguous, executable memory
- * @size: allocation size
- *
- * Kernel-internal function to allocate enough pages to cover @size
- * the page level allocator and map them into contiguous and
- * executable kernel virtual space.
- *
- * For tight control over page level allocator and protection flags
- * use __vmalloc() instead.
- *
- * Return: pointer to the allocated memory or %NULL on error
- */
-void *vmalloc_exec(unsigned long size)
-{
- return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END,
- GFP_KERNEL, PAGE_KERNEL_EXEC, VM_FLUSH_RESET_PERMS,
- NUMA_NO_NODE, __builtin_return_address(0));
-}
-
#if defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA32)
#define GFP_VMALLOC32 (GFP_DMA32 | GFP_KERNEL)
#elif defined(CONFIG_64BIT) && defined(CONFIG_ZONE_DMA)
@@ -2793,8 +2688,8 @@ void *vmalloc_exec(unsigned long size)
*/
void *vmalloc_32(unsigned long size)
{
- return __vmalloc_node(size, 1, GFP_VMALLOC32, PAGE_KERNEL,
- NUMA_NO_NODE, __builtin_return_address(0));
+ return __vmalloc_node(size, 1, GFP_VMALLOC32, NUMA_NO_NODE,
+ __builtin_return_address(0));
}
EXPORT_SYMBOL(vmalloc_32);
@@ -3137,21 +3032,6 @@ int remap_vmalloc_range(struct vm_area_struct *vma, void *addr,
}
EXPORT_SYMBOL(remap_vmalloc_range);
-/*
- * Implement stubs for vmalloc_sync_[un]mappings () if the architecture chose
- * not to have one.
- *
- * The purpose of this function is to make sure the vmalloc area
- * mappings are identical in all page-tables in the system.
- */
-void __weak vmalloc_sync_mappings(void)
-{
-}
-
-void __weak vmalloc_sync_unmappings(void)
-{
-}
-
static int f(pte_t *pte, unsigned long addr, void *data)
{
pte_t ***p = data;
@@ -3473,8 +3353,9 @@ recovery:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
}
@@ -3522,8 +3403,9 @@ err_free_shadow:
orig_end = vas[area]->va_end;
va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
&free_vmap_area_list);
- kasan_release_vmalloc(orig_start, orig_end,
- va->va_start, va->va_end);
+ if (va)
+ kasan_release_vmalloc(orig_start, orig_end,
+ va->va_start, va->va_end);
vas[area] = NULL;
kfree(vms[area]);
}
diff --git a/mm/vmscan.c b/mm/vmscan.c
index a37c87b5aee2..466fc3144fff 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -79,6 +79,12 @@ struct scan_control {
*/
struct mem_cgroup *target_mem_cgroup;
+ /*
+ * Scan pressure balancing between anon and file LRUs
+ */
+ unsigned long anon_cost;
+ unsigned long file_cost;
+
/* Can active pages be deactivated as part of reclaim? */
#define DEACTIVATE_ANON 1
#define DEACTIVATE_FILE 2
@@ -161,14 +167,9 @@ struct scan_control {
#endif
/*
- * From 0 .. 100. Higher means more swappy.
+ * From 0 .. 200. Higher means more swappy.
*/
int vm_swappiness = 60;
-/*
- * The total number of pages which are beyond the high watermark within all
- * zones.
- */
-unsigned long vm_total_pages;
static void set_task_reclaim_state(struct task_struct *task,
struct reclaim_state *rs)
@@ -676,7 +677,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid,
freed += ret;
/*
* Bail out if someone want to register a new shrinker to
- * prevent the regsitration from being stalled for long periods
+ * prevent the registration from being stalled for long periods
* by parallel ongoing shrinking.
*/
if (rwsem_is_contended(&shrinker_rwsem)) {
@@ -853,6 +854,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
{
unsigned long flags;
int refcount;
+ void *shadow = NULL;
BUG_ON(!PageLocked(page));
BUG_ON(mapping != page_mapping(page));
@@ -895,12 +897,13 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
if (PageSwapCache(page)) {
swp_entry_t swap = { .val = page_private(page) };
mem_cgroup_swapout(page, swap);
- __delete_from_swap_cache(page, swap);
+ if (reclaimed && !mapping_exiting(mapping))
+ shadow = workingset_eviction(page, target_memcg);
+ __delete_from_swap_cache(page, swap, shadow);
xa_unlock_irqrestore(&mapping->i_pages, flags);
put_swap_page(page, swap);
} else {
void (*freepage)(struct page *);
- void *shadow = NULL;
freepage = mapping->a_ops->freepage;
/*
@@ -908,7 +911,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page,
* order to detect refaults, thus thrashing, later on.
*
* But don't store shadows in an address space that is
- * already exiting. This is not just an optizimation,
+ * already exiting. This is not just an optimization,
* inode reclaim needs to empty out the radix tree or
* the nodes are lost. Don't plant shadows behind its
* back.
@@ -996,8 +999,6 @@ static enum page_references page_check_references(struct page *page,
return PAGEREF_RECLAIM;
if (referenced_ptes) {
- if (PageSwapBacked(page))
- return PAGEREF_ACTIVATE;
/*
* All mapped pages start out with page table
* references from the instantiating fault, so we need
@@ -1020,7 +1021,7 @@ static enum page_references page_check_references(struct page *page,
/*
* Activate file-backed executable pages after first usage.
*/
- if (vm_flags & VM_EXEC)
+ if ((vm_flags & VM_EXEC) && !PageSwapBacked(page))
return PAGEREF_ACTIVATE;
return PAGEREF_KEEP;
@@ -1066,17 +1067,17 @@ static void page_check_dirty_writeback(struct page *page,
/*
* shrink_page_list() returns the number of reclaimed pages
*/
-static unsigned long shrink_page_list(struct list_head *page_list,
- struct pglist_data *pgdat,
- struct scan_control *sc,
- enum ttu_flags ttu_flags,
- struct reclaim_stat *stat,
- bool ignore_references)
+static unsigned int shrink_page_list(struct list_head *page_list,
+ struct pglist_data *pgdat,
+ struct scan_control *sc,
+ enum ttu_flags ttu_flags,
+ struct reclaim_stat *stat,
+ bool ignore_references)
{
LIST_HEAD(ret_pages);
LIST_HEAD(free_pages);
- unsigned nr_reclaimed = 0;
- unsigned pgactivate = 0;
+ unsigned int nr_reclaimed = 0;
+ unsigned int pgactivate = 0;
memset(stat, 0, sizeof(*stat));
cond_resched();
@@ -1295,11 +1296,15 @@ static unsigned long shrink_page_list(struct list_head *page_list,
*/
if (page_mapped(page)) {
enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+ bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
flags |= TTU_SPLIT_HUGE_PMD;
+
if (!try_to_unmap(page, flags)) {
stat->nr_unmap_fail += nr_pages;
+ if (!was_swapbacked && PageSwapBacked(page))
+ stat->nr_lazyfree_fail += nr_pages;
goto activate_locked;
}
}
@@ -1349,6 +1354,8 @@ static unsigned long shrink_page_list(struct list_head *page_list,
case PAGE_ACTIVATE:
goto activate_locked;
case PAGE_SUCCESS:
+ stat->nr_pageout += thp_nr_pages(page);
+
if (PageWriteback(page))
goto keep;
if (PageDirty(page))
@@ -1438,7 +1445,7 @@ free_it:
* appear not as the counts should be low
*/
if (unlikely(PageTransHuge(page)))
- (*get_compound_page_dtor(page))(page);
+ destroy_compound_page(page);
else
list_add(&page->lru, &free_pages);
continue;
@@ -1483,7 +1490,7 @@ keep:
return nr_reclaimed;
}
-unsigned long reclaim_clean_pages_from_list(struct zone *zone,
+unsigned int reclaim_clean_pages_from_list(struct zone *zone,
struct list_head *page_list)
{
struct scan_control sc = {
@@ -1491,8 +1498,8 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
.priority = DEF_PRIORITY,
.may_unmap = 1,
};
- struct reclaim_stat dummy_stat;
- unsigned long ret;
+ struct reclaim_stat stat;
+ unsigned int nr_reclaimed;
struct page *page, *next;
LIST_HEAD(clean_pages);
@@ -1504,11 +1511,21 @@ unsigned long reclaim_clean_pages_from_list(struct zone *zone,
}
}
- ret = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, &dummy_stat, true);
+ nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
+ TTU_IGNORE_ACCESS, &stat, true);
list_splice(&clean_pages, page_list);
- mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -ret);
- return ret;
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE, -nr_reclaimed);
+ /*
+ * Since lazyfree pages are isolated from file LRU from the beginning,
+ * they will rotate back to anonymous LRU in the end if it failed to
+ * discard so isolated count will be mismatched.
+ * Compensate the isolated count for both LRU lists.
+ */
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_ANON,
+ stat.nr_lazyfree_fail);
+ mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
+ -stat.nr_lazyfree_fail);
+ return nr_reclaimed;
}
/*
@@ -1591,7 +1608,7 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
/*
* Update LRU sizes after isolating pages. The LRU size updates must
- * be complete before mem_cgroup_update_lru_size due to a santity check.
+ * be complete before mem_cgroup_update_lru_size due to a sanity check.
*/
static __always_inline void update_lru_sizes(struct lruvec *lruvec,
enum lru_list lru, unsigned long *nr_zone_taken)
@@ -1602,10 +1619,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
if (!nr_zone_taken[zid])
continue;
- __update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-#ifdef CONFIG_MEMCG
- mem_cgroup_update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
-#endif
+ update_lru_size(lruvec, lru, zid, -nr_zone_taken[zid]);
}
}
@@ -1848,7 +1862,7 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
SetPageLRU(page);
lru = page_lru(page);
- nr_pages = hpage_nr_pages(page);
+ nr_pages = thp_nr_pages(page);
update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
list_move(&page->lru, &lruvec->lists[lru]);
@@ -1859,12 +1873,14 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
if (unlikely(PageCompound(page))) {
spin_unlock_irq(&pgdat->lru_lock);
- (*get_compound_page_dtor(page))(page);
+ destroy_compound_page(page);
spin_lock_irq(&pgdat->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
} else {
nr_moved += nr_pages;
+ if (PageActive(page))
+ workingset_age_nonresident(lruvec, nr_pages);
}
}
@@ -1878,13 +1894,13 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
/*
* If a kernel thread (such as nfsd for loop-back mounts) services
- * a backing device by writing to the page cache it sets PF_LESS_THROTTLE.
+ * a backing device by writing to the page cache it sets PF_LOCAL_THROTTLE.
* In that case we should only throttle if the backing device it is
* writing to is congested. In other cases it is safe to throttle.
*/
static int current_may_throttle(void)
{
- return !(current->flags & PF_LESS_THROTTLE) ||
+ return !(current->flags & PF_LOCAL_THROTTLE) ||
current->backing_dev_info == NULL ||
bdi_write_congested(current->backing_dev_info);
}
@@ -1899,13 +1915,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
{
LIST_HEAD(page_list);
unsigned long nr_scanned;
- unsigned long nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0;
unsigned long nr_taken;
struct reclaim_stat stat;
- int file = is_file_lru(lru);
+ bool file = is_file_lru(lru);
enum vm_event_item item;
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
bool stalled = false;
while (unlikely(too_many_isolated(pgdat, file, sc))) {
@@ -1929,12 +1944,12 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
-
item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
+ __count_vm_events(PGSCAN_ANON + file, nr_scanned);
+
spin_unlock_irq(&pgdat->lru_lock);
if (nr_taken == 0)
@@ -1945,16 +1960,15 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
spin_lock_irq(&pgdat->lru_lock);
+ move_pages_to_lru(lruvec, &page_list);
+
+ __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ lru_note_cost(lruvec, file, stat.nr_pageout);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
- reclaim_stat->recent_rotated[0] += stat.nr_activate[0];
- reclaim_stat->recent_rotated[1] += stat.nr_activate[1];
-
- move_pages_to_lru(lruvec, &page_list);
-
- __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
+ __count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2001,7 +2015,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
LIST_HEAD(l_active);
LIST_HEAD(l_inactive);
struct page *page;
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
unsigned nr_deactivate, nr_activate;
unsigned nr_rotated = 0;
int file = is_file_lru(lru);
@@ -2015,9 +2028,9 @@ static void shrink_active_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- reclaim_stat->recent_scanned[file] += nr_taken;
- __count_vm_events(PGREFILL, nr_scanned);
+ if (!cgroup_reclaim(sc))
+ __count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
spin_unlock_irq(&pgdat->lru_lock);
@@ -2042,7 +2055,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
if (page_referenced(page, 0, sc->target_mem_cgroup,
&vm_flags)) {
- nr_rotated += hpage_nr_pages(page);
/*
* Identify referenced, file-backed active pages and
* give them one more trip around the active list. So
@@ -2053,6 +2065,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
* so we ignore them here.
*/
if ((vm_flags & VM_EXEC) && page_is_file_lru(page)) {
+ nr_rotated += thp_nr_pages(page);
list_add(&page->lru, &l_active);
continue;
}
@@ -2067,13 +2080,6 @@ static void shrink_active_list(unsigned long nr_to_scan,
* Move pages back to the lru list.
*/
spin_lock_irq(&pgdat->lru_lock);
- /*
- * Count referenced pages from currently used mappings as rotated,
- * even though only some of them are actually re-activated. This
- * helps balance scan pressure between file and anonymous pages in
- * get_scan_count.
- */
- reclaim_stat->recent_rotated[file] += nr_rotated;
nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2095,7 +2101,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
unsigned long reclaim_pages(struct list_head *page_list)
{
int nid = NUMA_NO_NODE;
- unsigned long nr_reclaimed = 0;
+ unsigned int nr_reclaimed = 0;
LIST_HEAD(node_page_list);
struct reclaim_stat dummy_stat;
struct page *page;
@@ -2229,14 +2235,11 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
unsigned long *nr)
{
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
+ unsigned long anon_cost, file_cost, total_cost;
int swappiness = mem_cgroup_swappiness(memcg);
- struct zone_reclaim_stat *reclaim_stat = &lruvec->reclaim_stat;
u64 fraction[2];
u64 denominator = 0; /* gcc */
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
- unsigned long anon_prio, file_prio;
enum scan_balance scan_balance;
- unsigned long anon, file;
unsigned long ap, fp;
enum lru_list lru;
@@ -2286,57 +2289,35 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc,
}
scan_balance = SCAN_FRACT;
-
/*
- * With swappiness at 100, anonymous and file have the same priority.
- * This scanning priority is essentially the inverse of IO cost.
- */
- anon_prio = swappiness;
- file_prio = 200 - anon_prio;
-
- /*
- * OK, so we have swap space and a fair amount of page cache
- * pages. We use the recently rotated / recently scanned
- * ratios to determine how valuable each cache is.
+ * Calculate the pressure balance between anon and file pages.
+ *
+ * The amount of pressure we put on each LRU is inversely
+ * proportional to the cost of reclaiming each list, as
+ * determined by the share of pages that are refaulting, times
+ * the relative IO cost of bringing back a swapped out
+ * anonymous page vs reloading a filesystem page (swappiness).
*
- * Because workloads change over time (and to avoid overflow)
- * we keep these statistics as a floating average, which ends
- * up weighing recent references more than old ones.
+ * Although we limit that influence to ensure no list gets
+ * left behind completely: at least a third of the pressure is
+ * applied, before swappiness.
*
- * anon in [0], file in [1]
+ * With swappiness at 100, anon and file have equal IO cost.
*/
+ total_cost = sc->anon_cost + sc->file_cost;
+ anon_cost = total_cost + sc->anon_cost;
+ file_cost = total_cost + sc->file_cost;
+ total_cost = anon_cost + file_cost;
- anon = lruvec_lru_size(lruvec, LRU_ACTIVE_ANON, MAX_NR_ZONES) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_ANON, MAX_NR_ZONES);
- file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE, MAX_NR_ZONES) +
- lruvec_lru_size(lruvec, LRU_INACTIVE_FILE, MAX_NR_ZONES);
-
- spin_lock_irq(&pgdat->lru_lock);
- if (unlikely(reclaim_stat->recent_scanned[0] > anon / 4)) {
- reclaim_stat->recent_scanned[0] /= 2;
- reclaim_stat->recent_rotated[0] /= 2;
- }
-
- if (unlikely(reclaim_stat->recent_scanned[1] > file / 4)) {
- reclaim_stat->recent_scanned[1] /= 2;
- reclaim_stat->recent_rotated[1] /= 2;
- }
-
- /*
- * The amount of pressure on anon vs file pages is inversely
- * proportional to the fraction of recently scanned pages on
- * each list that were recently referenced and in active use.
- */
- ap = anon_prio * (reclaim_stat->recent_scanned[0] + 1);
- ap /= reclaim_stat->recent_rotated[0] + 1;
+ ap = swappiness * (total_cost + 1);
+ ap /= anon_cost + 1;
- fp = file_prio * (reclaim_stat->recent_scanned[1] + 1);
- fp /= reclaim_stat->recent_rotated[1] + 1;
- spin_unlock_irq(&pgdat->lru_lock);
+ fp = (200 - swappiness) * (total_cost + 1);
+ fp /= file_cost + 1;
fraction[0] = ap;
fraction[1] = fp;
- denominator = ap + fp + 1;
+ denominator = ap + fp;
out:
for_each_evictable_lru(lru) {
int file = is_file_lru(lru);
@@ -2345,7 +2326,8 @@ out:
unsigned long protection;
lruvec_size = lruvec_lru_size(lruvec, lru, sc->reclaim_idx);
- protection = mem_cgroup_protection(memcg,
+ protection = mem_cgroup_protection(sc->target_mem_cgroup,
+ memcg,
sc->memcg_low_reclaim);
if (protection) {
@@ -2388,7 +2370,7 @@ out:
/*
* Minimally target SWAP_CLUSTER_MAX pages to keep
- * reclaim moving forwards, avoiding decremeting
+ * reclaim moving forwards, avoiding decrementing
* sc->priority further than desirable.
*/
scan = max(scan, SWAP_CLUSTER_MAX);
@@ -2566,7 +2548,7 @@ static bool in_reclaim_compaction(struct scan_control *sc)
* Reclaim/compaction is used for high-order allocation requests. It reclaims
* order-0 pages before compacting the zone. should_continue_reclaim() returns
* true if more pages should be reclaimed such that when the page allocator
- * calls try_to_compact_zone() that it will have enough free pages to succeed.
+ * calls try_to_compact_pages() that it will have enough free pages to succeed.
* It will give up earlier than that if there is difficulty reclaiming pages.
*/
static inline bool should_continue_reclaim(struct pglist_data *pgdat,
@@ -2633,14 +2615,23 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
unsigned long reclaimed;
unsigned long scanned;
- switch (mem_cgroup_protected(target_memcg, memcg)) {
- case MEMCG_PROT_MIN:
+ /*
+ * This loop can become CPU-bound when target memcgs
+ * aren't eligible for reclaim - either because they
+ * don't have any reclaimable pages, or because their
+ * memory is explicitly protected. Avoid soft lockups.
+ */
+ cond_resched();
+
+ mem_cgroup_calculate_protection(target_memcg, memcg);
+
+ if (mem_cgroup_below_min(memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- case MEMCG_PROT_LOW:
+ } else if (mem_cgroup_below_low(memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -2652,16 +2643,6 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
continue;
}
memcg_memory_event(memcg, MEMCG_LOW);
- break;
- case MEMCG_PROT_NONE:
- /*
- * All protection thresholds breached. We may
- * still choose to vary the scan pressure
- * applied based on by how much the cgroup in
- * question has exceeded its protection
- * thresholds (see get_scan_count).
- */
- break;
}
reclaimed = sc->nr_reclaimed;
@@ -2697,13 +2678,24 @@ again:
nr_scanned = sc->nr_scanned;
/*
+ * Determine the scan balance between anon and file LRUs.
+ */
+ spin_lock_irq(&pgdat->lru_lock);
+ sc->anon_cost = target_lruvec->anon_cost;
+ sc->file_cost = target_lruvec->file_cost;
+ spin_unlock_irq(&pgdat->lru_lock);
+
+ /*
* Target desirable inactive:active list ratios for the anon
* and file LRU lists.
*/
if (!sc->force_deactivate) {
unsigned long refaults;
- if (inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
+ refaults = lruvec_page_state(target_lruvec,
+ WORKINGSET_ACTIVATE_ANON);
+ if (refaults != target_lruvec->refaults[0] ||
+ inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
sc->may_deactivate |= DEACTIVATE_ANON;
else
sc->may_deactivate &= ~DEACTIVATE_ANON;
@@ -2714,8 +2706,8 @@ again:
* rid of any stale active pages quickly.
*/
refaults = lruvec_page_state(target_lruvec,
- WORKINGSET_ACTIVATE);
- if (refaults != target_lruvec->refaults ||
+ WORKINGSET_ACTIVATE_FILE);
+ if (refaults != target_lruvec->refaults[1] ||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
sc->may_deactivate |= DEACTIVATE_FILE;
else
@@ -2814,7 +2806,7 @@ again:
set_bit(PGDAT_DIRTY, &pgdat->flags);
/*
- * If kswapd scans pages marked marked for immediate
+ * If kswapd scans pages marked for immediate
* reclaim and under writeback (nr_immediate), it
* implies that pages are cycling through the LRU
* faster than they are written so also forcibly stall.
@@ -2992,8 +2984,10 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
unsigned long refaults;
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
- refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE);
- target_lruvec->refaults = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
+ target_lruvec->refaults[0] = refaults;
+ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
+ target_lruvec->refaults[1] = refaults;
}
/*
@@ -3131,8 +3125,8 @@ static bool allow_direct_reclaim(pg_data_t *pgdat)
/* kswapd must be awake if processes are being throttled */
if (!wmark_ok && waitqueue_active(&pgdat->kswapd_wait)) {
- if (READ_ONCE(pgdat->kswapd_classzone_idx) > ZONE_NORMAL)
- WRITE_ONCE(pgdat->kswapd_classzone_idx, ZONE_NORMAL);
+ if (READ_ONCE(pgdat->kswapd_highest_zoneidx) > ZONE_NORMAL)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, ZONE_NORMAL);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -3324,7 +3318,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
bool may_swap)
{
unsigned long nr_reclaimed;
- unsigned long pflags;
unsigned int noreclaim_flag;
struct scan_control sc = {
.nr_to_reclaim = max(nr_pages, SWAP_CLUSTER_MAX),
@@ -3345,17 +3338,12 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
struct zonelist *zonelist = node_zonelist(numa_node_id(), sc.gfp_mask);
set_task_reclaim_state(current, &sc.reclaim_state);
-
trace_mm_vmscan_memcg_reclaim_begin(0, sc.gfp_mask);
-
- psi_memstall_enter(&pflags);
noreclaim_flag = memalloc_noreclaim_save();
nr_reclaimed = do_try_to_free_pages(zonelist, &sc);
memalloc_noreclaim_restore(noreclaim_flag);
- psi_memstall_leave(&pflags);
-
trace_mm_vmscan_memcg_reclaim_end(nr_reclaimed);
set_task_reclaim_state(current, NULL);
@@ -3385,7 +3373,7 @@ static void age_active_anon(struct pglist_data *pgdat,
} while (memcg);
}
-static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
+static bool pgdat_watermark_boosted(pg_data_t *pgdat, int highest_zoneidx)
{
int i;
struct zone *zone;
@@ -3393,11 +3381,11 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
/*
* Check for watermark boosts top-down as the higher zones
* are more likely to be boosted. Both watermarks and boosts
- * should not be checked at the time time as reclaim would
+ * should not be checked at the same time as reclaim would
* start prematurely when there is no boosting and a lower
* zone is balanced.
*/
- for (i = classzone_idx; i >= 0; i--) {
+ for (i = highest_zoneidx; i >= 0; i--) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
@@ -3411,9 +3399,9 @@ static bool pgdat_watermark_boosted(pg_data_t *pgdat, int classzone_idx)
/*
* Returns true if there is an eligible zone balanced for the request order
- * and classzone_idx
+ * and highest_zoneidx
*/
-static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
+static bool pgdat_balanced(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long mark = -1;
@@ -3423,19 +3411,19 @@ static bool pgdat_balanced(pg_data_t *pgdat, int order, int classzone_idx)
* Check watermarks bottom-up as lower zones are more likely to
* meet watermarks.
*/
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
mark = high_wmark_pages(zone);
- if (zone_watermark_ok_safe(zone, order, mark, classzone_idx))
+ if (zone_watermark_ok_safe(zone, order, mark, highest_zoneidx))
return true;
}
/*
- * If a node has no populated zone within classzone_idx, it does not
+ * If a node has no populated zone within highest_zoneidx, it does not
* need balancing by definition. This can happen if a zone-restricted
* allocation tries to wake a remote kswapd.
*/
@@ -3461,7 +3449,8 @@ static void clear_pgdat_congested(pg_data_t *pgdat)
*
* Returns true if kswapd is ready to sleep
*/
-static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
+static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order,
+ int highest_zoneidx)
{
/*
* The throttled processes are normally woken up in balance_pgdat() as
@@ -3483,7 +3472,7 @@ static bool prepare_kswapd_sleep(pg_data_t *pgdat, int order, int classzone_idx)
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES)
return true;
- if (pgdat_balanced(pgdat, order, classzone_idx)) {
+ if (pgdat_balanced(pgdat, order, highest_zoneidx)) {
clear_pgdat_congested(pgdat);
return true;
}
@@ -3547,7 +3536,7 @@ static bool kswapd_shrink_node(pg_data_t *pgdat,
* or lower is eligible for reclaim until at least one usable zone is
* balanced.
*/
-static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
+static int balance_pgdat(pg_data_t *pgdat, int order, int highest_zoneidx)
{
int i;
unsigned long nr_soft_reclaimed;
@@ -3575,7 +3564,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx)
* stall or direct reclaim until kswapd is finished.
*/
nr_boost_reclaim = 0;
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
zone = pgdat->node_zones + i;
if (!managed_zone(zone))
continue;
@@ -3593,7 +3582,7 @@ restart:
bool balanced;
bool ret;
- sc.reclaim_idx = classzone_idx;
+ sc.reclaim_idx = highest_zoneidx;
/*
* If the number of buffer_heads exceeds the maximum allowed
@@ -3623,7 +3612,7 @@ restart:
* on the grounds that the normal reclaim should be enough to
* re-evaluate if boosting is required when kswapd next wakes.
*/
- balanced = pgdat_balanced(pgdat, sc.order, classzone_idx);
+ balanced = pgdat_balanced(pgdat, sc.order, highest_zoneidx);
if (!balanced && nr_boost_reclaim) {
nr_boost_reclaim = 0;
goto restart;
@@ -3723,7 +3712,7 @@ out:
if (boosted) {
unsigned long flags;
- for (i = 0; i <= classzone_idx; i++) {
+ for (i = 0; i <= highest_zoneidx; i++) {
if (!zone_boosts[i])
continue;
@@ -3738,7 +3727,7 @@ out:
* As there is now likely space, wakeup kcompact to defragment
* pageblocks.
*/
- wakeup_kcompactd(pgdat, pageblock_order, classzone_idx);
+ wakeup_kcompactd(pgdat, pageblock_order, highest_zoneidx);
}
snapshot_refaults(NULL, pgdat);
@@ -3756,22 +3745,22 @@ out:
}
/*
- * The pgdat->kswapd_classzone_idx is used to pass the highest zone index to be
- * reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is not
- * a valid index then either kswapd runs for first time or kswapd couldn't sleep
- * after previous reclaim attempt (node is still unbalanced). In that case
- * return the zone index of the previous kswapd reclaim cycle.
+ * The pgdat->kswapd_highest_zoneidx is used to pass the highest zone index to
+ * be reclaimed by kswapd from the waker. If the value is MAX_NR_ZONES which is
+ * not a valid index then either kswapd runs for first time or kswapd couldn't
+ * sleep after previous reclaim attempt (node is still unbalanced). In that
+ * case return the zone index of the previous kswapd reclaim cycle.
*/
-static enum zone_type kswapd_classzone_idx(pg_data_t *pgdat,
- enum zone_type prev_classzone_idx)
+static enum zone_type kswapd_highest_zoneidx(pg_data_t *pgdat,
+ enum zone_type prev_highest_zoneidx)
{
- enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+ enum zone_type curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
- return curr_idx == MAX_NR_ZONES ? prev_classzone_idx : curr_idx;
+ return curr_idx == MAX_NR_ZONES ? prev_highest_zoneidx : curr_idx;
}
static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_order,
- unsigned int classzone_idx)
+ unsigned int highest_zoneidx)
{
long remaining = 0;
DEFINE_WAIT(wait);
@@ -3788,7 +3777,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* eligible zone balanced that it's also unlikely that compaction will
* succeed.
*/
- if (prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+ if (prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
/*
* Compaction records what page blocks it recently failed to
* isolate pages from and skips them in the future scanning.
@@ -3801,18 +3790,19 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* We have freed the memory, now we should compact it to make
* allocation of the requested order possible.
*/
- wakeup_kcompactd(pgdat, alloc_order, classzone_idx);
+ wakeup_kcompactd(pgdat, alloc_order, highest_zoneidx);
remaining = schedule_timeout(HZ/10);
/*
- * If woken prematurely then reset kswapd_classzone_idx and
+ * If woken prematurely then reset kswapd_highest_zoneidx and
* order. The values will either be from a wakeup request or
* the previous request that slept prematurely.
*/
if (remaining) {
- WRITE_ONCE(pgdat->kswapd_classzone_idx,
- kswapd_classzone_idx(pgdat, classzone_idx));
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx,
+ kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx));
if (READ_ONCE(pgdat->kswapd_order) < reclaim_order)
WRITE_ONCE(pgdat->kswapd_order, reclaim_order);
@@ -3827,7 +3817,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
* go fully to sleep until explicitly woken up.
*/
if (!remaining &&
- prepare_kswapd_sleep(pgdat, reclaim_order, classzone_idx)) {
+ prepare_kswapd_sleep(pgdat, reclaim_order, highest_zoneidx)) {
trace_mm_vmscan_kswapd_sleep(pgdat->node_id);
/*
@@ -3869,7 +3859,7 @@ static void kswapd_try_to_sleep(pg_data_t *pgdat, int alloc_order, int reclaim_o
static int kswapd(void *p)
{
unsigned int alloc_order, reclaim_order;
- unsigned int classzone_idx = MAX_NR_ZONES - 1;
+ unsigned int highest_zoneidx = MAX_NR_ZONES - 1;
pg_data_t *pgdat = (pg_data_t*)p;
struct task_struct *tsk = current;
const struct cpumask *cpumask = cpumask_of_node(pgdat->node_id);
@@ -3893,22 +3883,24 @@ static int kswapd(void *p)
set_freezable();
WRITE_ONCE(pgdat->kswapd_order, 0);
- WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
for ( ; ; ) {
bool ret;
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
kswapd_try_sleep:
kswapd_try_to_sleep(pgdat, alloc_order, reclaim_order,
- classzone_idx);
+ highest_zoneidx);
- /* Read the new order and classzone_idx */
+ /* Read the new order and highest_zoneidx */
alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
- classzone_idx = kswapd_classzone_idx(pgdat, classzone_idx);
+ highest_zoneidx = kswapd_highest_zoneidx(pgdat,
+ highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
- WRITE_ONCE(pgdat->kswapd_classzone_idx, MAX_NR_ZONES);
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, MAX_NR_ZONES);
ret = try_to_freeze();
if (kthread_should_stop())
@@ -3929,9 +3921,10 @@ kswapd_try_sleep:
* but kcompactd is woken to compact for the original
* request (alloc_order).
*/
- trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx,
+ trace_mm_vmscan_kswapd_wake(pgdat->node_id, highest_zoneidx,
alloc_order);
- reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx);
+ reclaim_order = balance_pgdat(pgdat, alloc_order,
+ highest_zoneidx);
if (reclaim_order < alloc_order)
goto kswapd_try_sleep;
}
@@ -3949,7 +3942,7 @@ kswapd_try_sleep:
* needed.
*/
void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
- enum zone_type classzone_idx)
+ enum zone_type highest_zoneidx)
{
pg_data_t *pgdat;
enum zone_type curr_idx;
@@ -3961,10 +3954,10 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
return;
pgdat = zone->zone_pgdat;
- curr_idx = READ_ONCE(pgdat->kswapd_classzone_idx);
+ curr_idx = READ_ONCE(pgdat->kswapd_highest_zoneidx);
- if (curr_idx == MAX_NR_ZONES || curr_idx < classzone_idx)
- WRITE_ONCE(pgdat->kswapd_classzone_idx, classzone_idx);
+ if (curr_idx == MAX_NR_ZONES || curr_idx < highest_zoneidx)
+ WRITE_ONCE(pgdat->kswapd_highest_zoneidx, highest_zoneidx);
if (READ_ONCE(pgdat->kswapd_order) < order)
WRITE_ONCE(pgdat->kswapd_order, order);
@@ -3974,8 +3967,8 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
/* Hopeless node, leave it to direct reclaim if possible */
if (pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ||
- (pgdat_balanced(pgdat, order, classzone_idx) &&
- !pgdat_watermark_boosted(pgdat, classzone_idx))) {
+ (pgdat_balanced(pgdat, order, highest_zoneidx) &&
+ !pgdat_watermark_boosted(pgdat, highest_zoneidx))) {
/*
* There may be plenty of free memory available, but it's too
* fragmented for high-order allocations. Wake up kcompactd
@@ -3984,11 +3977,11 @@ void wakeup_kswapd(struct zone *zone, gfp_t gfp_flags, int order,
* ratelimit its work.
*/
if (!(gfp_flags & __GFP_DIRECT_RECLAIM))
- wakeup_kcompactd(pgdat, order, classzone_idx);
+ wakeup_kcompactd(pgdat, order, highest_zoneidx);
return;
}
- trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, classzone_idx, order,
+ trace_mm_vmscan_wakeup_kswapd(pgdat->node_id, highest_zoneidx, order,
gfp_flags);
wake_up_interruptible(&pgdat->kswapd_wait);
}
@@ -4223,7 +4216,8 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
* unmapped file backed pages.
*/
if (node_pagecache_reclaimable(pgdat) <= pgdat->min_unmapped_pages &&
- node_page_state(pgdat, NR_SLAB_RECLAIMABLE) <= pgdat->min_slab_pages)
+ node_page_state_pages(pgdat, NR_SLAB_RECLAIMABLE_B) <=
+ pgdat->min_slab_pages)
return NODE_RECLAIM_FULL;
/*
@@ -4274,8 +4268,14 @@ void check_move_unevictable_pages(struct pagevec *pvec)
for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
struct pglist_data *pagepgdat = page_pgdat(page);
+ int nr_pages;
+
+ if (PageTransTail(page))
+ continue;
+
+ nr_pages = thp_nr_pages(page);
+ pgscanned += nr_pages;
- pgscanned++;
if (pagepgdat != pgdat) {
if (pgdat)
spin_unlock_irq(&pgdat->lru_lock);
@@ -4294,7 +4294,7 @@ void check_move_unevictable_pages(struct pagevec *pvec)
ClearPageUnevictable(page);
del_page_from_lru_list(page, lruvec, LRU_UNEVICTABLE);
add_page_to_lru_list(page, lruvec, lru);
- pgrescued++;
+ pgrescued += nr_pages;
}
}
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 96d21a792b57..4f7b4ee6aa12 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -76,7 +76,7 @@ static void invalid_numa_statistics(void)
static DEFINE_MUTEX(vm_numa_stat_lock);
int sysctl_vm_numa_stat_handler(struct ctl_table *table, int write,
- void __user *buffer, size_t *length, loff_t *ppos)
+ void *buffer, size_t *length, loff_t *ppos)
{
int ret, oldval;
@@ -341,6 +341,11 @@ void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
long x;
long t;
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
x = delta + __this_cpu_read(*p);
t = __this_cpu_read(pcp->stat_threshold);
@@ -398,6 +403,8 @@ void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
v = __this_cpu_inc_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v > t)) {
@@ -442,6 +449,8 @@ void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
s8 __percpu *p = pcp->vm_node_stat_diff + item;
s8 v, t;
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
v = __this_cpu_dec_return(*p);
t = __this_cpu_read(pcp->stat_threshold);
if (unlikely(v < - t)) {
@@ -541,6 +550,11 @@ static inline void mod_node_state(struct pglist_data *pgdat,
s8 __percpu *p = pcp->vm_node_stat_diff + item;
long o, n, t, z;
+ if (vmstat_item_in_bytes(item)) {
+ VM_WARN_ON_ONCE(delta & (PAGE_SIZE - 1));
+ delta >>= PAGE_SHIFT;
+ }
+
do {
z = 0; /* overflow to node counters */
@@ -989,8 +1003,8 @@ unsigned long sum_zone_numa_state(int node,
/*
* Determine the per node value of a stat item.
*/
-unsigned long node_page_state(struct pglist_data *pgdat,
- enum node_stat_item item)
+unsigned long node_page_state_pages(struct pglist_data *pgdat,
+ enum node_stat_item item)
{
long x = atomic_long_read(&pgdat->vm_stat[item]);
#ifdef CONFIG_SMP
@@ -999,6 +1013,14 @@ unsigned long node_page_state(struct pglist_data *pgdat,
#endif
return x;
}
+
+unsigned long node_page_state(struct pglist_data *pgdat,
+ enum node_stat_item item)
+{
+ VM_WARN_ON_ONCE(vmstat_item_in_bytes(item));
+
+ return node_page_state_pages(pgdat, item);
+}
#endif
#ifdef CONFIG_COMPACTION
@@ -1074,6 +1096,24 @@ static int __fragmentation_index(unsigned int order, struct contig_page_info *in
return 1000 - div_u64( (1000+(div_u64(info->free_pages * 1000ULL, requested))), info->free_blocks_total);
}
+/*
+ * Calculates external fragmentation within a zone wrt the given order.
+ * It is defined as the percentage of pages found in blocks of size
+ * less than 1 << order. It returns values in range [0, 100].
+ */
+unsigned int extfrag_for_order(struct zone *zone, unsigned int order)
+{
+ struct contig_page_info info;
+
+ fill_contig_page_info(zone, order, &info);
+ if (info.free_pages == 0)
+ return 0;
+
+ return div_u64((info.free_pages -
+ (info.free_blocks_suitable << order)) * 100,
+ info.free_pages);
+}
+
/* Same as __fragmentation index but allocs contig_page_info on stack */
int fragmentation_index(struct zone *zone, unsigned int order)
{
@@ -1108,7 +1148,7 @@ int fragmentation_index(struct zone *zone, unsigned int order)
TEXT_FOR_HIGHMEM(xx) xx "_movable",
const char * const vmstat_text[] = {
- /* enum zone_stat_item countes */
+ /* enum zone_stat_item counters */
"nr_free_pages",
"nr_zone_inactive_anon",
"nr_zone_active_anon",
@@ -1118,7 +1158,6 @@ const char * const vmstat_text[] = {
"nr_zone_write_pending",
"nr_mlock",
"nr_page_table_pages",
- "nr_kernel_stack",
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
@@ -1146,9 +1185,12 @@ const char * const vmstat_text[] = {
"nr_isolated_anon",
"nr_isolated_file",
"workingset_nodes",
- "workingset_refault",
- "workingset_activate",
- "workingset_restore",
+ "workingset_refault_anon",
+ "workingset_refault_file",
+ "workingset_activate_anon",
+ "workingset_activate_file",
+ "workingset_restore_anon",
+ "workingset_restore_file",
"workingset_nodereclaim",
"nr_anon_pages",
"nr_mapped",
@@ -1162,7 +1204,6 @@ const char * const vmstat_text[] = {
"nr_file_hugepages",
"nr_file_pmdmapped",
"nr_anon_transparent_hugepages",
- "nr_unstable",
"nr_vmscan_write",
"nr_vmscan_immediate_reclaim",
"nr_dirtied",
@@ -1170,6 +1211,10 @@ const char * const vmstat_text[] = {
"nr_kernel_misc_reclaimable",
"nr_foll_pin_acquired",
"nr_foll_pin_released",
+ "nr_kernel_stack",
+#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
+ "nr_shadow_call_stack",
+#endif
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1196,11 +1241,16 @@ const char * const vmstat_text[] = {
"pglazyfreed",
"pgrefill",
+ "pgreuse",
"pgsteal_kswapd",
"pgsteal_direct",
"pgscan_kswapd",
"pgscan_direct",
"pgscan_direct_throttle",
+ "pgscan_anon",
+ "pgscan_file",
+ "pgsteal_anon",
+ "pgsteal_file",
#ifdef CONFIG_NUMA
"zone_reclaim_failed",
@@ -1228,6 +1278,9 @@ const char * const vmstat_text[] = {
#ifdef CONFIG_MIGRATION
"pgmigrate_success",
"pgmigrate_fail",
+ "thp_migration_success",
+ "thp_migration_fail",
+ "thp_migration_split",
#endif
#ifdef CONFIG_COMPACTION
"compact_migrate_scanned",
@@ -1571,7 +1624,7 @@ static void zoneinfo_show_print(struct seq_file *m, pg_data_t *pgdat,
seq_printf(m, "\n per-node stats");
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
seq_printf(m, "\n %-12s %lu", node_stat_name(i),
- node_page_state(pgdat, i));
+ node_page_state_pages(pgdat, i));
}
}
seq_printf(m,
@@ -1692,7 +1745,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
#endif
for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
- v[i] = global_node_page_state(i);
+ v[i] = global_node_page_state_pages(i);
v += NR_VM_NODE_STAT_ITEMS;
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
@@ -1723,6 +1776,14 @@ static int vmstat_show(struct seq_file *m, void *arg)
seq_puts(m, vmstat_text[off]);
seq_put_decimal_ull(m, " ", *l);
seq_putc(m, '\n');
+
+ if (off == NR_VMSTAT_ITEMS - 1) {
+ /*
+ * We've come to the end - add any deprecated counters to avoid
+ * breaking userspace which might depend on them being present.
+ */
+ seq_puts(m, "nr_unstable 0\n");
+ }
return 0;
}
@@ -1751,7 +1812,7 @@ static void refresh_vm_stats(struct work_struct *work)
}
int vmstat_refresh(struct ctl_table *table, int write,
- void __user *buffer, size_t *lenp, loff_t *ppos)
+ void *buffer, size_t *lenp, loff_t *ppos)
{
long val;
int err;
@@ -2055,24 +2116,14 @@ static int unusable_show(struct seq_file *m, void *arg)
return 0;
}
-static const struct seq_operations unusable_op = {
+static const struct seq_operations unusable_sops = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = unusable_show,
};
-static int unusable_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &unusable_op);
-}
-
-static const struct file_operations unusable_file_ops = {
- .open = unusable_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(unusable);
static void extfrag_show_print(struct seq_file *m,
pg_data_t *pgdat, struct zone *zone)
@@ -2107,24 +2158,14 @@ static int extfrag_show(struct seq_file *m, void *arg)
return 0;
}
-static const struct seq_operations extfrag_op = {
+static const struct seq_operations extfrag_sops = {
.start = frag_start,
.next = frag_next,
.stop = frag_stop,
.show = extfrag_show,
};
-static int extfrag_open(struct inode *inode, struct file *file)
-{
- return seq_open(file, &extfrag_op);
-}
-
-static const struct file_operations extfrag_file_ops = {
- .open = extfrag_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = seq_release,
-};
+DEFINE_SEQ_ATTRIBUTE(extfrag);
static int __init extfrag_debug_init(void)
{
@@ -2133,10 +2174,10 @@ static int __init extfrag_debug_init(void)
extfrag_debug_root = debugfs_create_dir("extfrag", NULL);
debugfs_create_file("unusable_index", 0444, extfrag_debug_root, NULL,
- &unusable_file_ops);
+ &unusable_fops);
debugfs_create_file("extfrag_index", 0444, extfrag_debug_root, NULL,
- &extfrag_file_ops);
+ &extfrag_fops);
return 0;
}
diff --git a/mm/workingset.c b/mm/workingset.c
index 474186b76ced..92e66113a577 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -6,6 +6,7 @@
*/
#include <linux/memcontrol.h>
+#include <linux/mm_inline.h>
#include <linux/writeback.h>
#include <linux/shmem_fs.h>
#include <linux/pagemap.h>
@@ -156,8 +157,8 @@
*
* Implementation
*
- * For each node's file LRU lists, a counter for inactive evictions
- * and activations is maintained (node->inactive_age).
+ * For each node's LRU lists, a counter for inactive evictions and
+ * activations is maintained (node->nonresident_age).
*
* On eviction, a snapshot of this counter (along with some bits to
* identify the node) is stored in the now empty page cache
@@ -213,7 +214,17 @@ static void unpack_shadow(void *shadow, int *memcgidp, pg_data_t **pgdat,
*workingsetp = workingset;
}
-static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
+/**
+ * workingset_age_nonresident - age non-resident entries as LRU ages
+ * @memcg: the lruvec that was aged
+ * @nr_pages: the number of pages to count
+ *
+ * As in-memory pages are aged, non-resident pages need to be aged as
+ * well, in order for the refault distances later on to be comparable
+ * to the in-memory dimensions. This function allows reclaim and LRU
+ * operations to drive the non-resident aging along in parallel.
+ */
+void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages)
{
/*
* Reclaiming a cgroup means reclaiming all its children in a
@@ -227,11 +238,8 @@ static void advance_inactive_age(struct mem_cgroup *memcg, pg_data_t *pgdat)
* the root cgroup's, age as well.
*/
do {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_lruvec(memcg, pgdat);
- atomic_long_inc(&lruvec->inactive_age);
- } while (memcg && (memcg = parent_mem_cgroup(memcg)));
+ atomic_long_add(nr_pages, &lruvec->nonresident_age);
+ } while ((lruvec = parent_lruvec(lruvec)));
}
/**
@@ -254,12 +262,11 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
- advance_inactive_age(page_memcg(page), pgdat);
-
lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
/* XXX: target_memcg can be NULL, go through lruvec */
memcgid = mem_cgroup_id(lruvec_memcg(lruvec));
- eviction = atomic_long_read(&lruvec->inactive_age);
+ eviction = atomic_long_read(&lruvec->nonresident_age);
return pack_shadow(memcgid, pgdat, eviction, PageWorkingset(page));
}
@@ -274,11 +281,12 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
*/
void workingset_refault(struct page *page, void *shadow)
{
+ bool file = page_is_file_lru(page);
struct mem_cgroup *eviction_memcg;
struct lruvec *eviction_lruvec;
unsigned long refault_distance;
+ unsigned long workingset_size;
struct pglist_data *pgdat;
- unsigned long active_file;
struct mem_cgroup *memcg;
unsigned long eviction;
struct lruvec *lruvec;
@@ -309,21 +317,20 @@ void workingset_refault(struct page *page, void *shadow)
if (!mem_cgroup_disabled() && !eviction_memcg)
goto out;
eviction_lruvec = mem_cgroup_lruvec(eviction_memcg, pgdat);
- refault = atomic_long_read(&eviction_lruvec->inactive_age);
- active_file = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
+ refault = atomic_long_read(&eviction_lruvec->nonresident_age);
/*
* Calculate the refault distance
*
* The unsigned subtraction here gives an accurate distance
- * across inactive_age overflows in most cases. There is a
+ * across nonresident_age overflows in most cases. There is a
* special case: usually, shadow entries have a short lifetime
* and are either refaulted or reclaimed along with the inode
* before they get too old. But it is not impossible for the
- * inactive_age to lap a shadow entry in the field, which can
- * then result in a false small refault distance, leading to a
- * false activation should this old entry actually refault
- * again. However, earlier kernels used to deactivate
+ * nonresident_age to lap a shadow entry in the field, which
+ * can then result in a false small refault distance, leading
+ * to a false activation should this old entry actually
+ * refault again. However, earlier kernels used to deactivate
* unconditionally with *every* reclaim invocation for the
* longest time, so the occasional inappropriate activation
* leading to pressure on the active list is not a problem.
@@ -341,24 +348,43 @@ void workingset_refault(struct page *page, void *shadow)
memcg = page_memcg(page);
lruvec = mem_cgroup_lruvec(memcg, pgdat);
- inc_lruvec_state(lruvec, WORKINGSET_REFAULT);
+ inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
/*
* Compare the distance to the existing workingset size. We
- * don't act on pages that couldn't stay resident even if all
- * the memory was available to the page cache.
+ * don't activate pages that couldn't stay resident even if
+ * all the memory was available to the workingset. Whether
+ * workingset competition needs to consider anon or not depends
+ * on having swap.
*/
- if (refault_distance > active_file)
+ workingset_size = lruvec_page_state(eviction_lruvec, NR_ACTIVE_FILE);
+ if (!file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_FILE);
+ }
+ if (mem_cgroup_get_nr_swap_pages(memcg) > 0) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_ACTIVE_ANON);
+ if (file) {
+ workingset_size += lruvec_page_state(eviction_lruvec,
+ NR_INACTIVE_ANON);
+ }
+ }
+ if (refault_distance > workingset_size)
goto out;
SetPageActive(page);
- advance_inactive_age(memcg, pgdat);
- inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE);
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
+ inc_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + file);
/* Page was active prior to eviction */
if (workingset) {
SetPageWorkingset(page);
- inc_lruvec_state(lruvec, WORKINGSET_RESTORE);
+ /* XXX: Move to lru_cache_add() when it supports new vs putback */
+ spin_lock_irq(&page_pgdat(page)->lru_lock);
+ lru_note_cost_page(page);
+ spin_unlock_irq(&page_pgdat(page)->lru_lock);
+ inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
}
out:
rcu_read_unlock();
@@ -371,6 +397,7 @@ out:
void workingset_activation(struct page *page)
{
struct mem_cgroup *memcg;
+ struct lruvec *lruvec;
rcu_read_lock();
/*
@@ -383,7 +410,8 @@ void workingset_activation(struct page *page)
memcg = page_memcg_rcu(page);
if (!mem_cgroup_disabled() && !memcg)
goto out;
- advance_inactive_age(memcg, page_pgdat(page));
+ lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
+ workingset_age_nonresident(lruvec, thp_nr_pages(page));
out:
rcu_read_unlock();
}
@@ -467,8 +495,10 @@ static unsigned long count_shadow_nodes(struct shrinker *shrinker,
for (pages = 0, i = 0; i < NR_LRU_LISTS; i++)
pages += lruvec_page_state_local(lruvec,
NR_LRU_BASE + i);
- pages += lruvec_page_state_local(lruvec, NR_SLAB_RECLAIMABLE);
- pages += lruvec_page_state_local(lruvec, NR_SLAB_UNRECLAIMABLE);
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_RECLAIMABLE_B) >> PAGE_SHIFT;
+ pages += lruvec_page_state_local(
+ lruvec, NR_SLAB_UNRECLAIMABLE_B) >> PAGE_SHIFT;
} else
#endif
pages = node_present_pages(sc->nid);
diff --git a/mm/zbud.c b/mm/zbud.c
index de5dd4ddaa82..bc93aa4e46fc 100644
--- a/mm/zbud.c
+++ b/mm/zbud.c
@@ -243,7 +243,7 @@ static struct zbud_header *init_zbud_page(struct page *page)
zhdr->last_chunks = 0;
INIT_LIST_HEAD(&zhdr->buddy);
INIT_LIST_HEAD(&zhdr->lru);
- zhdr->under_reclaim = 0;
+ zhdr->under_reclaim = false;
return zhdr;
}
diff --git a/mm/zpool.c b/mm/zpool.c
index 863669212070..3744a2d1a624 100644
--- a/mm/zpool.c
+++ b/mm/zpool.c
@@ -239,15 +239,15 @@ const char *zpool_get_type(struct zpool *zpool)
}
/**
- * zpool_malloc_support_movable() - Check if the zpool support
- * allocate movable memory
+ * zpool_malloc_support_movable() - Check if the zpool supports
+ * allocating movable memory
* @zpool: The zpool to check
*
- * This returns if the zpool support allocate movable memory.
+ * This returns if the zpool supports allocating movable memory.
*
* Implementations must guarantee this to be thread-safe.
*
- * Returns: true if if the zpool support allocate movable memory, false if not
+ * Returns: true if the zpool supports allocating movable memory, false if not
*/
bool zpool_malloc_support_movable(struct zpool *zpool)
{
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 2f836a2b993f..c36fdff9a371 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -39,8 +39,8 @@
#include <linux/highmem.h>
#include <linux/string.h>
#include <linux/slab.h>
+#include <linux/pgtable.h>
#include <asm/tlbflush.h>
-#include <asm/pgtable.h>
#include <linux/cpumask.h>
#include <linux/cpu.h>
#include <linux/vmalloc.h>
@@ -79,7 +79,7 @@
/*
* Object location (<PFN>, <obj_idx>) is encoded as
- * as single (unsigned long) handle value.
+ * a single (unsigned long) handle value.
*
* Note that object index <obj_idx> starts from 0.
*
@@ -293,7 +293,7 @@ struct zspage {
};
struct mapping_area {
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
struct vm_struct *vm; /* vm area for mapping object that span pages */
#else
char *vm_buf; /* copy buffer for objects that span pages */
@@ -1113,7 +1113,7 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-#ifdef CONFIG_PGTABLE_MAPPING
+#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area)
static inline void *__zs_map_object(struct mapping_area *area,
struct page *pages[2], int off, int size)
{
- BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages));
+ unsigned long addr = (unsigned long)area->vm->addr;
+
+ BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);
area->vm_addr = area->vm->addr;
return area->vm_addr + off;
}
@@ -1151,7 +1153,7 @@ static inline void __zs_unmap_object(struct mapping_area *area,
unmap_kernel_range(addr, PAGE_SIZE * 2);
}
-#else /* CONFIG_PGTABLE_MAPPING */
+#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
static inline int __zs_cpu_up(struct mapping_area *area)
{
@@ -1233,7 +1235,7 @@ out:
pagefault_enable();
}
-#endif /* CONFIG_PGTABLE_MAPPING */
+#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
static int zs_cpu_prepare(unsigned int cpu)
{