summaryrefslogtreecommitdiffstats
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig55
-rw-r--r--mm/Kconfig.debug28
-rw-r--r--mm/Makefile4
-rw-r--r--mm/backing-dev.c8
-rw-r--r--mm/cma.c6
-rw-r--r--mm/compaction.c123
-rw-r--r--mm/debug.c4
-rw-r--r--mm/filemap.c631
-rw-r--r--mm/gup.c384
-rw-r--r--mm/gup_test.c (renamed from mm/gup_benchmark.c)111
-rw-r--r--mm/gup_test.h32
-rw-r--r--mm/highmem.c324
-rw-r--r--mm/huge_memory.c113
-rw-r--r--mm/hugetlb.c29
-rw-r--r--mm/hugetlb_cgroup.c8
-rw-r--r--mm/init-mm.c1
-rw-r--r--mm/internal.h5
-rw-r--r--mm/kasan/Makefile25
-rw-r--r--mm/kasan/common.c822
-rw-r--r--mm/kasan/generic.c75
-rw-r--r--mm/kasan/generic_report.c165
-rw-r--r--mm/kasan/hw_tags.c204
-rw-r--r--mm/kasan/init.c17
-rw-r--r--mm/kasan/kasan.h173
-rw-r--r--mm/kasan/quarantine.c68
-rw-r--r--mm/kasan/report.c317
-rw-r--r--mm/kasan/report_generic.c327
-rw-r--r--mm/kasan/report_hw_tags.c42
-rw-r--r--mm/kasan/report_sw_tags.c (renamed from mm/kasan/tags_report.c)29
-rw-r--r--mm/kasan/shadow.c504
-rw-r--r--mm/kasan/sw_tags.c (renamed from mm/kasan/tags.c)39
-rw-r--r--mm/khugepaged.c60
-rw-r--r--mm/ksm.c50
-rw-r--r--mm/list_lru.c10
-rw-r--r--mm/madvise.c17
-rw-r--r--mm/mapping_dirty_helpers.c6
-rw-r--r--mm/memblock.c85
-rw-r--r--mm/memcontrol.c415
-rw-r--r--mm/memory-failure.c226
-rw-r--r--mm/memory.c60
-rw-r--r--mm/memory_hotplug.c139
-rw-r--r--mm/mempolicy.c8
-rw-r--r--mm/mempool.c4
-rw-r--r--mm/migrate.c185
-rw-r--r--mm/mlock.c63
-rw-r--r--mm/mm_init.c1
-rw-r--r--mm/mmap.c50
-rw-r--r--mm/mmap_lock.c230
-rw-r--r--mm/mmu_notifier.c7
-rw-r--r--mm/mmzone.c15
-rw-r--r--mm/mprotect.c7
-rw-r--r--mm/mremap.c280
-rw-r--r--mm/nommu.c8
-rw-r--r--mm/oom_kill.c14
-rw-r--r--mm/page_alloc.c487
-rw-r--r--mm/page_counter.c4
-rw-r--r--mm/page_ext.c12
-rw-r--r--mm/page_idle.c4
-rw-r--r--mm/page_io.c6
-rw-r--r--mm/page_isolation.c12
-rw-r--r--mm/page_owner.c17
-rw-r--r--mm/page_poison.c58
-rw-r--r--mm/page_vma_mapped.c9
-rw-r--r--mm/process_vm_access.c2
-rw-r--r--mm/ptdump.c13
-rw-r--r--mm/rmap.c21
-rw-r--r--mm/shmem.c39
-rw-r--r--mm/slab.c10
-rw-r--r--mm/slab.h87
-rw-r--r--mm/slab_common.c15
-rw-r--r--mm/slob.c6
-rw-r--r--mm/slub.c189
-rw-r--r--mm/swap.c220
-rw-r--r--mm/swap_state.c7
-rw-r--r--mm/swapfile.c29
-rw-r--r--mm/truncate.c12
-rw-r--r--mm/util.c12
-rw-r--r--mm/vmalloc.c105
-rw-r--r--mm/vmscan.c228
-rw-r--r--mm/vmstat.c6
-rw-r--r--mm/workingset.c12
-rw-r--r--mm/z3fold.c191
-rw-r--r--mm/zsmalloc.c65
-rw-r--r--mm/zswap.c189
84 files changed, 5048 insertions, 3632 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index d42423f884a7..f730605b8dcf 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -707,45 +707,30 @@ config ZSMALLOC
returned by an alloc(). This handle must be mapped in order to
access the allocated space.
-config ZSMALLOC_PGTABLE_MAPPING
- bool "Use page table mapping to access object in zsmalloc"
- depends on ZSMALLOC=y
- help
- By default, zsmalloc uses a copy-based object mapping method to
- access allocations that span two pages. However, if a particular
- architecture (ex, ARM) performs VM mapping faster than copying,
- then you should select this. This causes zsmalloc to use page table
- mapping rather than copying for object mapping.
-
- You can check speed with zsmalloc benchmark:
- https://github.com/spartacus06/zsmapbench
-
config ZSMALLOC_STAT
bool "Export zsmalloc statistics"
depends on ZSMALLOC
select DEBUG_FS
help
This option enables code in the zsmalloc to collect various
- statistics about whats happening in zsmalloc and exports that
+ statistics about what's happening in zsmalloc and exports that
information to userspace via debugfs.
If unsure, say N.
config GENERIC_EARLY_IOREMAP
bool
-config MAX_STACK_SIZE_MB
- int "Maximum user stack size for 32-bit processes (MB)"
- default 80
+config STACK_MAX_DEFAULT_SIZE_MB
+ int "Default maximum user stack size for 32-bit processes (MB)"
+ default 100
range 8 2048
depends on STACK_GROWSUP && (!64BIT || COMPAT)
help
This is the maximum stack size in Megabytes in the VM layout of 32-bit
user processes when the stack grows upwards (currently only on parisc
- arch). The stack will be located at the highest memory address minus
- the given value, unless the RLIMIT_STACK hard limit is changed to a
- smaller value in which case that is used.
+ arch) when the RLIMIT_STACK hard limit is unlimited.
- A sane initial value is 80 MB.
+ A sane initial value is 100 MB.
config DEFERRED_STRUCT_PAGE_INIT
bool "Defer initialisation of struct pages to kthreads"
@@ -834,13 +819,28 @@ config PERCPU_STATS
information includes global and per chunk statistics, which can
be used to help understand percpu memory usage.
-config GUP_BENCHMARK
- bool "Enable infrastructure for get_user_pages() and related calls benchmarking"
+config GUP_TEST
+ bool "Enable infrastructure for get_user_pages()-related unit tests"
+ depends on DEBUG_FS
help
- Provides /sys/kernel/debug/gup_benchmark that helps with testing
- performance of get_user_pages() and related calls.
+ Provides /sys/kernel/debug/gup_test, which in turn provides a way
+ to make ioctl calls that can launch kernel-based unit tests for
+ the get_user_pages*() and pin_user_pages*() family of API calls.
+
+ These tests include benchmark testing of the _fast variants of
+ get_user_pages*() and pin_user_pages*(), as well as smoke tests of
+ the non-_fast variants.
- See tools/testing/selftests/vm/gup_benchmark.c
+ There is also a sub-test that allows running dump_page() on any
+ of up to eight pages (selected by command line args) within the
+ range of user-space addresses. These pages are either pinned via
+ pin_user_pages*(), or pinned via get_user_pages*(), as specified
+ by other command line arguments.
+
+ See tools/testing/selftests/vm/gup_test.c
+
+comment "GUP_TEST needs to have DEBUG_FS enabled"
+ depends on !GUP_TEST && !DEBUG_FS
config GUP_GET_PTE_LOW_HIGH
bool
@@ -872,4 +872,7 @@ config ARCH_HAS_HUGEPD
config MAPPING_DIRTY_HELPERS
bool
+config KMAP_LOCAL
+ bool
+
endmenu
diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug
index 864f129f1937..1e73717802f8 100644
--- a/mm/Kconfig.debug
+++ b/mm/Kconfig.debug
@@ -64,7 +64,6 @@ config PAGE_OWNER
config PAGE_POISONING
bool "Poison pages after freeing"
- select PAGE_POISONING_NO_SANITY if HIBERNATION
help
Fill the pages with poison patterns after free_pages() and verify
the patterns before alloc_pages. The filling of the memory helps
@@ -75,30 +74,11 @@ config PAGE_POISONING
Note that "poison" here is not the same thing as the "HWPoison"
for CONFIG_MEMORY_FAILURE. This is software poisoning only.
- If unsure, say N
-
-config PAGE_POISONING_NO_SANITY
- depends on PAGE_POISONING
- bool "Only poison, don't sanity check"
- help
- Skip the sanity checking on alloc, only fill the pages with
- poison on free. This reduces some of the overhead of the
- poisoning feature.
-
- If you are only interested in sanitization, say Y. Otherwise
- say N.
+ If you are only interested in sanitization of freed pages without
+ checking the poison pattern on alloc, you can boot the kernel with
+ "init_on_free=1" instead of enabling this.
-config PAGE_POISONING_ZERO
- bool "Use zero for poisoning instead of debugging value"
- depends on PAGE_POISONING
- help
- Instead of using the existing poison value, fill the pages with
- zeros. This makes it harder to detect when errors are occurring
- due to sanitization but the zeroing at free means that it is
- no longer necessary to write zeros when GFP_ZERO is used on
- allocation.
-
- If unsure, say N
+ If unsure, say N
config DEBUG_PAGE_REF
bool "Enable tracepoint to track down page reference manipulation"
diff --git a/mm/Makefile b/mm/Makefile
index d73aed0fc99c..b6cd2fffa492 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \
mm_init.o percpu.o slab_common.o \
compaction.o vmacache.o \
interval_tree.o list_lru.o workingset.o \
- debug.o gup.o $(mmu-y)
+ debug.o gup.o mmap_lock.o $(mmu-y)
# Give 'page_alloc' its own module-parameter namespace
page-alloc-y := page_alloc.o
@@ -90,7 +90,7 @@ obj-$(CONFIG_PAGE_COUNTER) += page_counter.o
obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o
obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o
obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o
-obj-$(CONFIG_GUP_BENCHMARK) += gup_benchmark.o
+obj-$(CONFIG_GUP_TEST) += gup_test.o
obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o
obj-$(CONFIG_HWPOISON_INJECT) += hwpoison-inject.o
obj-$(CONFIG_DEBUG_KMEMLEAK) += kmemleak.o
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 408d5051d05b..e33797579338 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -150,11 +150,11 @@ static ssize_t read_ahead_kb_store(struct device *dev,
#define BDI_SHOW(name, expr) \
static ssize_t name##_show(struct device *dev, \
- struct device_attribute *attr, char *page) \
+ struct device_attribute *attr, char *buf) \
{ \
struct backing_dev_info *bdi = dev_get_drvdata(dev); \
\
- return snprintf(page, PAGE_SIZE-1, "%lld\n", (long long)expr); \
+ return sysfs_emit(buf, "%lld\n", (long long)expr); \
} \
static DEVICE_ATTR_RW(name);
@@ -200,11 +200,11 @@ BDI_SHOW(max_ratio, bdi->max_ratio)
static ssize_t stable_pages_required_show(struct device *dev,
struct device_attribute *attr,
- char *page)
+ char *buf)
{
dev_warn_once(dev,
"the stable_pages_required attribute has been removed. Use the stable_writes queue attribute instead.\n");
- return snprintf(page, PAGE_SIZE-1, "%d\n", 0);
+ return sysfs_emit(buf, "%d\n", 0);
}
static DEVICE_ATTR_RO(stable_pages_required);
diff --git a/mm/cma.c b/mm/cma.c
index 7f415d7cda9f..20c4f6f40037 100644
--- a/mm/cma.c
+++ b/mm/cma.c
@@ -38,7 +38,6 @@
struct cma cma_areas[MAX_CMA_AREAS];
unsigned cma_area_count;
-static DEFINE_MUTEX(cma_mutex);
phys_addr_t cma_get_base(const struct cma *cma)
{
@@ -454,10 +453,9 @@ struct page *cma_alloc(struct cma *cma, size_t count, unsigned int align,
mutex_unlock(&cma->lock);
pfn = cma->base_pfn + (bitmap_no << cma->order_per_bit);
- mutex_lock(&cma_mutex);
ret = alloc_contig_range(pfn, pfn + count, MIGRATE_CMA,
GFP_KERNEL | (no_warn ? __GFP_NOWARN : 0));
- mutex_unlock(&cma_mutex);
+
if (ret == 0) {
page = pfn_to_page(pfn);
break;
@@ -512,7 +510,7 @@ bool cma_release(struct cma *cma, const struct page *pages, unsigned int count)
if (!cma || !pages)
return false;
- pr_debug("%s(page %p)\n", __func__, (void *)pages);
+ pr_debug("%s(page %p, count %u)\n", __func__, (void *)pages, count);
pfn = page_to_pfn(pages);
diff --git a/mm/compaction.c b/mm/compaction.c
index 13cb7a961b31..e5acb9714436 100644
--- a/mm/compaction.c
+++ b/mm/compaction.c
@@ -157,7 +157,7 @@ EXPORT_SYMBOL(__ClearPageMovable);
* allocation success. 1 << compact_defer_shift, compactions are skipped up
* to a limit of 1 << COMPACT_MAX_DEFER_SHIFT
*/
-void defer_compaction(struct zone *zone, int order)
+static void defer_compaction(struct zone *zone, int order)
{
zone->compact_considered = 0;
zone->compact_defer_shift++;
@@ -172,7 +172,7 @@ void defer_compaction(struct zone *zone, int order)
}
/* Returns true if compaction should be skipped this time */
-bool compaction_deferred(struct zone *zone, int order)
+static bool compaction_deferred(struct zone *zone, int order)
{
unsigned long defer_limit = 1UL << zone->compact_defer_shift;
@@ -209,7 +209,7 @@ void compaction_defer_reset(struct zone *zone, int order,
}
/* Returns true if restarting compaction after many failures */
-bool compaction_restarting(struct zone *zone, int order)
+static bool compaction_restarting(struct zone *zone, int order)
{
if (order < zone->compact_order_failed)
return false;
@@ -237,7 +237,7 @@ static void reset_cached_positions(struct zone *zone)
}
/*
- * Compound pages of >= pageblock_order should consistenly be skipped until
+ * Compound pages of >= pageblock_order should consistently be skipped until
* released. It is always pointless to compact pages of such order (if they are
* migratable), and the pageblocks they occupy cannot contain any free pages.
*/
@@ -804,7 +804,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
unsigned long nr_scanned = 0, nr_isolated = 0;
struct lruvec *lruvec;
unsigned long flags = 0;
- bool locked = false;
+ struct lruvec *locked = NULL;
struct page *page = NULL, *valid_page = NULL;
unsigned long start_pfn = low_pfn;
bool skip_on_failure = false;
@@ -868,11 +868,20 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
* contention, to give chance to IRQs. Abort completely if
* a fatal signal is pending.
*/
- if (!(low_pfn % SWAP_CLUSTER_MAX)
- && compact_unlock_should_abort(&pgdat->lru_lock,
- flags, &locked, cc)) {
- low_pfn = 0;
- goto fatal_pending;
+ if (!(low_pfn % SWAP_CLUSTER_MAX)) {
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+
+ if (fatal_signal_pending(current)) {
+ cc->contended = true;
+
+ low_pfn = 0;
+ goto fatal_pending;
+ }
+
+ cond_resched();
}
if (!pfn_valid_within(low_pfn))
@@ -890,6 +899,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) {
if (!cc->ignore_skip_hint && get_pageblock_skip(page)) {
low_pfn = end_pfn;
+ page = NULL;
goto isolate_abort;
}
valid_page = page;
@@ -943,9 +953,8 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (unlikely(__PageMovable(page)) &&
!PageIsolated(page)) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock,
- flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
if (!isolate_movable_page(page, isolate_mode))
@@ -971,10 +980,34 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
if (!(cc->gfp_mask & __GFP_FS) && page_mapping(page))
goto isolate_fail;
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto isolate_fail;
+
+ if (__isolate_lru_page_prepare(page, isolate_mode) != 0)
+ goto isolate_fail_put;
+
+ /* Try isolate the page */
+ if (!TestClearPageLRU(page))
+ goto isolate_fail_put;
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+
/* If we already hold the lock, we can skip some rechecking */
- if (!locked) {
- locked = compact_lock_irqsave(&pgdat->lru_lock,
- &flags, cc);
+ if (lruvec != locked) {
+ if (locked)
+ unlock_page_lruvec_irqrestore(locked, flags);
+
+ compact_lock_irqsave(&lruvec->lru_lock, &flags, cc);
+ locked = lruvec;
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
/* Try get exclusive access under lock */
if (!skip_updated) {
@@ -983,10 +1016,6 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
goto isolate_abort;
}
- /* Recheck PageLRU and PageCompound under lock */
- if (!PageLRU(page))
- goto isolate_fail;
-
/*
* Page become compound since the non-locked check,
* and it's on LRU. It can only be a THP so the order
@@ -994,15 +1023,11 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn,
*/
if (unlikely(PageCompound(page) && !cc->alloc_contig)) {
low_pfn += compound_nr(page) - 1;
- goto isolate_fail;
+ SetPageLRU(page);
+ goto isolate_fail_put;
}
- }
-
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- /* Try isolate the page */
- if (__isolate_lru_page(page, isolate_mode) != 0)
- goto isolate_fail;
+ } else
+ rcu_read_unlock();
/* The whole page is taken off the LRU; skip the tail pages. */
if (PageCompound(page))
@@ -1032,6 +1057,15 @@ isolate_success:
}
continue;
+
+isolate_fail_put:
+ /* Avoid potential deadlock in freeing page under lru_lock */
+ if (locked) {
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
+ }
+ put_page(page);
+
isolate_fail:
if (!skip_on_failure)
continue;
@@ -1043,8 +1077,8 @@ isolate_fail:
*/
if (nr_isolated) {
if (locked) {
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- locked = false;
+ unlock_page_lruvec_irqrestore(locked, flags);
+ locked = NULL;
}
putback_movable_pages(&cc->migratepages);
cc->nr_migratepages = 0;
@@ -1068,9 +1102,15 @@ isolate_fail:
if (unlikely(low_pfn > end_pfn))
low_pfn = end_pfn;
+ page = NULL;
+
isolate_abort:
if (locked)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(locked, flags);
+ if (page) {
+ SetPageLRU(page);
+ put_page(page);
+ }
/*
* Updated the cached scanner pfn once the pageblock has been scanned
@@ -2070,13 +2110,6 @@ static enum compact_result compact_finished(struct compact_control *cc)
return ret;
}
-/*
- * compaction_suitable: Is this suitable to run compaction on this zone now?
- * Returns
- * COMPACT_SKIPPED - If there are too few free pages for compaction
- * COMPACT_SUCCESS - If the allocation would succeed without compaction
- * COMPACT_CONTINUE - If compaction should run now
- */
static enum compact_result __compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
int highest_zoneidx,
@@ -2120,6 +2153,13 @@ static enum compact_result __compaction_suitable(struct zone *zone, int order,
return COMPACT_CONTINUE;
}
+/*
+ * compaction_suitable: Is this suitable to run compaction on this zone now?
+ * Returns
+ * COMPACT_SKIPPED - If there are too few free pages for compaction
+ * COMPACT_SUCCESS - If the allocation would succeed without compaction
+ * COMPACT_CONTINUE - If compaction should run now
+ */
enum compact_result compaction_suitable(struct zone *zone, int order,
unsigned int alloc_flags,
int highest_zoneidx)
@@ -2275,7 +2315,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
while ((ret = compact_finished(cc)) == COMPACT_CONTINUE) {
int err;
- unsigned long start_pfn = cc->migrate_pfn;
+ unsigned long iteration_start_pfn = cc->migrate_pfn;
/*
* Avoid multiple rescans which can happen if a page cannot be
@@ -2287,7 +2327,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
*/
cc->rescan = false;
if (pageblock_start_pfn(last_migrated_pfn) ==
- pageblock_start_pfn(start_pfn)) {
+ pageblock_start_pfn(iteration_start_pfn)) {
cc->rescan = true;
}
@@ -2311,8 +2351,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
goto check_drain;
case ISOLATE_SUCCESS:
update_cached = false;
- last_migrated_pfn = start_pfn;
- ;
+ last_migrated_pfn = iteration_start_pfn;
}
err = migrate_pages(&cc->migratepages, compaction_alloc,
diff --git a/mm/debug.c b/mm/debug.c
index ccca576b2899..8a40b3fefbeb 100644
--- a/mm/debug.c
+++ b/mm/debug.c
@@ -182,8 +182,8 @@ hex_only:
pr_warn("page dumped because: %s\n", reason);
#ifdef CONFIG_MEMCG
- if (!page_poisoned && page->mem_cgroup)
- pr_warn("page->mem_cgroup:%px\n", page->mem_cgroup);
+ if (!page_poisoned && page->memcg_data)
+ pr_warn("pages's memcg:%lx\n", page->memcg_data);
#endif
}
diff --git a/mm/filemap.c b/mm/filemap.c
index 0b2067b3c328..5c9d564317a5 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -102,8 +102,8 @@
* ->swap_lock (try_to_unmap_one)
* ->private_lock (try_to_unmap_one)
* ->i_pages lock (try_to_unmap_one)
- * ->pgdat->lru_lock (follow_page->mark_page_accessed)
- * ->pgdat->lru_lock (check_pte_range->isolate_lru_page)
+ * ->lruvec->lru_lock (follow_page->mark_page_accessed)
+ * ->lruvec->lru_lock (check_pte_range->isolate_lru_page)
* ->private_lock (page_remove_rmap->set_page_dirty)
* ->i_pages lock (page_remove_rmap->set_page_dirty)
* bdi.wb->list_lock (page_remove_rmap->set_page_dirty)
@@ -204,9 +204,9 @@ static void unaccount_page_cache_page(struct address_space *mapping,
if (PageSwapBacked(page)) {
__mod_lruvec_page_state(page, NR_SHMEM, -nr);
if (PageTransHuge(page))
- __dec_node_page_state(page, NR_SHMEM_THPS);
+ __dec_lruvec_page_state(page, NR_SHMEM_THPS);
} else if (PageTransHuge(page)) {
- __dec_node_page_state(page, NR_FILE_THPS);
+ __dec_lruvec_page_state(page, NR_FILE_THPS);
filemap_nr_thps_dec(mapping);
}
@@ -1359,7 +1359,7 @@ static int __wait_on_page_locked_async(struct page *page,
else
ret = PageLocked(page);
/*
- * If we were succesful now, we know we're still on the
+ * If we were successful now, we know we're still on the
* waitqueue as we're still under the lock. This means it's
* safe to remove and return success, we know the callback
* isn't going to trigger.
@@ -1583,19 +1583,20 @@ int __lock_page_or_retry(struct page *page, struct mm_struct *mm,
else
wait_on_page_locked(page);
return 0;
- } else {
- if (flags & FAULT_FLAG_KILLABLE) {
- int ret;
+ }
+ if (flags & FAULT_FLAG_KILLABLE) {
+ int ret;
- ret = __lock_page_killable(page);
- if (ret) {
- mmap_read_unlock(mm);
- return 0;
- }
- } else
- __lock_page(page);
- return 1;
+ ret = __lock_page_killable(page);
+ if (ret) {
+ mmap_read_unlock(mm);
+ return 0;
+ }
+ } else {
+ __lock_page(page);
}
+ return 1;
+
}
/**
@@ -2166,6 +2167,259 @@ static void shrink_readahead_size_eio(struct file_ra_state *ra)
ra->ra_pages /= 4;
}
+static int lock_page_for_iocb(struct kiocb *iocb, struct page *page)
+{
+ if (iocb->ki_flags & IOCB_WAITQ)
+ return lock_page_async(page, iocb->ki_waitq);
+ else if (iocb->ki_flags & IOCB_NOWAIT)
+ return trylock_page(page) ? 0 : -EAGAIN;
+ else
+ return lock_page_killable(page);
+}
+
+static struct page *
+generic_file_buffered_read_readpage(struct kiocb *iocb,
+ struct file *filp,
+ struct address_space *mapping,
+ struct page *page)
+{
+ struct file_ra_state *ra = &filp->f_ra;
+ int error;
+
+ if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
+ unlock_page(page);
+ put_page(page);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /*
+ * A previous I/O error may have been due to temporary
+ * failures, eg. multipath errors.
+ * PG_error will be set again if readpage fails.
+ */
+ ClearPageError(page);
+ /* Start the actual read. The read will unlock the page. */
+ error = mapping->a_ops->readpage(filp, page);
+
+ if (unlikely(error)) {
+ put_page(page);
+ return error != AOP_TRUNCATED_PAGE ? ERR_PTR(error) : NULL;
+ }
+
+ if (!PageUptodate(page)) {
+ error = lock_page_for_iocb(iocb, page);
+ if (unlikely(error)) {
+ put_page(page);
+ return ERR_PTR(error);
+ }
+ if (!PageUptodate(page)) {
+ if (page->mapping == NULL) {
+ /*
+ * invalidate_mapping_pages got it
+ */
+ unlock_page(page);
+ put_page(page);
+ return NULL;
+ }
+ unlock_page(page);
+ shrink_readahead_size_eio(ra);
+ put_page(page);
+ return ERR_PTR(-EIO);
+ }
+ unlock_page(page);
+ }
+
+ return page;
+}
+
+static struct page *
+generic_file_buffered_read_pagenotuptodate(struct kiocb *iocb,
+ struct file *filp,
+ struct iov_iter *iter,
+ struct page *page,
+ loff_t pos, loff_t count)
+{
+ struct address_space *mapping = filp->f_mapping;
+ struct inode *inode = mapping->host;
+ int error;
+
+ /*
+ * See comment in do_read_cache_page on why
+ * wait_on_page_locked is used to avoid unnecessarily
+ * serialisations and why it's safe.
+ */
+ if (iocb->ki_flags & IOCB_WAITQ) {
+ error = wait_on_page_locked_async(page,
+ iocb->ki_waitq);
+ } else {
+ error = wait_on_page_locked_killable(page);
+ }
+ if (unlikely(error)) {
+ put_page(page);
+ return ERR_PTR(error);
+ }
+ if (PageUptodate(page))
+ return page;
+
+ if (inode->i_blkbits == PAGE_SHIFT ||
+ !mapping->a_ops->is_partially_uptodate)
+ goto page_not_up_to_date;
+ /* pipes can't handle partially uptodate pages */
+ if (unlikely(iov_iter_is_pipe(iter)))
+ goto page_not_up_to_date;
+ if (!trylock_page(page))
+ goto page_not_up_to_date;
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping)
+ goto page_not_up_to_date_locked;
+ if (!mapping->a_ops->is_partially_uptodate(page,
+ pos & ~PAGE_MASK, count))
+ goto page_not_up_to_date_locked;
+ unlock_page(page);
+ return page;
+
+page_not_up_to_date:
+ /* Get exclusive access to the page ... */
+ error = lock_page_for_iocb(iocb, page);
+ if (unlikely(error)) {
+ put_page(page);
+ return ERR_PTR(error);
+ }
+
+page_not_up_to_date_locked:
+ /* Did it get truncated before we got the lock? */
+ if (!page->mapping) {
+ unlock_page(page);
+ put_page(page);
+ return NULL;
+ }
+
+ /* Did somebody else fill it already? */
+ if (PageUptodate(page)) {
+ unlock_page(page);
+ return page;
+ }
+
+ return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static struct page *
+generic_file_buffered_read_no_cached_page(struct kiocb *iocb,
+ struct iov_iter *iter)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ struct page *page;
+ int error;
+
+ if (iocb->ki_flags & IOCB_NOIO)
+ return ERR_PTR(-EAGAIN);
+
+ /*
+ * Ok, it wasn't cached, so we need to create a new
+ * page..
+ */
+ page = page_cache_alloc(mapping);
+ if (!page)
+ return ERR_PTR(-ENOMEM);
+
+ error = add_to_page_cache_lru(page, mapping, index,
+ mapping_gfp_constraint(mapping, GFP_KERNEL));
+ if (error) {
+ put_page(page);
+ return error != -EEXIST ? ERR_PTR(error) : NULL;
+ }
+
+ return generic_file_buffered_read_readpage(iocb, filp, mapping, page);
+}
+
+static int generic_file_buffered_read_get_pages(struct kiocb *iocb,
+ struct iov_iter *iter,
+ struct page **pages,
+ unsigned int nr)
+{
+ struct file *filp = iocb->ki_filp;
+ struct address_space *mapping = filp->f_mapping;
+ struct file_ra_state *ra = &filp->f_ra;
+ pgoff_t index = iocb->ki_pos >> PAGE_SHIFT;
+ pgoff_t last_index = (iocb->ki_pos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
+ int i, j, nr_got, err = 0;
+
+ nr = min_t(unsigned long, last_index - index, nr);
+find_page:
+ if (fatal_signal_pending(current))
+ return -EINTR;
+
+ nr_got = find_get_pages_contig(mapping, index, nr, pages);
+ if (nr_got)
+ goto got_pages;
+
+ if (iocb->ki_flags & IOCB_NOIO)
+ return -EAGAIN;
+
+ page_cache_sync_readahead(mapping, ra, filp, index, last_index - index);
+
+ nr_got = find_get_pages_contig(mapping, index, nr, pages);
+ if (nr_got)
+ goto got_pages;
+
+ pages[0] = generic_file_buffered_read_no_cached_page(iocb, iter);
+ err = PTR_ERR_OR_ZERO(pages[0]);
+ if (!IS_ERR_OR_NULL(pages[0]))
+ nr_got = 1;
+got_pages:
+ for (i = 0; i < nr_got; i++) {
+ struct page *page = pages[i];
+ pgoff_t pg_index = index + i;
+ loff_t pg_pos = max(iocb->ki_pos,
+ (loff_t) pg_index << PAGE_SHIFT);
+ loff_t pg_count = iocb->ki_pos + iter->count - pg_pos;
+
+ if (PageReadahead(page)) {
+ if (iocb->ki_flags & IOCB_NOIO) {
+ for (j = i; j < nr_got; j++)
+ put_page(pages[j]);
+ nr_got = i;
+ err = -EAGAIN;
+ break;
+ }
+ page_cache_async_readahead(mapping, ra, filp, page,
+ pg_index, last_index - pg_index);
+ }
+
+ if (!PageUptodate(page)) {
+ if ((iocb->ki_flags & IOCB_NOWAIT) ||
+ ((iocb->ki_flags & IOCB_WAITQ) && i)) {
+ for (j = i; j < nr_got; j++)
+ put_page(pages[j]);
+ nr_got = i;
+ err = -EAGAIN;
+ break;
+ }
+
+ page = generic_file_buffered_read_pagenotuptodate(iocb,
+ filp, iter, page, pg_pos, pg_count);
+ if (IS_ERR_OR_NULL(page)) {
+ for (j = i + 1; j < nr_got; j++)
+ put_page(pages[j]);
+ nr_got = i;
+ err = PTR_ERR_OR_ZERO(page);
+ break;
+ }
+ }
+ }
+
+ if (likely(nr_got))
+ return nr_got;
+ if (err)
+ return err;
+ /*
+ * No pages and no error means we raced and should retry:
+ */
+ goto find_page;
+}
+
/**
* generic_file_buffered_read - generic file read routine
* @iocb: the iocb to read
@@ -2186,294 +2440,120 @@ ssize_t generic_file_buffered_read(struct kiocb *iocb,
struct iov_iter *iter, ssize_t written)
{
struct file *filp = iocb->ki_filp;
+ struct file_ra_state *ra = &filp->f_ra;
struct address_space *mapping = filp->f_mapping;
struct inode *inode = mapping->host;
- struct file_ra_state *ra = &filp->f_ra;
- loff_t *ppos = &iocb->ki_pos;
- pgoff_t index;
- pgoff_t last_index;
- pgoff_t prev_index;
- unsigned long offset; /* offset into pagecache page */
- unsigned int prev_offset;
- int error = 0;
-
- if (unlikely(*ppos >= inode->i_sb->s_maxbytes))
+ struct page *pages_onstack[PAGEVEC_SIZE], **pages = NULL;
+ unsigned int nr_pages = min_t(unsigned int, 512,
+ ((iocb->ki_pos + iter->count + PAGE_SIZE - 1) >> PAGE_SHIFT) -
+ (iocb->ki_pos >> PAGE_SHIFT));
+ int i, pg_nr, error = 0;
+ bool writably_mapped;
+ loff_t isize, end_offset;
+
+ if (unlikely(iocb->ki_pos >= inode->i_sb->s_maxbytes))
return 0;
+ if (unlikely(!iov_iter_count(iter)))
+ return 0;
+
iov_iter_truncate(iter, inode->i_sb->s_maxbytes);
- index = *ppos >> PAGE_SHIFT;
- prev_index = ra->prev_pos >> PAGE_SHIFT;
- prev_offset = ra->prev_pos & (PAGE_SIZE-1);
- last_index = (*ppos + iter->count + PAGE_SIZE-1) >> PAGE_SHIFT;
- offset = *ppos & ~PAGE_MASK;
+ if (nr_pages > ARRAY_SIZE(pages_onstack))
+ pages = kmalloc_array(nr_pages, sizeof(void *), GFP_KERNEL);
- /*
- * If we've already successfully copied some data, then we
- * can no longer safely return -EIOCBQUEUED. Hence mark
- * an async read NOWAIT at that point.
- */
- if (written && (iocb->ki_flags & IOCB_WAITQ))
- iocb->ki_flags |= IOCB_NOWAIT;
-
- for (;;) {
- struct page *page;
- pgoff_t end_index;
- loff_t isize;
- unsigned long nr, ret;
+ if (!pages) {
+ pages = pages_onstack;
+ nr_pages = min_t(unsigned int, nr_pages, ARRAY_SIZE(pages_onstack));
+ }
+ do {
cond_resched();
-find_page:
- if (fatal_signal_pending(current)) {
- error = -EINTR;
- goto out;
- }
- page = find_get_page(mapping, index);
- if (!page) {
- if (iocb->ki_flags & IOCB_NOIO)
- goto would_block;
- page_cache_sync_readahead(mapping,
- ra, filp,
- index, last_index - index);
- page = find_get_page(mapping, index);
- if (unlikely(page == NULL))
- goto no_cached_page;
- }
- if (PageReadahead(page)) {
- if (iocb->ki_flags & IOCB_NOIO) {
- put_page(page);
- goto out;
- }
- page_cache_async_readahead(mapping,
- ra, filp, page,
- index, last_index - index);
- }
- if (!PageUptodate(page)) {
- /*
- * See comment in do_read_cache_page on why
- * wait_on_page_locked is used to avoid unnecessarily
- * serialisations and why it's safe.
- */
- if (iocb->ki_flags & IOCB_WAITQ) {
- if (written) {
- put_page(page);
- goto out;
- }
- error = wait_on_page_locked_async(page,
- iocb->ki_waitq);
- } else {
- if (iocb->ki_flags & IOCB_NOWAIT) {
- put_page(page);
- goto would_block;
- }
- error = wait_on_page_locked_killable(page);
- }
- if (unlikely(error))
- goto readpage_error;
- if (PageUptodate(page))
- goto page_ok;
-
- if (inode->i_blkbits == PAGE_SHIFT ||
- !mapping->a_ops->is_partially_uptodate)
- goto page_not_up_to_date;
- /* pipes can't handle partially uptodate pages */
- if (unlikely(iov_iter_is_pipe(iter)))
- goto page_not_up_to_date;
- if (!trylock_page(page))
- goto page_not_up_to_date;
- /* Did it get truncated before we got the lock? */
- if (!page->mapping)
- goto page_not_up_to_date_locked;
- if (!mapping->a_ops->is_partially_uptodate(page,
- offset, iter->count))
- goto page_not_up_to_date_locked;
- unlock_page(page);
+ /*
+ * If we've already successfully copied some data, then we
+ * can no longer safely return -EIOCBQUEUED. Hence mark
+ * an async read NOWAIT at that point.
+ */
+ if ((iocb->ki_flags & IOCB_WAITQ) && written)
+ iocb->ki_flags |= IOCB_NOWAIT;
+
+ i = 0;
+ pg_nr = generic_file_buffered_read_get_pages(iocb, iter,
+ pages, nr_pages);
+ if (pg_nr < 0) {
+ error = pg_nr;
+ break;
}
-page_ok:
+
/*
- * i_size must be checked after we know the page is Uptodate.
+ * i_size must be checked after we know the pages are Uptodate.
*
* Checking i_size after the check allows us to calculate
* the correct value for "nr", which means the zero-filled
* part of the page is not copied back to userspace (unless
* another truncate extends the file - this is desired though).
*/
-
isize = i_size_read(inode);
- end_index = (isize - 1) >> PAGE_SHIFT;
- if (unlikely(!isize || index > end_index)) {
- put_page(page);
- goto out;
- }
+ if (unlikely(iocb->ki_pos >= isize))
+ goto put_pages;
- /* nr is the maximum number of bytes to copy from this page */
- nr = PAGE_SIZE;
- if (index == end_index) {
- nr = ((isize - 1) & ~PAGE_MASK) + 1;
- if (nr <= offset) {
- put_page(page);
- goto out;
- }
- }
- nr = nr - offset;
+ end_offset = min_t(loff_t, isize, iocb->ki_pos + iter->count);
- /* If users can be writing to this page using arbitrary
- * virtual addresses, take care about potential aliasing
- * before reading the page on the kernel side.
- */
- if (mapping_writably_mapped(mapping))
- flush_dcache_page(page);
+ while ((iocb->ki_pos >> PAGE_SHIFT) + pg_nr >
+ (end_offset + PAGE_SIZE - 1) >> PAGE_SHIFT)
+ put_page(pages[--pg_nr]);
/*
- * When a sequential read accesses a page several times,
- * only mark it as accessed the first time.
+ * Once we start copying data, we don't want to be touching any
+ * cachelines that might be contended:
*/
- if (prev_index != index || offset != prev_offset)
- mark_page_accessed(page);
- prev_index = index;
+ writably_mapped = mapping_writably_mapped(mapping);
/*
- * Ok, we have the page, and it's up-to-date, so
- * now we can copy it to user space...
+ * When a sequential read accesses a page several times, only
+ * mark it as accessed the first time.
*/
+ if (iocb->ki_pos >> PAGE_SHIFT !=
+ ra->prev_pos >> PAGE_SHIFT)
+ mark_page_accessed(pages[0]);
+ for (i = 1; i < pg_nr; i++)
+ mark_page_accessed(pages[i]);
+
+ for (i = 0; i < pg_nr; i++) {
+ unsigned int offset = iocb->ki_pos & ~PAGE_MASK;
+ unsigned int bytes = min_t(loff_t, end_offset - iocb->ki_pos,
+ PAGE_SIZE - offset);
+ unsigned int copied;
- ret = copy_page_to_iter(page, offset, nr, iter);
- offset += ret;
- index += offset >> PAGE_SHIFT;
- offset &= ~PAGE_MASK;
- prev_offset = offset;
-
- put_page(page);
- written += ret;
- if (!iov_iter_count(iter))
- goto out;
- if (ret < nr) {
- error = -EFAULT;
- goto out;
- }
- continue;
-
-page_not_up_to_date:
- /* Get exclusive access to the page ... */
- if (iocb->ki_flags & IOCB_WAITQ) {
- if (written) {
- put_page(page);
- goto out;
- }
- error = lock_page_async(page, iocb->ki_waitq);
- } else {
- error = lock_page_killable(page);
- }
- if (unlikely(error))
- goto readpage_error;
-
-page_not_up_to_date_locked:
- /* Did it get truncated before we got the lock? */
- if (!page->mapping) {
- unlock_page(page);
- put_page(page);
- continue;
- }
-
- /* Did somebody else fill it already? */
- if (PageUptodate(page)) {
- unlock_page(page);
- goto page_ok;
- }
-
-readpage:
- if (iocb->ki_flags & (IOCB_NOIO | IOCB_NOWAIT)) {
- unlock_page(page);
- put_page(page);
- goto would_block;
- }
- /*
- * A previous I/O error may have been due to temporary
- * failures, eg. multipath errors.
- * PG_error will be set again if readpage fails.
- */
- ClearPageError(page);
- /* Start the actual read. The read will unlock the page. */
- error = mapping->a_ops->readpage(filp, page);
+ /*
+ * If users can be writing to this page using arbitrary
+ * virtual addresses, take care about potential aliasing
+ * before reading the page on the kernel side.
+ */
+ if (writably_mapped)
+ flush_dcache_page(pages[i]);
- if (unlikely(error)) {
- if (error == AOP_TRUNCATED_PAGE) {
- put_page(page);
- error = 0;
- goto find_page;
- }
- goto readpage_error;
- }
+ copied = copy_page_to_iter(pages[i], offset, bytes, iter);
- if (!PageUptodate(page)) {
- if (iocb->ki_flags & IOCB_WAITQ) {
- if (written) {
- put_page(page);
- goto out;
- }
- error = lock_page_async(page, iocb->ki_waitq);
- } else {
- error = lock_page_killable(page);
- }
+ written += copied;
+ iocb->ki_pos += copied;
+ ra->prev_pos = iocb->ki_pos;
- if (unlikely(error))
- goto readpage_error;
- if (!PageUptodate(page)) {
- if (page->mapping == NULL) {
- /*
- * invalidate_mapping_pages got it
- */
- unlock_page(page);
- put_page(page);
- goto find_page;
- }
- unlock_page(page);
- shrink_readahead_size_eio(ra);
- error = -EIO;
- goto readpage_error;
+ if (copied < bytes) {
+ error = -EFAULT;
+ break;
}
- unlock_page(page);
}
+put_pages:
+ for (i = 0; i < pg_nr; i++)
+ put_page(pages[i]);
+ } while (iov_iter_count(iter) && iocb->ki_pos < isize && !error);
- goto page_ok;
-
-readpage_error:
- /* UHHUH! A synchronous read error occurred. Report it */
- put_page(page);
- goto out;
-
-no_cached_page:
- /*
- * Ok, it wasn't cached, so we need to create a new
- * page..
- */
- page = page_cache_alloc(mapping);
- if (!page) {
- error = -ENOMEM;
- goto out;
- }
- error = add_to_page_cache_lru(page, mapping, index,
- mapping_gfp_constraint(mapping, GFP_KERNEL));
- if (error) {
- put_page(page);
- if (error == -EEXIST) {
- error = 0;
- goto find_page;
- }
- goto out;
- }
- goto readpage;
- }
+ file_accessed(filp);
-would_block:
- error = -EAGAIN;
-out:
- ra->prev_pos = prev_index;
- ra->prev_pos <<= PAGE_SHIFT;
- ra->prev_pos |= prev_offset;
+ if (pages != pages_onstack)
+ kfree(pages);
- *ppos = ((loff_t)index << PAGE_SHIFT) + offset;
- file_accessed(filp);
return written ? written : error;
}
EXPORT_SYMBOL_GPL(generic_file_buffered_read);
@@ -2904,14 +2984,14 @@ EXPORT_SYMBOL(filemap_map_pages);
vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
{
+ struct address_space *mapping = vmf->vma->vm_file->f_mapping;
struct page *page = vmf->page;
- struct inode *inode = file_inode(vmf->vma->vm_file);
vm_fault_t ret = VM_FAULT_LOCKED;
- sb_start_pagefault(inode->i_sb);
+ sb_start_pagefault(mapping->host->i_sb);
file_update_time(vmf->vma->vm_file);
lock_page(page);
- if (page->mapping != inode->i_mapping) {
+ if (page->mapping != mapping) {
unlock_page(page);
ret = VM_FAULT_NOPAGE;
goto out;
@@ -2924,7 +3004,7 @@ vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf)
set_page_dirty(page);
wait_for_stable_page(page);
out:
- sb_end_pagefault(inode->i_sb);
+ sb_end_pagefault(mapping->host->i_sb);
return ret;
}
@@ -3167,10 +3247,9 @@ void dio_warn_stale_pagecache(struct file *filp)
{
static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST);
char pathname[128];
- struct inode *inode = file_inode(filp);
char *path;
- errseq_set(&inode->i_mapping->wb_err, -EIO);
+ errseq_set(&filp->f_mapping->wb_err, -EIO);
if (__ratelimit(&_rs)) {
path = file_path(filp, pathname, sizeof(pathname));
if (IS_ERR(path))
@@ -3197,7 +3276,7 @@ generic_file_direct_write(struct kiocb *iocb, struct iov_iter *from)
if (iocb->ki_flags & IOCB_NOWAIT) {
/* If there are pages to writeback, return */
- if (filemap_range_has_page(inode->i_mapping, pos,
+ if (filemap_range_has_page(file->f_mapping, pos,
pos + write_len - 1))
return -EAGAIN;
} else {
diff --git a/mm/gup.c b/mm/gup.c
index 98eb8e6d2609..e4c224cd9661 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -123,6 +123,28 @@ static __maybe_unused struct page *try_grab_compound_head(struct page *page,
return NULL;
}
+static void put_compound_head(struct page *page, int refs, unsigned int flags)
+{
+ if (flags & FOLL_PIN) {
+ mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
+ refs);
+
+ if (hpage_pincount_available(page))
+ hpage_pincount_sub(page, refs);
+ else
+ refs *= GUP_PIN_COUNTING_BIAS;
+ }
+
+ VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
+ /*
+ * Calling put_page() for each ref is unnecessarily slow. Only the last
+ * ref needs a put_page().
+ */
+ if (refs > 1)
+ page_ref_sub(page, refs - 1);
+ put_page(page);
+}
+
/**
* try_grab_page() - elevate a page's refcount by a flag-dependent amount
*
@@ -177,41 +199,6 @@ bool __must_check try_grab_page(struct page *page, unsigned int flags)
return true;
}
-#ifdef CONFIG_DEV_PAGEMAP_OPS
-static bool __unpin_devmap_managed_user_page(struct page *page)
-{
- int count, refs = 1;
-
- if (!page_is_devmap_managed(page))
- return false;
-
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, 1);
- else
- refs = GUP_PIN_COUNTING_BIAS;
-
- count = page_ref_sub_return(page, refs);
-
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1);
- /*
- * devmap page refcounts are 1-based, rather than 0-based: if
- * refcount is 1, then the page is free and the refcount is
- * stable because nobody holds a reference on the page.
- */
- if (count == 1)
- free_devmap_managed_page(page);
- else if (!count)
- __put_page(page);
-
- return true;
-}
-#else
-static bool __unpin_devmap_managed_user_page(struct page *page)
-{
- return false;
-}
-#endif /* CONFIG_DEV_PAGEMAP_OPS */
-
/**
* unpin_user_page() - release a dma-pinned page
* @page: pointer to page to be released
@@ -223,28 +210,7 @@ static bool __unpin_devmap_managed_user_page(struct page *page)
*/
void unpin_user_page(struct page *page)
{
- int refs = 1;
-
- page = compound_head(page);
-
- /*
- * For devmap managed pages we need to catch refcount transition from
- * GUP_PIN_COUNTING_BIAS to 1, when refcount reach one it means the
- * page is free and we need to inform the device driver through
- * callback. See include/linux/memremap.h and HMM for details.
- */
- if (__unpin_devmap_managed_user_page(page))
- return;
-
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, 1);
- else
- refs = GUP_PIN_COUNTING_BIAS;
-
- if (page_ref_sub_and_test(page, refs))
- __put_page(page);
-
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED, 1);
+ put_compound_head(compound_head(page), 1, FOLL_PIN);
}
EXPORT_SYMBOL(unpin_user_page);
@@ -923,6 +889,9 @@ static int check_vma_flags(struct vm_area_struct *vma, unsigned long gup_flags)
if (gup_flags & FOLL_ANON && !vma_is_anonymous(vma))
return -EFAULT;
+ if ((gup_flags & FOLL_LONGTERM) && vma_is_fsdax(vma))
+ return -EOPNOTSUPP;
+
if (write) {
if (!(vm_flags & VM_WRITE)) {
if (!(gup_flags & FOLL_FORCE))
@@ -1060,10 +1029,14 @@ static long __get_user_pages(struct mm_struct *mm,
goto next_page;
}
- if (!vma || check_vma_flags(vma, gup_flags)) {
+ if (!vma) {
ret = -EFAULT;
goto out;
}
+ ret = check_vma_flags(vma, gup_flags);
+ if (ret)
+ goto out;
+
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &nr_pages, i,
@@ -1567,26 +1540,6 @@ struct page *get_dump_page(unsigned long addr)
}
#endif /* CONFIG_ELF_CORE */
-#if defined(CONFIG_FS_DAX) || defined (CONFIG_CMA)
-static bool check_dax_vmas(struct vm_area_struct **vmas, long nr_pages)
-{
- long i;
- struct vm_area_struct *vma_prev = NULL;
-
- for (i = 0; i < nr_pages; i++) {
- struct vm_area_struct *vma = vmas[i];
-
- if (vma == vma_prev)
- continue;
-
- vma_prev = vma;
-
- if (vma_is_fsdax(vma))
- return true;
- }
- return false;
-}
-
#ifdef CONFIG_CMA
static long check_and_migrate_cma_pages(struct mm_struct *mm,
unsigned long start,
@@ -1705,63 +1658,23 @@ static long __gup_longterm_locked(struct mm_struct *mm,
struct vm_area_struct **vmas,
unsigned int gup_flags)
{
- struct vm_area_struct **vmas_tmp = vmas;
unsigned long flags = 0;
- long rc, i;
+ long rc;
- if (gup_flags & FOLL_LONGTERM) {
- if (!pages)
- return -EINVAL;
-
- if (!vmas_tmp) {
- vmas_tmp = kcalloc(nr_pages,
- sizeof(struct vm_area_struct *),
- GFP_KERNEL);
- if (!vmas_tmp)
- return -ENOMEM;
- }
+ if (gup_flags & FOLL_LONGTERM)
flags = memalloc_nocma_save();
- }
- rc = __get_user_pages_locked(mm, start, nr_pages, pages,
- vmas_tmp, NULL, gup_flags);
+ rc = __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL,
+ gup_flags);
if (gup_flags & FOLL_LONGTERM) {
- if (rc < 0)
- goto out;
-
- if (check_dax_vmas(vmas_tmp, rc)) {
- if (gup_flags & FOLL_PIN)
- unpin_user_pages(pages, rc);
- else
- for (i = 0; i < rc; i++)
- put_page(pages[i]);
- rc = -EOPNOTSUPP;
- goto out;
- }
-
- rc = check_and_migrate_cma_pages(mm, start, rc, pages,
- vmas_tmp, gup_flags);
-out:
+ if (rc > 0)
+ rc = check_and_migrate_cma_pages(mm, start, rc, pages,
+ vmas, gup_flags);
memalloc_nocma_restore(flags);
}
-
- if (vmas_tmp != vmas)
- kfree(vmas_tmp);
return rc;
}
-#else /* !CONFIG_FS_DAX && !CONFIG_CMA */
-static __always_inline long __gup_longterm_locked(struct mm_struct *mm,
- unsigned long start,
- unsigned long nr_pages,
- struct page **pages,
- struct vm_area_struct **vmas,
- unsigned int flags)
-{
- return __get_user_pages_locked(mm, start, nr_pages, pages, vmas,
- NULL, flags);
-}
-#endif /* CONFIG_FS_DAX || CONFIG_CMA */
static bool is_valid_gup_flags(unsigned int gup_flags)
{
@@ -1932,7 +1845,19 @@ long get_user_pages(unsigned long start, unsigned long nr_pages,
EXPORT_SYMBOL(get_user_pages);
/**
- * get_user_pages_locked() is suitable to replace the form:
+ * get_user_pages_locked() - variant of get_user_pages()
+ *
+ * @start: starting user address
+ * @nr_pages: number of pages from start to pin
+ * @gup_flags: flags modifying lookup behaviour
+ * @pages: array that receives pointers to the pages pinned.
+ * Should be at least nr_pages long. Or NULL, if caller
+ * only intends to ensure the pages are faulted in.
+ * @locked: pointer to lock flag indicating whether lock is held and
+ * subsequently whether VM_FAULT_RETRY functionality can be
+ * utilised. Lock must initially be held.
+ *
+ * It is suitable to replace the form:
*
* mmap_read_lock(mm);
* do_something()
@@ -1948,16 +1873,6 @@ EXPORT_SYMBOL(get_user_pages);
* if (locked)
* mmap_read_unlock(mm);
*
- * @start: starting user address
- * @nr_pages: number of pages from start to pin
- * @gup_flags: flags modifying lookup behaviour
- * @pages: array that receives pointers to the pages pinned.
- * Should be at least nr_pages long. Or NULL, if caller
- * only intends to ensure the pages are faulted in.
- * @locked: pointer to lock flag indicating whether lock is held and
- * subsequently whether VM_FAULT_RETRY functionality can be
- * utilised. Lock must initially be held.
- *
* We can leverage the VM_FAULT_RETRY functionality in the page fault
* paths better by using either get_user_pages_locked() or
* get_user_pages_unlocked().
@@ -2063,84 +1978,6 @@ EXPORT_SYMBOL(get_user_pages_unlocked);
*/
#ifdef CONFIG_HAVE_FAST_GUP
-static void put_compound_head(struct page *page, int refs, unsigned int flags)
-{
- if (flags & FOLL_PIN) {
- mod_node_page_state(page_pgdat(page), NR_FOLL_PIN_RELEASED,
- refs);
-
- if (hpage_pincount_available(page))
- hpage_pincount_sub(page, refs);
- else
- refs *= GUP_PIN_COUNTING_BIAS;
- }
-
- VM_BUG_ON_PAGE(page_ref_count(page) < refs, page);
- /*
- * Calling put_page() for each ref is unnecessarily slow. Only the last
- * ref needs a put_page().
- */
- if (refs > 1)
- page_ref_sub(page, refs - 1);
- put_page(page);
-}
-
-#ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
-
-/*
- * WARNING: only to be used in the get_user_pages_fast() implementation.
- *
- * With get_user_pages_fast(), we walk down the pagetables without taking any
- * locks. For this we would like to load the pointers atomically, but sometimes
- * that is not possible (e.g. without expensive cmpxchg8b on x86_32 PAE). What
- * we do have is the guarantee that a PTE will only either go from not present
- * to present, or present to not present or both -- it will not switch to a
- * completely different present page without a TLB flush in between; something
- * that we are blocking by holding interrupts off.
- *
- * Setting ptes from not present to present goes:
- *
- * ptep->pte_high = h;
- * smp_wmb();
- * ptep->pte_low = l;
- *
- * And present to not present goes:
- *
- * ptep->pte_low = 0;
- * smp_wmb();
- * ptep->pte_high = 0;
- *
- * We must ensure here that the load of pte_low sees 'l' IFF pte_high sees 'h'.
- * We load pte_high *after* loading pte_low, which ensures we don't see an older
- * value of pte_high. *Then* we recheck pte_low, which ensures that we haven't
- * picked up a changed pte high. We might have gotten rubbish values from
- * pte_low and pte_high, but we are guaranteed that pte_low will not have the
- * present bit set *unless* it is 'l'. Because get_user_pages_fast() only
- * operates on present ptes we're safe.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
- pte_t pte;
-
- do {
- pte.pte_low = ptep->pte_low;
- smp_rmb();
- pte.pte_high = ptep->pte_high;
- smp_rmb();
- } while (unlikely(pte.pte_low != ptep->pte_low));
-
- return pte;
-}
-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-/*
- * We require that the PTE can be read atomically.
- */
-static inline pte_t gup_get_pte(pte_t *ptep)
-{
- return ptep_get(ptep);
-}
-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
-
static void __maybe_unused undo_dev_pagemap(int *nr, int nr_start,
unsigned int flags,
struct page **pages)
@@ -2166,7 +2003,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end,
ptem = ptep = pte_offset_map(&pmd, addr);
do {
- pte_t pte = gup_get_pte(ptep);
+ pte_t pte = ptep_get_lockless(ptep);
struct page *head, *page;
/*
@@ -2677,13 +2514,61 @@ static int __gup_longterm_unlocked(unsigned long start, int nr_pages,
return ret;
}
-static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
+static unsigned long lockless_pages_from_mm(unsigned long start,
+ unsigned long end,
+ unsigned int gup_flags,
+ struct page **pages)
+{
+ unsigned long flags;
+ int nr_pinned = 0;
+ unsigned seq;
+
+ if (!IS_ENABLED(CONFIG_HAVE_FAST_GUP) ||
+ !gup_fast_permitted(start, end))
+ return 0;
+
+ if (gup_flags & FOLL_PIN) {
+ seq = raw_read_seqcount(&current->mm->write_protect_seq);
+ if (seq & 1)
+ return 0;
+ }
+
+ /*
+ * Disable interrupts. The nested form is used, in order to allow full,
+ * general purpose use of this routine.
+ *
+ * With interrupts disabled, we block page table pages from being freed
+ * from under us. See struct mmu_table_batch comments in
+ * include/asm-generic/tlb.h for more details.
+ *
+ * We do not adopt an rcu_read_lock() here as we also want to block IPIs
+ * that come from THPs splitting.
+ */
+ local_irq_save(flags);
+ gup_pgd_range(start, end, gup_flags, pages, &nr_pinned);
+ local_irq_restore(flags);
+
+ /*
+ * When pinning pages for DMA there could be a concurrent write protect
+ * from fork() via copy_page_range(), in this case always fail fast GUP.
+ */
+ if (gup_flags & FOLL_PIN) {
+ if (read_seqcount_retry(&current->mm->write_protect_seq, seq)) {
+ unpin_user_pages(pages, nr_pinned);
+ return 0;
+ }
+ }
+ return nr_pinned;
+}
+
+static int internal_get_user_pages_fast(unsigned long start,
+ unsigned long nr_pages,
unsigned int gup_flags,
struct page **pages)
{
- unsigned long addr, len, end;
- unsigned long flags;
- int nr_pinned = 0, ret = 0;
+ unsigned long len, end;
+ unsigned long nr_pinned;
+ int ret;
if (WARN_ON_ONCE(gup_flags & ~(FOLL_WRITE | FOLL_LONGTERM |
FOLL_FORCE | FOLL_PIN | FOLL_GET |
@@ -2697,54 +2582,33 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages,
might_lock_read(&current->mm->mmap_lock);
start = untagged_addr(start) & PAGE_MASK;
- addr = start;
- len = (unsigned long) nr_pages << PAGE_SHIFT;
- end = start + len;
-
- if (end <= start)
+ len = nr_pages << PAGE_SHIFT;
+ if (check_add_overflow(start, len, &end))
return 0;
if (unlikely(!access_ok((void __user *)start, len)))
return -EFAULT;
- /*
- * Disable interrupts. The nested form is used, in order to allow
- * full, general purpose use of this routine.
- *
- * With interrupts disabled, we block page table pages from being
- * freed from under us. See struct mmu_table_batch comments in
- * include/asm-generic/tlb.h for more details.
- *
- * We do not adopt an rcu_read_lock(.) here as we also want to
- * block IPIs that come from THPs splitting.
- */
- if (IS_ENABLED(CONFIG_HAVE_FAST_GUP) && gup_fast_permitted(start, end)) {
- unsigned long fast_flags = gup_flags;
+ nr_pinned = lockless_pages_from_mm(start, end, gup_flags, pages);
+ if (nr_pinned == nr_pages || gup_flags & FOLL_FAST_ONLY)
+ return nr_pinned;
- local_irq_save(flags);
- gup_pgd_range(addr, end, fast_flags, pages, &nr_pinned);
- local_irq_restore(flags);
- ret = nr_pinned;
- }
-
- if (nr_pinned < nr_pages && !(gup_flags & FOLL_FAST_ONLY)) {
- /* Try to get the remaining pages with get_user_pages */
- start += nr_pinned << PAGE_SHIFT;
- pages += nr_pinned;
-
- ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned,
- gup_flags, pages);
-
- /* Have to be a bit careful with return values */
- if (nr_pinned > 0) {
- if (ret < 0)
- ret = nr_pinned;
- else
- ret += nr_pinned;
- }
+ /* Slow path: try to get the remaining pages with get_user_pages */
+ start += nr_pinned << PAGE_SHIFT;
+ pages += nr_pinned;
+ ret = __gup_longterm_unlocked(start, nr_pages - nr_pinned, gup_flags,
+ pages);
+ if (ret < 0) {
+ /*
+ * The caller has to unpin the pages we already pinned so
+ * returning -errno is not an option
+ */
+ if (nr_pinned)
+ return nr_pinned;
+ return ret;
}
-
- return ret;
+ return ret + nr_pinned;
}
+
/**
* get_user_pages_fast_only() - pin user pages in memory
* @start: starting user address
diff --git a/mm/gup_benchmark.c b/mm/gup_test.c
index 8b3e5b5cd8fa..e3cf78e5873e 100644
--- a/mm/gup_benchmark.c
+++ b/mm/gup_test.c
@@ -4,40 +4,34 @@
#include <linux/uaccess.h>
#include <linux/ktime.h>
#include <linux/debugfs.h>
-
-#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_benchmark)
-#define GUP_BENCHMARK _IOWR('g', 2, struct gup_benchmark)
-#define PIN_FAST_BENCHMARK _IOWR('g', 3, struct gup_benchmark)
-#define PIN_BENCHMARK _IOWR('g', 4, struct gup_benchmark)
-#define PIN_LONGTERM_BENCHMARK _IOWR('g', 5, struct gup_benchmark)
-
-struct gup_benchmark {
- __u64 get_delta_usec;
- __u64 put_delta_usec;
- __u64 addr;
- __u64 size;
- __u32 nr_pages_per_call;
- __u32 flags;
- __u64 expansion[10]; /* For future use */
-};
+#include "gup_test.h"
static void put_back_pages(unsigned int cmd, struct page **pages,
- unsigned long nr_pages)
+ unsigned long nr_pages, unsigned int gup_test_flags)
{
unsigned long i;
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_BENCHMARK:
+ case GUP_BASIC_TEST:
for (i = 0; i < nr_pages; i++)
put_page(pages[i]);
break;
case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
+ case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
unpin_user_pages(pages, nr_pages);
break;
+ case DUMP_USER_PAGES_TEST:
+ if (gup_test_flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN) {
+ unpin_user_pages(pages, nr_pages);
+ } else {
+ for (i = 0; i < nr_pages; i++)
+ put_page(pages[i]);
+
+ }
+ break;
}
}
@@ -49,14 +43,14 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
switch (cmd) {
case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
+ case PIN_BASIC_TEST:
case PIN_LONGTERM_BENCHMARK:
for (i = 0; i < nr_pages; i++) {
page = pages[i];
if (WARN(!page_maybe_dma_pinned(page),
"pages[%lu] is NOT dma-pinned\n", i)) {
- dump_page(page, "gup_benchmark failure");
+ dump_page(page, "gup_test failure");
break;
}
}
@@ -64,8 +58,39 @@ static void verify_dma_pinned(unsigned int cmd, struct page **pages,
}
}
-static int __gup_benchmark_ioctl(unsigned int cmd,
- struct gup_benchmark *gup)
+static void dump_pages_test(struct gup_test *gup, struct page **pages,
+ unsigned long nr_pages)
+{
+ unsigned int index_to_dump;
+ unsigned int i;
+
+ /*
+ * Zero out any user-supplied page index that is out of range. Remember:
+ * .which_pages[] contains a 1-based set of page indices.
+ */
+ for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+ if (gup->which_pages[i] > nr_pages) {
+ pr_warn("ZEROING due to out of range: .which_pages[%u]: %u\n",
+ i, gup->which_pages[i]);
+ gup->which_pages[i] = 0;
+ }
+ }
+
+ for (i = 0; i < GUP_TEST_MAX_PAGES_TO_DUMP; i++) {
+ index_to_dump = gup->which_pages[i];
+
+ if (index_to_dump) {
+ index_to_dump--; // Decode from 1-based, to 0-based
+ pr_info("---- page #%u, starting from user virt addr: 0x%llx\n",
+ index_to_dump, gup->addr);
+ dump_page(pages[index_to_dump],
+ "gup_test: dump_pages() test");
+ }
+ }
+}
+
+static int __gup_test_ioctl(unsigned int cmd,
+ struct gup_test *gup)
{
ktime_t start_time, end_time;
unsigned long i, nr_pages, addr, next;
@@ -109,7 +134,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = get_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case GUP_BENCHMARK:
+ case GUP_BASIC_TEST:
nr = get_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
@@ -117,7 +142,7 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
nr = pin_user_pages_fast(addr, nr, gup->flags,
pages + i);
break;
- case PIN_BENCHMARK:
+ case PIN_BASIC_TEST:
nr = pin_user_pages(addr, nr, gup->flags, pages + i,
NULL);
break;
@@ -126,6 +151,14 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
gup->flags | FOLL_LONGTERM,
pages + i, NULL);
break;
+ case DUMP_USER_PAGES_TEST:
+ if (gup->flags & GUP_TEST_FLAG_DUMP_PAGES_USE_PIN)
+ nr = pin_user_pages(addr, nr, gup->flags,
+ pages + i, NULL);
+ else
+ nr = get_user_pages(addr, nr, gup->flags,
+ pages + i, NULL);
+ break;
default:
ret = -EINVAL;
goto unlock;
@@ -149,9 +182,12 @@ static int __gup_benchmark_ioctl(unsigned int cmd,
*/
verify_dma_pinned(cmd, pages, nr_pages);
+ if (cmd == DUMP_USER_PAGES_TEST)
+ dump_pages_test(gup, pages, nr_pages);
+
start_time = ktime_get();
- put_back_pages(cmd, pages, nr_pages);
+ put_back_pages(cmd, pages, nr_pages, gup->flags);
end_time = ktime_get();
gup->put_delta_usec = ktime_us_delta(end_time, start_time);
@@ -164,18 +200,19 @@ free_pages:
return ret;
}
-static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
+static long gup_test_ioctl(struct file *filep, unsigned int cmd,
unsigned long arg)
{
- struct gup_benchmark gup;
+ struct gup_test gup;
int ret;
switch (cmd) {
case GUP_FAST_BENCHMARK:
- case GUP_BENCHMARK:
case PIN_FAST_BENCHMARK:
- case PIN_BENCHMARK:
case PIN_LONGTERM_BENCHMARK:
+ case GUP_BASIC_TEST:
+ case PIN_BASIC_TEST:
+ case DUMP_USER_PAGES_TEST:
break;
default:
return -EINVAL;
@@ -184,7 +221,7 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
if (copy_from_user(&gup, (void __user *)arg, sizeof(gup)))
return -EFAULT;
- ret = __gup_benchmark_ioctl(cmd, &gup);
+ ret = __gup_test_ioctl(cmd, &gup);
if (ret)
return ret;
@@ -194,17 +231,17 @@ static long gup_benchmark_ioctl(struct file *filep, unsigned int cmd,
return 0;
}
-static const struct file_operations gup_benchmark_fops = {
+static const struct file_operations gup_test_fops = {
.open = nonseekable_open,
- .unlocked_ioctl = gup_benchmark_ioctl,
+ .unlocked_ioctl = gup_test_ioctl,
};
-static int gup_benchmark_init(void)
+static int __init gup_test_init(void)
{
- debugfs_create_file_unsafe("gup_benchmark", 0600, NULL, NULL,
- &gup_benchmark_fops);
+ debugfs_create_file_unsafe("gup_test", 0600, NULL, NULL,
+ &gup_test_fops);
return 0;
}
-late_initcall(gup_benchmark_init);
+late_initcall(gup_test_init);
diff --git a/mm/gup_test.h b/mm/gup_test.h
new file mode 100644
index 000000000000..90a6713d50eb
--- /dev/null
+++ b/mm/gup_test.h
@@ -0,0 +1,32 @@
+/* SPDX-License-Identifier: GPL-2.0-or-later */
+#ifndef __GUP_TEST_H
+#define __GUP_TEST_H
+
+#include <linux/types.h>
+
+#define GUP_FAST_BENCHMARK _IOWR('g', 1, struct gup_test)
+#define PIN_FAST_BENCHMARK _IOWR('g', 2, struct gup_test)
+#define PIN_LONGTERM_BENCHMARK _IOWR('g', 3, struct gup_test)
+#define GUP_BASIC_TEST _IOWR('g', 4, struct gup_test)
+#define PIN_BASIC_TEST _IOWR('g', 5, struct gup_test)
+#define DUMP_USER_PAGES_TEST _IOWR('g', 6, struct gup_test)
+
+#define GUP_TEST_MAX_PAGES_TO_DUMP 8
+
+#define GUP_TEST_FLAG_DUMP_PAGES_USE_PIN 0x1
+
+struct gup_test {
+ __u64 get_delta_usec;
+ __u64 put_delta_usec;
+ __u64 addr;
+ __u64 size;
+ __u32 nr_pages_per_call;
+ __u32 flags;
+ /*
+ * Each non-zero entry is the number of the page (1-based: first page is
+ * page 1, so that zero entries mean "do nothing") from the .addr base.
+ */
+ __u32 which_pages[GUP_TEST_MAX_PAGES_TO_DUMP];
+};
+
+#endif /* __GUP_TEST_H */
diff --git a/mm/highmem.c b/mm/highmem.c
index 1352a27951e3..c3a9ea7875ef 100644
--- a/mm/highmem.c
+++ b/mm/highmem.c
@@ -31,10 +31,6 @@
#include <asm/tlbflush.h>
#include <linux/vmalloc.h>
-#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32)
-DEFINE_PER_CPU(int, __kmap_atomic_idx);
-#endif
-
/*
* Virtual_count is not a pure "count".
* 0 means that it is not mapped, and has not been mapped
@@ -108,9 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color)
atomic_long_t _totalhigh_pages __read_mostly;
EXPORT_SYMBOL(_totalhigh_pages);
-EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx);
-
-unsigned int nr_free_highpages (void)
+unsigned int __nr_free_highpages (void)
{
struct zone *zone;
unsigned int pages = 0;
@@ -147,7 +141,7 @@ pte_t * pkmap_page_table;
do { spin_unlock(&kmap_lock); (void)(flags); } while (0)
#endif
-struct page *kmap_to_page(void *vaddr)
+struct page *__kmap_to_page(void *vaddr)
{
unsigned long addr = (unsigned long)vaddr;
@@ -158,7 +152,7 @@ struct page *kmap_to_page(void *vaddr)
return virt_to_page(addr);
}
-EXPORT_SYMBOL(kmap_to_page);
+EXPORT_SYMBOL(__kmap_to_page);
static void flush_all_zero_pkmaps(void)
{
@@ -200,10 +194,7 @@ static void flush_all_zero_pkmaps(void)
flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP));
}
-/**
- * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings
- */
-void kmap_flush_unused(void)
+void __kmap_flush_unused(void)
{
lock_kmap();
flush_all_zero_pkmaps();
@@ -367,9 +358,312 @@ void kunmap_high(struct page *page)
if (need_wakeup)
wake_up(pkmap_map_wait);
}
-
EXPORT_SYMBOL(kunmap_high);
-#endif /* CONFIG_HIGHMEM */
+
+#ifdef CONFIG_TRANSPARENT_HUGEPAGE
+void zero_user_segments(struct page *page, unsigned start1, unsigned end1,
+ unsigned start2, unsigned end2)
+{
+ unsigned int i;
+
+ BUG_ON(end1 > page_size(page) || end2 > page_size(page));
+
+ for (i = 0; i < compound_nr(page); i++) {
+ void *kaddr = NULL;
+
+ if (start1 < PAGE_SIZE || start2 < PAGE_SIZE)
+ kaddr = kmap_atomic(page + i);
+
+ if (start1 >= PAGE_SIZE) {
+ start1 -= PAGE_SIZE;
+ end1 -= PAGE_SIZE;
+ } else {
+ unsigned this_end = min_t(unsigned, end1, PAGE_SIZE);
+
+ if (end1 > start1)
+ memset(kaddr + start1, 0, this_end - start1);
+ end1 -= this_end;
+ start1 = 0;
+ }
+
+ if (start2 >= PAGE_SIZE) {
+ start2 -= PAGE_SIZE;
+ end2 -= PAGE_SIZE;
+ } else {
+ unsigned this_end = min_t(unsigned, end2, PAGE_SIZE);
+
+ if (end2 > start2)
+ memset(kaddr + start2, 0, this_end - start2);
+ end2 -= this_end;
+ start2 = 0;
+ }
+
+ if (kaddr) {
+ kunmap_atomic(kaddr);
+ flush_dcache_page(page + i);
+ }
+
+ if (!end1 && !end2)
+ break;
+ }
+
+ BUG_ON((start1 | start2 | end1 | end2) != 0);
+}
+EXPORT_SYMBOL(zero_user_segments);
+#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
+#endif /* CONFIG_HIGHMEM */
+
+#ifdef CONFIG_KMAP_LOCAL
+
+#include <asm/kmap_size.h>
+
+/*
+ * With DEBUG_KMAP_LOCAL the stack depth is doubled and every second
+ * slot is unused which acts as a guard page
+ */
+#ifdef CONFIG_DEBUG_KMAP_LOCAL
+# define KM_INCR 2
+#else
+# define KM_INCR 1
+#endif
+
+static inline int kmap_local_idx_push(void)
+{
+ WARN_ON_ONCE(in_irq() && !irqs_disabled());
+ current->kmap_ctrl.idx += KM_INCR;
+ BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX);
+ return current->kmap_ctrl.idx - 1;
+}
+
+static inline int kmap_local_idx(void)
+{
+ return current->kmap_ctrl.idx - 1;
+}
+
+static inline void kmap_local_idx_pop(void)
+{
+ current->kmap_ctrl.idx -= KM_INCR;
+ BUG_ON(current->kmap_ctrl.idx < 0);
+}
+
+#ifndef arch_kmap_local_post_map
+# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_pre_unmap
+# define arch_kmap_local_pre_unmap(vaddr) do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_post_unmap
+# define arch_kmap_local_post_unmap(vaddr) do { } while (0)
+#endif
+
+#ifndef arch_kmap_local_map_idx
+#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx)
+#endif
+
+#ifndef arch_kmap_local_unmap_idx
+#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx)
+#endif
+
+#ifndef arch_kmap_local_high_get
+static inline void *arch_kmap_local_high_get(struct page *page)
+{
+ return NULL;
+}
+#endif
+
+/* Unmap a local mapping which was obtained by kmap_high_get() */
+static inline bool kmap_high_unmap_local(unsigned long vaddr)
+{
+#ifdef ARCH_NEEDS_KMAP_HIGH_GET
+ if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) {
+ kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)]));
+ return true;
+ }
+#endif
+ return false;
+}
+
+static inline int kmap_local_calc_idx(int idx)
+{
+ return idx + KM_MAX_IDX * smp_processor_id();
+}
+
+static pte_t *__kmap_pte;
+
+static pte_t *kmap_get_pte(void)
+{
+ if (!__kmap_pte)
+ __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN));
+ return __kmap_pte;
+}
+
+void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot)
+{
+ pte_t pteval, *kmap_pte = kmap_get_pte();
+ unsigned long vaddr;
+ int idx;
+
+ /*
+ * Disable migration so resulting virtual address is stable
+ * accross preemption.
+ */
+ migrate_disable();
+ preempt_disable();
+ idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn);
+ vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ BUG_ON(!pte_none(*(kmap_pte - idx)));
+ pteval = pfn_pte(pfn, prot);
+ set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval);
+ arch_kmap_local_post_map(vaddr, pteval);
+ current->kmap_ctrl.pteval[kmap_local_idx()] = pteval;
+ preempt_enable();
+
+ return (void *)vaddr;
+}
+EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot);
+
+void *__kmap_local_page_prot(struct page *page, pgprot_t prot)
+{
+ void *kmap;
+
+ /*
+ * To broaden the usage of the actual kmap_local() machinery always map
+ * pages when debugging is enabled and the architecture has no problems
+ * with alias mappings.
+ */
+ if (!IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP) && !PageHighMem(page))
+ return page_address(page);
+
+ /* Try kmap_high_get() if architecture has it enabled */
+ kmap = arch_kmap_local_high_get(page);
+ if (kmap)
+ return kmap;
+
+ return __kmap_local_pfn_prot(page_to_pfn(page), prot);
+}
+EXPORT_SYMBOL(__kmap_local_page_prot);
+
+void kunmap_local_indexed(void *vaddr)
+{
+ unsigned long addr = (unsigned long) vaddr & PAGE_MASK;
+ pte_t *kmap_pte = kmap_get_pte();
+ int idx;
+
+ if (addr < __fix_to_virt(FIX_KMAP_END) ||
+ addr > __fix_to_virt(FIX_KMAP_BEGIN)) {
+ if (IS_ENABLED(CONFIG_DEBUG_KMAP_LOCAL_FORCE_MAP)) {
+ /* This _should_ never happen! See above. */
+ WARN_ON_ONCE(1);
+ return;
+ }
+ /*
+ * Handle mappings which were obtained by kmap_high_get()
+ * first as the virtual address of such mappings is below
+ * PAGE_OFFSET. Warn for all other addresses which are in
+ * the user space part of the virtual address space.
+ */
+ if (!kmap_high_unmap_local(addr))
+ WARN_ON_ONCE(addr < PAGE_OFFSET);
+ return;
+ }
+
+ preempt_disable();
+ idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr);
+ WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx));
+
+ arch_kmap_local_pre_unmap(addr);
+ pte_clear(&init_mm, addr, kmap_pte - idx);
+ arch_kmap_local_post_unmap(addr);
+ current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0);
+ kmap_local_idx_pop();
+ preempt_enable();
+ migrate_enable();
+}
+EXPORT_SYMBOL(kunmap_local_indexed);
+
+/*
+ * Invoked before switch_to(). This is safe even when during or after
+ * clearing the maps an interrupt which needs a kmap_local happens because
+ * the task::kmap_ctrl.idx is not modified by the unmapping code so a
+ * nested kmap_local will use the next unused index and restore the index
+ * on unmap. The already cleared kmaps of the outgoing task are irrelevant
+ * because the interrupt context does not know about them. The same applies
+ * when scheduling back in for an interrupt which happens before the
+ * restore is complete.
+ */
+void __kmap_local_sched_out(void)
+{
+ struct task_struct *tsk = current;
+ pte_t *kmap_pte = kmap_get_pte();
+ int i;
+
+ /* Clear kmaps */
+ for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
+ pte_t pteval = tsk->kmap_ctrl.pteval[i];
+ unsigned long addr;
+ int idx;
+
+ /* With debug all even slots are unmapped and act as guard */
+ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+ WARN_ON_ONCE(!pte_none(pteval));
+ continue;
+ }
+ if (WARN_ON_ONCE(pte_none(pteval)))
+ continue;
+
+ /*
+ * This is a horrible hack for XTENSA to calculate the
+ * coloured PTE index. Uses the PFN encoded into the pteval
+ * and the map index calculation because the actual mapped
+ * virtual address is not stored in task::kmap_ctrl.
+ * For any sane architecture this is optimized out.
+ */
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+
+ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ arch_kmap_local_pre_unmap(addr);
+ pte_clear(&init_mm, addr, kmap_pte - idx);
+ arch_kmap_local_post_unmap(addr);
+ }
+}
+
+void __kmap_local_sched_in(void)
+{
+ struct task_struct *tsk = current;
+ pte_t *kmap_pte = kmap_get_pte();
+ int i;
+
+ /* Restore kmaps */
+ for (i = 0; i < tsk->kmap_ctrl.idx; i++) {
+ pte_t pteval = tsk->kmap_ctrl.pteval[i];
+ unsigned long addr;
+ int idx;
+
+ /* With debug all even slots are unmapped and act as guard */
+ if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) {
+ WARN_ON_ONCE(!pte_none(pteval));
+ continue;
+ }
+ if (WARN_ON_ONCE(pte_none(pteval)))
+ continue;
+
+ /* See comment in __kmap_local_sched_out() */
+ idx = arch_kmap_local_map_idx(i, pte_pfn(pteval));
+ addr = __fix_to_virt(FIX_KMAP_BEGIN + idx);
+ set_pte_at(&init_mm, addr, kmap_pte - idx, pteval);
+ arch_kmap_local_post_map(addr, pteval);
+ }
+}
+
+void kmap_local_fork(struct task_struct *tsk)
+{
+ if (WARN_ON_ONCE(tsk->kmap_ctrl.idx))
+ memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl));
+}
+
+#endif
#if defined(HASHED_PAGE_VIRTUAL)
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index ec2bb93f7431..9237976abe72 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -163,12 +163,17 @@ static struct shrinker huge_zero_page_shrinker = {
static ssize_t enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
+ const char *output;
+
if (test_bit(TRANSPARENT_HUGEPAGE_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "[always] madvise never\n");
- else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always [madvise] never\n");
+ output = "[always] madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ output = "always [madvise] never";
else
- return sprintf(buf, "always madvise [never]\n");
+ output = "always madvise [never]";
+
+ return sysfs_emit(buf, "%s\n", output);
}
static ssize_t enabled_store(struct kobject *kobj,
@@ -200,11 +205,11 @@ static struct kobj_attribute enabled_attr =
__ATTR(enabled, 0644, enabled_show, enabled_store);
ssize_t single_hugepage_flag_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf,
- enum transparent_hugepage_flag flag)
+ struct kobj_attribute *attr, char *buf,
+ enum transparent_hugepage_flag flag)
{
- return sprintf(buf, "%d\n",
- !!test_bit(flag, &transparent_hugepage_flags));
+ return sysfs_emit(buf, "%d\n",
+ !!test_bit(flag, &transparent_hugepage_flags));
}
ssize_t single_hugepage_flag_store(struct kobject *kobj,
@@ -232,15 +237,24 @@ ssize_t single_hugepage_flag_store(struct kobject *kobj,
static ssize_t defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "[always] defer defer+madvise madvise never\n");
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always [defer] defer+madvise madvise never\n");
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always defer [defer+madvise] madvise never\n");
- if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG, &transparent_hugepage_flags))
- return sprintf(buf, "always defer defer+madvise [madvise] never\n");
- return sprintf(buf, "always defer defer+madvise madvise [never]\n");
+ const char *output;
+
+ if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_DIRECT_FLAG,
+ &transparent_hugepage_flags))
+ output = "[always] defer defer+madvise madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_FLAG,
+ &transparent_hugepage_flags))
+ output = "always [defer] defer+madvise madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_KSWAPD_OR_MADV_FLAG,
+ &transparent_hugepage_flags))
+ output = "always defer [defer+madvise] madvise never";
+ else if (test_bit(TRANSPARENT_HUGEPAGE_DEFRAG_REQ_MADV_FLAG,
+ &transparent_hugepage_flags))
+ output = "always defer defer+madvise [madvise] never";
+ else
+ output = "always defer defer+madvise madvise [never]";
+
+ return sysfs_emit(buf, "%s\n", output);
}
static ssize_t defrag_store(struct kobject *kobj,
@@ -281,10 +295,10 @@ static struct kobj_attribute defrag_attr =
__ATTR(defrag, 0644, defrag_show, defrag_store);
static ssize_t use_zero_page_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
+ TRANSPARENT_HUGEPAGE_USE_ZERO_PAGE_FLAG);
}
static ssize_t use_zero_page_store(struct kobject *kobj,
struct kobj_attribute *attr, const char *buf, size_t count)
@@ -296,9 +310,9 @@ static struct kobj_attribute use_zero_page_attr =
__ATTR(use_zero_page, 0644, use_zero_page_show, use_zero_page_store);
static ssize_t hpage_pmd_size_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", HPAGE_PMD_SIZE);
+ return sysfs_emit(buf, "%lu\n", HPAGE_PMD_SIZE);
}
static struct kobj_attribute hpage_pmd_size_attr =
__ATTR_RO(hpage_pmd_size);
@@ -470,7 +484,7 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma)
#ifdef CONFIG_MEMCG
static inline struct deferred_split *get_deferred_split_queue(struct page *page)
{
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(compound_head(page));
struct pglist_data *pgdat = NODE_DATA(page_to_nid(page));
if (memcg)
@@ -2321,7 +2335,7 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma,
static void unmap_page(struct page *page)
{
- enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS |
+ enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK |
TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD;
bool unmap_success;
@@ -2345,6 +2359,27 @@ static void remap_page(struct page *page, unsigned int nr)
}
}
+static void lru_add_page_tail(struct page *head, struct page *tail,
+ struct lruvec *lruvec, struct list_head *list)
+{
+ VM_BUG_ON_PAGE(!PageHead(head), head);
+ VM_BUG_ON_PAGE(PageCompound(tail), head);
+ VM_BUG_ON_PAGE(PageLRU(tail), head);
+ lockdep_assert_held(&lruvec->lru_lock);
+
+ if (list) {
+ /* page reclaim is reclaiming a huge page */
+ VM_WARN_ON(PageLRU(head));
+ get_page(tail);
+ list_add_tail(&tail->lru, list);
+ } else {
+ /* head is still on lru (and we have it frozen) */
+ VM_WARN_ON(!PageLRU(head));
+ SetPageLRU(tail);
+ list_add_tail(&tail->lru, &head->lru);
+ }
+}
+
static void __split_huge_page_tail(struct page *head, int tail,
struct lruvec *lruvec, struct list_head *list)
{
@@ -2356,7 +2391,7 @@ static void __split_huge_page_tail(struct page *head, int tail,
* Clone page flags before unfreezing refcount.
*
* After successful get_page_unless_zero() might follow flags change,
- * for exmaple lock_page() which set PG_waiters.
+ * for example lock_page() which set PG_waiters.
*/
page_tail->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
page_tail->flags |= (head->flags &
@@ -2411,18 +2446,15 @@ static void __split_huge_page_tail(struct page *head, int tail,
}
static void __split_huge_page(struct page *page, struct list_head *list,
- pgoff_t end, unsigned long flags)
+ pgoff_t end)
{
struct page *head = compound_head(page);
- pg_data_t *pgdat = page_pgdat(head);
struct lruvec *lruvec;
struct address_space *swap_cache = NULL;
unsigned long offset = 0;
unsigned int nr = thp_nr_pages(head);
int i;
- lruvec = mem_cgroup_page_lruvec(head, pgdat);
-
/* complete memcg works before add pages to LRU */
mem_cgroup_split_huge_fixup(head);
@@ -2434,6 +2466,9 @@ static void __split_huge_page(struct page *page, struct list_head *list,
xa_lock(&swap_cache->i_pages);
}
+ /* lock lru list/PageCompound, ref freezed by page_ref_freeze */
+ lruvec = lock_page_lruvec(head);
+
for (i = nr - 1; i >= 1; i--) {
__split_huge_page_tail(head, i, lruvec, list);
/* Some pages can be beyond i_size: drop them from page cache */
@@ -2453,6 +2488,8 @@ static void __split_huge_page(struct page *page, struct list_head *list,
}
ClearPageCompound(head);
+ unlock_page_lruvec(lruvec);
+ /* Caller disabled irqs, so they are still disabled here */
split_page_owner(head, nr);
@@ -2470,8 +2507,7 @@ static void __split_huge_page(struct page *page, struct list_head *list,
page_ref_add(head, 2);
xa_unlock(&head->mapping->i_pages);
}
-
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ local_irq_enable();
remap_page(head, nr);
@@ -2617,12 +2653,10 @@ bool can_split_huge_page(struct page *page, int *pextra_pins)
int split_huge_page_to_list(struct page *page, struct list_head *list)
{
struct page *head = compound_head(page);
- struct pglist_data *pgdata = NODE_DATA(page_to_nid(head));
struct deferred_split *ds_queue = get_deferred_split_queue(head);
struct anon_vma *anon_vma = NULL;
struct address_space *mapping = NULL;
int count, mapcount, extra_pins, ret;
- unsigned long flags;
pgoff_t end;
VM_BUG_ON_PAGE(is_huge_zero_page(head), head);
@@ -2683,9 +2717,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
unmap_page(head);
VM_BUG_ON_PAGE(compound_mapcount(head), head);
- /* prevent PageLRU to go away from under us, and freeze lru stats */
- spin_lock_irqsave(&pgdata->lru_lock, flags);
-
+ /* block interrupt reentry in xa_lock and spinlock */
+ local_irq_disable();
if (mapping) {
XA_STATE(xas, &mapping->i_pages, page_index(head));
@@ -2710,12 +2743,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&ds_queue->split_queue_lock);
if (mapping) {
if (PageSwapBacked(head))
- __dec_node_page_state(head, NR_SHMEM_THPS);
+ __dec_lruvec_page_state(head, NR_SHMEM_THPS);
else
- __dec_node_page_state(head, NR_FILE_THPS);
+ __dec_lruvec_page_state(head, NR_FILE_THPS);
}
- __split_huge_page(page, list, end, flags);
+ __split_huge_page(page, list, end);
ret = 0;
} else {
if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) {
@@ -2729,7 +2762,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list)
spin_unlock(&ds_queue->split_queue_lock);
fail: if (mapping)
xa_unlock(&mapping->i_pages);
- spin_unlock_irqrestore(&pgdata->lru_lock, flags);
+ local_irq_enable();
remap_page(head, thp_nr_pages(head));
ret = -EBUSY;
}
@@ -2764,7 +2797,7 @@ void deferred_split_huge_page(struct page *page)
{
struct deferred_split *ds_queue = get_deferred_split_queue(page);
#ifdef CONFIG_MEMCG
- struct mem_cgroup *memcg = compound_head(page)->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(compound_head(page));
#endif
unsigned long flags;
diff --git a/mm/hugetlb.c b/mm/hugetlb.c
index 37f15c3c24dc..cbf32d2824fd 100644
--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -1216,6 +1216,7 @@ static void destroy_compound_gigantic_page(struct page *page,
}
set_compound_order(page, 0);
+ page[1].compound_nr = 0;
__ClearPageHead(page);
}
@@ -1943,13 +1944,14 @@ struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma,
* Increase the hugetlb pool such that it can accommodate a reservation
* of size 'delta'.
*/
-static int gather_surplus_pages(struct hstate *h, int delta)
+static int gather_surplus_pages(struct hstate *h, long delta)
__must_hold(&hugetlb_lock)
{
struct list_head surplus_list;
struct page *page, *tmp;
- int ret, i;
- int needed, allocated;
+ int ret;
+ long i;
+ long needed, allocated;
bool alloc_ok = true;
needed = (h->resv_huge_pages + delta) - h->free_huge_pages;
@@ -2013,8 +2015,7 @@ retry:
* This page is now managed by the hugetlb allocator and has
* no users -- drop the buddy allocator's reference.
*/
- put_page_testzero(page);
- VM_BUG_ON_PAGE(page_count(page), page);
+ VM_BUG_ON_PAGE(!put_page_testzero(page), page);
enqueue_huge_page(h, page);
}
free:
@@ -2759,7 +2760,7 @@ static ssize_t nr_hugepages_show_common(struct kobject *kobj,
else
nr_huge_pages = h->nr_huge_pages_node[nid];
- return sprintf(buf, "%lu\n", nr_huge_pages);
+ return sysfs_emit(buf, "%lu\n", nr_huge_pages);
}
static ssize_t __nr_hugepages_store_common(bool obey_mempolicy,
@@ -2832,7 +2833,8 @@ HSTATE_ATTR(nr_hugepages);
* huge page alloc/free.
*/
static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr,
+ char *buf)
{
return nr_hugepages_show_common(kobj, attr, buf);
}
@@ -2850,7 +2852,7 @@ static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
- return sprintf(buf, "%lu\n", h->nr_overcommit_huge_pages);
+ return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages);
}
static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj,
@@ -2888,7 +2890,7 @@ static ssize_t free_hugepages_show(struct kobject *kobj,
else
free_huge_pages = h->free_huge_pages_node[nid];
- return sprintf(buf, "%lu\n", free_huge_pages);
+ return sysfs_emit(buf, "%lu\n", free_huge_pages);
}
HSTATE_ATTR_RO(free_hugepages);
@@ -2896,7 +2898,7 @@ static ssize_t resv_hugepages_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
struct hstate *h = kobj_to_hstate(kobj, NULL);
- return sprintf(buf, "%lu\n", h->resv_huge_pages);
+ return sysfs_emit(buf, "%lu\n", h->resv_huge_pages);
}
HSTATE_ATTR_RO(resv_hugepages);
@@ -2913,7 +2915,7 @@ static ssize_t surplus_hugepages_show(struct kobject *kobj,
else
surplus_huge_pages = h->surplus_huge_pages_node[nid];
- return sprintf(buf, "%lu\n", surplus_huge_pages);
+ return sysfs_emit(buf, "%lu\n", surplus_huge_pages);
}
HSTATE_ATTR_RO(surplus_hugepages);
@@ -3197,8 +3199,6 @@ void __init hugetlb_add_hstate(unsigned int order)
h = &hstates[hugetlb_max_hstate++];
h->order = order;
h->mask = ~((1ULL << (order + PAGE_SHIFT)) - 1);
- h->nr_huge_pages = 0;
- h->free_huge_pages = 0;
for (i = 0; i < MAX_NUMNODES; ++i)
INIT_LIST_HEAD(&h->hugepage_freelists[i]);
INIT_LIST_HEAD(&h->hugepage_activelist);
@@ -3672,7 +3672,7 @@ const struct vm_operations_struct hugetlb_vm_ops = {
.fault = hugetlb_vm_op_fault,
.open = hugetlb_vm_op_open,
.close = hugetlb_vm_op_close,
- .split = hugetlb_vm_op_split,
+ .may_split = hugetlb_vm_op_split,
.pagesize = hugetlb_vm_op_pagesize,
};
@@ -5114,6 +5114,7 @@ int hugetlb_reserve_pages(struct inode *inode,
if (unlikely(add < 0)) {
hugetlb_acct_memory(h, -gbl_reserve);
+ ret = add;
goto out_put_pages;
} else if (unlikely(chg > add)) {
/*
diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c
index 1f87aec9ab5c..9182848dda3e 100644
--- a/mm/hugetlb_cgroup.c
+++ b/mm/hugetlb_cgroup.c
@@ -82,11 +82,8 @@ static inline bool hugetlb_cgroup_have_usage(struct hugetlb_cgroup *h_cg)
for (idx = 0; idx < hugetlb_max_hstate; idx++) {
if (page_counter_read(
- hugetlb_cgroup_counter_from_cgroup(h_cg, idx)) ||
- page_counter_read(hugetlb_cgroup_counter_from_cgroup_rsvd(
- h_cg, idx))) {
+ hugetlb_cgroup_counter_from_cgroup(h_cg, idx)))
return true;
- }
}
return false;
}
@@ -202,9 +199,10 @@ static void hugetlb_cgroup_css_offline(struct cgroup_subsys_state *css)
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(css);
struct hstate *h;
struct page *page;
- int idx = 0;
+ int idx;
do {
+ idx = 0;
for_each_hstate(h) {
spin_lock(&hugetlb_lock);
list_for_each_entry(page, &h->hugepage_activelist, lru)
diff --git a/mm/init-mm.c b/mm/init-mm.c
index 3a613c85f9ed..153162669f80 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -31,6 +31,7 @@ struct mm_struct init_mm = {
.pgd = swapper_pg_dir,
.mm_users = ATOMIC_INIT(2),
.mm_count = ATOMIC_INIT(1),
+ .write_protect_seq = SEQCNT_ZERO(init_mm.write_protect_seq),
MMAP_LOCK_INITIALIZER(init_mm)
.page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock),
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
diff --git a/mm/internal.h b/mm/internal.h
index c43ccdddb0f6..25d2b2439f19 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -199,8 +199,13 @@ extern void post_alloc_hook(struct page *page, unsigned int order,
gfp_t gfp_flags);
extern int user_min_free_kbytes;
+extern void free_unref_page(struct page *page);
+extern void free_unref_page_list(struct list_head *list);
+
extern void zone_pcp_update(struct zone *zone);
extern void zone_pcp_reset(struct zone *zone);
+extern void zone_pcp_disable(struct zone *zone);
+extern void zone_pcp_enable(struct zone *zone);
#if defined CONFIG_COMPACTION || defined CONFIG_CMA
diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile
index 370d970e5ab5..9fe39a66388a 100644
--- a/mm/kasan/Makefile
+++ b/mm/kasan/Makefile
@@ -6,12 +6,15 @@ KCOV_INSTRUMENT := n
# Disable ftrace to avoid recursion.
CFLAGS_REMOVE_common.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_generic.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_generic_report.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_init.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_quarantine.o = $(CC_FLAGS_FTRACE)
CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_tags.o = $(CC_FLAGS_FTRACE)
-CFLAGS_REMOVE_tags_report.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_generic.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_hw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_report_sw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_shadow.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_hw_tags.o = $(CC_FLAGS_FTRACE)
+CFLAGS_REMOVE_sw_tags.o = $(CC_FLAGS_FTRACE)
# Function splitter causes unnecessary splits in __asan_load1/__asan_store1
# see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=63533
@@ -22,13 +25,17 @@ CC_FLAGS_KASAN_RUNTIME += -DDISABLE_BRANCH_PROFILING
CFLAGS_common.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_generic_report.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_init.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_quarantine.o := $(CC_FLAGS_KASAN_RUNTIME)
CFLAGS_report.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-CFLAGS_tags_report.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_generic.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_report_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
+CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME)
-obj-$(CONFIG_KASAN) := common.o init.o report.o
-obj-$(CONFIG_KASAN_GENERIC) += generic.o generic_report.o quarantine.o
-obj-$(CONFIG_KASAN_SW_TAGS) += tags.o tags_report.o
+obj-$(CONFIG_KASAN) := common.o report.o
+obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o
+obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o
+obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o
diff --git a/mm/kasan/common.c b/mm/kasan/common.c
index 950fd372a07e..b25167664ead 100644
--- a/mm/kasan/common.c
+++ b/mm/kasan/common.c
@@ -1,24 +1,18 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains common generic and tag-based KASAN code.
+ * This file contains common KASAN code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/export.h>
#include <linux/init.h>
#include <linux/kasan.h>
#include <linux/kernel.h>
-#include <linux/kmemleak.h>
#include <linux/linkage.h>
#include <linux/memblock.h>
#include <linux/memory.h>
@@ -31,12 +25,8 @@
#include <linux/stacktrace.h>
#include <linux/string.h>
#include <linux/types.h>
-#include <linux/vmalloc.h>
#include <linux/bug.h>
-#include <asm/cacheflush.h>
-#include <asm/tlbflush.h>
-
#include "kasan.h"
#include "../slab.h"
@@ -56,6 +46,7 @@ void kasan_set_track(struct kasan_track *track, gfp_t flags)
track->stack = kasan_save_stack(flags);
}
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
void kasan_enable_current(void)
{
current->kasan_depth++;
@@ -65,106 +56,20 @@ void kasan_disable_current(void)
{
current->kasan_depth--;
}
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
-bool __kasan_check_read(const volatile void *p, unsigned int size)
-{
- return check_memory_region((unsigned long)p, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__kasan_check_read);
-
-bool __kasan_check_write(const volatile void *p, unsigned int size)
-{
- return check_memory_region((unsigned long)p, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__kasan_check_write);
-
-#undef memset
-void *memset(void *addr, int c, size_t len)
-{
- if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
- return NULL;
-
- return __memset(addr, c, len);
-}
-
-#ifdef __HAVE_ARCH_MEMMOVE
-#undef memmove
-void *memmove(void *dest, const void *src, size_t len)
-{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
- return NULL;
-
- return __memmove(dest, src, len);
-}
-#endif
-
-#undef memcpy
-void *memcpy(void *dest, const void *src, size_t len)
+void __kasan_unpoison_range(const void *address, size_t size)
{
- if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
- !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
- return NULL;
-
- return __memcpy(dest, src, len);
-}
-
-/*
- * Poisons the shadow memory for 'size' bytes starting from 'addr'.
- * Memory addresses should be aligned to KASAN_SHADOW_SCALE_SIZE.
- */
-void kasan_poison_shadow(const void *address, size_t size, u8 value)
-{
- void *shadow_start, *shadow_end;
-
- /*
- * Perform shadow offset calculation based on untagged address, as
- * some of the callers (e.g. kasan_poison_object_data) pass tagged
- * addresses to this function.
- */
- address = reset_tag(address);
-
- shadow_start = kasan_mem_to_shadow(address);
- shadow_end = kasan_mem_to_shadow(address + size);
-
- __memset(shadow_start, value, shadow_end - shadow_start);
-}
-
-void kasan_unpoison_shadow(const void *address, size_t size)
-{
- u8 tag = get_tag(address);
-
- /*
- * Perform shadow offset calculation based on untagged address, as
- * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
- * addresses to this function.
- */
- address = reset_tag(address);
-
- kasan_poison_shadow(address, size, tag);
-
- if (size & KASAN_SHADOW_MASK) {
- u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- *shadow = tag;
- else
- *shadow = size & KASAN_SHADOW_MASK;
- }
-}
-
-static void __kasan_unpoison_stack(struct task_struct *task, const void *sp)
-{
- void *base = task_stack_page(task);
- size_t size = sp - base;
-
- kasan_unpoison_shadow(base, size);
+ unpoison_range(address, size);
}
+#if CONFIG_KASAN_STACK
/* Unpoison the entire stack for a task. */
void kasan_unpoison_task_stack(struct task_struct *task)
{
- __kasan_unpoison_stack(task, task_stack_page(task) + THREAD_SIZE);
+ void *base = task_stack_page(task);
+
+ unpoison_range(base, THREAD_SIZE);
}
/* Unpoison the stack for the current task beyond a watermark sp value. */
@@ -177,10 +82,22 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark)
*/
void *base = (void *)((unsigned long)watermark & ~(THREAD_SIZE - 1));
- kasan_unpoison_shadow(base, watermark - base);
+ unpoison_range(base, watermark - base);
+}
+#endif /* CONFIG_KASAN_STACK */
+
+/*
+ * Only allow cache merging when stack collection is disabled and no metadata
+ * is present.
+ */
+slab_flags_t __kasan_never_merge(void)
+{
+ if (kasan_stack_collection_enabled())
+ return SLAB_KASAN;
+ return 0;
}
-void kasan_alloc_pages(struct page *page, unsigned int order)
+void __kasan_alloc_pages(struct page *page, unsigned int order)
{
u8 tag;
unsigned long i;
@@ -191,13 +108,13 @@ void kasan_alloc_pages(struct page *page, unsigned int order)
tag = random_tag();
for (i = 0; i < (1 << order); i++)
page_kasan_tag_set(page + i, tag);
- kasan_unpoison_shadow(page_address(page), PAGE_SIZE << order);
+ unpoison_range(page_address(page), PAGE_SIZE << order);
}
-void kasan_free_pages(struct page *page, unsigned int order)
+void __kasan_free_pages(struct page *page, unsigned int order)
{
if (likely(!PageHighMem(page)))
- kasan_poison_shadow(page_address(page),
+ poison_range(page_address(page),
PAGE_SIZE << order,
KASAN_FREE_PAGE);
}
@@ -208,9 +125,6 @@ void kasan_free_pages(struct page *page, unsigned int order)
*/
static inline unsigned int optimal_redzone(unsigned int object_size)
{
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- return 0;
-
return
object_size <= 64 - 16 ? 16 :
object_size <= 128 - 32 ? 32 :
@@ -221,88 +135,129 @@ static inline unsigned int optimal_redzone(unsigned int object_size)
object_size <= (1 << 16) - 1024 ? 1024 : 2048;
}
-void kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
- slab_flags_t *flags)
+void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size,
+ slab_flags_t *flags)
{
- unsigned int orig_size = *size;
- unsigned int redzone_size;
- int redzone_adjust;
+ unsigned int ok_size;
+ unsigned int optimal_size;
- /* Add alloc meta. */
- cache->kasan_info.alloc_meta_offset = *size;
- *size += sizeof(struct kasan_alloc_meta);
+ /*
+ * SLAB_KASAN is used to mark caches as ones that are sanitized by
+ * KASAN. Currently this flag is used in two places:
+ * 1. In slab_ksize() when calculating the size of the accessible
+ * memory within the object.
+ * 2. In slab_common.c to prevent merging of sanitized caches.
+ */
+ *flags |= SLAB_KASAN;
- /* Add free meta. */
- if (IS_ENABLED(CONFIG_KASAN_GENERIC) &&
- (cache->flags & SLAB_TYPESAFE_BY_RCU || cache->ctor ||
- cache->object_size < sizeof(struct kasan_free_meta))) {
- cache->kasan_info.free_meta_offset = *size;
- *size += sizeof(struct kasan_free_meta);
- }
+ if (!kasan_stack_collection_enabled())
+ return;
- redzone_size = optimal_redzone(cache->object_size);
- redzone_adjust = redzone_size - (*size - cache->object_size);
- if (redzone_adjust > 0)
- *size += redzone_adjust;
+ ok_size = *size;
- *size = min_t(unsigned int, KMALLOC_MAX_SIZE,
- max(*size, cache->object_size + redzone_size));
+ /* Add alloc meta into redzone. */
+ cache->kasan_info.alloc_meta_offset = *size;
+ *size += sizeof(struct kasan_alloc_meta);
/*
- * If the metadata doesn't fit, don't enable KASAN at all.
+ * If alloc meta doesn't fit, don't add it.
+ * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal
+ * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for
+ * larger sizes.
*/
- if (*size <= cache->kasan_info.alloc_meta_offset ||
- *size <= cache->kasan_info.free_meta_offset) {
+ if (*size > KMALLOC_MAX_SIZE) {
cache->kasan_info.alloc_meta_offset = 0;
- cache->kasan_info.free_meta_offset = 0;
- *size = orig_size;
+ *size = ok_size;
+ /* Continue, since free meta might still fit. */
+ }
+
+ /* Only the generic mode uses free meta or flexible redzones. */
+ if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
return;
}
- *flags |= SLAB_KASAN;
+ /*
+ * Add free meta into redzone when it's not possible to store
+ * it in the object. This is the case when:
+ * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can
+ * be touched after it was freed, or
+ * 2. Object has a constructor, which means it's expected to
+ * retain its content until the next allocation, or
+ * 3. Object is too small.
+ * Otherwise cache->kasan_info.free_meta_offset = 0 is implied.
+ */
+ if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor ||
+ cache->object_size < sizeof(struct kasan_free_meta)) {
+ ok_size = *size;
+
+ cache->kasan_info.free_meta_offset = *size;
+ *size += sizeof(struct kasan_free_meta);
+
+ /* If free meta doesn't fit, don't add it. */
+ if (*size > KMALLOC_MAX_SIZE) {
+ cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META;
+ *size = ok_size;
+ }
+ }
+
+ /* Calculate size with optimal redzone. */
+ optimal_size = cache->object_size + optimal_redzone(cache->object_size);
+ /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */
+ if (optimal_size > KMALLOC_MAX_SIZE)
+ optimal_size = KMALLOC_MAX_SIZE;
+ /* Use optimal size if the size with added metas is not large enough. */
+ if (*size < optimal_size)
+ *size = optimal_size;
}
-size_t kasan_metadata_size(struct kmem_cache *cache)
+size_t __kasan_metadata_size(struct kmem_cache *cache)
{
+ if (!kasan_stack_collection_enabled())
+ return 0;
return (cache->kasan_info.alloc_meta_offset ?
sizeof(struct kasan_alloc_meta) : 0) +
(cache->kasan_info.free_meta_offset ?
sizeof(struct kasan_free_meta) : 0);
}
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object)
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object)
{
- return (void *)object + cache->kasan_info.alloc_meta_offset;
+ if (!cache->kasan_info.alloc_meta_offset)
+ return NULL;
+ return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset;
}
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object)
+#ifdef CONFIG_KASAN_GENERIC
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object)
{
BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32);
- return (void *)object + cache->kasan_info.free_meta_offset;
+ if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META)
+ return NULL;
+ return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset;
}
+#endif
-void kasan_poison_slab(struct page *page)
+void __kasan_poison_slab(struct page *page)
{
unsigned long i;
for (i = 0; i < compound_nr(page); i++)
page_kasan_tag_reset(page + i);
- kasan_poison_shadow(page_address(page), page_size(page),
- KASAN_KMALLOC_REDZONE);
+ poison_range(page_address(page), page_size(page),
+ KASAN_KMALLOC_REDZONE);
}
-void kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_unpoison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_unpoison_shadow(object, cache->object_size);
+ unpoison_range(object, cache->object_size);
}
-void kasan_poison_object_data(struct kmem_cache *cache, void *object)
+void __kasan_poison_object_data(struct kmem_cache *cache, void *object)
{
- kasan_poison_shadow(object,
- round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE),
- KASAN_KMALLOC_REDZONE);
+ poison_range(object, cache->object_size, KASAN_KMALLOC_REDZONE);
}
/*
@@ -322,6 +277,9 @@ void kasan_poison_object_data(struct kmem_cache *cache, void *object)
static u8 assign_tag(struct kmem_cache *cache, const void *object,
bool init, bool keep_tag)
{
+ if (IS_ENABLED(CONFIG_KASAN_GENERIC))
+ return 0xff;
+
/*
* 1. When an object is kmalloc()'ed, two hooks are called:
* kasan_slab_alloc() and kasan_kmalloc(). We assign the
@@ -351,50 +309,32 @@ static u8 assign_tag(struct kmem_cache *cache, const void *object,
#endif
}
-void * __must_check kasan_init_slab_obj(struct kmem_cache *cache,
+void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache,
const void *object)
{
- struct kasan_alloc_meta *alloc_info;
+ struct kasan_alloc_meta *alloc_meta;
- if (!(cache->flags & SLAB_KASAN))
- return (void *)object;
-
- alloc_info = get_alloc_info(cache, object);
- __memset(alloc_info, 0, sizeof(*alloc_info));
+ if (kasan_stack_collection_enabled()) {
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ __memset(alloc_meta, 0, sizeof(*alloc_meta));
+ }
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- object = set_tag(object,
- assign_tag(cache, object, true, false));
+ /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */
+ object = set_tag(object, assign_tag(cache, object, true, false));
return (void *)object;
}
-static inline bool shadow_invalid(u8 tag, s8 shadow_byte)
-{
- if (IS_ENABLED(CONFIG_KASAN_GENERIC))
- return shadow_byte < 0 ||
- shadow_byte >= KASAN_SHADOW_SCALE_SIZE;
-
- /* else CONFIG_KASAN_SW_TAGS: */
- if ((u8)shadow_byte == KASAN_TAG_INVALID)
- return true;
- if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte))
- return true;
-
- return false;
-}
-
-static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
+static bool ____kasan_slab_free(struct kmem_cache *cache, void *object,
unsigned long ip, bool quarantine)
{
- s8 shadow_byte;
u8 tag;
void *tagged_object;
- unsigned long rounded_up_size;
tag = get_tag(object);
tagged_object = object;
- object = reset_tag(object);
+ object = kasan_reset_tag(object);
if (unlikely(nearest_obj(cache, virt_to_head_page(object), object) !=
object)) {
@@ -406,37 +346,67 @@ static bool __kasan_slab_free(struct kmem_cache *cache, void *object,
if (unlikely(cache->flags & SLAB_TYPESAFE_BY_RCU))
return false;
- shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(object));
- if (shadow_invalid(tag, shadow_byte)) {
+ if (check_invalid_free(tagged_object)) {
kasan_report_invalid_free(tagged_object, ip);
return true;
}
- rounded_up_size = round_up(cache->object_size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(object, rounded_up_size, KASAN_KMALLOC_FREE);
+ poison_range(object, cache->object_size, KASAN_KMALLOC_FREE);
- if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine) ||
- unlikely(!(cache->flags & SLAB_KASAN)))
+ if (!kasan_stack_collection_enabled())
+ return false;
+
+ if ((IS_ENABLED(CONFIG_KASAN_GENERIC) && !quarantine))
return false;
kasan_set_free_info(cache, object, tag);
- quarantine_put(get_free_info(cache, object), cache);
+ return quarantine_put(cache, object);
+}
- return IS_ENABLED(CONFIG_KASAN_GENERIC);
+bool __kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+{
+ return ____kasan_slab_free(cache, object, ip, true);
}
-bool kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip)
+void __kasan_slab_free_mempool(void *ptr, unsigned long ip)
{
- return __kasan_slab_free(cache, object, ip, true);
+ struct page *page;
+
+ page = virt_to_head_page(ptr);
+
+ /*
+ * Even though this function is only called for kmem_cache_alloc and
+ * kmalloc backed mempool allocations, those allocations can still be
+ * !PageSlab() when the size provided to kmalloc is larger than
+ * KMALLOC_MAX_SIZE, and kmalloc falls back onto page_alloc.
+ */
+ if (unlikely(!PageSlab(page))) {
+ if (ptr != page_address(page)) {
+ kasan_report_invalid_free(ptr, ip);
+ return;
+ }
+ poison_range(ptr, page_size(page), KASAN_FREE_PAGE);
+ } else {
+ ____kasan_slab_free(page->slab_cache, ptr, ip, false);
+ }
+}
+
+static void set_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->alloc_track, flags);
}
-static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
+static void *____kasan_kmalloc(struct kmem_cache *cache, const void *object,
size_t size, gfp_t flags, bool keep_tag)
{
unsigned long redzone_start;
unsigned long redzone_end;
- u8 tag = 0xff;
+ u8 tag;
if (gfpflags_allow_blocking(flags))
quarantine_reduce();
@@ -445,38 +415,36 @@ static void *__kasan_kmalloc(struct kmem_cache *cache, const void *object,
return NULL;
redzone_start = round_up((unsigned long)(object + size),
- KASAN_SHADOW_SCALE_SIZE);
+ KASAN_GRANULE_SIZE);
redzone_end = round_up((unsigned long)object + cache->object_size,
- KASAN_SHADOW_SCALE_SIZE);
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- tag = assign_tag(cache, object, false, keep_tag);
+ KASAN_GRANULE_SIZE);
+ tag = assign_tag(cache, object, false, keep_tag);
- /* Tag is ignored in set_tag without CONFIG_KASAN_SW_TAGS */
- kasan_unpoison_shadow(set_tag(object, tag), size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_KMALLOC_REDZONE);
+ /* Tag is ignored in set_tag without CONFIG_KASAN_SW/HW_TAGS */
+ unpoison_range(set_tag(object, tag), size);
+ poison_range((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_KMALLOC_REDZONE);
- if (cache->flags & SLAB_KASAN)
- kasan_set_track(&get_alloc_info(cache, object)->alloc_track, flags);
+ if (kasan_stack_collection_enabled())
+ set_alloc_info(cache, (void *)object, flags);
return set_tag(object, tag);
}
-void * __must_check kasan_slab_alloc(struct kmem_cache *cache, void *object,
- gfp_t flags)
+void * __must_check __kasan_slab_alloc(struct kmem_cache *cache,
+ void *object, gfp_t flags)
{
- return __kasan_kmalloc(cache, object, cache->object_size, flags, false);
+ return ____kasan_kmalloc(cache, object, cache->object_size, flags, false);
}
-void * __must_check kasan_kmalloc(struct kmem_cache *cache, const void *object,
- size_t size, gfp_t flags)
+void * __must_check __kasan_kmalloc(struct kmem_cache *cache, const void *object,
+ size_t size, gfp_t flags)
{
- return __kasan_kmalloc(cache, object, size, flags, true);
+ return ____kasan_kmalloc(cache, object, size, flags, true);
}
-EXPORT_SYMBOL(kasan_kmalloc);
+EXPORT_SYMBOL(__kasan_kmalloc);
-void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
+void * __must_check __kasan_kmalloc_large(const void *ptr, size_t size,
gfp_t flags)
{
struct page *page;
@@ -491,17 +459,17 @@ void * __must_check kasan_kmalloc_large(const void *ptr, size_t size,
page = virt_to_page(ptr);
redzone_start = round_up((unsigned long)(ptr + size),
- KASAN_SHADOW_SCALE_SIZE);
+ KASAN_GRANULE_SIZE);
redzone_end = (unsigned long)ptr + page_size(page);
- kasan_unpoison_shadow(ptr, size);
- kasan_poison_shadow((void *)redzone_start, redzone_end - redzone_start,
- KASAN_PAGE_REDZONE);
+ unpoison_range(ptr, size);
+ poison_range((void *)redzone_start, redzone_end - redzone_start,
+ KASAN_PAGE_REDZONE);
return (void *)ptr;
}
-void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
+void * __must_check __kasan_krealloc(const void *object, size_t size, gfp_t flags)
{
struct page *page;
@@ -511,421 +479,15 @@ void * __must_check kasan_krealloc(const void *object, size_t size, gfp_t flags)
page = virt_to_head_page(object);
if (unlikely(!PageSlab(page)))
- return kasan_kmalloc_large(object, size, flags);
+ return __kasan_kmalloc_large(object, size, flags);
else
- return __kasan_kmalloc(page->slab_cache, object, size,
+ return ____kasan_kmalloc(page->slab_cache, object, size,
flags, true);
}
-void kasan_poison_kfree(void *ptr, unsigned long ip)
-{
- struct page *page;
-
- page = virt_to_head_page(ptr);
-
- if (unlikely(!PageSlab(page))) {
- if (ptr != page_address(page)) {
- kasan_report_invalid_free(ptr, ip);
- return;
- }
- kasan_poison_shadow(ptr, page_size(page), KASAN_FREE_PAGE);
- } else {
- __kasan_slab_free(page->slab_cache, ptr, ip, false);
- }
-}
-
-void kasan_kfree_large(void *ptr, unsigned long ip)
+void __kasan_kfree_large(void *ptr, unsigned long ip)
{
if (ptr != page_address(virt_to_head_page(ptr)))
kasan_report_invalid_free(ptr, ip);
- /* The object will be poisoned by page_alloc. */
-}
-
-#ifndef CONFIG_KASAN_VMALLOC
-int kasan_module_alloc(void *addr, size_t size)
-{
- void *ret;
- size_t scaled_size;
- size_t shadow_size;
- unsigned long shadow_start;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
- scaled_size = (size + KASAN_SHADOW_MASK) >> KASAN_SHADOW_SCALE_SHIFT;
- shadow_size = round_up(scaled_size, PAGE_SIZE);
-
- if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
- return -EINVAL;
-
- ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
- shadow_start + shadow_size,
- GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
- __builtin_return_address(0));
-
- if (ret) {
- __memset(ret, KASAN_SHADOW_INIT, shadow_size);
- find_vm_area(addr)->flags |= VM_KASAN;
- kmemleak_ignore(ret);
- return 0;
- }
-
- return -ENOMEM;
-}
-
-void kasan_free_shadow(const struct vm_struct *vm)
-{
- if (vm->flags & VM_KASAN)
- vfree(kasan_mem_to_shadow(vm->addr));
-}
-#endif
-
-#ifdef CONFIG_MEMORY_HOTPLUG
-static bool shadow_mapped(unsigned long addr)
-{
- pgd_t *pgd = pgd_offset_k(addr);
- p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
- pte_t *pte;
-
- if (pgd_none(*pgd))
- return false;
- p4d = p4d_offset(pgd, addr);
- if (p4d_none(*p4d))
- return false;
- pud = pud_offset(p4d, addr);
- if (pud_none(*pud))
- return false;
-
- /*
- * We can't use pud_large() or pud_huge(), the first one is
- * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
- * pud_bad(), if pud is bad then it's bad because it's huge.
- */
- if (pud_bad(*pud))
- return true;
- pmd = pmd_offset(pud, addr);
- if (pmd_none(*pmd))
- return false;
-
- if (pmd_bad(*pmd))
- return true;
- pte = pte_offset_kernel(pmd, addr);
- return !pte_none(*pte);
-}
-
-static int __meminit kasan_mem_notifier(struct notifier_block *nb,
- unsigned long action, void *data)
-{
- struct memory_notify *mem_data = data;
- unsigned long nr_shadow_pages, start_kaddr, shadow_start;
- unsigned long shadow_end, shadow_size;
-
- nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
- start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
- shadow_size = nr_shadow_pages << PAGE_SHIFT;
- shadow_end = shadow_start + shadow_size;
-
- if (WARN_ON(mem_data->nr_pages % KASAN_SHADOW_SCALE_SIZE) ||
- WARN_ON(start_kaddr % (KASAN_SHADOW_SCALE_SIZE << PAGE_SHIFT)))
- return NOTIFY_BAD;
-
- switch (action) {
- case MEM_GOING_ONLINE: {
- void *ret;
-
- /*
- * If shadow is mapped already than it must have been mapped
- * during the boot. This could happen if we onlining previously
- * offlined memory.
- */
- if (shadow_mapped(shadow_start))
- return NOTIFY_OK;
-
- ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
- shadow_end, GFP_KERNEL,
- PAGE_KERNEL, VM_NO_GUARD,
- pfn_to_nid(mem_data->start_pfn),
- __builtin_return_address(0));
- if (!ret)
- return NOTIFY_BAD;
-
- kmemleak_ignore(ret);
- return NOTIFY_OK;
- }
- case MEM_CANCEL_ONLINE:
- case MEM_OFFLINE: {
- struct vm_struct *vm;
-
- /*
- * shadow_start was either mapped during boot by kasan_init()
- * or during memory online by __vmalloc_node_range().
- * In the latter case we can use vfree() to free shadow.
- * Non-NULL result of the find_vm_area() will tell us if
- * that was the second case.
- *
- * Currently it's not possible to free shadow mapped
- * during boot by kasan_init(). It's because the code
- * to do that hasn't been written yet. So we'll just
- * leak the memory.
- */
- vm = find_vm_area((void *)shadow_start);
- if (vm)
- vfree((void *)shadow_start);
- }
- }
-
- return NOTIFY_OK;
-}
-
-static int __init kasan_memhotplug_init(void)
-{
- hotplug_memory_notifier(kasan_mem_notifier, 0);
-
- return 0;
-}
-
-core_initcall(kasan_memhotplug_init);
-#endif
-
-#ifdef CONFIG_KASAN_VMALLOC
-static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
-{
- unsigned long page;
- pte_t pte;
-
- if (likely(!pte_none(*ptep)))
- return 0;
-
- page = __get_free_page(GFP_KERNEL);
- if (!page)
- return -ENOMEM;
-
- memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
- pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
-
- spin_lock(&init_mm.page_table_lock);
- if (likely(pte_none(*ptep))) {
- set_pte_at(&init_mm, addr, ptep, pte);
- page = 0;
- }
- spin_unlock(&init_mm.page_table_lock);
- if (page)
- free_page(page);
- return 0;
-}
-
-int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
-{
- unsigned long shadow_start, shadow_end;
- int ret;
-
- if (!is_vmalloc_or_module_addr((void *)addr))
- return 0;
-
- shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
- shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
- shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
- shadow_end = ALIGN(shadow_end, PAGE_SIZE);
-
- ret = apply_to_page_range(&init_mm, shadow_start,
- shadow_end - shadow_start,
- kasan_populate_vmalloc_pte, NULL);
- if (ret)
- return ret;
-
- flush_cache_vmap(shadow_start, shadow_end);
-
- /*
- * We need to be careful about inter-cpu effects here. Consider:
- *
- * CPU#0 CPU#1
- * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
- * p[99] = 1;
- *
- * With compiler instrumentation, that ends up looking like this:
- *
- * CPU#0 CPU#1
- * // vmalloc() allocates memory
- * // let a = area->addr
- * // we reach kasan_populate_vmalloc
- * // and call kasan_unpoison_shadow:
- * STORE shadow(a), unpoison_val
- * ...
- * STORE shadow(a+99), unpoison_val x = LOAD p
- * // rest of vmalloc process <data dependency>
- * STORE p, a LOAD shadow(x+99)
- *
- * If there is no barrier between the end of unpoisioning the shadow
- * and the store of the result to p, the stores could be committed
- * in a different order by CPU#0, and CPU#1 could erroneously observe
- * poison in the shadow.
- *
- * We need some sort of barrier between the stores.
- *
- * In the vmalloc() case, this is provided by a smp_wmb() in
- * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
- * get_vm_area() and friends, the caller gets shadow allocated but
- * doesn't have any pages mapped into the virtual address space that
- * has been reserved. Mapping those pages in will involve taking and
- * releasing a page-table lock, which will provide the barrier.
- */
-
- return 0;
-}
-
-/*
- * Poison the shadow for a vmalloc region. Called as part of the
- * freeing process at the time the region is freed.
- */
-void kasan_poison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
- kasan_poison_shadow(start, size, KASAN_VMALLOC_INVALID);
-}
-
-void kasan_unpoison_vmalloc(const void *start, unsigned long size)
-{
- if (!is_vmalloc_or_module_addr(start))
- return;
-
- kasan_unpoison_shadow(start, size);
+ /* The object will be poisoned by kasan_free_pages(). */
}
-
-static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
- void *unused)
-{
- unsigned long page;
-
- page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
-
- spin_lock(&init_mm.page_table_lock);
-
- if (likely(!pte_none(*ptep))) {
- pte_clear(&init_mm, addr, ptep);
- free_page(page);
- }
- spin_unlock(&init_mm.page_table_lock);
-
- return 0;
-}
-
-/*
- * Release the backing for the vmalloc region [start, end), which
- * lies within the free region [free_region_start, free_region_end).
- *
- * This can be run lazily, long after the region was freed. It runs
- * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
- * infrastructure.
- *
- * How does this work?
- * -------------------
- *
- * We have a region that is page aligned, labelled as A.
- * That might not map onto the shadow in a way that is page-aligned:
- *
- * start end
- * v v
- * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
- * -------- -------- -------- -------- --------
- * | | | | |
- * | | | /-------/ |
- * \-------\|/------/ |/---------------/
- * ||| ||
- * |??AAAAAA|AAAAAAAA|AA??????| < shadow
- * (1) (2) (3)
- *
- * First we align the start upwards and the end downwards, so that the
- * shadow of the region aligns with shadow page boundaries. In the
- * example, this gives us the shadow page (2). This is the shadow entirely
- * covered by this allocation.
- *
- * Then we have the tricky bits. We want to know if we can free the
- * partially covered shadow pages - (1) and (3) in the example. For this,
- * we are given the start and end of the free region that contains this
- * allocation. Extending our previous example, we could have:
- *
- * free_region_start free_region_end
- * | start end |
- * v v v v
- * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
- * -------- -------- -------- -------- --------
- * | | | | |
- * | | | /-------/ |
- * \-------\|/------/ |/---------------/
- * ||| ||
- * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
- * (1) (2) (3)
- *
- * Once again, we align the start of the free region up, and the end of
- * the free region down so that the shadow is page aligned. So we can free
- * page (1) - we know no allocation currently uses anything in that page,
- * because all of it is in the vmalloc free region. But we cannot free
- * page (3), because we can't be sure that the rest of it is unused.
- *
- * We only consider pages that contain part of the original region for
- * freeing: we don't try to free other pages from the free region or we'd
- * end up trying to free huge chunks of virtual address space.
- *
- * Concurrency
- * -----------
- *
- * How do we know that we're not freeing a page that is simultaneously
- * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
- *
- * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
- * at the same time. While we run under free_vmap_area_lock, the population
- * code does not.
- *
- * free_vmap_area_lock instead operates to ensure that the larger range
- * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
- * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
- * no space identified as free will become used while we are running. This
- * means that so long as we are careful with alignment and only free shadow
- * pages entirely covered by the free region, we will not run in to any
- * trouble - any simultaneous allocations will be for disjoint regions.
- */
-void kasan_release_vmalloc(unsigned long start, unsigned long end,
- unsigned long free_region_start,
- unsigned long free_region_end)
-{
- void *shadow_start, *shadow_end;
- unsigned long region_start, region_end;
- unsigned long size;
-
- region_start = ALIGN(start, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
- region_end = ALIGN_DOWN(end, PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- free_region_start = ALIGN(free_region_start,
- PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- if (start != region_start &&
- free_region_start < region_start)
- region_start -= PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
-
- free_region_end = ALIGN_DOWN(free_region_end,
- PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE);
-
- if (end != region_end &&
- free_region_end > region_end)
- region_end += PAGE_SIZE * KASAN_SHADOW_SCALE_SIZE;
-
- shadow_start = kasan_mem_to_shadow((void *)region_start);
- shadow_end = kasan_mem_to_shadow((void *)region_end);
-
- if (shadow_end > shadow_start) {
- size = shadow_end - shadow_start;
- apply_to_existing_page_range(&init_mm,
- (unsigned long)shadow_start,
- size, kasan_depopulate_vmalloc_pte,
- NULL);
- flush_tlb_kernel_range((unsigned long)shadow_start,
- (unsigned long)shadow_end);
- }
-}
-#endif
diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c
index 248264b9cb76..1dd5a0f99372 100644
--- a/mm/kasan/generic.c
+++ b/mm/kasan/generic.c
@@ -7,15 +7,8 @@
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-
#include <linux/export.h>
#include <linux/interrupt.h>
#include <linux/init.h>
@@ -51,7 +44,7 @@ static __always_inline bool memory_is_poisoned_1(unsigned long addr)
s8 shadow_value = *(s8 *)kasan_mem_to_shadow((void *)addr);
if (unlikely(shadow_value)) {
- s8 last_accessible_byte = addr & KASAN_SHADOW_MASK;
+ s8 last_accessible_byte = addr & KASAN_GRANULE_MASK;
return unlikely(last_accessible_byte >= shadow_value);
}
@@ -67,7 +60,7 @@ static __always_inline bool memory_is_poisoned_2_4_8(unsigned long addr,
* Access crosses 8(shadow size)-byte boundary. Such access maps
* into 2 shadow bytes, so we need to check them both.
*/
- if (unlikely(((addr + size - 1) & KASAN_SHADOW_MASK) < size - 1))
+ if (unlikely(((addr + size - 1) & KASAN_GRANULE_MASK) < size - 1))
return *shadow_addr || memory_is_poisoned_1(addr + size - 1);
return memory_is_poisoned_1(addr + size - 1);
@@ -78,7 +71,7 @@ static __always_inline bool memory_is_poisoned_16(unsigned long addr)
u16 *shadow_addr = (u16 *)kasan_mem_to_shadow((void *)addr);
/* Unaligned 16-bytes access maps into 3 shadow bytes. */
- if (unlikely(!IS_ALIGNED(addr, KASAN_SHADOW_SCALE_SIZE)))
+ if (unlikely(!IS_ALIGNED(addr, KASAN_GRANULE_SIZE)))
return *shadow_addr || memory_is_poisoned_1(addr + 15);
return *shadow_addr;
@@ -139,7 +132,7 @@ static __always_inline bool memory_is_poisoned_n(unsigned long addr,
s8 *last_shadow = (s8 *)kasan_mem_to_shadow((void *)last_byte);
if (unlikely(ret != (unsigned long)last_shadow ||
- ((long)(last_byte & KASAN_SHADOW_MASK) >= *last_shadow)))
+ ((long)(last_byte & KASAN_GRANULE_MASK) >= *last_shadow)))
return true;
}
return false;
@@ -192,6 +185,13 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
return check_memory_region_inline(addr, size, write, ret_ip);
}
+bool check_invalid_free(void *addr)
+{
+ s8 shadow_byte = READ_ONCE(*(s8 *)kasan_mem_to_shadow(addr));
+
+ return shadow_byte < 0 || shadow_byte >= KASAN_GRANULE_SIZE;
+}
+
void kasan_cache_shrink(struct kmem_cache *cache)
{
quarantine_remove_cache(cache);
@@ -205,13 +205,13 @@ void kasan_cache_shutdown(struct kmem_cache *cache)
static void register_global(struct kasan_global *global)
{
- size_t aligned_size = round_up(global->size, KASAN_SHADOW_SCALE_SIZE);
+ size_t aligned_size = round_up(global->size, KASAN_GRANULE_SIZE);
- kasan_unpoison_shadow(global->beg, global->size);
+ unpoison_range(global->beg, global->size);
- kasan_poison_shadow(global->beg + aligned_size,
- global->size_with_redzone - aligned_size,
- KASAN_GLOBAL_REDZONE);
+ poison_range(global->beg + aligned_size,
+ global->size_with_redzone - aligned_size,
+ KASAN_GLOBAL_REDZONE);
}
void __asan_register_globals(struct kasan_global *globals, size_t size)
@@ -279,10 +279,10 @@ EXPORT_SYMBOL(__asan_handle_no_return);
/* Emitted by compiler to poison alloca()ed objects. */
void __asan_alloca_poison(unsigned long addr, size_t size)
{
- size_t rounded_up_size = round_up(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t rounded_up_size = round_up(size, KASAN_GRANULE_SIZE);
size_t padding_size = round_up(size, KASAN_ALLOCA_REDZONE_SIZE) -
rounded_up_size;
- size_t rounded_down_size = round_down(size, KASAN_SHADOW_SCALE_SIZE);
+ size_t rounded_down_size = round_down(size, KASAN_GRANULE_SIZE);
const void *left_redzone = (const void *)(addr -
KASAN_ALLOCA_REDZONE_SIZE);
@@ -290,13 +290,12 @@ void __asan_alloca_poison(unsigned long addr, size_t size)
WARN_ON(!IS_ALIGNED(addr, KASAN_ALLOCA_REDZONE_SIZE));
- kasan_unpoison_shadow((const void *)(addr + rounded_down_size),
- size - rounded_down_size);
- kasan_poison_shadow(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_LEFT);
- kasan_poison_shadow(right_redzone,
- padding_size + KASAN_ALLOCA_REDZONE_SIZE,
- KASAN_ALLOCA_RIGHT);
+ unpoison_range((const void *)(addr + rounded_down_size),
+ size - rounded_down_size);
+ poison_range(left_redzone, KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_LEFT);
+ poison_range(right_redzone, padding_size + KASAN_ALLOCA_REDZONE_SIZE,
+ KASAN_ALLOCA_RIGHT);
}
EXPORT_SYMBOL(__asan_alloca_poison);
@@ -306,7 +305,7 @@ void __asan_allocas_unpoison(const void *stack_top, const void *stack_bottom)
if (unlikely(!stack_top || stack_top > stack_bottom))
return;
- kasan_unpoison_shadow(stack_top, stack_bottom - stack_top);
+ unpoison_range(stack_top, stack_bottom - stack_top);
}
EXPORT_SYMBOL(__asan_allocas_unpoison);
@@ -329,7 +328,7 @@ void kasan_record_aux_stack(void *addr)
{
struct page *page = kasan_addr_to_page(addr);
struct kmem_cache *cache;
- struct kasan_alloc_meta *alloc_info;
+ struct kasan_alloc_meta *alloc_meta;
void *object;
if (!(page && PageSlab(page)))
@@ -337,13 +336,10 @@ void kasan_record_aux_stack(void *addr)
cache = page->slab_cache;
object = nearest_obj(cache, page, addr);
- alloc_info = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
- /*
- * record the last two call_rcu() call stacks.
- */
- alloc_info->aux_stack[1] = alloc_info->aux_stack[0];
- alloc_info->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
+ alloc_meta->aux_stack[1] = alloc_meta->aux_stack[0];
+ alloc_meta->aux_stack[0] = kasan_save_stack(GFP_NOWAIT);
}
void kasan_set_free_info(struct kmem_cache *cache,
@@ -351,12 +347,12 @@ void kasan_set_free_info(struct kmem_cache *cache,
{
struct kasan_free_meta *free_meta;
- free_meta = get_free_info(cache, object);
- kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ free_meta = kasan_get_free_meta(cache, object);
+ if (!free_meta)
+ return;
- /*
- * the object was freed and has free track set
- */
+ kasan_set_track(&free_meta->free_track, GFP_NOWAIT);
+ /* The object was freed and has free track set. */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREETRACK;
}
@@ -365,5 +361,6 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
{
if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_KMALLOC_FREETRACK)
return NULL;
- return &get_free_info(cache, object)->free_track;
+ /* Free meta must be present with KASAN_KMALLOC_FREETRACK. */
+ return &kasan_get_free_meta(cache, object)->free_track;
}
diff --git a/mm/kasan/generic_report.c b/mm/kasan/generic_report.c
deleted file mode 100644
index a38c7a9e192a..000000000000
--- a/mm/kasan/generic_report.c
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * This file contains generic KASAN specific error reporting code.
- *
- * Copyright (c) 2014 Samsung Electronics Co., Ltd.
- * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * Some code borrowed from https://github.com/xairy/kasan-prototype by
- * Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- */
-
-#include <linux/bitops.h>
-#include <linux/ftrace.h>
-#include <linux/init.h>
-#include <linux/kernel.h>
-#include <linux/mm.h>
-#include <linux/printk.h>
-#include <linux/sched.h>
-#include <linux/slab.h>
-#include <linux/stackdepot.h>
-#include <linux/stacktrace.h>
-#include <linux/string.h>
-#include <linux/types.h>
-#include <linux/kasan.h>
-#include <linux/module.h>
-
-#include <asm/sections.h>
-
-#include "kasan.h"
-#include "../slab.h"
-
-void *find_first_bad_addr(void *addr, size_t size)
-{
- void *p = addr;
-
- while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
- p += KASAN_SHADOW_SCALE_SIZE;
- return p;
-}
-
-static const char *get_shadow_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
- u8 *shadow_addr;
-
- shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
-
- /*
- * If shadow byte value is in [0, KASAN_SHADOW_SCALE_SIZE) we can look
- * at the next shadow byte to determine the type of the bad access.
- */
- if (*shadow_addr > 0 && *shadow_addr <= KASAN_SHADOW_SCALE_SIZE - 1)
- shadow_addr++;
-
- switch (*shadow_addr) {
- case 0 ... KASAN_SHADOW_SCALE_SIZE - 1:
- /*
- * In theory it's still possible to see these shadow values
- * due to a data race in the kernel code.
- */
- bug_type = "out-of-bounds";
- break;
- case KASAN_PAGE_REDZONE:
- case KASAN_KMALLOC_REDZONE:
- bug_type = "slab-out-of-bounds";
- break;
- case KASAN_GLOBAL_REDZONE:
- bug_type = "global-out-of-bounds";
- break;
- case KASAN_STACK_LEFT:
- case KASAN_STACK_MID:
- case KASAN_STACK_RIGHT:
- case KASAN_STACK_PARTIAL:
- bug_type = "stack-out-of-bounds";
- break;
- case KASAN_FREE_PAGE:
- case KASAN_KMALLOC_FREE:
- case KASAN_KMALLOC_FREETRACK:
- bug_type = "use-after-free";
- break;
- case KASAN_ALLOCA_LEFT:
- case KASAN_ALLOCA_RIGHT:
- bug_type = "alloca-out-of-bounds";
- break;
- case KASAN_VMALLOC_INVALID:
- bug_type = "vmalloc-out-of-bounds";
- break;
- }
-
- return bug_type;
-}
-
-static const char *get_wild_bug_type(struct kasan_access_info *info)
-{
- const char *bug_type = "unknown-crash";
-
- if ((unsigned long)info->access_addr < PAGE_SIZE)
- bug_type = "null-ptr-deref";
- else if ((unsigned long)info->access_addr < TASK_SIZE)
- bug_type = "user-memory-access";
- else
- bug_type = "wild-memory-access";
-
- return bug_type;
-}
-
-const char *get_bug_type(struct kasan_access_info *info)
-{
- /*
- * If access_size is a negative number, then it has reason to be
- * defined as out-of-bounds bug type.
- *
- * Casting negative numbers to size_t would indeed turn up as
- * a large size_t and its value will be larger than ULONG_MAX/2,
- * so that this can qualify as out-of-bounds.
- */
- if (info->access_addr + info->access_size < info->access_addr)
- return "out-of-bounds";
-
- if (addr_has_shadow(info->access_addr))
- return get_shadow_bug_type(info);
- return get_wild_bug_type(info);
-}
-
-#define DEFINE_ASAN_REPORT_LOAD(size) \
-void __asan_report_load##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, false, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_load##size##_noabort)
-
-#define DEFINE_ASAN_REPORT_STORE(size) \
-void __asan_report_store##size##_noabort(unsigned long addr) \
-{ \
- kasan_report(addr, size, true, _RET_IP_); \
-} \
-EXPORT_SYMBOL(__asan_report_store##size##_noabort)
-
-DEFINE_ASAN_REPORT_LOAD(1);
-DEFINE_ASAN_REPORT_LOAD(2);
-DEFINE_ASAN_REPORT_LOAD(4);
-DEFINE_ASAN_REPORT_LOAD(8);
-DEFINE_ASAN_REPORT_LOAD(16);
-DEFINE_ASAN_REPORT_STORE(1);
-DEFINE_ASAN_REPORT_STORE(2);
-DEFINE_ASAN_REPORT_STORE(4);
-DEFINE_ASAN_REPORT_STORE(8);
-DEFINE_ASAN_REPORT_STORE(16);
-
-void __asan_report_load_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, false, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_load_n_noabort);
-
-void __asan_report_store_n_noabort(unsigned long addr, size_t size)
-{
- kasan_report(addr, size, true, _RET_IP_);
-}
-EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c
new file mode 100644
index 000000000000..55bd6f09c70f
--- /dev/null
+++ b/mm/kasan/hw_tags.c
@@ -0,0 +1,204 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains core hardware tag-based KASAN code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#define pr_fmt(fmt) "kasan: " fmt
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/static_key.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+enum kasan_arg_mode {
+ KASAN_ARG_MODE_DEFAULT,
+ KASAN_ARG_MODE_OFF,
+ KASAN_ARG_MODE_PROD,
+ KASAN_ARG_MODE_FULL,
+};
+
+enum kasan_arg_stacktrace {
+ KASAN_ARG_STACKTRACE_DEFAULT,
+ KASAN_ARG_STACKTRACE_OFF,
+ KASAN_ARG_STACKTRACE_ON,
+};
+
+enum kasan_arg_fault {
+ KASAN_ARG_FAULT_DEFAULT,
+ KASAN_ARG_FAULT_REPORT,
+ KASAN_ARG_FAULT_PANIC,
+};
+
+static enum kasan_arg_mode kasan_arg_mode __ro_after_init;
+static enum kasan_arg_stacktrace kasan_arg_stacktrace __ro_after_init;
+static enum kasan_arg_fault kasan_arg_fault __ro_after_init;
+
+/* Whether KASAN is enabled at all. */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_enabled);
+EXPORT_SYMBOL(kasan_flag_enabled);
+
+/* Whether to collect alloc/free stack traces. */
+DEFINE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+
+/* Whether panic or disable tag checking on fault. */
+bool kasan_flag_panic __ro_after_init;
+
+/* kasan.mode=off/prod/full */
+static int __init early_kasan_mode(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_mode = KASAN_ARG_MODE_OFF;
+ else if (!strcmp(arg, "prod"))
+ kasan_arg_mode = KASAN_ARG_MODE_PROD;
+ else if (!strcmp(arg, "full"))
+ kasan_arg_mode = KASAN_ARG_MODE_FULL;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.mode", early_kasan_mode);
+
+/* kasan.stack=off/on */
+static int __init early_kasan_flag_stacktrace(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "off"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF;
+ else if (!strcmp(arg, "on"))
+ kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.stacktrace", early_kasan_flag_stacktrace);
+
+/* kasan.fault=report/panic */
+static int __init early_kasan_fault(char *arg)
+{
+ if (!arg)
+ return -EINVAL;
+
+ if (!strcmp(arg, "report"))
+ kasan_arg_fault = KASAN_ARG_FAULT_REPORT;
+ else if (!strcmp(arg, "panic"))
+ kasan_arg_fault = KASAN_ARG_FAULT_PANIC;
+ else
+ return -EINVAL;
+
+ return 0;
+}
+early_param("kasan.fault", early_kasan_fault);
+
+/* kasan_init_hw_tags_cpu() is called for each CPU. */
+void kasan_init_hw_tags_cpu(void)
+{
+ /*
+ * There's no need to check that the hardware is MTE-capable here,
+ * as this function is only called for MTE-capable hardware.
+ */
+
+ /* If KASAN is disabled, do nothing. */
+ if (kasan_arg_mode == KASAN_ARG_MODE_OFF)
+ return;
+
+ hw_init_tags(KASAN_TAG_MAX);
+ hw_enable_tagging();
+}
+
+/* kasan_init_hw_tags() is called once on boot CPU. */
+void __init kasan_init_hw_tags(void)
+{
+ /* If hardware doesn't support MTE, do nothing. */
+ if (!system_supports_mte())
+ return;
+
+ /* Choose KASAN mode if kasan boot parameter is not provided. */
+ if (kasan_arg_mode == KASAN_ARG_MODE_DEFAULT) {
+ if (IS_ENABLED(CONFIG_DEBUG_KERNEL))
+ kasan_arg_mode = KASAN_ARG_MODE_FULL;
+ else
+ kasan_arg_mode = KASAN_ARG_MODE_PROD;
+ }
+
+ /* Preset parameter values based on the mode. */
+ switch (kasan_arg_mode) {
+ case KASAN_ARG_MODE_DEFAULT:
+ /* Shouldn't happen as per the check above. */
+ WARN_ON(1);
+ return;
+ case KASAN_ARG_MODE_OFF:
+ /* If KASAN is disabled, do nothing. */
+ return;
+ case KASAN_ARG_MODE_PROD:
+ static_branch_enable(&kasan_flag_enabled);
+ break;
+ case KASAN_ARG_MODE_FULL:
+ static_branch_enable(&kasan_flag_enabled);
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
+
+ /* Now, optionally override the presets. */
+
+ switch (kasan_arg_stacktrace) {
+ case KASAN_ARG_STACKTRACE_DEFAULT:
+ break;
+ case KASAN_ARG_STACKTRACE_OFF:
+ static_branch_disable(&kasan_flag_stacktrace);
+ break;
+ case KASAN_ARG_STACKTRACE_ON:
+ static_branch_enable(&kasan_flag_stacktrace);
+ break;
+ }
+
+ switch (kasan_arg_fault) {
+ case KASAN_ARG_FAULT_DEFAULT:
+ break;
+ case KASAN_ARG_FAULT_REPORT:
+ kasan_flag_panic = false;
+ break;
+ case KASAN_ARG_FAULT_PANIC:
+ kasan_flag_panic = true;
+ break;
+ }
+
+ pr_info("KernelAddressSanitizer initialized\n");
+}
+
+void kasan_set_free_info(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta)
+ kasan_set_track(&alloc_meta->free_track[0], GFP_NOWAIT);
+}
+
+struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
+ void *object, u8 tag)
+{
+ struct kasan_alloc_meta *alloc_meta;
+
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return NULL;
+
+ return &alloc_meta->free_track[0];
+}
diff --git a/mm/kasan/init.c b/mm/kasan/init.c
index fe6be0be1f76..bc0ad208b3a7 100644
--- a/mm/kasan/init.c
+++ b/mm/kasan/init.c
@@ -1,14 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains some kasan initialization code.
+ * This file contains KASAN shadow initialization code.
*
* Copyright (c) 2015 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/memblock.h>
@@ -446,9 +441,8 @@ void kasan_remove_zero_shadow(void *start, unsigned long size)
addr = (unsigned long)kasan_mem_to_shadow(start);
end = addr + (size >> KASAN_SHADOW_SCALE_SHIFT);
- if (WARN_ON((unsigned long)start %
- (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
- WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) ||
+ WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE))
return;
for (; addr < end; addr = next) {
@@ -481,9 +475,8 @@ int kasan_add_zero_shadow(void *start, unsigned long size)
shadow_start = kasan_mem_to_shadow(start);
shadow_end = shadow_start + (size >> KASAN_SHADOW_SCALE_SHIFT);
- if (WARN_ON((unsigned long)start %
- (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)) ||
- WARN_ON(size % (KASAN_SHADOW_SCALE_SIZE * PAGE_SIZE)))
+ if (WARN_ON((unsigned long)start % KASAN_MEMORY_PER_SHADOW_PAGE) ||
+ WARN_ON(size % KASAN_MEMORY_PER_SHADOW_PAGE))
return -EINVAL;
ret = kasan_populate_early_shadow(shadow_start, shadow_end);
diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h
index ac499456740f..cc4d9e1d49b1 100644
--- a/mm/kasan/kasan.h
+++ b/mm/kasan/kasan.h
@@ -5,8 +5,32 @@
#include <linux/kasan.h>
#include <linux/stackdepot.h>
-#define KASAN_SHADOW_SCALE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
-#define KASAN_SHADOW_MASK (KASAN_SHADOW_SCALE_SIZE - 1)
+#ifdef CONFIG_KASAN_HW_TAGS
+#include <linux/static_key.h>
+DECLARE_STATIC_KEY_FALSE(kasan_flag_stacktrace);
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return static_branch_unlikely(&kasan_flag_stacktrace);
+}
+#else
+static inline bool kasan_stack_collection_enabled(void)
+{
+ return true;
+}
+#endif
+
+extern bool kasan_flag_panic __ro_after_init;
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
+#define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT)
+#else
+#include <asm/mte-kasan.h>
+#define KASAN_GRANULE_SIZE MTE_GRANULE_SIZE
+#endif
+
+#define KASAN_GRANULE_MASK (KASAN_GRANULE_SIZE - 1)
+
+#define KASAN_MEMORY_PER_SHADOW_PAGE (KASAN_GRANULE_SIZE << PAGE_SHIFT)
#define KASAN_TAG_KERNEL 0xFF /* native kernel pointers tag */
#define KASAN_TAG_INVALID 0xFE /* inaccessible memory tag */
@@ -56,6 +80,13 @@
#define KASAN_ABI_VERSION 1
#endif
+/* Metadata layout customization. */
+#define META_BYTES_PER_BLOCK 1
+#define META_BLOCKS_PER_ROW 16
+#define META_BYTES_PER_ROW (META_BLOCKS_PER_ROW * META_BYTES_PER_BLOCK)
+#define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE)
+#define META_ROWS_AROUND_ADDR 2
+
struct kasan_access_info {
const void *access_addr;
const void *first_bad_addr;
@@ -124,20 +155,33 @@ struct kasan_alloc_meta {
struct qlist_node {
struct qlist_node *next;
};
+
+/*
+ * Generic mode either stores free meta in the object itself or in the redzone
+ * after the object. In the former case free meta offset is 0, in the latter
+ * case it has some sane value smaller than INT_MAX. Use INT_MAX as free meta
+ * offset when free meta isn't present.
+ */
+#define KASAN_NO_FREE_META INT_MAX
+
struct kasan_free_meta {
+#ifdef CONFIG_KASAN_GENERIC
/* This field is used while the object is in the quarantine.
* Otherwise it might be used for the allocator freelist.
*/
struct qlist_node quarantine_link;
-#ifdef CONFIG_KASAN_GENERIC
struct kasan_track free_track;
#endif
};
-struct kasan_alloc_meta *get_alloc_info(struct kmem_cache *cache,
- const void *object);
-struct kasan_free_meta *get_free_info(struct kmem_cache *cache,
- const void *object);
+struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache,
+ const void *object);
+#ifdef CONFIG_KASAN_GENERIC
+struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache,
+ const void *object);
+#endif
+
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
{
@@ -145,13 +189,11 @@ static inline const void *kasan_shadow_to_mem(const void *shadow_addr)
<< KASAN_SHADOW_SCALE_SHIFT);
}
-static inline bool addr_has_shadow(const void *addr)
+static inline bool addr_has_metadata(const void *addr)
{
return (addr >= kasan_shadow_to_mem((void *)KASAN_SHADOW_START));
}
-void kasan_poison_shadow(const void *address, size_t size, u8 value);
-
/**
* check_memory_region - Check memory region, and report if invalid access.
* @addr: the accessed address
@@ -163,8 +205,30 @@ void kasan_poison_shadow(const void *address, size_t size, u8 value);
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip);
+#else /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+static inline bool addr_has_metadata(const void *addr)
+{
+ return true;
+}
+
+#endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */
+
+#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS)
+void print_tags(u8 addr_tag, const void *addr);
+#else
+static inline void print_tags(u8 addr_tag, const void *addr) { }
+#endif
+
void *find_first_bad_addr(void *addr, size_t size);
const char *get_bug_type(struct kasan_access_info *info);
+void metadata_fetch_row(char *buffer, void *row);
+
+#if defined(CONFIG_KASAN_GENERIC) && CONFIG_KASAN_STACK
+void print_address_stack_frame(const void *addr);
+#else
+static inline void print_address_stack_frame(const void *addr) { }
+#endif
bool kasan_report(unsigned long addr, size_t size,
bool is_write, unsigned long ip);
@@ -180,49 +244,92 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
#if defined(CONFIG_KASAN_GENERIC) && \
(defined(CONFIG_SLAB) || defined(CONFIG_SLUB))
-void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache);
+bool quarantine_put(struct kmem_cache *cache, void *object);
void quarantine_reduce(void);
void quarantine_remove_cache(struct kmem_cache *cache);
#else
-static inline void quarantine_put(struct kasan_free_meta *info,
- struct kmem_cache *cache) { }
+static inline bool quarantine_put(struct kmem_cache *cache, void *object) { return false; }
static inline void quarantine_reduce(void) { }
static inline void quarantine_remove_cache(struct kmem_cache *cache) { }
#endif
-#ifdef CONFIG_KASAN_SW_TAGS
+#ifndef arch_kasan_set_tag
+static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
+{
+ return addr;
+}
+#endif
+#ifndef arch_kasan_get_tag
+#define arch_kasan_get_tag(addr) 0
+#endif
-void print_tags(u8 addr_tag, const void *addr);
+#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
+#define get_tag(addr) arch_kasan_get_tag(addr)
-u8 random_tag(void);
+#ifdef CONFIG_KASAN_HW_TAGS
+
+#ifndef arch_enable_tagging
+#define arch_enable_tagging()
+#endif
+#ifndef arch_init_tags
+#define arch_init_tags(max_tag)
+#endif
+#ifndef arch_get_random_tag
+#define arch_get_random_tag() (0xFF)
+#endif
+#ifndef arch_get_mem_tag
+#define arch_get_mem_tag(addr) (0xFF)
+#endif
+#ifndef arch_set_mem_tag_range
+#define arch_set_mem_tag_range(addr, size, tag) ((void *)(addr))
+#endif
+
+#define hw_enable_tagging() arch_enable_tagging()
+#define hw_init_tags(max_tag) arch_init_tags(max_tag)
+#define hw_get_random_tag() arch_get_random_tag()
+#define hw_get_mem_tag(addr) arch_get_mem_tag(addr)
+#define hw_set_mem_tag_range(addr, size, tag) arch_set_mem_tag_range((addr), (size), (tag))
+#endif /* CONFIG_KASAN_HW_TAGS */
+
+#ifdef CONFIG_KASAN_SW_TAGS
+u8 random_tag(void);
+#elif defined(CONFIG_KASAN_HW_TAGS)
+static inline u8 random_tag(void) { return hw_get_random_tag(); }
#else
+static inline u8 random_tag(void) { return 0; }
+#endif
-static inline void print_tags(u8 addr_tag, const void *addr) { }
+#ifdef CONFIG_KASAN_HW_TAGS
-static inline u8 random_tag(void)
+static inline void poison_range(const void *address, size_t size, u8 value)
{
- return 0;
+ hw_set_mem_tag_range(kasan_reset_tag(address),
+ round_up(size, KASAN_GRANULE_SIZE), value);
}
-#endif
+static inline void unpoison_range(const void *address, size_t size)
+{
+ hw_set_mem_tag_range(kasan_reset_tag(address),
+ round_up(size, KASAN_GRANULE_SIZE), get_tag(address));
+}
-#ifndef arch_kasan_set_tag
-static inline const void *arch_kasan_set_tag(const void *addr, u8 tag)
+static inline bool check_invalid_free(void *addr)
{
- return addr;
+ u8 ptr_tag = get_tag(addr);
+ u8 mem_tag = hw_get_mem_tag(addr);
+
+ return (mem_tag == KASAN_TAG_INVALID) ||
+ (ptr_tag != KASAN_TAG_KERNEL && ptr_tag != mem_tag);
}
-#endif
-#ifndef arch_kasan_reset_tag
-#define arch_kasan_reset_tag(addr) ((void *)(addr))
-#endif
-#ifndef arch_kasan_get_tag
-#define arch_kasan_get_tag(addr) 0
-#endif
-#define set_tag(addr, tag) ((void *)arch_kasan_set_tag((addr), (tag)))
-#define reset_tag(addr) ((void *)arch_kasan_reset_tag(addr))
-#define get_tag(addr) arch_kasan_get_tag(addr)
+#else /* CONFIG_KASAN_HW_TAGS */
+
+void poison_range(const void *address, size_t size, u8 value);
+void unpoison_range(const void *address, size_t size);
+bool check_invalid_free(void *addr);
+
+#endif /* CONFIG_KASAN_HW_TAGS */
/*
* Exported functions for interfaces called from assembly or from generated
diff --git a/mm/kasan/quarantine.c b/mm/kasan/quarantine.c
index 4c5375810449..55783125a767 100644
--- a/mm/kasan/quarantine.c
+++ b/mm/kasan/quarantine.c
@@ -6,16 +6,6 @@
* Copyright (C) 2016 Google, Inc.
*
* Based on code by Dmitry Chernenkov.
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public License
- * version 2 as published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful, but
- * WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- * General Public License for more details.
- *
*/
#include <linux/gfp.h>
@@ -29,6 +19,7 @@
#include <linux/srcu.h>
#include <linux/string.h>
#include <linux/types.h>
+#include <linux/cpuhotplug.h>
#include "../slab.h"
#include "kasan.h"
@@ -43,6 +34,7 @@ struct qlist_head {
struct qlist_node *head;
struct qlist_node *tail;
size_t bytes;
+ bool offline;
};
#define QLIST_INIT { NULL, NULL, 0 }
@@ -145,7 +137,12 @@ static void qlink_free(struct qlist_node *qlink, struct kmem_cache *cache)
if (IS_ENABLED(CONFIG_SLAB))
local_irq_save(flags);
+ /*
+ * As the object now gets freed from the quaratine, assume that its
+ * free track is no longer valid.
+ */
*(u8 *)kasan_mem_to_shadow(object) = KASAN_KMALLOC_FREE;
+
___cache_free(cache, object, _THIS_IP_);
if (IS_ENABLED(CONFIG_SLAB))
@@ -171,11 +168,19 @@ static void qlist_free_all(struct qlist_head *q, struct kmem_cache *cache)
qlist_init(q);
}
-void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
+bool quarantine_put(struct kmem_cache *cache, void *object)
{
unsigned long flags;
struct qlist_head *q;
struct qlist_head temp = QLIST_INIT;
+ struct kasan_free_meta *meta = kasan_get_free_meta(cache, object);
+
+ /*
+ * If there's no metadata for this object, don't put it into
+ * quarantine.
+ */
+ if (!meta)
+ return false;
/*
* Note: irq must be disabled until after we move the batch to the
@@ -188,7 +193,11 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
local_irq_save(flags);
q = this_cpu_ptr(&cpu_quarantine);
- qlist_put(q, &info->quarantine_link, cache->size);
+ if (q->offline) {
+ local_irq_restore(flags);
+ return false;
+ }
+ qlist_put(q, &meta->quarantine_link, cache->size);
if (unlikely(q->bytes > QUARANTINE_PERCPU_SIZE)) {
qlist_move_all(q, &temp);
@@ -209,6 +218,8 @@ void quarantine_put(struct kasan_free_meta *info, struct kmem_cache *cache)
}
local_irq_restore(flags);
+
+ return true;
}
void quarantine_reduce(void)
@@ -328,3 +339,36 @@ void quarantine_remove_cache(struct kmem_cache *cache)
synchronize_srcu(&remove_cache_srcu);
}
+
+static int kasan_cpu_online(unsigned int cpu)
+{
+ this_cpu_ptr(&cpu_quarantine)->offline = false;
+ return 0;
+}
+
+static int kasan_cpu_offline(unsigned int cpu)
+{
+ struct qlist_head *q;
+
+ q = this_cpu_ptr(&cpu_quarantine);
+ /* Ensure the ordering between the writing to q->offline and
+ * qlist_free_all. Otherwise, cpu_quarantine may be corrupted
+ * by interrupt.
+ */
+ WRITE_ONCE(q->offline, true);
+ barrier();
+ qlist_free_all(q, NULL);
+ return 0;
+}
+
+static int __init kasan_cpu_quarantine_init(void)
+{
+ int ret = 0;
+
+ ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "mm/kasan:online",
+ kasan_cpu_online, kasan_cpu_offline);
+ if (ret < 0)
+ pr_err("kasan cpu quarantine register failed [%d]\n", ret);
+ return ret;
+}
+late_initcall(kasan_cpu_quarantine_init);
diff --git a/mm/kasan/report.c b/mm/kasan/report.c
index 00a53f1355ae..c0fb21797550 100644
--- a/mm/kasan/report.c
+++ b/mm/kasan/report.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains common generic and tag-based KASAN error reporting code.
+ * This file contains common KASAN error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/bitops.h>
@@ -38,12 +33,6 @@
#include "kasan.h"
#include "../slab.h"
-/* Shadow layout customization. */
-#define SHADOW_BYTES_PER_BLOCK 1
-#define SHADOW_BLOCKS_PER_ROW 16
-#define SHADOW_BYTES_PER_ROW (SHADOW_BLOCKS_PER_ROW * SHADOW_BYTES_PER_BLOCK)
-#define SHADOW_ROWS_AROUND_ADDR 2
-
static unsigned long kasan_flags;
#define KASAN_BIT_REPORTED 0
@@ -73,9 +62,14 @@ static void print_error_description(struct kasan_access_info *info)
{
pr_err("BUG: KASAN: %s in %pS\n",
get_bug_type(info), (void *)info->ip);
- pr_err("%s of size %zu at addr %px by task %s/%d\n",
- info->is_write ? "Write" : "Read", info->access_size,
- info->access_addr, current->comm, task_pid_nr(current));
+ if (info->access_size)
+ pr_err("%s of size %zu at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read", info->access_size,
+ info->access_addr, current->comm, task_pid_nr(current));
+ else
+ pr_err("%s at addr %px by task %s/%d\n",
+ info->is_write ? "Write" : "Read",
+ info->access_addr, current->comm, task_pid_nr(current));
}
static DEFINE_SPINLOCK(report_lock);
@@ -105,6 +99,10 @@ static void end_report(unsigned long *flags)
panic_on_warn = 0;
panic("panic_on_warn set ...\n");
}
+#ifdef CONFIG_KASAN_HW_TAGS
+ if (kasan_flag_panic)
+ panic("kasan.fault=panic set ...\n");
+#endif
kasan_enable_current();
}
@@ -167,36 +165,45 @@ static void describe_object_addr(struct kmem_cache *cache, void *object,
(void *)(object_addr + cache->object_size));
}
-static void describe_object(struct kmem_cache *cache, void *object,
- const void *addr, u8 tag)
+static void describe_object_stacks(struct kmem_cache *cache, void *object,
+ const void *addr, u8 tag)
{
- struct kasan_alloc_meta *alloc_info = get_alloc_info(cache, object);
+ struct kasan_alloc_meta *alloc_meta;
+ struct kasan_track *free_track;
- if (cache->flags & SLAB_KASAN) {
- struct kasan_track *free_track;
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (alloc_meta) {
+ print_track(&alloc_meta->alloc_track, "Allocated");
+ pr_err("\n");
+ }
- print_track(&alloc_info->alloc_track, "Allocated");
+ free_track = kasan_get_free_track(cache, object, tag);
+ if (free_track) {
+ print_track(free_track, "Freed");
pr_err("\n");
- free_track = kasan_get_free_track(cache, object, tag);
- if (free_track) {
- print_track(free_track, "Freed");
- pr_err("\n");
- }
+ }
#ifdef CONFIG_KASAN_GENERIC
- if (alloc_info->aux_stack[0]) {
- pr_err("Last call_rcu():\n");
- print_stack(alloc_info->aux_stack[0]);
- pr_err("\n");
- }
- if (alloc_info->aux_stack[1]) {
- pr_err("Second to last call_rcu():\n");
- print_stack(alloc_info->aux_stack[1]);
- pr_err("\n");
- }
-#endif
+ if (!alloc_meta)
+ return;
+ if (alloc_meta->aux_stack[0]) {
+ pr_err("Last potentially related work creation:\n");
+ print_stack(alloc_meta->aux_stack[0]);
+ pr_err("\n");
}
+ if (alloc_meta->aux_stack[1]) {
+ pr_err("Second to last potentially related work creation:\n");
+ print_stack(alloc_meta->aux_stack[1]);
+ pr_err("\n");
+ }
+#endif
+}
+static void describe_object(struct kmem_cache *cache, void *object,
+ const void *addr, u8 tag)
+{
+ if (kasan_stack_collection_enabled())
+ describe_object_stacks(cache, object, addr, tag);
describe_object_addr(cache, object, addr);
}
@@ -216,168 +223,6 @@ static inline bool init_task_stack_addr(const void *addr)
sizeof(init_thread_union.stack));
}
-static bool __must_check tokenize_frame_descr(const char **frame_descr,
- char *token, size_t max_tok_len,
- unsigned long *value)
-{
- const char *sep = strchr(*frame_descr, ' ');
-
- if (sep == NULL)
- sep = *frame_descr + strlen(*frame_descr);
-
- if (token != NULL) {
- const size_t tok_len = sep - *frame_descr;
-
- if (tok_len + 1 > max_tok_len) {
- pr_err("KASAN internal error: frame description too long: %s\n",
- *frame_descr);
- return false;
- }
-
- /* Copy token (+ 1 byte for '\0'). */
- strlcpy(token, *frame_descr, tok_len + 1);
- }
-
- /* Advance frame_descr past separator. */
- *frame_descr = sep + 1;
-
- if (value != NULL && kstrtoul(token, 10, value)) {
- pr_err("KASAN internal error: not a valid number: %s\n", token);
- return false;
- }
-
- return true;
-}
-
-static void print_decoded_frame_descr(const char *frame_descr)
-{
- /*
- * We need to parse the following string:
- * "n alloc_1 alloc_2 ... alloc_n"
- * where alloc_i looks like
- * "offset size len name"
- * or "offset size len name:line".
- */
-
- char token[64];
- unsigned long num_objects;
-
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &num_objects))
- return;
-
- pr_err("\n");
- pr_err("this frame has %lu %s:\n", num_objects,
- num_objects == 1 ? "object" : "objects");
-
- while (num_objects--) {
- unsigned long offset;
- unsigned long size;
-
- /* access offset */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &offset))
- return;
- /* access size */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- &size))
- return;
- /* name length (unused) */
- if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
- return;
- /* object name */
- if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
- NULL))
- return;
-
- /* Strip line number; without filename it's not very helpful. */
- strreplace(token, ':', '\0');
-
- /* Finally, print object information. */
- pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
- }
-}
-
-static bool __must_check get_address_stack_frame_info(const void *addr,
- unsigned long *offset,
- const char **frame_descr,
- const void **frame_pc)
-{
- unsigned long aligned_addr;
- unsigned long mem_ptr;
- const u8 *shadow_bottom;
- const u8 *shadow_ptr;
- const unsigned long *frame;
-
- BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
-
- /*
- * NOTE: We currently only support printing frame information for
- * accesses to the task's own stack.
- */
- if (!object_is_on_stack(addr))
- return false;
-
- aligned_addr = round_down((unsigned long)addr, sizeof(long));
- mem_ptr = round_down(aligned_addr, KASAN_SHADOW_SCALE_SIZE);
- shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
- shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
-
- while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
- shadow_ptr--;
- mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
- }
-
- while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
- shadow_ptr--;
- mem_ptr -= KASAN_SHADOW_SCALE_SIZE;
- }
-
- if (shadow_ptr < shadow_bottom)
- return false;
-
- frame = (const unsigned long *)(mem_ptr + KASAN_SHADOW_SCALE_SIZE);
- if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
- pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
- frame[0]);
- return false;
- }
-
- *offset = (unsigned long)addr - (unsigned long)frame;
- *frame_descr = (const char *)frame[1];
- *frame_pc = (void *)frame[2];
-
- return true;
-}
-
-static void print_address_stack_frame(const void *addr)
-{
- unsigned long offset;
- const char *frame_descr;
- const void *frame_pc;
-
- if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
- return;
-
- if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
- &frame_pc))
- return;
-
- /*
- * get_address_stack_frame_info only returns true if the given addr is
- * on the current task's stack.
- */
- pr_err("\n");
- pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
- addr, current->comm, task_pid_nr(current), offset);
- pr_err(" %pS\n", frame_pc);
-
- if (!frame_descr)
- return;
-
- print_decoded_frame_descr(frame_descr);
-}
-
static void print_address_description(void *addr, u8 tag)
{
struct page *page = kasan_addr_to_page(addr);
@@ -405,62 +250,68 @@ static void print_address_description(void *addr, u8 tag)
print_address_stack_frame(addr);
}
-static bool row_is_guilty(const void *row, const void *guilty)
+static bool meta_row_is_guilty(const void *row, const void *addr)
{
- return (row <= guilty) && (guilty < row + SHADOW_BYTES_PER_ROW);
+ return (row <= addr) && (addr < row + META_MEM_BYTES_PER_ROW);
}
-static int shadow_pointer_offset(const void *row, const void *shadow)
+static int meta_pointer_offset(const void *row, const void *addr)
{
- /* The length of ">ff00ff00ff00ff00: " is
- * 3 + (BITS_PER_LONG/8)*2 chars.
+ /*
+ * Memory state around the buggy address:
+ * ff00ff00ff00ff00: 00 00 00 05 fe fe fe fe fe fe fe fe fe fe fe fe
+ * ...
+ *
+ * The length of ">ff00ff00ff00ff00: " is
+ * 3 + (BITS_PER_LONG / 8) * 2 chars.
+ * The length of each granule metadata is 2 bytes
+ * plus 1 byte for space.
*/
- return 3 + (BITS_PER_LONG/8)*2 + (shadow - row)*2 +
- (shadow - row) / SHADOW_BYTES_PER_BLOCK + 1;
+ return 3 + (BITS_PER_LONG / 8) * 2 +
+ (addr - row) / KASAN_GRANULE_SIZE * 3 + 1;
}
-static void print_shadow_for_address(const void *addr)
+static void print_memory_metadata(const void *addr)
{
int i;
- const void *shadow = kasan_mem_to_shadow(addr);
- const void *shadow_row;
+ void *row;
- shadow_row = (void *)round_down((unsigned long)shadow,
- SHADOW_BYTES_PER_ROW)
- - SHADOW_ROWS_AROUND_ADDR * SHADOW_BYTES_PER_ROW;
+ row = (void *)round_down((unsigned long)addr, META_MEM_BYTES_PER_ROW)
+ - META_ROWS_AROUND_ADDR * META_MEM_BYTES_PER_ROW;
pr_err("Memory state around the buggy address:\n");
- for (i = -SHADOW_ROWS_AROUND_ADDR; i <= SHADOW_ROWS_AROUND_ADDR; i++) {
- const void *kaddr = kasan_shadow_to_mem(shadow_row);
- char buffer[4 + (BITS_PER_LONG/8)*2];
- char shadow_buf[SHADOW_BYTES_PER_ROW];
+ for (i = -META_ROWS_AROUND_ADDR; i <= META_ROWS_AROUND_ADDR; i++) {
+ char buffer[4 + (BITS_PER_LONG / 8) * 2];
+ char metadata[META_BYTES_PER_ROW];
snprintf(buffer, sizeof(buffer),
- (i == 0) ? ">%px: " : " %px: ", kaddr);
+ (i == 0) ? ">%px: " : " %px: ", row);
+
/*
* We should not pass a shadow pointer to generic
* function, because generic functions may try to
* access kasan mapping for the passed address.
*/
- memcpy(shadow_buf, shadow_row, SHADOW_BYTES_PER_ROW);
+ metadata_fetch_row(&metadata[0], row);
+
print_hex_dump(KERN_ERR, buffer,
- DUMP_PREFIX_NONE, SHADOW_BYTES_PER_ROW, 1,
- shadow_buf, SHADOW_BYTES_PER_ROW, 0);
+ DUMP_PREFIX_NONE, META_BYTES_PER_ROW, 1,
+ metadata, META_BYTES_PER_ROW, 0);
- if (row_is_guilty(shadow_row, shadow))
- pr_err("%*c\n",
- shadow_pointer_offset(shadow_row, shadow),
- '^');
+ if (meta_row_is_guilty(row, addr))
+ pr_err("%*c\n", meta_pointer_offset(row, addr), '^');
- shadow_row += SHADOW_BYTES_PER_ROW;
+ row += META_MEM_BYTES_PER_ROW;
}
}
static bool report_enabled(void)
{
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (current->kasan_depth)
return false;
+#endif
if (test_bit(KASAN_BIT_MULTI_SHOT, &kasan_flags))
return true;
return !test_and_set_bit(KASAN_BIT_REPORTED, &kasan_flags);
@@ -490,7 +341,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
unsigned long flags;
u8 tag = get_tag(object);
- object = reset_tag(object);
+ object = kasan_reset_tag(object);
#if IS_ENABLED(CONFIG_KUNIT)
if (current->kunit_test)
@@ -503,7 +354,7 @@ void kasan_report_invalid_free(void *object, unsigned long ip)
pr_err("\n");
print_address_description(object, tag);
pr_err("\n");
- print_shadow_for_address(object);
+ print_memory_metadata(object);
end_report(&flags);
}
@@ -523,10 +374,10 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
disable_trace_on_warning();
tagged_addr = (void *)addr;
- untagged_addr = reset_tag(tagged_addr);
+ untagged_addr = kasan_reset_tag(tagged_addr);
info.access_addr = tagged_addr;
- if (addr_has_shadow(untagged_addr))
+ if (addr_has_metadata(untagged_addr))
info.first_bad_addr = find_first_bad_addr(tagged_addr, size);
else
info.first_bad_addr = untagged_addr;
@@ -537,14 +388,14 @@ static void __kasan_report(unsigned long addr, size_t size, bool is_write,
start_report(&flags);
print_error_description(&info);
- if (addr_has_shadow(untagged_addr))
+ if (addr_has_metadata(untagged_addr))
print_tags(get_tag(tagged_addr), info.first_bad_addr);
pr_err("\n");
- if (addr_has_shadow(untagged_addr)) {
+ if (addr_has_metadata(untagged_addr)) {
print_address_description(untagged_addr, get_tag(tagged_addr));
pr_err("\n");
- print_shadow_for_address(info.first_bad_addr);
+ print_memory_metadata(info.first_bad_addr);
} else {
dump_stack();
}
@@ -604,6 +455,6 @@ void kasan_non_canonical_hook(unsigned long addr)
else
bug_type = "maybe wild-memory-access";
pr_alert("KASAN: %s in range [0x%016lx-0x%016lx]\n", bug_type,
- orig_addr, orig_addr + KASAN_SHADOW_MASK);
+ orig_addr, orig_addr + KASAN_GRANULE_SIZE - 1);
}
#endif
diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c
new file mode 100644
index 000000000000..8a9c889872da
--- /dev/null
+++ b/mm/kasan/report_generic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains generic KASAN specific error reporting code.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/bitops.h>
+#include <linux/ftrace.h>
+#include <linux/init.h>
+#include <linux/kernel.h>
+#include <linux/mm.h>
+#include <linux/printk.h>
+#include <linux/sched.h>
+#include <linux/sched/task_stack.h>
+#include <linux/slab.h>
+#include <linux/stackdepot.h>
+#include <linux/stacktrace.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/kasan.h>
+#include <linux/module.h>
+
+#include <asm/sections.h>
+
+#include "kasan.h"
+#include "../slab.h"
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ void *p = addr;
+
+ while (p < addr + size && !(*(u8 *)kasan_mem_to_shadow(p)))
+ p += KASAN_GRANULE_SIZE;
+ return p;
+}
+
+static const char *get_shadow_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+ u8 *shadow_addr;
+
+ shadow_addr = (u8 *)kasan_mem_to_shadow(info->first_bad_addr);
+
+ /*
+ * If shadow byte value is in [0, KASAN_GRANULE_SIZE) we can look
+ * at the next shadow byte to determine the type of the bad access.
+ */
+ if (*shadow_addr > 0 && *shadow_addr <= KASAN_GRANULE_SIZE - 1)
+ shadow_addr++;
+
+ switch (*shadow_addr) {
+ case 0 ... KASAN_GRANULE_SIZE - 1:
+ /*
+ * In theory it's still possible to see these shadow values
+ * due to a data race in the kernel code.
+ */
+ bug_type = "out-of-bounds";
+ break;
+ case KASAN_PAGE_REDZONE:
+ case KASAN_KMALLOC_REDZONE:
+ bug_type = "slab-out-of-bounds";
+ break;
+ case KASAN_GLOBAL_REDZONE:
+ bug_type = "global-out-of-bounds";
+ break;
+ case KASAN_STACK_LEFT:
+ case KASAN_STACK_MID:
+ case KASAN_STACK_RIGHT:
+ case KASAN_STACK_PARTIAL:
+ bug_type = "stack-out-of-bounds";
+ break;
+ case KASAN_FREE_PAGE:
+ case KASAN_KMALLOC_FREE:
+ case KASAN_KMALLOC_FREETRACK:
+ bug_type = "use-after-free";
+ break;
+ case KASAN_ALLOCA_LEFT:
+ case KASAN_ALLOCA_RIGHT:
+ bug_type = "alloca-out-of-bounds";
+ break;
+ case KASAN_VMALLOC_INVALID:
+ bug_type = "vmalloc-out-of-bounds";
+ break;
+ }
+
+ return bug_type;
+}
+
+static const char *get_wild_bug_type(struct kasan_access_info *info)
+{
+ const char *bug_type = "unknown-crash";
+
+ if ((unsigned long)info->access_addr < PAGE_SIZE)
+ bug_type = "null-ptr-deref";
+ else if ((unsigned long)info->access_addr < TASK_SIZE)
+ bug_type = "user-memory-access";
+ else
+ bug_type = "wild-memory-access";
+
+ return bug_type;
+}
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ /*
+ * If access_size is a negative number, then it has reason to be
+ * defined as out-of-bounds bug type.
+ *
+ * Casting negative numbers to size_t would indeed turn up as
+ * a large size_t and its value will be larger than ULONG_MAX/2,
+ * so that this can qualify as out-of-bounds.
+ */
+ if (info->access_addr + info->access_size < info->access_addr)
+ return "out-of-bounds";
+
+ if (addr_has_metadata(info->access_addr))
+ return get_shadow_bug_type(info);
+ return get_wild_bug_type(info);
+}
+
+void metadata_fetch_row(char *buffer, void *row)
+{
+ memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
+}
+
+#if CONFIG_KASAN_STACK
+static bool __must_check tokenize_frame_descr(const char **frame_descr,
+ char *token, size_t max_tok_len,
+ unsigned long *value)
+{
+ const char *sep = strchr(*frame_descr, ' ');
+
+ if (sep == NULL)
+ sep = *frame_descr + strlen(*frame_descr);
+
+ if (token != NULL) {
+ const size_t tok_len = sep - *frame_descr;
+
+ if (tok_len + 1 > max_tok_len) {
+ pr_err("KASAN internal error: frame description too long: %s\n",
+ *frame_descr);
+ return false;
+ }
+
+ /* Copy token (+ 1 byte for '\0'). */
+ strlcpy(token, *frame_descr, tok_len + 1);
+ }
+
+ /* Advance frame_descr past separator. */
+ *frame_descr = sep + 1;
+
+ if (value != NULL && kstrtoul(token, 10, value)) {
+ pr_err("KASAN internal error: not a valid number: %s\n", token);
+ return false;
+ }
+
+ return true;
+}
+
+static void print_decoded_frame_descr(const char *frame_descr)
+{
+ /*
+ * We need to parse the following string:
+ * "n alloc_1 alloc_2 ... alloc_n"
+ * where alloc_i looks like
+ * "offset size len name"
+ * or "offset size len name:line".
+ */
+
+ char token[64];
+ unsigned long num_objects;
+
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &num_objects))
+ return;
+
+ pr_err("\n");
+ pr_err("this frame has %lu %s:\n", num_objects,
+ num_objects == 1 ? "object" : "objects");
+
+ while (num_objects--) {
+ unsigned long offset;
+ unsigned long size;
+
+ /* access offset */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &offset))
+ return;
+ /* access size */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ &size))
+ return;
+ /* name length (unused) */
+ if (!tokenize_frame_descr(&frame_descr, NULL, 0, NULL))
+ return;
+ /* object name */
+ if (!tokenize_frame_descr(&frame_descr, token, sizeof(token),
+ NULL))
+ return;
+
+ /* Strip line number; without filename it's not very helpful. */
+ strreplace(token, ':', '\0');
+
+ /* Finally, print object information. */
+ pr_err(" [%lu, %lu) '%s'", offset, offset + size, token);
+ }
+}
+
+static bool __must_check get_address_stack_frame_info(const void *addr,
+ unsigned long *offset,
+ const char **frame_descr,
+ const void **frame_pc)
+{
+ unsigned long aligned_addr;
+ unsigned long mem_ptr;
+ const u8 *shadow_bottom;
+ const u8 *shadow_ptr;
+ const unsigned long *frame;
+
+ BUILD_BUG_ON(IS_ENABLED(CONFIG_STACK_GROWSUP));
+
+ /*
+ * NOTE: We currently only support printing frame information for
+ * accesses to the task's own stack.
+ */
+ if (!object_is_on_stack(addr))
+ return false;
+
+ aligned_addr = round_down((unsigned long)addr, sizeof(long));
+ mem_ptr = round_down(aligned_addr, KASAN_GRANULE_SIZE);
+ shadow_ptr = kasan_mem_to_shadow((void *)aligned_addr);
+ shadow_bottom = kasan_mem_to_shadow(end_of_stack(current));
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr != KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_GRANULE_SIZE;
+ }
+
+ while (shadow_ptr >= shadow_bottom && *shadow_ptr == KASAN_STACK_LEFT) {
+ shadow_ptr--;
+ mem_ptr -= KASAN_GRANULE_SIZE;
+ }
+
+ if (shadow_ptr < shadow_bottom)
+ return false;
+
+ frame = (const unsigned long *)(mem_ptr + KASAN_GRANULE_SIZE);
+ if (frame[0] != KASAN_CURRENT_STACK_FRAME_MAGIC) {
+ pr_err("KASAN internal error: frame info validation failed; invalid marker: %lu\n",
+ frame[0]);
+ return false;
+ }
+
+ *offset = (unsigned long)addr - (unsigned long)frame;
+ *frame_descr = (const char *)frame[1];
+ *frame_pc = (void *)frame[2];
+
+ return true;
+}
+
+void print_address_stack_frame(const void *addr)
+{
+ unsigned long offset;
+ const char *frame_descr;
+ const void *frame_pc;
+
+ if (!get_address_stack_frame_info(addr, &offset, &frame_descr,
+ &frame_pc))
+ return;
+
+ /*
+ * get_address_stack_frame_info only returns true if the given addr is
+ * on the current task's stack.
+ */
+ pr_err("\n");
+ pr_err("addr %px is located in stack of task %s/%d at offset %lu in frame:\n",
+ addr, current->comm, task_pid_nr(current), offset);
+ pr_err(" %pS\n", frame_pc);
+
+ if (!frame_descr)
+ return;
+
+ print_decoded_frame_descr(frame_descr);
+}
+#endif /* CONFIG_KASAN_STACK */
+
+#define DEFINE_ASAN_REPORT_LOAD(size) \
+void __asan_report_load##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, false, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_load##size##_noabort)
+
+#define DEFINE_ASAN_REPORT_STORE(size) \
+void __asan_report_store##size##_noabort(unsigned long addr) \
+{ \
+ kasan_report(addr, size, true, _RET_IP_); \
+} \
+EXPORT_SYMBOL(__asan_report_store##size##_noabort)
+
+DEFINE_ASAN_REPORT_LOAD(1);
+DEFINE_ASAN_REPORT_LOAD(2);
+DEFINE_ASAN_REPORT_LOAD(4);
+DEFINE_ASAN_REPORT_LOAD(8);
+DEFINE_ASAN_REPORT_LOAD(16);
+DEFINE_ASAN_REPORT_STORE(1);
+DEFINE_ASAN_REPORT_STORE(2);
+DEFINE_ASAN_REPORT_STORE(4);
+DEFINE_ASAN_REPORT_STORE(8);
+DEFINE_ASAN_REPORT_STORE(16);
+
+void __asan_report_load_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_load_n_noabort);
+
+void __asan_report_store_n_noabort(unsigned long addr, size_t size)
+{
+ kasan_report(addr, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__asan_report_store_n_noabort);
diff --git a/mm/kasan/report_hw_tags.c b/mm/kasan/report_hw_tags.c
new file mode 100644
index 000000000000..57114f0e14d1
--- /dev/null
+++ b/mm/kasan/report_hw_tags.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains hardware tag-based KASAN specific error reporting code.
+ *
+ * Copyright (c) 2020 Google, Inc.
+ * Author: Andrey Konovalov <andreyknvl@google.com>
+ */
+
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+
+#include "kasan.h"
+
+const char *get_bug_type(struct kasan_access_info *info)
+{
+ return "invalid-access";
+}
+
+void *find_first_bad_addr(void *addr, size_t size)
+{
+ return kasan_reset_tag(addr);
+}
+
+void metadata_fetch_row(char *buffer, void *row)
+{
+ int i;
+
+ for (i = 0; i < META_BYTES_PER_ROW; i++)
+ buffer[i] = hw_get_mem_tag(row + i * KASAN_GRANULE_SIZE);
+}
+
+void print_tags(u8 addr_tag, const void *addr)
+{
+ u8 memory_tag = hw_get_mem_tag((void *)addr);
+
+ pr_err("Pointer tag: [%02x], memory tag: [%02x]\n",
+ addr_tag, memory_tag);
+}
diff --git a/mm/kasan/tags_report.c b/mm/kasan/report_sw_tags.c
index bee43717d6f0..1b026793ad57 100644
--- a/mm/kasan/tags_report.c
+++ b/mm/kasan/report_sw_tags.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains tag-based KASAN specific error reporting code.
+ * This file contains software tag-based KASAN specific error reporting code.
*
* Copyright (c) 2014 Samsung Electronics Co., Ltd.
* Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
*
* Some code borrowed from https://github.com/xairy/kasan-prototype by
* Andrey Konovalov <andreyknvl@gmail.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
#include <linux/bitops.h>
@@ -46,16 +41,19 @@ const char *get_bug_type(struct kasan_access_info *info)
int i;
tag = get_tag(info->access_addr);
- addr = reset_tag(info->access_addr);
+ addr = kasan_reset_tag(info->access_addr);
page = kasan_addr_to_page(addr);
if (page && PageSlab(page)) {
cache = page->slab_cache;
object = nearest_obj(cache, page, (void *)addr);
- alloc_meta = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
- for (i = 0; i < KASAN_NR_FREE_STACKS; i++)
- if (alloc_meta->free_pointer_tag[i] == tag)
- return "use-after-free";
+ if (alloc_meta) {
+ for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
+ if (alloc_meta->free_pointer_tag[i] == tag)
+ return "use-after-free";
+ }
+ }
return "out-of-bounds";
}
@@ -77,14 +75,19 @@ const char *get_bug_type(struct kasan_access_info *info)
void *find_first_bad_addr(void *addr, size_t size)
{
u8 tag = get_tag(addr);
- void *p = reset_tag(addr);
+ void *p = kasan_reset_tag(addr);
void *end = p + size;
while (p < end && tag == *(u8 *)kasan_mem_to_shadow(p))
- p += KASAN_SHADOW_SCALE_SIZE;
+ p += KASAN_GRANULE_SIZE;
return p;
}
+void metadata_fetch_row(char *buffer, void *row)
+{
+ memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW);
+}
+
void print_tags(u8 addr_tag, const void *addr)
{
u8 *shadow = (u8 *)kasan_mem_to_shadow(addr);
diff --git a/mm/kasan/shadow.c b/mm/kasan/shadow.c
new file mode 100644
index 000000000000..7c2c08c55f32
--- /dev/null
+++ b/mm/kasan/shadow.c
@@ -0,0 +1,504 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * This file contains KASAN runtime code that manages shadow memory for
+ * generic and software tag-based KASAN modes.
+ *
+ * Copyright (c) 2014 Samsung Electronics Co., Ltd.
+ * Author: Andrey Ryabinin <ryabinin.a.a@gmail.com>
+ *
+ * Some code borrowed from https://github.com/xairy/kasan-prototype by
+ * Andrey Konovalov <andreyknvl@gmail.com>
+ */
+
+#include <linux/init.h>
+#include <linux/kasan.h>
+#include <linux/kernel.h>
+#include <linux/kmemleak.h>
+#include <linux/memory.h>
+#include <linux/mm.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#include "kasan.h"
+
+bool __kasan_check_read(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, false, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_read);
+
+bool __kasan_check_write(const volatile void *p, unsigned int size)
+{
+ return check_memory_region((unsigned long)p, size, true, _RET_IP_);
+}
+EXPORT_SYMBOL(__kasan_check_write);
+
+#undef memset
+void *memset(void *addr, int c, size_t len)
+{
+ if (!check_memory_region((unsigned long)addr, len, true, _RET_IP_))
+ return NULL;
+
+ return __memset(addr, c, len);
+}
+
+#ifdef __HAVE_ARCH_MEMMOVE
+#undef memmove
+void *memmove(void *dest, const void *src, size_t len)
+{
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memmove(dest, src, len);
+}
+#endif
+
+#undef memcpy
+void *memcpy(void *dest, const void *src, size_t len)
+{
+ if (!check_memory_region((unsigned long)src, len, false, _RET_IP_) ||
+ !check_memory_region((unsigned long)dest, len, true, _RET_IP_))
+ return NULL;
+
+ return __memcpy(dest, src, len);
+}
+
+/*
+ * Poisons the shadow memory for 'size' bytes starting from 'addr'.
+ * Memory addresses should be aligned to KASAN_GRANULE_SIZE.
+ */
+void poison_range(const void *address, size_t size, u8 value)
+{
+ void *shadow_start, *shadow_end;
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_poison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = kasan_reset_tag(address);
+ size = round_up(size, KASAN_GRANULE_SIZE);
+
+ shadow_start = kasan_mem_to_shadow(address);
+ shadow_end = kasan_mem_to_shadow(address + size);
+
+ __memset(shadow_start, value, shadow_end - shadow_start);
+}
+
+void unpoison_range(const void *address, size_t size)
+{
+ u8 tag = get_tag(address);
+
+ /*
+ * Perform shadow offset calculation based on untagged address, as
+ * some of the callers (e.g. kasan_unpoison_object_data) pass tagged
+ * addresses to this function.
+ */
+ address = kasan_reset_tag(address);
+
+ poison_range(address, size, tag);
+
+ if (size & KASAN_GRANULE_MASK) {
+ u8 *shadow = (u8 *)kasan_mem_to_shadow(address + size);
+
+ if (IS_ENABLED(CONFIG_KASAN_SW_TAGS))
+ *shadow = tag;
+ else /* CONFIG_KASAN_GENERIC */
+ *shadow = size & KASAN_GRANULE_MASK;
+ }
+}
+
+#ifdef CONFIG_MEMORY_HOTPLUG
+static bool shadow_mapped(unsigned long addr)
+{
+ pgd_t *pgd = pgd_offset_k(addr);
+ p4d_t *p4d;
+ pud_t *pud;
+ pmd_t *pmd;
+ pte_t *pte;
+
+ if (pgd_none(*pgd))
+ return false;
+ p4d = p4d_offset(pgd, addr);
+ if (p4d_none(*p4d))
+ return false;
+ pud = pud_offset(p4d, addr);
+ if (pud_none(*pud))
+ return false;
+
+ /*
+ * We can't use pud_large() or pud_huge(), the first one is
+ * arch-specific, the last one depends on HUGETLB_PAGE. So let's abuse
+ * pud_bad(), if pud is bad then it's bad because it's huge.
+ */
+ if (pud_bad(*pud))
+ return true;
+ pmd = pmd_offset(pud, addr);
+ if (pmd_none(*pmd))
+ return false;
+
+ if (pmd_bad(*pmd))
+ return true;
+ pte = pte_offset_kernel(pmd, addr);
+ return !pte_none(*pte);
+}
+
+static int __meminit kasan_mem_notifier(struct notifier_block *nb,
+ unsigned long action, void *data)
+{
+ struct memory_notify *mem_data = data;
+ unsigned long nr_shadow_pages, start_kaddr, shadow_start;
+ unsigned long shadow_end, shadow_size;
+
+ nr_shadow_pages = mem_data->nr_pages >> KASAN_SHADOW_SCALE_SHIFT;
+ start_kaddr = (unsigned long)pfn_to_kaddr(mem_data->start_pfn);
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)start_kaddr);
+ shadow_size = nr_shadow_pages << PAGE_SHIFT;
+ shadow_end = shadow_start + shadow_size;
+
+ if (WARN_ON(mem_data->nr_pages % KASAN_GRANULE_SIZE) ||
+ WARN_ON(start_kaddr % KASAN_MEMORY_PER_SHADOW_PAGE))
+ return NOTIFY_BAD;
+
+ switch (action) {
+ case MEM_GOING_ONLINE: {
+ void *ret;
+
+ /*
+ * If shadow is mapped already than it must have been mapped
+ * during the boot. This could happen if we onlining previously
+ * offlined memory.
+ */
+ if (shadow_mapped(shadow_start))
+ return NOTIFY_OK;
+
+ ret = __vmalloc_node_range(shadow_size, PAGE_SIZE, shadow_start,
+ shadow_end, GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD,
+ pfn_to_nid(mem_data->start_pfn),
+ __builtin_return_address(0));
+ if (!ret)
+ return NOTIFY_BAD;
+
+ kmemleak_ignore(ret);
+ return NOTIFY_OK;
+ }
+ case MEM_CANCEL_ONLINE:
+ case MEM_OFFLINE: {
+ struct vm_struct *vm;
+
+ /*
+ * shadow_start was either mapped during boot by kasan_init()
+ * or during memory online by __vmalloc_node_range().
+ * In the latter case we can use vfree() to free shadow.
+ * Non-NULL result of the find_vm_area() will tell us if
+ * that was the second case.
+ *
+ * Currently it's not possible to free shadow mapped
+ * during boot by kasan_init(). It's because the code
+ * to do that hasn't been written yet. So we'll just
+ * leak the memory.
+ */
+ vm = find_vm_area((void *)shadow_start);
+ if (vm)
+ vfree((void *)shadow_start);
+ }
+ }
+
+ return NOTIFY_OK;
+}
+
+static int __init kasan_memhotplug_init(void)
+{
+ hotplug_memory_notifier(kasan_mem_notifier, 0);
+
+ return 0;
+}
+
+core_initcall(kasan_memhotplug_init);
+#endif
+
+#ifdef CONFIG_KASAN_VMALLOC
+
+static int kasan_populate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+ pte_t pte;
+
+ if (likely(!pte_none(*ptep)))
+ return 0;
+
+ page = __get_free_page(GFP_KERNEL);
+ if (!page)
+ return -ENOMEM;
+
+ memset((void *)page, KASAN_VMALLOC_INVALID, PAGE_SIZE);
+ pte = pfn_pte(PFN_DOWN(__pa(page)), PAGE_KERNEL);
+
+ spin_lock(&init_mm.page_table_lock);
+ if (likely(pte_none(*ptep))) {
+ set_pte_at(&init_mm, addr, ptep, pte);
+ page = 0;
+ }
+ spin_unlock(&init_mm.page_table_lock);
+ if (page)
+ free_page(page);
+ return 0;
+}
+
+int kasan_populate_vmalloc(unsigned long addr, unsigned long size)
+{
+ unsigned long shadow_start, shadow_end;
+ int ret;
+
+ if (!is_vmalloc_or_module_addr((void *)addr))
+ return 0;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow((void *)addr);
+ shadow_start = ALIGN_DOWN(shadow_start, PAGE_SIZE);
+ shadow_end = (unsigned long)kasan_mem_to_shadow((void *)addr + size);
+ shadow_end = ALIGN(shadow_end, PAGE_SIZE);
+
+ ret = apply_to_page_range(&init_mm, shadow_start,
+ shadow_end - shadow_start,
+ kasan_populate_vmalloc_pte, NULL);
+ if (ret)
+ return ret;
+
+ flush_cache_vmap(shadow_start, shadow_end);
+
+ /*
+ * We need to be careful about inter-cpu effects here. Consider:
+ *
+ * CPU#0 CPU#1
+ * WRITE_ONCE(p, vmalloc(100)); while (x = READ_ONCE(p)) ;
+ * p[99] = 1;
+ *
+ * With compiler instrumentation, that ends up looking like this:
+ *
+ * CPU#0 CPU#1
+ * // vmalloc() allocates memory
+ * // let a = area->addr
+ * // we reach kasan_populate_vmalloc
+ * // and call unpoison_range:
+ * STORE shadow(a), unpoison_val
+ * ...
+ * STORE shadow(a+99), unpoison_val x = LOAD p
+ * // rest of vmalloc process <data dependency>
+ * STORE p, a LOAD shadow(x+99)
+ *
+ * If there is no barrier between the end of unpoisioning the shadow
+ * and the store of the result to p, the stores could be committed
+ * in a different order by CPU#0, and CPU#1 could erroneously observe
+ * poison in the shadow.
+ *
+ * We need some sort of barrier between the stores.
+ *
+ * In the vmalloc() case, this is provided by a smp_wmb() in
+ * clear_vm_uninitialized_flag(). In the per-cpu allocator and in
+ * get_vm_area() and friends, the caller gets shadow allocated but
+ * doesn't have any pages mapped into the virtual address space that
+ * has been reserved. Mapping those pages in will involve taking and
+ * releasing a page-table lock, which will provide the barrier.
+ */
+
+ return 0;
+}
+
+/*
+ * Poison the shadow for a vmalloc region. Called as part of the
+ * freeing process at the time the region is freed.
+ */
+void kasan_poison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ size = round_up(size, KASAN_GRANULE_SIZE);
+ poison_range(start, size, KASAN_VMALLOC_INVALID);
+}
+
+void kasan_unpoison_vmalloc(const void *start, unsigned long size)
+{
+ if (!is_vmalloc_or_module_addr(start))
+ return;
+
+ unpoison_range(start, size);
+}
+
+static int kasan_depopulate_vmalloc_pte(pte_t *ptep, unsigned long addr,
+ void *unused)
+{
+ unsigned long page;
+
+ page = (unsigned long)__va(pte_pfn(*ptep) << PAGE_SHIFT);
+
+ spin_lock(&init_mm.page_table_lock);
+
+ if (likely(!pte_none(*ptep))) {
+ pte_clear(&init_mm, addr, ptep);
+ free_page(page);
+ }
+ spin_unlock(&init_mm.page_table_lock);
+
+ return 0;
+}
+
+/*
+ * Release the backing for the vmalloc region [start, end), which
+ * lies within the free region [free_region_start, free_region_end).
+ *
+ * This can be run lazily, long after the region was freed. It runs
+ * under vmap_area_lock, so it's not safe to interact with the vmalloc/vmap
+ * infrastructure.
+ *
+ * How does this work?
+ * -------------------
+ *
+ * We have a region that is page aligned, labelled as A.
+ * That might not map onto the shadow in a way that is page-aligned:
+ *
+ * start end
+ * v v
+ * |????????|????????|AAAAAAAA|AA....AA|AAAAAAAA|????????| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |??AAAAAA|AAAAAAAA|AA??????| < shadow
+ * (1) (2) (3)
+ *
+ * First we align the start upwards and the end downwards, so that the
+ * shadow of the region aligns with shadow page boundaries. In the
+ * example, this gives us the shadow page (2). This is the shadow entirely
+ * covered by this allocation.
+ *
+ * Then we have the tricky bits. We want to know if we can free the
+ * partially covered shadow pages - (1) and (3) in the example. For this,
+ * we are given the start and end of the free region that contains this
+ * allocation. Extending our previous example, we could have:
+ *
+ * free_region_start free_region_end
+ * | start end |
+ * v v v v
+ * |FFFFFFFF|FFFFFFFF|AAAAAAAA|AA....AA|AAAAAAAA|FFFFFFFF| < vmalloc
+ * -------- -------- -------- -------- --------
+ * | | | | |
+ * | | | /-------/ |
+ * \-------\|/------/ |/---------------/
+ * ||| ||
+ * |FFAAAAAA|AAAAAAAA|AAF?????| < shadow
+ * (1) (2) (3)
+ *
+ * Once again, we align the start of the free region up, and the end of
+ * the free region down so that the shadow is page aligned. So we can free
+ * page (1) - we know no allocation currently uses anything in that page,
+ * because all of it is in the vmalloc free region. But we cannot free
+ * page (3), because we can't be sure that the rest of it is unused.
+ *
+ * We only consider pages that contain part of the original region for
+ * freeing: we don't try to free other pages from the free region or we'd
+ * end up trying to free huge chunks of virtual address space.
+ *
+ * Concurrency
+ * -----------
+ *
+ * How do we know that we're not freeing a page that is simultaneously
+ * being used for a fresh allocation in kasan_populate_vmalloc(_pte)?
+ *
+ * We _can_ have kasan_release_vmalloc and kasan_populate_vmalloc running
+ * at the same time. While we run under free_vmap_area_lock, the population
+ * code does not.
+ *
+ * free_vmap_area_lock instead operates to ensure that the larger range
+ * [free_region_start, free_region_end) is safe: because __alloc_vmap_area and
+ * the per-cpu region-finding algorithm both run under free_vmap_area_lock,
+ * no space identified as free will become used while we are running. This
+ * means that so long as we are careful with alignment and only free shadow
+ * pages entirely covered by the free region, we will not run in to any
+ * trouble - any simultaneous allocations will be for disjoint regions.
+ */
+void kasan_release_vmalloc(unsigned long start, unsigned long end,
+ unsigned long free_region_start,
+ unsigned long free_region_end)
+{
+ void *shadow_start, *shadow_end;
+ unsigned long region_start, region_end;
+ unsigned long size;
+
+ region_start = ALIGN(start, KASAN_MEMORY_PER_SHADOW_PAGE);
+ region_end = ALIGN_DOWN(end, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ free_region_start = ALIGN(free_region_start, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ if (start != region_start &&
+ free_region_start < region_start)
+ region_start -= KASAN_MEMORY_PER_SHADOW_PAGE;
+
+ free_region_end = ALIGN_DOWN(free_region_end, KASAN_MEMORY_PER_SHADOW_PAGE);
+
+ if (end != region_end &&
+ free_region_end > region_end)
+ region_end += KASAN_MEMORY_PER_SHADOW_PAGE;
+
+ shadow_start = kasan_mem_to_shadow((void *)region_start);
+ shadow_end = kasan_mem_to_shadow((void *)region_end);
+
+ if (shadow_end > shadow_start) {
+ size = shadow_end - shadow_start;
+ apply_to_existing_page_range(&init_mm,
+ (unsigned long)shadow_start,
+ size, kasan_depopulate_vmalloc_pte,
+ NULL);
+ flush_tlb_kernel_range((unsigned long)shadow_start,
+ (unsigned long)shadow_end);
+ }
+}
+
+#else /* CONFIG_KASAN_VMALLOC */
+
+int kasan_module_alloc(void *addr, size_t size)
+{
+ void *ret;
+ size_t scaled_size;
+ size_t shadow_size;
+ unsigned long shadow_start;
+
+ shadow_start = (unsigned long)kasan_mem_to_shadow(addr);
+ scaled_size = (size + KASAN_GRANULE_SIZE - 1) >>
+ KASAN_SHADOW_SCALE_SHIFT;
+ shadow_size = round_up(scaled_size, PAGE_SIZE);
+
+ if (WARN_ON(!PAGE_ALIGNED(shadow_start)))
+ return -EINVAL;
+
+ ret = __vmalloc_node_range(shadow_size, 1, shadow_start,
+ shadow_start + shadow_size,
+ GFP_KERNEL,
+ PAGE_KERNEL, VM_NO_GUARD, NUMA_NO_NODE,
+ __builtin_return_address(0));
+
+ if (ret) {
+ __memset(ret, KASAN_SHADOW_INIT, shadow_size);
+ find_vm_area(addr)->flags |= VM_KASAN;
+ kmemleak_ignore(ret);
+ return 0;
+ }
+
+ return -ENOMEM;
+}
+
+void kasan_free_shadow(const struct vm_struct *vm)
+{
+ if (vm->flags & VM_KASAN)
+ vfree(kasan_mem_to_shadow(vm->addr));
+}
+
+#endif
diff --git a/mm/kasan/tags.c b/mm/kasan/sw_tags.c
index e02a36a51f42..5dcd830805b2 100644
--- a/mm/kasan/tags.c
+++ b/mm/kasan/sw_tags.c
@@ -1,17 +1,12 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * This file contains core tag-based KASAN code.
+ * This file contains core software tag-based KASAN code.
*
* Copyright (c) 2018 Google, Inc.
* Author: Andrey Konovalov <andreyknvl@google.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
*/
-#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#define pr_fmt(fmt) "kasan: " fmt
#include <linux/export.h>
#include <linux/interrupt.h>
@@ -40,12 +35,14 @@
static DEFINE_PER_CPU(u32, prng_state);
-void kasan_init_tags(void)
+void __init kasan_init_sw_tags(void)
{
int cpu;
for_each_possible_cpu(cpu)
per_cpu(prng_state, cpu) = (u32)get_cycles();
+
+ pr_info("KernelAddressSanitizer initialized\n");
}
/*
@@ -70,11 +67,6 @@ u8 random_tag(void)
return (u8)(state % (KASAN_TAG_MAX + 1));
}
-void *kasan_reset_tag(const void *addr)
-{
- return reset_tag(addr);
-}
-
bool check_memory_region(unsigned long addr, size_t size, bool write,
unsigned long ret_ip)
{
@@ -110,7 +102,7 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
if (tag == KASAN_TAG_KERNEL)
return true;
- untagged_addr = reset_tag((const void *)addr);
+ untagged_addr = kasan_reset_tag((const void *)addr);
if (unlikely(untagged_addr <
kasan_shadow_to_mem((void *)KASAN_SHADOW_START))) {
return !kasan_report(addr, size, write, ret_ip);
@@ -126,6 +118,15 @@ bool check_memory_region(unsigned long addr, size_t size, bool write,
return true;
}
+bool check_invalid_free(void *addr)
+{
+ u8 tag = get_tag(addr);
+ u8 shadow_byte = READ_ONCE(*(u8 *)kasan_mem_to_shadow(kasan_reset_tag(addr)));
+
+ return (shadow_byte == KASAN_TAG_INVALID) ||
+ (tag != KASAN_TAG_KERNEL && tag != shadow_byte);
+}
+
#define DEFINE_HWASAN_LOAD_STORE(size) \
void __hwasan_load##size##_noabort(unsigned long addr) \
{ \
@@ -158,7 +159,7 @@ EXPORT_SYMBOL(__hwasan_storeN_noabort);
void __hwasan_tag_memory(unsigned long addr, u8 tag, unsigned long size)
{
- kasan_poison_shadow((void *)addr, size, tag);
+ poison_range((void *)addr, size, tag);
}
EXPORT_SYMBOL(__hwasan_tag_memory);
@@ -168,7 +169,9 @@ void kasan_set_free_info(struct kmem_cache *cache,
struct kasan_alloc_meta *alloc_meta;
u8 idx = 0;
- alloc_meta = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return;
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
idx = alloc_meta->free_track_idx;
@@ -185,7 +188,9 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache,
struct kasan_alloc_meta *alloc_meta;
int i = 0;
- alloc_meta = get_alloc_info(cache, object);
+ alloc_meta = kasan_get_alloc_meta(cache, object);
+ if (!alloc_meta)
+ return NULL;
#ifdef CONFIG_KASAN_SW_TAGS_IDENTIFY
for (i = 0; i < KASAN_NR_FREE_STACKS; i++) {
diff --git a/mm/khugepaged.c b/mm/khugepaged.c
index 4e3dff13eb70..67ab391a5373 100644
--- a/mm/khugepaged.c
+++ b/mm/khugepaged.c
@@ -90,6 +90,8 @@ static struct kmem_cache *mm_slot_cache __read_mostly;
* @hash: hash collision list
* @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head
* @mm: the mm that this information is valid for
+ * @nr_pte_mapped_thp: number of pte mapped THP
+ * @pte_mapped_thp: address array corresponding pte mapped THP
*/
struct mm_slot {
struct hlist_node hash;
@@ -124,18 +126,18 @@ static ssize_t scan_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_scan_sleep_millisecs);
+ return sysfs_emit(buf, "%u\n", khugepaged_scan_sleep_millisecs);
}
static ssize_t scan_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
khugepaged_scan_sleep_millisecs = msecs;
@@ -152,18 +154,18 @@ static ssize_t alloc_sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
+ return sysfs_emit(buf, "%u\n", khugepaged_alloc_sleep_millisecs);
}
static ssize_t alloc_sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
khugepaged_alloc_sleep_millisecs = msecs;
@@ -180,17 +182,17 @@ static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_pages_to_scan);
+ return sysfs_emit(buf, "%u\n", khugepaged_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ unsigned int pages;
int err;
- unsigned long pages;
- err = kstrtoul(buf, 10, &pages);
- if (err || !pages || pages > UINT_MAX)
+ err = kstrtouint(buf, 10, &pages);
+ if (err || !pages)
return -EINVAL;
khugepaged_pages_to_scan = pages;
@@ -205,7 +207,7 @@ static ssize_t pages_collapsed_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_pages_collapsed);
+ return sysfs_emit(buf, "%u\n", khugepaged_pages_collapsed);
}
static struct kobj_attribute pages_collapsed_attr =
__ATTR_RO(pages_collapsed);
@@ -214,7 +216,7 @@ static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_full_scans);
+ return sysfs_emit(buf, "%u\n", khugepaged_full_scans);
}
static struct kobj_attribute full_scans_attr =
__ATTR_RO(full_scans);
@@ -223,7 +225,7 @@ static ssize_t khugepaged_defrag_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
return single_hugepage_flag_show(kobj, attr, buf,
- TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
+ TRANSPARENT_HUGEPAGE_DEFRAG_KHUGEPAGED_FLAG);
}
static ssize_t khugepaged_defrag_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -248,7 +250,7 @@ static ssize_t khugepaged_max_ptes_none_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_none);
+ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_none);
}
static ssize_t khugepaged_max_ptes_none_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -273,7 +275,7 @@ static ssize_t khugepaged_max_ptes_swap_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_swap);
+ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_swap);
}
static ssize_t khugepaged_max_ptes_swap_store(struct kobject *kobj,
@@ -297,10 +299,10 @@ static struct kobj_attribute khugepaged_max_ptes_swap_attr =
khugepaged_max_ptes_swap_store);
static ssize_t khugepaged_max_ptes_shared_show(struct kobject *kobj,
- struct kobj_attribute *attr,
- char *buf)
+ struct kobj_attribute *attr,
+ char *buf)
{
- return sprintf(buf, "%u\n", khugepaged_max_ptes_shared);
+ return sysfs_emit(buf, "%u\n", khugepaged_max_ptes_shared);
}
static ssize_t khugepaged_max_ptes_shared_store(struct kobject *kobj,
@@ -1273,7 +1275,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
* PTEs are armed with uffd write protection.
* Here we can also mark the new huge pmd as
* write protected if any of the small ones is
- * marked but that could bring uknown
+ * marked but that could bring unknown
* userfault messages that falls outside of
* the registered range. So, just be simple.
*/
@@ -1414,7 +1416,11 @@ static int khugepaged_add_pte_mapped_thp(struct mm_struct *mm,
}
/**
- * Try to collapse a pte-mapped THP for mm at address haddr.
+ * collapse_pte_mapped_thp - Try to collapse a pte-mapped THP for mm at
+ * address haddr.
+ *
+ * @mm: process address space where collapse happens
+ * @addr: THP collapse address
*
* This function checks whether all the PTEs in the PMD are pointing to the
* right THP. If so, retract the page table so the THP can refault in with
@@ -1605,6 +1611,12 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff)
/**
* collapse_file - collapse filemap/tmpfs/shmem pages into huge one.
*
+ * @mm: process address space where collapse happens
+ * @file: file that collapse on
+ * @start: collapse start address
+ * @hpage: new allocated huge page for collapse
+ * @node: appointed node the new huge page allocate from
+ *
* Basic scheme is simple, details are more complex:
* - allocate and lock a new huge page;
* - scan page cache replacing old pages with the new one
@@ -1845,9 +1857,9 @@ out_unlock:
}
if (is_shmem)
- __inc_node_page_state(new_page, NR_SHMEM_THPS);
+ __inc_lruvec_page_state(new_page, NR_SHMEM_THPS);
else {
- __inc_node_page_state(new_page, NR_FILE_THPS);
+ __inc_lruvec_page_state(new_page, NR_FILE_THPS);
filemap_nr_thps_inc(mapping);
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 0960750bb316..9694ee2c71de 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2833,18 +2833,18 @@ static void wait_while_offlining(void)
static ssize_t sleep_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_thread_sleep_millisecs);
+ return sysfs_emit(buf, "%u\n", ksm_thread_sleep_millisecs);
}
static ssize_t sleep_millisecs_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
- unsigned long msecs;
+ unsigned int msecs;
int err;
- err = kstrtoul(buf, 10, &msecs);
- if (err || msecs > UINT_MAX)
+ err = kstrtouint(buf, 10, &msecs);
+ if (err)
return -EINVAL;
ksm_thread_sleep_millisecs = msecs;
@@ -2857,18 +2857,18 @@ KSM_ATTR(sleep_millisecs);
static ssize_t pages_to_scan_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_thread_pages_to_scan);
+ return sysfs_emit(buf, "%u\n", ksm_thread_pages_to_scan);
}
static ssize_t pages_to_scan_store(struct kobject *kobj,
struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ unsigned int nr_pages;
int err;
- unsigned long nr_pages;
- err = kstrtoul(buf, 10, &nr_pages);
- if (err || nr_pages > UINT_MAX)
+ err = kstrtouint(buf, 10, &nr_pages);
+ if (err)
return -EINVAL;
ksm_thread_pages_to_scan = nr_pages;
@@ -2880,17 +2880,17 @@ KSM_ATTR(pages_to_scan);
static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%lu\n", ksm_run);
+ return sysfs_emit(buf, "%lu\n", ksm_run);
}
static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr,
const char *buf, size_t count)
{
+ unsigned int flags;
int err;
- unsigned long flags;
- err = kstrtoul(buf, 10, &flags);
- if (err || flags > UINT_MAX)
+ err = kstrtouint(buf, 10, &flags);
+ if (err)
return -EINVAL;
if (flags > KSM_RUN_UNMERGE)
return -EINVAL;
@@ -2927,9 +2927,9 @@ KSM_ATTR(run);
#ifdef CONFIG_NUMA
static ssize_t merge_across_nodes_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_merge_across_nodes);
+ return sysfs_emit(buf, "%u\n", ksm_merge_across_nodes);
}
static ssize_t merge_across_nodes_store(struct kobject *kobj,
@@ -2984,9 +2984,9 @@ KSM_ATTR(merge_across_nodes);
#endif
static ssize_t use_zero_pages_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_use_zero_pages);
+ return sysfs_emit(buf, "%u\n", ksm_use_zero_pages);
}
static ssize_t use_zero_pages_store(struct kobject *kobj,
struct kobj_attribute *attr,
@@ -3008,7 +3008,7 @@ KSM_ATTR(use_zero_pages);
static ssize_t max_page_sharing_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%u\n", ksm_max_page_sharing);
+ return sysfs_emit(buf, "%u\n", ksm_max_page_sharing);
}
static ssize_t max_page_sharing_store(struct kobject *kobj,
@@ -3049,21 +3049,21 @@ KSM_ATTR(max_page_sharing);
static ssize_t pages_shared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_pages_shared);
+ return sysfs_emit(buf, "%lu\n", ksm_pages_shared);
}
KSM_ATTR_RO(pages_shared);
static ssize_t pages_sharing_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_pages_sharing);
+ return sysfs_emit(buf, "%lu\n", ksm_pages_sharing);
}
KSM_ATTR_RO(pages_sharing);
static ssize_t pages_unshared_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_pages_unshared);
+ return sysfs_emit(buf, "%lu\n", ksm_pages_unshared);
}
KSM_ATTR_RO(pages_unshared);
@@ -3080,21 +3080,21 @@ static ssize_t pages_volatile_show(struct kobject *kobj,
*/
if (ksm_pages_volatile < 0)
ksm_pages_volatile = 0;
- return sprintf(buf, "%ld\n", ksm_pages_volatile);
+ return sysfs_emit(buf, "%ld\n", ksm_pages_volatile);
}
KSM_ATTR_RO(pages_volatile);
static ssize_t stable_node_dups_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_stable_node_dups);
+ return sysfs_emit(buf, "%lu\n", ksm_stable_node_dups);
}
KSM_ATTR_RO(stable_node_dups);
static ssize_t stable_node_chains_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_stable_node_chains);
+ return sysfs_emit(buf, "%lu\n", ksm_stable_node_chains);
}
KSM_ATTR_RO(stable_node_chains);
@@ -3103,7 +3103,7 @@ stable_node_chains_prune_millisecs_show(struct kobject *kobj,
struct kobj_attribute *attr,
char *buf)
{
- return sprintf(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
+ return sysfs_emit(buf, "%u\n", ksm_stable_node_chains_prune_millisecs);
}
static ssize_t
@@ -3127,7 +3127,7 @@ KSM_ATTR(stable_node_chains_prune_millisecs);
static ssize_t full_scans_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%lu\n", ksm_scan.seqnr);
+ return sysfs_emit(buf, "%lu\n", ksm_scan.seqnr);
}
KSM_ATTR_RO(full_scans);
diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5aa6e44bc2ae..fe230081690b 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -534,7 +534,6 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
struct list_lru_node *nlru = &lru->node[nid];
int dst_idx = dst_memcg->kmemcg_id;
struct list_lru_one *src, *dst;
- bool set;
/*
* Since list_lru_{add,del} may be called under an IRQ-safe lock,
@@ -546,11 +545,12 @@ static void memcg_drain_list_lru_node(struct list_lru *lru, int nid,
dst = list_lru_from_memcg_idx(nlru, dst_idx);
list_splice_init(&src->list, &dst->list);
- set = (!dst->nr_items && src->nr_items);
- dst->nr_items += src->nr_items;
- if (set)
+
+ if (src->nr_items) {
+ dst->nr_items += src->nr_items;
memcg_set_shrinker_bit(dst_memcg, nid, lru_shrinker_id(lru));
- src->nr_items = 0;
+ src->nr_items = 0;
+ }
spin_unlock_irq(&nlru->lock);
}
diff --git a/mm/madvise.c b/mm/madvise.c
index a8d8d48a57fe..6a660858784b 100644
--- a/mm/madvise.c
+++ b/mm/madvise.c
@@ -877,7 +877,6 @@ static long madvise_remove(struct vm_area_struct *vma,
static int madvise_inject_error(int behavior,
unsigned long start, unsigned long end)
{
- struct zone *zone;
unsigned long size;
if (!capable(CAP_SYS_ADMIN))
@@ -908,24 +907,13 @@ static int madvise_inject_error(int behavior,
} else {
pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n",
pfn, start);
- /*
- * Drop the page reference taken by get_user_pages_fast(). In
- * the absence of MF_COUNT_INCREASED the memory_failure()
- * routine is responsible for pinning the page to prevent it
- * from being released back to the page allocator.
- */
- put_page(page);
- ret = memory_failure(pfn, 0);
+ ret = memory_failure(pfn, MF_COUNT_INCREASED);
}
if (ret)
return ret;
}
- /* Ensure that all poisoned pages are removed from per-cpu lists */
- for_each_populated_zone(zone)
- drain_all_pages(zone);
-
return 0;
}
#endif
@@ -1204,8 +1192,7 @@ SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec,
goto put_pid;
}
- if (task->mm != current->mm &&
- !process_madvise_behavior_valid(behavior)) {
+ if (!process_madvise_behavior_valid(behavior)) {
ret = -EINVAL;
goto release_task;
}
diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
index 2c7d03675903..b59054ef2e10 100644
--- a/mm/mapping_dirty_helpers.c
+++ b/mm/mapping_dirty_helpers.c
@@ -23,7 +23,8 @@ struct wp_walk {
/**
* wp_pte - Write-protect a pte
* @pte: Pointer to the pte
- * @addr: The virtual page address
+ * @addr: The start of protecting virtual address
+ * @end: The end of protecting virtual address
* @walk: pagetable walk callback argument
*
* The function write-protects a pte and records the range in
@@ -74,7 +75,8 @@ struct clean_walk {
* clean_record_pte - Clean a pte and record its address space offset in a
* bitmap
* @pte: Pointer to the pte
- * @addr: The virtual page address
+ * @addr: The start of virtual address to be clean
+ * @end: The end of virtual address to be clean
* @walk: pagetable walk callback argument
*
* The function cleans a pte and records the range in
diff --git a/mm/memblock.c b/mm/memblock.c
index b68ee86788af..d24bcfa88d2f 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -871,7 +871,7 @@ int __init_memblock memblock_physmem_add(phys_addr_t base, phys_addr_t size)
* @base: base address of the region
* @size: size of the region
* @set: set or clear the flag
- * @flag: the flag to udpate
+ * @flag: the flag to update
*
* This function isolates region [@base, @base + @size), and sets/clears flag
*
@@ -1419,6 +1419,9 @@ phys_addr_t __init memblock_phys_alloc_range(phys_addr_t size,
phys_addr_t start,
phys_addr_t end)
{
+ memblock_dbg("%s: %llu bytes align=0x%llx from=%pa max_addr=%pa %pS\n",
+ __func__, (u64)size, (u64)align, &start, &end,
+ (void *)_RET_IP_);
return memblock_alloc_range_nid(size, align, start, end, NUMA_NO_NODE,
false);
}
@@ -1926,6 +1929,85 @@ static int __init early_memblock(char *p)
}
early_param("memblock", early_memblock);
+static void __init free_memmap(unsigned long start_pfn, unsigned long end_pfn)
+{
+ struct page *start_pg, *end_pg;
+ phys_addr_t pg, pgend;
+
+ /*
+ * Convert start_pfn/end_pfn to a struct page pointer.
+ */
+ start_pg = pfn_to_page(start_pfn - 1) + 1;
+ end_pg = pfn_to_page(end_pfn - 1) + 1;
+
+ /*
+ * Convert to physical addresses, and round start upwards and end
+ * downwards.
+ */
+ pg = PAGE_ALIGN(__pa(start_pg));
+ pgend = __pa(end_pg) & PAGE_MASK;
+
+ /*
+ * If there are free pages between these, free the section of the
+ * memmap array.
+ */
+ if (pg < pgend)
+ memblock_free(pg, pgend - pg);
+}
+
+/*
+ * The mem_map array can get very big. Free the unused area of the memory map.
+ */
+static void __init free_unused_memmap(void)
+{
+ unsigned long start, end, prev_end = 0;
+ int i;
+
+ if (!IS_ENABLED(CONFIG_HAVE_ARCH_PFN_VALID) ||
+ IS_ENABLED(CONFIG_SPARSEMEM_VMEMMAP))
+ return;
+
+ /*
+ * This relies on each bank being in address order.
+ * The banks are sorted previously in bootmem_init().
+ */
+ for_each_mem_pfn_range(i, MAX_NUMNODES, &start, &end, NULL) {
+#ifdef CONFIG_SPARSEMEM
+ /*
+ * Take care not to free memmap entries that don't exist
+ * due to SPARSEMEM sections which aren't present.
+ */
+ start = min(start, ALIGN(prev_end, PAGES_PER_SECTION));
+#else
+ /*
+ * Align down here since the VM subsystem insists that the
+ * memmap entries are valid from the bank start aligned to
+ * MAX_ORDER_NR_PAGES.
+ */
+ start = round_down(start, MAX_ORDER_NR_PAGES);
+#endif
+
+ /*
+ * If we had a previous bank, and there is a space
+ * between the current bank and the previous, free it.
+ */
+ if (prev_end && prev_end < start)
+ free_memmap(prev_end, start);
+
+ /*
+ * Align up here since the VM subsystem insists that the
+ * memmap entries are valid from the bank end aligned to
+ * MAX_ORDER_NR_PAGES.
+ */
+ prev_end = ALIGN(end, MAX_ORDER_NR_PAGES);
+ }
+
+#ifdef CONFIG_SPARSEMEM
+ if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION))
+ free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION));
+#endif
+}
+
static void __init __free_pages_memory(unsigned long start, unsigned long end)
{
int order;
@@ -2012,6 +2094,7 @@ unsigned long __init memblock_free_all(void)
{
unsigned long pages;
+ free_unused_memmap();
reset_all_zones_managed_pages();
pages = free_low_memory_core_early();
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 29459a6ce1c7..605f671203ef 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -20,6 +20,9 @@
* Lockless page tracking & accounting
* Unified hierarchy configuration model
* Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
+ *
+ * Per memcg lru locking
+ * Copyright (C) 2020 Alibaba, Inc, Alex Shi
*/
#include <linux/page_counter.h>
@@ -533,7 +536,7 @@ struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
{
struct mem_cgroup *memcg;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
memcg = root_mem_cgroup;
@@ -560,16 +563,7 @@ ino_t page_cgroup_ino(struct page *page)
unsigned long ino = 0;
rcu_read_lock();
- memcg = page->mem_cgroup;
-
- /*
- * The lowest bit set means that memcg isn't a valid
- * memcg pointer, but a obj_cgroups pointer.
- * In this case the page is shared and doesn't belong
- * to any specific memory cgroup.
- */
- if ((unsigned long) memcg & 0x1UL)
- memcg = NULL;
+ memcg = page_memcg_check(page);
while (memcg && !(memcg->css.flags & CSS_ONLINE))
memcg = parent_mem_cgroup(memcg);
@@ -623,14 +617,9 @@ static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
if (mz->usage_in_excess < mz_node->usage_in_excess) {
p = &(*p)->rb_left;
rightmost = false;
- }
-
- /*
- * We can't avoid mem cgroups that are over their soft
- * limit by the same amount
- */
- else if (mz->usage_in_excess >= mz_node->usage_in_excess)
+ } else {
p = &(*p)->rb_right;
+ }
}
if (rightmost)
@@ -858,7 +847,26 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
__mod_memcg_lruvec_state(lruvec, idx, val);
}
-void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
+void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx,
+ int val)
+{
+ struct page *head = compound_head(page); /* rmap on tail pages */
+ struct mem_cgroup *memcg = page_memcg(head);
+ pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;
+
+ /* Untracked pages have no memcg, no lruvec. Update only the node */
+ if (!memcg) {
+ __mod_node_page_state(pgdat, idx, val);
+ return;
+ }
+
+ lruvec = mem_cgroup_lruvec(memcg, pgdat);
+ __mod_lruvec_state(lruvec, idx, val);
+}
+EXPORT_SYMBOL(__mod_lruvec_page_state);
+
+void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val)
{
pg_data_t *pgdat = page_pgdat(virt_to_page(p));
struct mem_cgroup *memcg;
@@ -882,17 +890,6 @@ void __mod_lruvec_slab_state(void *p, enum node_stat_item idx, int val)
rcu_read_unlock();
}
-void mod_memcg_obj_state(void *p, int idx, int val)
-{
- struct mem_cgroup *memcg;
-
- rcu_read_lock();
- memcg = mem_cgroup_from_obj(p);
- if (memcg)
- mod_memcg_state(memcg, idx, val);
- rcu_read_unlock();
-}
-
/**
* __count_memcg_events - account VM events in a cgroup
* @memcg: the memory cgroup
@@ -1055,7 +1052,7 @@ EXPORT_SYMBOL(get_mem_cgroup_from_mm);
*/
struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(page);
if (mem_cgroup_disabled())
return NULL;
@@ -1157,12 +1154,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
if (prev && !reclaim)
pos = prev;
- if (!root->use_hierarchy && root != root_mem_cgroup) {
- if (prev)
- goto out;
- return root;
- }
-
rcu_read_lock();
if (reclaim) {
@@ -1242,7 +1233,6 @@ struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
out_unlock:
rcu_read_unlock();
-out:
if (prev && prev != root)
css_put(&prev->css);
@@ -1335,43 +1325,74 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
return ret;
}
+#ifdef CONFIG_DEBUG_VM
+void lruvec_memcg_debug(struct lruvec *lruvec, struct page *page)
+{
+ struct mem_cgroup *memcg;
+
+ if (mem_cgroup_disabled())
+ return;
+
+ memcg = page_memcg(page);
+
+ if (!memcg)
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != root_mem_cgroup, page);
+ else
+ VM_BUG_ON_PAGE(lruvec_memcg(lruvec) != memcg, page);
+}
+#endif
+
/**
- * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
+ * lock_page_lruvec - lock and return lruvec for a given page.
* @page: the page
- * @pgdat: pgdat of the page
*
- * This function relies on page->mem_cgroup being stable - see the
- * access rules in commit_charge().
+ * This series functions should be used in either conditions:
+ * PageLRU is cleared or unset
+ * or page->_refcount is zero
+ * or page is locked.
*/
-struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
+struct lruvec *lock_page_lruvec(struct page *page)
{
- struct mem_cgroup_per_node *mz;
- struct mem_cgroup *memcg;
struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
- if (mem_cgroup_disabled()) {
- lruvec = &pgdat->__lruvec;
- goto out;
- }
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock(&lruvec->lru_lock);
+ rcu_read_unlock();
- memcg = page->mem_cgroup;
- /*
- * Swapcache readahead pages are added to the LRU - and
- * possibly migrated - before they are charged.
- */
- if (!memcg)
- memcg = root_mem_cgroup;
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irq(struct page *page)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irq(&lruvec->lru_lock);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
+
+ return lruvec;
+}
+
+struct lruvec *lock_page_lruvec_irqsave(struct page *page, unsigned long *flags)
+{
+ struct lruvec *lruvec;
+ struct pglist_data *pgdat = page_pgdat(page);
+
+ rcu_read_lock();
+ lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ spin_lock_irqsave(&lruvec->lru_lock, *flags);
+ rcu_read_unlock();
+
+ lruvec_memcg_debug(lruvec, page);
- mz = mem_cgroup_page_nodeinfo(memcg, page);
- lruvec = &mz->lruvec;
-out:
- /*
- * Since a node can be onlined after the mem_cgroup was created,
- * we have to be prepared to initialize lruvec->zone here;
- * and if offlined then reonlined, we need to reinitialize it.
- */
- if (unlikely(lruvec->pgdat != pgdat))
- lruvec->pgdat = pgdat;
return lruvec;
}
@@ -1499,6 +1520,7 @@ static struct memory_stat memory_stats[] = {
{ "anon", PAGE_SIZE, NR_ANON_MAPPED },
{ "file", PAGE_SIZE, NR_FILE_PAGES },
{ "kernel_stack", 1024, NR_KERNEL_STACK_KB },
+ { "pagetables", PAGE_SIZE, NR_PAGETABLE },
{ "percpu", 1, MEMCG_PERCPU_B },
{ "sock", PAGE_SIZE, MEMCG_SOCK },
{ "shmem", PAGE_SIZE, NR_SHMEM },
@@ -1512,6 +1534,8 @@ static struct memory_stat memory_stats[] = {
* constant(e.g. powerpc).
*/
{ "anon_thp", 0, NR_ANON_THPS },
+ { "file_thp", 0, NR_FILE_THPS },
+ { "shmem_thp", 0, NR_SHMEM_THPS },
#endif
{ "inactive_anon", PAGE_SIZE, NR_INACTIVE_ANON },
{ "active_anon", PAGE_SIZE, NR_ACTIVE_ANON },
@@ -1542,7 +1566,9 @@ static int __init memory_stats_init(void)
for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- if (memory_stats[i].idx == NR_ANON_THPS)
+ if (memory_stats[i].idx == NR_ANON_THPS ||
+ memory_stats[i].idx == NR_FILE_THPS ||
+ memory_stats[i].idx == NR_SHMEM_THPS)
memory_stats[i].ratio = HPAGE_PMD_SIZE;
#endif
VM_BUG_ON(!memory_stats[i].ratio);
@@ -2114,7 +2140,7 @@ void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
}
/**
- * lock_page_memcg - lock a page->mem_cgroup binding
+ * lock_page_memcg - lock a page and memcg binding
* @page: the page
*
* This function protects unlocked LRU pages from being moved to
@@ -2146,15 +2172,21 @@ struct mem_cgroup *lock_page_memcg(struct page *page)
if (mem_cgroup_disabled())
return NULL;
again:
- memcg = head->mem_cgroup;
+ memcg = page_memcg(head);
if (unlikely(!memcg))
return NULL;
+#ifdef CONFIG_PROVE_LOCKING
+ local_irq_save(flags);
+ might_lock(&memcg->move_lock);
+ local_irq_restore(flags);
+#endif
+
if (atomic_read(&memcg->moving_account) <= 0)
return memcg;
spin_lock_irqsave(&memcg->move_lock, flags);
- if (memcg != head->mem_cgroup) {
+ if (memcg != page_memcg(head)) {
spin_unlock_irqrestore(&memcg->move_lock, flags);
goto again;
}
@@ -2192,14 +2224,14 @@ void __unlock_page_memcg(struct mem_cgroup *memcg)
}
/**
- * unlock_page_memcg - unlock a page->mem_cgroup binding
+ * unlock_page_memcg - unlock a page and memcg binding
* @page: the page
*/
void unlock_page_memcg(struct page *page)
{
struct page *head = compound_head(page);
- __unlock_page_memcg(head->mem_cgroup);
+ __unlock_page_memcg(page_memcg(head));
}
EXPORT_SYMBOL(unlock_page_memcg);
@@ -2889,16 +2921,16 @@ static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
static void commit_charge(struct page *page, struct mem_cgroup *memcg)
{
- VM_BUG_ON_PAGE(page->mem_cgroup, page);
+ VM_BUG_ON_PAGE(page_memcg(page), page);
/*
- * Any of the following ensures page->mem_cgroup stability:
+ * Any of the following ensures page's memcg stability:
*
* - the page lock
* - LRU isolation
* - lock_page_memcg()
* - exclusive reference
*/
- page->mem_cgroup = memcg;
+ page->memcg_data = (unsigned long)memcg;
}
#ifdef CONFIG_MEMCG_KMEM
@@ -2913,8 +2945,7 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
if (!vec)
return -ENOMEM;
- if (cmpxchg(&page->obj_cgroups, NULL,
- (struct obj_cgroup **) ((unsigned long)vec | 0x1UL)))
+ if (!set_page_objcgs(page, vec))
kfree(vec);
else
kmemleak_not_leak(vec);
@@ -2925,6 +2956,12 @@ int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
/*
* Returns a pointer to the memory cgroup to which the kernel object is charged.
*
+ * A passed kernel object can be a slab object or a generic kernel page, so
+ * different mechanisms for getting the memory cgroup pointer should be used.
+ * In certain cases (e.g. kernel stacks or large kmallocs with SLUB) the caller
+ * can not know for sure how the kernel object is implemented.
+ * mem_cgroup_from_obj() can be safely used in such cases.
+ *
* The caller must ensure the memcg lifetime, e.g. by taking rcu_read_lock(),
* cgroup_mutex, etc.
*/
@@ -2938,35 +2975,30 @@ struct mem_cgroup *mem_cgroup_from_obj(void *p)
page = virt_to_head_page(p);
/*
- * If page->mem_cgroup is set, it's either a simple mem_cgroup pointer
- * or a pointer to obj_cgroup vector. In the latter case the lowest
- * bit of the pointer is set.
- * The page->mem_cgroup pointer can be asynchronously changed
- * from NULL to (obj_cgroup_vec | 0x1UL), but can't be changed
- * from a valid memcg pointer to objcg vector or back.
- */
- if (!page->mem_cgroup)
- return NULL;
-
- /*
* Slab objects are accounted individually, not per-page.
* Memcg membership data for each individual object is saved in
* the page->obj_cgroups.
*/
- if (page_has_obj_cgroups(page)) {
+ if (page_objcgs_check(page)) {
struct obj_cgroup *objcg;
unsigned int off;
off = obj_to_index(page->slab_cache, page, p);
- objcg = page_obj_cgroups(page)[off];
+ objcg = page_objcgs(page)[off];
if (objcg)
return obj_cgroup_memcg(objcg);
return NULL;
}
- /* All other pages use page->mem_cgroup */
- return page->mem_cgroup;
+ /*
+ * page_memcg_check() is used here, because page_has_obj_cgroups()
+ * check above could fail because the object cgroups vector wasn't set
+ * at that moment, but it can be set concurrently.
+ * page_memcg_check(page) will guarantee that a proper memory
+ * cgroup pointer or NULL will be returned.
+ */
+ return page_memcg_check(page);
}
__always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
@@ -2987,6 +3019,7 @@ __always_inline struct obj_cgroup *get_obj_cgroup_from_current(void)
objcg = rcu_dereference(memcg->objcg);
if (objcg && obj_cgroup_tryget(objcg))
break;
+ objcg = NULL;
}
rcu_read_unlock();
@@ -3104,8 +3137,8 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
if (memcg && !mem_cgroup_is_root(memcg)) {
ret = __memcg_kmem_charge(memcg, gfp, 1 << order);
if (!ret) {
- page->mem_cgroup = memcg;
- __SetPageKmemcg(page);
+ page->memcg_data = (unsigned long)memcg |
+ MEMCG_DATA_KMEM;
return 0;
}
css_put(&memcg->css);
@@ -3120,7 +3153,7 @@ int __memcg_kmem_charge_page(struct page *page, gfp_t gfp, int order)
*/
void __memcg_kmem_uncharge_page(struct page *page, int order)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(page);
unsigned int nr_pages = 1 << order;
if (!memcg)
@@ -3128,12 +3161,8 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
__memcg_kmem_uncharge(memcg, nr_pages);
- page->mem_cgroup = NULL;
+ page->memcg_data = 0;
css_put(&memcg->css);
-
- /* slab pages do not have PageKmemcg flag set */
- if (PageKmemcg(page))
- __ClearPageKmemcg(page);
}
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
@@ -3246,8 +3275,10 @@ int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size)
* independently later.
*/
rcu_read_lock();
+retry:
memcg = obj_cgroup_memcg(objcg);
- css_get(&memcg->css);
+ if (unlikely(!css_tryget(&memcg->css)))
+ goto retry;
rcu_read_unlock();
nr_pages = size >> PAGE_SHIFT;
@@ -3272,14 +3303,12 @@ void obj_cgroup_uncharge(struct obj_cgroup *objcg, size_t size)
#endif /* CONFIG_MEMCG_KMEM */
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-
/*
- * Because tail pages are not marked as "used", set it. We're under
- * pgdat->lru_lock and migration entries setup in all page mappings.
+ * Because page_memcg(head) is not set on compound tails, set it now.
*/
void mem_cgroup_split_huge_fixup(struct page *head)
{
- struct mem_cgroup *memcg = head->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(head);
int i;
if (mem_cgroup_disabled())
@@ -3287,7 +3316,7 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++) {
css_get(&memcg->css);
- head[i].mem_cgroup = memcg;
+ head[i].memcg_data = (unsigned long)memcg;
}
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
@@ -3470,22 +3499,6 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
}
/*
- * Test whether @memcg has children, dead or alive. Note that this
- * function doesn't care whether @memcg has use_hierarchy enabled and
- * returns %true if there are child csses according to the cgroup
- * hierarchy. Testing use_hierarchy is the caller's responsibility.
- */
-static inline bool memcg_has_children(struct mem_cgroup *memcg)
-{
- bool ret;
-
- rcu_read_lock();
- ret = css_next_child(NULL, &memcg->css);
- rcu_read_unlock();
- return ret;
-}
-
-/*
* Reclaims as many pages from the given memcg as possible.
*
* Caller is responsible for holding css reference for memcg.
@@ -3533,37 +3546,20 @@ static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
struct cftype *cft)
{
- return mem_cgroup_from_css(css)->use_hierarchy;
+ return 1;
}
static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
struct cftype *cft, u64 val)
{
- int retval = 0;
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
-
- if (memcg->use_hierarchy == val)
+ if (val == 1)
return 0;
- /*
- * If parent's use_hierarchy is set, we can't make any modifications
- * in the child subtrees. If it is unset, then the change can
- * occur, provided the current cgroup has no children.
- *
- * For the root cgroup, parent_mem is NULL, we allow value to be
- * set if there are no children.
- */
- if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
- (val == 1 || val == 0)) {
- if (!memcg_has_children(memcg))
- memcg->use_hierarchy = val;
- else
- retval = -EBUSY;
- } else
- retval = -EINVAL;
+ pr_warn_once("Non-hierarchical mode is deprecated. "
+ "Please report your usecase to linux-mm@kvack.org if you "
+ "depend on this functionality.\n");
- return retval;
+ return -EINVAL;
}
static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
@@ -3712,12 +3708,6 @@ static int memcg_online_kmem(struct mem_cgroup *memcg)
static_branch_enable(&memcg_kmem_enabled_key);
- /*
- * A memory cgroup is considered kmem-online as soon as it gets
- * kmemcg_id. Setting the id after enabling static branching will
- * guarantee no one starts accounting before all call sites are
- * patched.
- */
memcg->kmemcg_id = memcg_id;
memcg->kmem_state = KMEM_ONLINE;
@@ -3757,8 +3747,6 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg)
child = mem_cgroup_from_css(css);
BUG_ON(child->kmemcg_id != kmemcg_id);
child->kmemcg_id = parent->kmemcg_id;
- if (!memcg->use_hierarchy)
- break;
}
rcu_read_unlock();
@@ -4669,7 +4657,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
void mem_cgroup_track_foreign_dirty_slowpath(struct page *page,
struct bdi_writeback *wb)
{
- struct mem_cgroup *memcg = page->mem_cgroup;
+ struct mem_cgroup *memcg = page_memcg(page);
struct memcg_cgwb_frn *frn;
u64 now = get_jiffies_64();
u64 oldest_at = now;
@@ -5349,38 +5337,22 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
if (parent) {
memcg->swappiness = mem_cgroup_swappiness(parent);
memcg->oom_kill_disable = parent->oom_kill_disable;
- }
- if (!parent) {
- page_counter_init(&memcg->memory, NULL);
- page_counter_init(&memcg->swap, NULL);
- page_counter_init(&memcg->kmem, NULL);
- page_counter_init(&memcg->tcpmem, NULL);
- } else if (parent->use_hierarchy) {
- memcg->use_hierarchy = true;
+
page_counter_init(&memcg->memory, &parent->memory);
page_counter_init(&memcg->swap, &parent->swap);
page_counter_init(&memcg->kmem, &parent->kmem);
page_counter_init(&memcg->tcpmem, &parent->tcpmem);
} else {
- page_counter_init(&memcg->memory, &root_mem_cgroup->memory);
- page_counter_init(&memcg->swap, &root_mem_cgroup->swap);
- page_counter_init(&memcg->kmem, &root_mem_cgroup->kmem);
- page_counter_init(&memcg->tcpmem, &root_mem_cgroup->tcpmem);
- /*
- * Deeper hierachy with use_hierarchy == false doesn't make
- * much sense so let cgroup subsystem know about this
- * unfortunate state in our controller.
- */
- if (parent != root_mem_cgroup)
- memory_cgrp_subsys.broken_hierarchy = true;
- }
+ page_counter_init(&memcg->memory, NULL);
+ page_counter_init(&memcg->swap, NULL);
+ page_counter_init(&memcg->kmem, NULL);
+ page_counter_init(&memcg->tcpmem, NULL);
- /* The following stuff does not apply to the root */
- if (!parent) {
root_mem_cgroup = memcg;
return &memcg->css;
}
+ /* The following stuff does not apply to the root */
error = memcg_online_kmem(memcg);
if (error)
goto fail;
@@ -5646,14 +5618,14 @@ static int mem_cgroup_move_account(struct page *page,
/*
* Prevent mem_cgroup_migrate() from looking at
- * page->mem_cgroup of its source page while we change it.
+ * page's memory cgroup of its source page while we change it.
*/
ret = -EBUSY;
if (!trylock_page(page))
goto out;
ret = -EINVAL;
- if (page->mem_cgroup != from)
+ if (page_memcg(page) != from)
goto out_unlock;
pgdat = page_pgdat(page);
@@ -5708,13 +5680,13 @@ static int mem_cgroup_move_account(struct page *page,
/*
* All state has been migrated, let's switch to the new memcg.
*
- * It is safe to change page->mem_cgroup here because the page
+ * It is safe to change page's memcg here because the page
* is referenced, charged, isolated, and locked: we can't race
* with (un)charging, migration, LRU putback, or anything else
- * that would rely on a stable page->mem_cgroup.
+ * that would rely on a stable page's memory cgroup.
*
* Note that lock_page_memcg is a memcg lock, not a page lock,
- * to save space. As soon as we switch page->mem_cgroup to a
+ * to save space. As soon as we switch page's memory cgroup to a
* new memcg that isn't locked, the above state can change
* concurrently again. Make sure we're truly done with it.
*/
@@ -5723,7 +5695,7 @@ static int mem_cgroup_move_account(struct page *page,
css_get(&to->css);
css_put(&from->css);
- page->mem_cgroup = to;
+ page->memcg_data = (unsigned long)to;
__unlock_page_memcg(from);
@@ -5789,7 +5761,7 @@ static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
* mem_cgroup_move_account() checks the page is valid or
* not under LRU exclusion.
*/
- if (page->mem_cgroup == mc.from) {
+ if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (is_device_private_page(page))
ret = MC_TARGET_DEVICE;
@@ -5833,7 +5805,7 @@ static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
VM_BUG_ON_PAGE(!page || !PageHead(page), page);
if (!(mc.flags & MOVE_ANON))
return ret;
- if (page->mem_cgroup == mc.from) {
+ if (page_memcg(page) == mc.from) {
ret = MC_TARGET_PAGE;
if (target) {
get_page(page);
@@ -6217,24 +6189,6 @@ static void mem_cgroup_move_task(void)
}
#endif
-/*
- * Cgroup retains root cgroups across [un]mount cycles making it necessary
- * to verify whether we're attached to the default hierarchy on each mount
- * attempt.
- */
-static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
-{
- /*
- * use_hierarchy is forced on the default hierarchy. cgroup core
- * guarantees that @root doesn't have any children, so turning it
- * on for the root memcg is enough.
- */
- if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
- root_mem_cgroup->use_hierarchy = true;
- else
- root_mem_cgroup->use_hierarchy = false;
-}
-
static int seq_puts_memcg_tunable(struct seq_file *m, unsigned long value)
{
if (value == PAGE_COUNTER_MAX)
@@ -6572,7 +6526,6 @@ struct cgroup_subsys memory_cgrp_subsys = {
.can_attach = mem_cgroup_can_attach,
.cancel_attach = mem_cgroup_cancel_attach,
.post_attach = mem_cgroup_move_task,
- .bind = mem_cgroup_bind,
.dfl_cftypes = memory_files,
.legacy_cftypes = mem_cgroup_legacy_files,
.early_init = 0,
@@ -6779,12 +6732,12 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask)
/*
* Every swap fault against a single page tries to charge the
* page, bail as early as possible. shmem_unuse() encounters
- * already charged pages, too. page->mem_cgroup is protected
- * by the page lock, which serializes swap cache removal, which
- * in turn serializes uncharging.
+ * already charged pages, too. page and memcg binding is
+ * protected by the page lock, which serializes swap cache
+ * removal, which in turn serializes uncharging.
*/
VM_BUG_ON_PAGE(!PageLocked(page), page);
- if (compound_head(page)->mem_cgroup)
+ if (page_memcg(compound_head(page)))
goto out;
id = lookup_swap_cgroup_id(ent);
@@ -6868,21 +6821,21 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
VM_BUG_ON_PAGE(PageLRU(page), page);
- if (!page->mem_cgroup)
+ if (!page_memcg(page))
return;
/*
* Nobody should be changing or seriously looking at
- * page->mem_cgroup at this point, we have fully
+ * page_memcg(page) at this point, we have fully
* exclusive access to the page.
*/
- if (ug->memcg != page->mem_cgroup) {
+ if (ug->memcg != page_memcg(page)) {
if (ug->memcg) {
uncharge_batch(ug);
uncharge_gather_clear(ug);
}
- ug->memcg = page->mem_cgroup;
+ ug->memcg = page_memcg(page);
/* pairs with css_put in uncharge_batch */
css_get(&ug->memcg->css);
@@ -6891,15 +6844,13 @@ static void uncharge_page(struct page *page, struct uncharge_gather *ug)
nr_pages = compound_nr(page);
ug->nr_pages += nr_pages;
- if (!PageKmemcg(page)) {
- ug->pgpgout++;
- } else {
+ if (PageMemcgKmem(page))
ug->nr_kmem += nr_pages;
- __ClearPageKmemcg(page);
- }
+ else
+ ug->pgpgout++;
ug->dummy_page = page;
- page->mem_cgroup = NULL;
+ page->memcg_data = 0;
css_put(&ug->memcg->css);
}
@@ -6942,7 +6893,7 @@ void mem_cgroup_uncharge(struct page *page)
return;
/* Don't touch page->lru of any random page, pre-check: */
- if (!page->mem_cgroup)
+ if (!page_memcg(page))
return;
uncharge_gather_clear(&ug);
@@ -6992,11 +6943,11 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
return;
/* Page cache replacement: new page already charged? */
- if (newpage->mem_cgroup)
+ if (page_memcg(newpage))
return;
- /* Swapcache readahead pages can get replaced before being charged */
- memcg = oldpage->mem_cgroup;
+ memcg = page_memcg(oldpage);
+ VM_WARN_ON_ONCE_PAGE(!memcg, oldpage);
if (!memcg)
return;
@@ -7188,12 +7139,15 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
+ if (mem_cgroup_disabled())
+ return;
+
if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
return;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return;
@@ -7212,7 +7166,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry)
VM_BUG_ON_PAGE(oldid, page);
mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries);
- page->mem_cgroup = NULL;
+ page->memcg_data = 0;
if (!mem_cgroup_is_root(memcg))
page_counter_uncharge(&memcg->memory, nr_entries);
@@ -7252,12 +7206,15 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry)
struct mem_cgroup *memcg;
unsigned short oldid;
+ if (mem_cgroup_disabled())
+ return 0;
+
if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
return 0;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
- /* Readahead page, never charged */
+ VM_WARN_ON_ONCE_PAGE(!memcg, page);
if (!memcg)
return 0;
@@ -7336,7 +7293,7 @@ bool mem_cgroup_swap_full(struct page *page)
if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
return false;
- memcg = page->mem_cgroup;
+ memcg = page_memcg(page);
if (!memcg)
return false;
@@ -7354,9 +7311,9 @@ bool mem_cgroup_swap_full(struct page *page)
static int __init setup_swap_account(char *s)
{
if (!strcmp(s, "1"))
- cgroup_memory_noswap = 0;
+ cgroup_memory_noswap = false;
else if (!strcmp(s, "0"))
- cgroup_memory_noswap = 1;
+ cgroup_memory_noswap = true;
return 1;
}
__setup("swapaccount=", setup_swap_account);
diff --git a/mm/memory-failure.c b/mm/memory-failure.c
index 5d880d4eb9a2..5a38e9eade94 100644
--- a/mm/memory-failure.c
+++ b/mm/memory-failure.c
@@ -263,8 +263,8 @@ static int kill_proc(struct to_kill *tk, unsigned long pfn, int flags)
}
/*
- * When a unknown page type is encountered drain as many buffers as possible
- * in the hope to turn the page into a LRU or free page, which we can handle.
+ * Unknown page type encountered. Try to check whether it can turn PageLRU by
+ * lru_add_drain_all, or a free page by reclaiming slabs when possible.
*/
void shake_page(struct page *p, int access)
{
@@ -273,9 +273,6 @@ void shake_page(struct page *p, int access)
if (!PageSlab(p)) {
lru_add_drain_all();
- if (PageLRU(p))
- return;
- drain_all_pages(page_zone(p));
if (PageLRU(p) || is_free_buddy_page(p))
return;
}
@@ -809,7 +806,7 @@ static int me_swapcache_clean(struct page *p, unsigned long pfn)
*/
static int me_huge_page(struct page *p, unsigned long pfn)
{
- int res = 0;
+ int res;
struct page *hpage = compound_head(p);
struct address_space *mapping;
@@ -820,6 +817,7 @@ static int me_huge_page(struct page *p, unsigned long pfn)
if (mapping) {
res = truncate_error_page(hpage, pfn, mapping);
} else {
+ res = MF_FAILED;
unlock_page(hpage);
/*
* migration entry prevents later access on error anonymous
@@ -828,8 +826,10 @@ static int me_huge_page(struct page *p, unsigned long pfn)
*/
if (PageAnon(hpage))
put_page(hpage);
- dissolve_free_huge_page(p);
- res = MF_RECOVERED;
+ if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ }
lock_page(hpage);
}
@@ -946,13 +946,13 @@ static int page_action(struct page_state *ps, struct page *p,
}
/**
- * get_hwpoison_page() - Get refcount for memory error handling:
+ * __get_hwpoison_page() - Get refcount for memory error handling:
* @page: raw error page (hit by memory error)
*
* Return: return 0 if failed to grab the refcount, otherwise true (some
* non-zero value.)
*/
-static int get_hwpoison_page(struct page *page)
+static int __get_hwpoison_page(struct page *page)
{
struct page *head = compound_head(page);
@@ -983,13 +983,80 @@ static int get_hwpoison_page(struct page *page)
}
/*
+ * Safely get reference count of an arbitrary page.
+ *
+ * Returns 0 for a free page, 1 for an in-use page,
+ * -EIO for a page-type we cannot handle and -EBUSY if we raced with an
+ * allocation.
+ * We only incremented refcount in case the page was already in-use and it
+ * is a known type we can handle.
+ */
+static int get_any_page(struct page *p, unsigned long flags)
+{
+ int ret = 0, pass = 0;
+ bool count_increased = false;
+
+ if (flags & MF_COUNT_INCREASED)
+ count_increased = true;
+
+try_again:
+ if (!count_increased && !__get_hwpoison_page(p)) {
+ if (page_count(p)) {
+ /* We raced with an allocation, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EBUSY;
+ } else if (!PageHuge(p) && !is_free_buddy_page(p)) {
+ /* We raced with put_page, retry. */
+ if (pass++ < 3)
+ goto try_again;
+ ret = -EIO;
+ }
+ } else {
+ if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) {
+ ret = 1;
+ } else {
+ /*
+ * A page we cannot handle. Check whether we can turn
+ * it into something we can handle.
+ */
+ if (pass++ < 3) {
+ put_page(p);
+ shake_page(p, 1);
+ count_increased = false;
+ goto try_again;
+ }
+ put_page(p);
+ ret = -EIO;
+ }
+ }
+
+ return ret;
+}
+
+static int get_hwpoison_page(struct page *p, unsigned long flags,
+ enum mf_flags ctxt)
+{
+ int ret;
+
+ zone_pcp_disable(page_zone(p));
+ if (ctxt == MF_SOFT_OFFLINE)
+ ret = get_any_page(p, flags);
+ else
+ ret = __get_hwpoison_page(p);
+ zone_pcp_enable(page_zone(p));
+
+ return ret;
+}
+
+/*
* Do all that is necessary to remove user space mappings. Unmap
* the pages and send SIGBUS to the processes if the data was dirty.
*/
static bool hwpoison_user_mappings(struct page *p, unsigned long pfn,
int flags, struct page **hpagep)
{
- enum ttu_flags ttu = TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ enum ttu_flags ttu = TTU_IGNORE_MLOCK;
struct address_space *mapping;
LIST_HEAD(tokill);
bool unmap_success = true;
@@ -1162,7 +1229,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
num_poisoned_pages_inc();
- if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) {
/*
* Check "filter hit" and "race with other subpage."
*/
@@ -1176,9 +1243,13 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
}
}
unlock_page(head);
- dissolve_free_huge_page(p);
- action_result(pfn, MF_MSG_FREE_HUGE, MF_DELAYED);
- return 0;
+ res = MF_FAILED;
+ if (!dissolve_free_huge_page(p) && take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ }
+ action_result(pfn, MF_MSG_FREE_HUGE, res);
+ return res == MF_RECOVERED ? 0 : -EBUSY;
}
lock_page(head);
@@ -1231,6 +1302,12 @@ static int memory_failure_dev_pagemap(unsigned long pfn, int flags,
loff_t start;
dax_entry_t cookie;
+ if (flags & MF_COUNT_INCREASED)
+ /*
+ * Drop the extra refcount in case we come from madvise().
+ */
+ put_page(page);
+
/*
* Prevent the inode from being freed while we are interrogating
* the address_space, typically this would be handled by
@@ -1319,6 +1396,7 @@ int memory_failure(unsigned long pfn, int flags)
struct dev_pagemap *pgmap;
int res;
unsigned long page_flags;
+ bool retry = true;
if (!sysctl_memory_failure_recovery)
panic("Memory failure on page %lx", pfn);
@@ -1336,6 +1414,7 @@ int memory_failure(unsigned long pfn, int flags)
return -ENXIO;
}
+try_again:
if (PageHuge(p))
return memory_failure_hugetlb(pfn, flags);
if (TestSetPageHWPoison(p)) {
@@ -1358,10 +1437,23 @@ int memory_failure(unsigned long pfn, int flags)
* In fact it's dangerous to directly bump up page count from 0,
* that may make page_ref_freeze()/page_ref_unfreeze() mismatch.
*/
- if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) {
+ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p, flags, 0)) {
if (is_free_buddy_page(p)) {
- action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
- return 0;
+ if (take_page_off_buddy(p)) {
+ page_ref_inc(p);
+ res = MF_RECOVERED;
+ } else {
+ /* We lost the race, try again */
+ if (retry) {
+ ClearPageHWPoison(p);
+ num_poisoned_pages_dec();
+ retry = false;
+ goto try_again;
+ }
+ res = MF_FAILED;
+ }
+ action_result(pfn, MF_MSG_BUDDY, res);
+ return res == MF_RECOVERED ? 0 : -EBUSY;
} else {
action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED);
return -EBUSY;
@@ -1385,14 +1477,6 @@ int memory_failure(unsigned long pfn, int flags)
* walked by the page reclaim code, however that's not a big loss.
*/
shake_page(p, 0);
- /* shake_page could have turned it free. */
- if (!PageLRU(p) && is_free_buddy_page(p)) {
- if (flags & MF_COUNT_INCREASED)
- action_result(pfn, MF_MSG_BUDDY, MF_DELAYED);
- else
- action_result(pfn, MF_MSG_BUDDY_2ND, MF_DELAYED);
- return 0;
- }
lock_page(p);
@@ -1596,6 +1680,7 @@ int unpoison_memory(unsigned long pfn)
struct page *page;
struct page *p;
int freeit = 0;
+ unsigned long flags = 0;
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
DEFAULT_RATELIMIT_BURST);
@@ -1640,7 +1725,7 @@ int unpoison_memory(unsigned long pfn)
return 0;
}
- if (!get_hwpoison_page(p)) {
+ if (!get_hwpoison_page(p, flags, 0)) {
if (TestClearPageHWPoison(p))
num_poisoned_pages_dec();
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
@@ -1671,75 +1756,6 @@ int unpoison_memory(unsigned long pfn)
}
EXPORT_SYMBOL(unpoison_memory);
-/*
- * Safely get reference count of an arbitrary page.
- * Returns 0 for a free page, -EIO for a zero refcount page
- * that is not free, and 1 for any other page type.
- * For 1 the page is returned with increased page count, otherwise not.
- */
-static int __get_any_page(struct page *p, unsigned long pfn, int flags)
-{
- int ret;
-
- if (flags & MF_COUNT_INCREASED)
- return 1;
-
- /*
- * When the target page is a free hugepage, just remove it
- * from free hugepage list.
- */
- if (!get_hwpoison_page(p)) {
- if (PageHuge(p)) {
- pr_info("%s: %#lx free huge page\n", __func__, pfn);
- ret = 0;
- } else if (is_free_buddy_page(p)) {
- pr_info("%s: %#lx free buddy page\n", __func__, pfn);
- ret = 0;
- } else if (page_count(p)) {
- /* raced with allocation */
- ret = -EBUSY;
- } else {
- pr_info("%s: %#lx: unknown zero refcount page type %lx\n",
- __func__, pfn, p->flags);
- ret = -EIO;
- }
- } else {
- /* Not a free page */
- ret = 1;
- }
- return ret;
-}
-
-static int get_any_page(struct page *page, unsigned long pfn, int flags)
-{
- int ret = __get_any_page(page, pfn, flags);
-
- if (ret == -EBUSY)
- ret = __get_any_page(page, pfn, flags);
-
- if (ret == 1 && !PageHuge(page) &&
- !PageLRU(page) && !__PageMovable(page)) {
- /*
- * Try to free it.
- */
- put_page(page);
- shake_page(page, 1);
-
- /*
- * Did it turn free?
- */
- ret = __get_any_page(page, pfn, 0);
- if (ret == 1 && !PageLRU(page)) {
- /* Drop page reference which is from __get_any_page() */
- put_page(page);
- pr_info("soft_offline: %#lx: unknown non LRU page type %lx (%pGp)\n",
- pfn, page->flags, &page->flags);
- return -EIO;
- }
- }
- return ret;
-}
-
static bool isolate_page(struct page *page, struct list_head *pagelist)
{
bool isolated = false;
@@ -1839,11 +1855,11 @@ static int __soft_offline_page(struct page *page)
pr_info("soft offline: %#lx: %s migration failed %d, type %lx (%pGp)\n",
pfn, msg_page[huge], ret, page->flags, &page->flags);
if (ret > 0)
- ret = -EIO;
+ ret = -EBUSY;
}
} else {
- pr_info("soft offline: %#lx: %s isolation failed: %d, page count %d, type %lx (%pGp)\n",
- pfn, msg_page[huge], ret, page_count(page), page->flags, &page->flags);
+ pr_info("soft offline: %#lx: %s isolation failed, page count %d, type %lx (%pGp)\n",
+ pfn, msg_page[huge], page_count(page), page->flags, &page->flags);
ret = -EBUSY;
}
return ret;
@@ -1905,7 +1921,7 @@ int soft_offline_page(unsigned long pfn, int flags)
return -EIO;
if (PageHWPoison(page)) {
- pr_info("soft offline: %#lx page already poisoned\n", pfn);
+ pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
if (flags & MF_COUNT_INCREASED)
put_page(page);
return 0;
@@ -1913,16 +1929,20 @@ int soft_offline_page(unsigned long pfn, int flags)
retry:
get_online_mems();
- ret = get_any_page(page, pfn, flags);
+ ret = get_hwpoison_page(page, flags, MF_SOFT_OFFLINE);
put_online_mems();
- if (ret > 0)
+ if (ret > 0) {
ret = soft_offline_in_use_page(page);
- else if (ret == 0)
+ } else if (ret == 0) {
if (soft_offline_free_page(page) && try_again) {
try_again = false;
goto retry;
}
+ } else if (ret == -EIO) {
+ pr_info("%s: %#lx: unknown page type: %lx (%pGP)\n",
+ __func__, pfn, page->flags, &page->flags);
+ }
return ret;
}
diff --git a/mm/memory.c b/mm/memory.c
index c48f8df6e502..7d608765932b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1171,6 +1171,15 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE,
0, src_vma, src_mm, addr, end);
mmu_notifier_invalidate_range_start(&range);
+ /*
+ * Disabling preemption is not needed for the write side, as
+ * the read side doesn't spin, but goes to the mmap_lock.
+ *
+ * Use the raw variant of the seqcount_t write API to avoid
+ * lockdep complaining about preemptibility.
+ */
+ mmap_assert_write_locked(src_mm);
+ raw_write_seqcount_begin(&src_mm->write_protect_seq);
}
ret = 0;
@@ -1187,8 +1196,10 @@ copy_page_range(struct vm_area_struct *dst_vma, struct vm_area_struct *src_vma)
}
} while (dst_pgd++, src_pgd++, addr = next, addr != end);
- if (is_cow)
+ if (is_cow) {
+ raw_write_seqcount_end(&src_mm->write_protect_seq);
mmu_notifier_invalidate_range_end(&range);
+ }
return ret;
}
@@ -4696,9 +4707,9 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
}
#endif /* __PAGETABLE_PMD_FOLDED */
-static int __follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
+int follow_pte(struct mm_struct *mm, unsigned long address,
+ struct mmu_notifier_range *range, pte_t **ptepp, pmd_t **pmdpp,
+ spinlock_t **ptlp)
{
pgd_t *pgd;
p4d_t *p4d;
@@ -4763,32 +4774,6 @@ out:
return -EINVAL;
}
-static inline int follow_pte(struct mm_struct *mm, unsigned long address,
- pte_t **ptepp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, NULL,
- ptepp, NULL, ptlp)));
- return res;
-}
-
-int follow_pte_pmd(struct mm_struct *mm, unsigned long address,
- struct mmu_notifier_range *range,
- pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp)
-{
- int res;
-
- /* (void) is needed to make gcc happy */
- (void) __cond_lock(*ptlp,
- !(res = __follow_pte_pmd(mm, address, range,
- ptepp, pmdpp, ptlp)));
- return res;
-}
-EXPORT_SYMBOL(follow_pte_pmd);
-
/**
* follow_pfn - look up PFN at a user virtual address
* @vma: memory mapping
@@ -4809,7 +4794,7 @@ int follow_pfn(struct vm_area_struct *vma, unsigned long address,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
return ret;
- ret = follow_pte(vma->vm_mm, address, &ptep, &ptl);
+ ret = follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl);
if (ret)
return ret;
*pfn = pte_pfn(*ptep);
@@ -4830,7 +4815,7 @@ int follow_phys(struct vm_area_struct *vma,
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)))
goto out;
- if (follow_pte(vma->vm_mm, address, &ptep, &ptl))
+ if (follow_pte(vma->vm_mm, address, NULL, &ptep, NULL, &ptl))
goto out;
pte = *ptep;
@@ -4874,11 +4859,10 @@ EXPORT_SYMBOL_GPL(generic_access_phys);
#endif
/*
- * Access another process' address space as given in mm. If non-NULL, use the
- * given task for page fault accounting.
+ * Access another process' address space as given in mm.
*/
-int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, void *buf, int len, unsigned int gup_flags)
+int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+ int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
void *old_buf = buf;
@@ -4955,7 +4939,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
- return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+ return __access_remote_vm(mm, addr, buf, len, gup_flags);
}
/*
@@ -4973,7 +4957,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr,
if (!mm)
return 0;
- ret = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ ret = __access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 63b2e46b6555..af41fb990820 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -596,8 +596,7 @@ void generic_online_page(struct page *page, unsigned int order)
* so we should map it first. This is better than introducing a special
* case in page freeing fast path.
*/
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 1);
+ debug_pagealloc_map_pages(page, 1 << order);
__free_pages_core(page, order);
totalram_pages_add(1UL << order);
#ifdef CONFIG_HIGHMEM
@@ -1304,7 +1303,7 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn)
if (WARN_ON(PageLRU(page)))
isolate_lru_page(page);
if (page_mapped(page))
- try_to_unmap(page, TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_IGNORE_MLOCK);
continue;
}
@@ -1492,13 +1491,19 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
}
node = zone_to_nid(zone);
+ /*
+ * Disable pcplists so that page isolation cannot race with freeing
+ * in a way that pages from isolated pageblock are left on pcplists.
+ */
+ zone_pcp_disable(zone);
+
/* set above range as isolated */
ret = start_isolate_page_range(start_pfn, end_pfn,
MIGRATE_MOVABLE,
MEMORY_OFFLINE | REPORT_FAILURE);
if (ret) {
reason = "failure to isolate range";
- goto failed_removal;
+ goto failed_removal_pcplists_disabled;
}
arg.start_pfn = start_pfn;
@@ -1550,26 +1555,13 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
goto failed_removal_isolated;
}
- /*
- * per-cpu pages are drained in start_isolate_page_range, but if
- * there are still pages that are not free, make sure that we
- * drain again, because when we isolated range we might
- * have raced with another thread that was adding pages to pcp
- * list.
- *
- * Forward progress should be still guaranteed because
- * pages on the pcp list can only belong to MOVABLE_ZONE
- * because has_unmovable_pages explicitly checks for
- * PageBuddy on freed pages on other zones.
- */
ret = test_pages_isolated(start_pfn, end_pfn, MEMORY_OFFLINE);
- if (ret)
- drain_all_pages(zone);
+
} while (ret);
/* Mark all sections offline and remove free pages from the buddy. */
__offline_isolated_pages(start_pfn, end_pfn);
- pr_info("Offlined Pages %ld\n", nr_pages);
+ pr_debug("Offlined Pages %ld\n", nr_pages);
/*
* The memory sections are marked offline, and the pageblock flags
@@ -1580,6 +1572,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
zone->nr_isolate_pageblock -= nr_pages / pageblock_nr_pages;
spin_unlock_irqrestore(&zone->lock, flags);
+ zone_pcp_enable(zone);
+
/* removal success */
adjust_managed_page_count(pfn_to_page(start_pfn), -nr_pages);
zone->present_pages -= nr_pages;
@@ -1612,6 +1606,8 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages)
failed_removal_isolated:
undo_isolate_page_range(start_pfn, end_pfn, MIGRATE_MOVABLE);
memory_notify(MEM_CANCEL_OFFLINE, &arg);
+failed_removal_pcplists_disabled:
+ zone_pcp_enable(zone);
failed_removal:
pr_debug("memory offlining [mem %#010llx-%#010llx] failed due to %s\n",
(unsigned long long) start_pfn << PAGE_SHIFT,
@@ -1788,39 +1784,112 @@ int remove_memory(int nid, u64 start, u64 size)
}
EXPORT_SYMBOL_GPL(remove_memory);
+static int try_offline_memory_block(struct memory_block *mem, void *arg)
+{
+ uint8_t online_type = MMOP_ONLINE_KERNEL;
+ uint8_t **online_types = arg;
+ struct page *page;
+ int rc;
+
+ /*
+ * Sense the online_type via the zone of the memory block. Offlining
+ * with multiple zones within one memory block will be rejected
+ * by offlining code ... so we don't care about that.
+ */
+ page = pfn_to_online_page(section_nr_to_pfn(mem->start_section_nr));
+ if (page && zone_idx(page_zone(page)) == ZONE_MOVABLE)
+ online_type = MMOP_ONLINE_MOVABLE;
+
+ rc = device_offline(&mem->dev);
+ /*
+ * Default is MMOP_OFFLINE - change it only if offlining succeeded,
+ * so try_reonline_memory_block() can do the right thing.
+ */
+ if (!rc)
+ **online_types = online_type;
+
+ (*online_types)++;
+ /* Ignore if already offline. */
+ return rc < 0 ? rc : 0;
+}
+
+static int try_reonline_memory_block(struct memory_block *mem, void *arg)
+{
+ uint8_t **online_types = arg;
+ int rc;
+
+ if (**online_types != MMOP_OFFLINE) {
+ mem->online_type = **online_types;
+ rc = device_online(&mem->dev);
+ if (rc < 0)
+ pr_warn("%s: Failed to re-online memory: %d",
+ __func__, rc);
+ }
+
+ /* Continue processing all remaining memory blocks. */
+ (*online_types)++;
+ return 0;
+}
+
/*
- * Try to offline and remove a memory block. Might take a long time to
- * finish in case memory is still in use. Primarily useful for memory devices
- * that logically unplugged all memory (so it's no longer in use) and want to
- * offline + remove the memory block.
+ * Try to offline and remove memory. Might take a long time to finish in case
+ * memory is still in use. Primarily useful for memory devices that logically
+ * unplugged all memory (so it's no longer in use) and want to offline + remove
+ * that memory.
*/
int offline_and_remove_memory(int nid, u64 start, u64 size)
{
- struct memory_block *mem;
- int rc = -EINVAL;
+ const unsigned long mb_count = size / memory_block_size_bytes();
+ uint8_t *online_types, *tmp;
+ int rc;
if (!IS_ALIGNED(start, memory_block_size_bytes()) ||
- size != memory_block_size_bytes())
- return rc;
+ !IS_ALIGNED(size, memory_block_size_bytes()) || !size)
+ return -EINVAL;
+
+ /*
+ * We'll remember the old online type of each memory block, so we can
+ * try to revert whatever we did when offlining one memory block fails
+ * after offlining some others succeeded.
+ */
+ online_types = kmalloc_array(mb_count, sizeof(*online_types),
+ GFP_KERNEL);
+ if (!online_types)
+ return -ENOMEM;
+ /*
+ * Initialize all states to MMOP_OFFLINE, so when we abort processing in
+ * try_offline_memory_block(), we'll skip all unprocessed blocks in
+ * try_reonline_memory_block().
+ */
+ memset(online_types, MMOP_OFFLINE, mb_count);
lock_device_hotplug();
- mem = find_memory_block(__pfn_to_section(PFN_DOWN(start)));
- if (mem)
- rc = device_offline(&mem->dev);
- /* Ignore if the device is already offline. */
- if (rc > 0)
- rc = 0;
+
+ tmp = online_types;
+ rc = walk_memory_blocks(start, size, &tmp, try_offline_memory_block);
/*
- * In case we succeeded to offline the memory block, remove it.
+ * In case we succeeded to offline all memory, remove it.
* This cannot fail as it cannot get onlined in the meantime.
*/
if (!rc) {
rc = try_remove_memory(nid, start, size);
- WARN_ON_ONCE(rc);
+ if (rc)
+ pr_err("%s: Failed to remove memory: %d", __func__, rc);
+ }
+
+ /*
+ * Rollback what we did. While memory onlining might theoretically fail
+ * (nacked by a notifier), it barely ever happens.
+ */
+ if (rc) {
+ tmp = online_types;
+ walk_memory_blocks(start, size, &tmp,
+ try_reonline_memory_block);
}
unlock_device_hotplug();
+ kfree(online_types);
return rc;
}
EXPORT_SYMBOL_GPL(offline_and_remove_memory);
diff --git a/mm/mempolicy.c b/mm/mempolicy.c
index 3ca4898f3f24..8cf96bd21341 100644
--- a/mm/mempolicy.c
+++ b/mm/mempolicy.c
@@ -1114,9 +1114,7 @@ int do_migrate_pages(struct mm_struct *mm, const nodemask_t *from,
int err;
nodemask_t tmp;
- err = migrate_prep();
- if (err)
- return err;
+ migrate_prep();
mmap_read_lock(mm);
@@ -1315,9 +1313,7 @@ static long do_mbind(unsigned long start, unsigned long len,
if (flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL)) {
- err = migrate_prep();
- if (err)
- goto mpol_out;
+ migrate_prep();
}
{
NODEMASK_SCRATCH(scratch);
diff --git a/mm/mempool.c b/mm/mempool.c
index f473cdddaff0..624ed51b060f 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -104,7 +104,7 @@ static inline void poison_element(mempool_t *pool, void *element)
static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_poison_kfree(element, _RET_IP_);
+ kasan_slab_free_mempool(element, _RET_IP_);
else if (pool->alloc == mempool_alloc_pages)
kasan_free_pages(element, (unsigned long)pool->pool_data);
}
@@ -112,7 +112,7 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element)
static void kasan_unpoison_element(mempool_t *pool, void *element)
{
if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc)
- kasan_unpoison_slab(element);
+ kasan_unpoison_range(element, __ksize(element));
else if (pool->alloc == mempool_alloc_pages)
kasan_alloc_pages(element, (unsigned long)pool->pool_data);
}
diff --git a/mm/migrate.c b/mm/migrate.c
index 5795cb82e27c..ee5e612b4cd8 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -62,7 +62,7 @@
* to be migrated using isolate_lru_page(). If scheduling work on other CPUs is
* undesirable, use migrate_prep_local()
*/
-int migrate_prep(void)
+void migrate_prep(void)
{
/*
* Clear the LRU lists so pages can be isolated.
@@ -71,16 +71,12 @@ int migrate_prep(void)
* pages that may be busy.
*/
lru_add_drain_all();
-
- return 0;
}
/* Do the necessary work of migrate_prep but not if it involves other CPUs */
-int migrate_prep_local(void)
+void migrate_prep_local(void)
{
lru_add_drain();
-
- return 0;
}
int isolate_movable_page(struct page *page, isolate_mode_t mode)
@@ -1106,7 +1102,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
* and treated as swapcache but it has no rmap yet.
* Calling try_to_unmap() against a page->mapping==NULL page will
* trigger a BUG. So handle it here.
- * 2. An orphaned page (see truncate_complete_page) might have
+ * 2. An orphaned page (see truncate_cleanup_page) might have
* fs-private metadata. The page can be picked up due to memory
* offlining. Everywhere else except page reclaim, the page is
* invisible to the vm, so the page can not be migrated. So try to
@@ -1122,8 +1118,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage,
/* Establish migration ptes */
VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma,
page);
- try_to_unmap(page,
- TTU_MIGRATION|TTU_IGNORE_MLOCK|TTU_IGNORE_ACCESS);
+ try_to_unmap(page, TTU_MIGRATION|TTU_IGNORE_MLOCK);
page_was_mapped = 1;
}
@@ -1169,13 +1164,14 @@ static int unmap_and_move(new_page_t get_new_page,
free_page_t put_new_page,
unsigned long private, struct page *page,
int force, enum migrate_mode mode,
- enum migrate_reason reason)
+ enum migrate_reason reason,
+ struct list_head *ret)
{
int rc = MIGRATEPAGE_SUCCESS;
struct page *newpage = NULL;
if (!thp_migration_supported() && PageTransHuge(page))
- return -ENOMEM;
+ return -ENOSYS;
if (page_count(page) == 1) {
/* page was freed from under us. So we are done. */
@@ -1206,7 +1202,14 @@ out:
* migrated will have kept its references and be restored.
*/
list_del(&page->lru);
+ }
+ /*
+ * If migration is successful, releases reference grabbed during
+ * isolation. Otherwise, restore the page to right list unless
+ * we want to retry.
+ */
+ if (rc == MIGRATEPAGE_SUCCESS) {
/*
* Compaction can migrate also non-LRU pages which are
* not accounted to NR_ISOLATED_*. They can be recognized
@@ -1215,35 +1218,16 @@ out:
if (likely(!__PageMovable(page)))
mod_node_page_state(page_pgdat(page), NR_ISOLATED_ANON +
page_is_file_lru(page), -thp_nr_pages(page));
- }
- /*
- * If migration is successful, releases reference grabbed during
- * isolation. Otherwise, restore the page to right list unless
- * we want to retry.
- */
- if (rc == MIGRATEPAGE_SUCCESS) {
if (reason != MR_MEMORY_FAILURE)
/*
* We release the page in page_handle_poison.
*/
put_page(page);
} else {
- if (rc != -EAGAIN) {
- if (likely(!__PageMovable(page))) {
- putback_lru_page(page);
- goto put_new;
- }
+ if (rc != -EAGAIN)
+ list_add_tail(&page->lru, ret);
- lock_page(page);
- if (PageMovable(page))
- putback_movable_page(page);
- else
- __ClearPageIsolated(page);
- unlock_page(page);
- put_page(page);
- }
-put_new:
if (put_new_page)
put_new_page(newpage, private);
else
@@ -1274,7 +1258,8 @@ put_new:
static int unmap_and_move_huge_page(new_page_t get_new_page,
free_page_t put_new_page, unsigned long private,
struct page *hpage, int force,
- enum migrate_mode mode, int reason)
+ enum migrate_mode mode, int reason,
+ struct list_head *ret)
{
int rc = -EAGAIN;
int page_was_mapped = 0;
@@ -1290,7 +1275,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
* kicking migration.
*/
if (!hugepage_migration_supported(page_hstate(hpage))) {
- putback_active_hugepage(hpage);
+ list_move_tail(&hpage->lru, ret);
return -ENOSYS;
}
@@ -1329,8 +1314,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page,
if (page_mapped(hpage)) {
bool mapping_locked = false;
- enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK|
- TTU_IGNORE_ACCESS;
+ enum ttu_flags ttu = TTU_MIGRATION|TTU_IGNORE_MLOCK;
if (!PageAnon(hpage)) {
/*
@@ -1376,8 +1360,10 @@ put_anon:
out_unlock:
unlock_page(hpage);
out:
- if (rc != -EAGAIN)
+ if (rc == MIGRATEPAGE_SUCCESS)
putback_active_hugepage(hpage);
+ else if (rc != -EAGAIN && rc != MIGRATEPAGE_SUCCESS)
+ list_move_tail(&hpage->lru, ret);
/*
* If migration was not successful and there's a freeing callback, use
@@ -1392,6 +1378,20 @@ out:
return rc;
}
+static inline int try_split_thp(struct page *page, struct page **page2,
+ struct list_head *from)
+{
+ int rc = 0;
+
+ lock_page(page);
+ rc = split_huge_page_to_list(page, from);
+ unlock_page(page);
+ if (!rc)
+ list_safe_reset_next(page, *page2, lru);
+
+ return rc;
+}
+
/*
* migrate_pages - migrate the pages specified in a list, to the free pages
* supplied as the target for the page migration
@@ -1408,8 +1408,8 @@ out:
*
* The function returns after 10 attempts or if no pages are movable any more
* because the list has become empty or no retryable pages exist any more.
- * The caller should call putback_movable_pages() to return pages to the LRU
- * or free list only if ret != 0.
+ * It is caller's responsibility to call putback_movable_pages() to return pages
+ * to the LRU or free list only if ret != 0.
*
* Returns the number of pages that were not migrated, or an error code.
*/
@@ -1430,6 +1430,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
struct page *page2;
int swapwrite = current->flags & PF_SWAPWRITE;
int rc, nr_subpages;
+ LIST_HEAD(ret_pages);
if (!swapwrite)
current->flags |= PF_SWAPWRITE;
@@ -1452,31 +1453,56 @@ retry:
if (PageHuge(page))
rc = unmap_and_move_huge_page(get_new_page,
put_new_page, private, page,
- pass > 2, mode, reason);
+ pass > 2, mode, reason,
+ &ret_pages);
else
rc = unmap_and_move(get_new_page, put_new_page,
private, page, pass > 2, mode,
- reason);
-
+ reason, &ret_pages);
+ /*
+ * The rules are:
+ * Success: non hugetlb page will be freed, hugetlb
+ * page will be put back
+ * -EAGAIN: stay on the from list
+ * -ENOMEM: stay on the from list
+ * Other errno: put on ret_pages list then splice to
+ * from list
+ */
switch(rc) {
+ /*
+ * THP migration might be unsupported or the
+ * allocation could've failed so we should
+ * retry on the same page with the THP split
+ * to base pages.
+ *
+ * Head page is retried immediately and tail
+ * pages are added to the tail of the list so
+ * we encounter them after the rest of the list
+ * is processed.
+ */
+ case -ENOSYS:
+ /* THP migration is unsupported */
+ if (is_thp) {
+ if (!try_split_thp(page, &page2, from)) {
+ nr_thp_split++;
+ goto retry;
+ }
+
+ nr_thp_failed++;
+ nr_failed += nr_subpages;
+ break;
+ }
+
+ /* Hugetlb migration is unsupported */
+ nr_failed++;
+ break;
case -ENOMEM:
/*
- * THP migration might be unsupported or the
- * allocation could've failed so we should
- * retry on the same page with the THP split
- * to base pages.
- *
- * Head page is retried immediately and tail
- * pages are added to the tail of the list so
- * we encounter them after the rest of the list
- * is processed.
+ * When memory is low, don't bother to try to migrate
+ * other pages, just exit.
*/
if (is_thp) {
- lock_page(page);
- rc = split_huge_page_to_list(page, from);
- unlock_page(page);
- if (!rc) {
- list_safe_reset_next(page, page2, lru);
+ if (!try_split_thp(page, &page2, from)) {
nr_thp_split++;
goto retry;
}
@@ -1504,7 +1530,7 @@ retry:
break;
default:
/*
- * Permanent failure (-EBUSY, -ENOSYS, etc.):
+ * Permanent failure (-EBUSY, etc.):
* unlike -EAGAIN case, the failed page is
* removed from migration page list and not
* retried in the next outer loop.
@@ -1523,6 +1549,12 @@ retry:
nr_thp_failed += thp_retry;
rc = nr_failed;
out:
+ /*
+ * Put the permanent failure page back to migration list, they
+ * will be put back to the right list by the caller.
+ */
+ list_splice(&ret_pages, from);
+
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
count_vm_events(PGMIGRATE_FAIL, nr_failed);
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
@@ -1698,7 +1730,7 @@ static int move_pages_and_store_status(struct mm_struct *mm, int node,
* Positive err means the number of failed
* pages to migrate. Since we are going to
* abort and return the number of non-migrated
- * pages, so need to incude the rest of the
+ * pages, so need to include the rest of the
* nr_pages that have not been attempted as
* well.
*/
@@ -2065,6 +2097,17 @@ bool pmd_trans_migrating(pmd_t pmd)
return PageLocked(page);
}
+static inline bool is_shared_exec_page(struct vm_area_struct *vma,
+ struct page *page)
+{
+ if (page_mapcount(page) != 1 &&
+ (page_is_file_lru(page) || vma_is_shmem(vma)) &&
+ (vma->vm_flags & VM_EXEC))
+ return true;
+
+ return false;
+}
+
/*
* Attempt to migrate a misplaced page to the specified destination
* node. Caller is expected to have an elevated reference count on
@@ -2082,8 +2125,7 @@ int migrate_misplaced_page(struct page *page, struct vm_area_struct *vma,
* Don't migrate file pages that are mapped in multiple processes
* with execute permissions as they are probably shared libraries.
*/
- if (page_mapcount(page) != 1 && page_is_file_lru(page) &&
- (vma->vm_flags & VM_EXEC))
+ if (is_shared_exec_page(vma, page))
goto out;
/*
@@ -2138,6 +2180,9 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
int page_lru = page_is_file_lru(page);
unsigned long start = address & HPAGE_PMD_MASK;
+ if (is_shared_exec_page(vma, page))
+ goto out;
+
new_page = alloc_pages_node(node,
(GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
HPAGE_PMD_ORDER);
@@ -2249,6 +2294,7 @@ out_fail:
out_unlock:
unlock_page(page);
+out:
put_page(page);
return 0;
}
@@ -2548,7 +2594,7 @@ static bool migrate_vma_check_page(struct page *page)
* will bump the page reference count. Sadly there is no way to
* differentiate a regular pin from migration wait. Hence to
* avoid 2 racing thread trying to migrate back to CPU to enter
- * infinite loop (one stoping migration because the other is
+ * infinite loop (one stopping migration because the other is
* waiting on pte migration entry). We always return true here.
*
* FIXME proper solution is to rework migration_entry_wait() so
@@ -2688,7 +2734,7 @@ static void migrate_vma_prepare(struct migrate_vma *migrate)
*/
static void migrate_vma_unmap(struct migrate_vma *migrate)
{
- int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK | TTU_IGNORE_ACCESS;
+ int flags = TTU_MIGRATION | TTU_IGNORE_MLOCK;
const unsigned long npages = migrate->npages;
const unsigned long start = migrate->start;
unsigned long addr, i, restore = 0;
@@ -2848,8 +2894,7 @@ EXPORT_SYMBOL(migrate_vma_setup);
static void migrate_vma_insert_page(struct migrate_vma *migrate,
unsigned long addr,
struct page *page,
- unsigned long *src,
- unsigned long *dst)
+ unsigned long *src)
{
struct vm_area_struct *vma = migrate->vma;
struct mm_struct *mm = vma->vm_mm;
@@ -3003,16 +3048,14 @@ void migrate_vma_pages(struct migrate_vma *migrate)
if (!notified) {
notified = true;
- mmu_notifier_range_init(&range,
- MMU_NOTIFY_CLEAR, 0,
- NULL,
- migrate->vma->vm_mm,
- addr, migrate->end);
+ mmu_notifier_range_init_migrate(&range, 0,
+ migrate->vma, migrate->vma->vm_mm,
+ addr, migrate->end,
+ migrate->pgmap_owner);
mmu_notifier_invalidate_range_start(&range);
}
migrate_vma_insert_page(migrate, addr, newpage,
- &migrate->src[i],
- &migrate->dst[i]);
+ &migrate->src[i]);
continue;
}
diff --git a/mm/mlock.c b/mm/mlock.c
index 884b1216da6a..55b3b3672977 100644
--- a/mm/mlock.c
+++ b/mm/mlock.c
@@ -106,26 +106,6 @@ void mlock_vma_page(struct page *page)
}
/*
- * Isolate a page from LRU with optional get_page() pin.
- * Assumes lru_lock already held and page already pinned.
- */
-static bool __munlock_isolate_lru_page(struct page *page, bool getpage)
-{
- if (PageLRU(page)) {
- struct lruvec *lruvec;
-
- lruvec = mem_cgroup_page_lruvec(page, page_pgdat(page));
- if (getpage)
- get_page(page);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, page_lru(page));
- return true;
- }
-
- return false;
-}
-
-/*
* Finish munlock after successful page isolation
*
* Page must be locked. This is a wrapper for try_to_munlock()
@@ -187,40 +167,24 @@ static void __munlock_isolation_failed(struct page *page)
unsigned int munlock_vma_page(struct page *page)
{
int nr_pages;
- pg_data_t *pgdat = page_pgdat(page);
/* For try_to_munlock() and to serialize with page migration */
BUG_ON(!PageLocked(page));
-
VM_BUG_ON_PAGE(PageTail(page), page);
- /*
- * Serialize with any parallel __split_huge_page_refcount() which
- * might otherwise copy PageMlocked to part of the tail pages before
- * we clear it in the head page. It also stabilizes thp_nr_pages().
- */
- spin_lock_irq(&pgdat->lru_lock);
-
if (!TestClearPageMlocked(page)) {
/* Potentially, PTE-mapped THP: do not skip the rest PTEs */
- nr_pages = 1;
- goto unlock_out;
+ return 0;
}
nr_pages = thp_nr_pages(page);
- __mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
+ mod_zone_page_state(page_zone(page), NR_MLOCK, -nr_pages);
- if (__munlock_isolate_lru_page(page, true)) {
- spin_unlock_irq(&pgdat->lru_lock);
+ if (!isolate_lru_page(page))
__munlock_isolated_page(page);
- goto out;
- }
- __munlock_isolation_failed(page);
-
-unlock_out:
- spin_unlock_irq(&pgdat->lru_lock);
+ else
+ __munlock_isolation_failed(page);
-out:
return nr_pages - 1;
}
@@ -298,12 +262,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
int nr = pagevec_count(pvec);
int delta_munlocked = -nr;
struct pagevec pvec_putback;
+ struct lruvec *lruvec = NULL;
int pgrescued = 0;
pagevec_init(&pvec_putback);
/* Phase 1: page isolation */
- spin_lock_irq(&zone->zone_pgdat->lru_lock);
for (i = 0; i < nr; i++) {
struct page *page = pvec->pages[i];
@@ -312,9 +276,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
* We already have pin from follow_page_mask()
* so we can spare the get_page() here.
*/
- if (__munlock_isolate_lru_page(page, false))
+ if (TestClearPageLRU(page)) {
+ lruvec = relock_page_lruvec_irq(page, lruvec);
+ del_page_from_lru_list(page, lruvec,
+ page_lru(page));
continue;
- else
+ } else
__munlock_isolation_failed(page);
} else {
delta_munlocked++;
@@ -329,8 +296,12 @@ static void __munlock_pagevec(struct pagevec *pvec, struct zone *zone)
pagevec_add(&pvec_putback, pvec->pages[i]);
pvec->pages[i] = NULL;
}
- __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
- spin_unlock_irq(&zone->zone_pgdat->lru_lock);
+ if (lruvec) {
+ __mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ unlock_page_lruvec_irq(lruvec);
+ } else if (delta_munlocked) {
+ mod_zone_page_state(zone, NR_MLOCK, delta_munlocked);
+ }
/* Now we can release pins of pages that we are not munlocking */
pagevec_release(&pvec_putback);
diff --git a/mm/mm_init.c b/mm/mm_init.c
index b06a30fbedff..8e02e865cc65 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -173,6 +173,7 @@ static int __meminit mm_compute_batch_notifier(struct notifier_block *self,
case MEM_ONLINE:
case MEM_OFFLINE:
mm_compute_batch(sysctl_overcommit_memory);
+ break;
default:
break;
}
diff --git a/mm/mmap.c b/mm/mmap.c
index d91ecb00d38c..dc7206032387 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -1808,6 +1808,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
if (error)
goto unmap_and_free_vma;
+ /* Can addr have changed??
+ *
+ * Answer: Yes, several device drivers can do it in their
+ * f_op->mmap method. -DaveM
+ * Bug: If addr is changed, prev, rb_link, rb_parent should
+ * be updated for vma_link()
+ */
+ WARN_ON_ONCE(addr != vma->vm_start);
+
+ addr = vma->vm_start;
+
/* If vm_flags changed after call_mmap(), we should try merge vma again
* as we may succeed this time.
*/
@@ -1822,25 +1833,12 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
fput(vma->vm_file);
vm_area_free(vma);
vma = merge;
- /* Update vm_flags and possible addr to pick up the change. We don't
- * warn here if addr changed as the vma is not linked by vma_link().
- */
- addr = vma->vm_start;
+ /* Update vm_flags to pick up the change. */
vm_flags = vma->vm_flags;
goto unmap_writable;
}
}
- /* Can addr have changed??
- *
- * Answer: Yes, several device drivers can do it in their
- * f_op->mmap method. -DaveM
- * Bug: If addr is changed, prev, rb_link, rb_parent should
- * be updated for vma_link()
- */
- WARN_ON_ONCE(addr != vma->vm_start);
-
- addr = vma->vm_start;
vm_flags = vma->vm_flags;
} else if (vm_flags & VM_SHARED) {
error = shmem_zero_setup(vma);
@@ -1899,8 +1897,8 @@ out:
return addr;
unmap_and_free_vma:
+ fput(vma->vm_file);
vma->vm_file = NULL;
- fput(file);
/* Undo any partial mapping done by a device driver. */
unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end);
@@ -2733,8 +2731,8 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma,
struct vm_area_struct *new;
int err;
- if (vma->vm_ops && vma->vm_ops->split) {
- err = vma->vm_ops->split(vma, addr);
+ if (vma->vm_ops && vma->vm_ops->may_split) {
+ err = vma->vm_ops->may_split(vma, addr);
if (err)
return err;
}
@@ -3407,10 +3405,14 @@ static const char *special_mapping_name(struct vm_area_struct *vma)
return ((struct vm_special_mapping *)vma->vm_private_data)->name;
}
-static int special_mapping_mremap(struct vm_area_struct *new_vma)
+static int special_mapping_mremap(struct vm_area_struct *new_vma,
+ unsigned long flags)
{
struct vm_special_mapping *sm = new_vma->vm_private_data;
+ if (flags & MREMAP_DONTUNMAP)
+ return -EINVAL;
+
if (WARN_ON_ONCE(current->mm != new_vma->vm_mm))
return -EFAULT;
@@ -3420,6 +3422,17 @@ static int special_mapping_mremap(struct vm_area_struct *new_vma)
return 0;
}
+static int special_mapping_split(struct vm_area_struct *vma, unsigned long addr)
+{
+ /*
+ * Forbid splitting special mappings - kernel has expectations over
+ * the number of pages in mapping. Together with VM_DONTEXPAND
+ * the size of vma should stay the same over the special mapping's
+ * lifetime.
+ */
+ return -EINVAL;
+}
+
static const struct vm_operations_struct special_mapping_vmops = {
.close = special_mapping_close,
.fault = special_mapping_fault,
@@ -3427,6 +3440,7 @@ static const struct vm_operations_struct special_mapping_vmops = {
.name = special_mapping_name,
/* vDSO code relies that VVAR can't be accessed remotely */
.access = NULL,
+ .may_split = special_mapping_split,
};
static const struct vm_operations_struct legacy_special_mapping_vmops = {
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
new file mode 100644
index 000000000000..dcdde4f722a4
--- /dev/null
+++ b/mm/mmap_lock.c
@@ -0,0 +1,230 @@
+// SPDX-License-Identifier: GPL-2.0
+#define CREATE_TRACE_POINTS
+#include <trace/events/mmap_lock.h>
+
+#include <linux/mm.h>
+#include <linux/cgroup.h>
+#include <linux/memcontrol.h>
+#include <linux/mmap_lock.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/rcupdate.h>
+#include <linux/smp.h>
+#include <linux/trace_events.h>
+
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_start_locking);
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_acquire_returned);
+EXPORT_TRACEPOINT_SYMBOL(mmap_lock_released);
+
+#ifdef CONFIG_MEMCG
+
+/*
+ * Our various events all share the same buffer (because we don't want or need
+ * to allocate a set of buffers *per event type*), so we need to protect against
+ * concurrent _reg() and _unreg() calls, and count how many _reg() calls have
+ * been made.
+ */
+static DEFINE_MUTEX(reg_lock);
+static int reg_refcount; /* Protected by reg_lock. */
+
+/*
+ * Size of the buffer for memcg path names. Ignoring stack trace support,
+ * trace_events_hist.c uses MAX_FILTER_STR_VAL for this, so we also use it.
+ */
+#define MEMCG_PATH_BUF_SIZE MAX_FILTER_STR_VAL
+
+/*
+ * How many contexts our trace events might be called in: normal, softirq, irq,
+ * and NMI.
+ */
+#define CONTEXT_COUNT 4
+
+static DEFINE_PER_CPU(char __rcu *, memcg_path_buf);
+static char **tmp_bufs;
+static DEFINE_PER_CPU(int, memcg_path_buf_idx);
+
+/* Called with reg_lock held. */
+static void free_memcg_path_bufs(void)
+{
+ int cpu;
+ char **old = tmp_bufs;
+
+ for_each_possible_cpu(cpu) {
+ *(old++) = rcu_dereference_protected(
+ per_cpu(memcg_path_buf, cpu),
+ lockdep_is_held(&reg_lock));
+ rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), NULL);
+ }
+
+ /* Wait for inflight memcg_path_buf users to finish. */
+ synchronize_rcu();
+
+ old = tmp_bufs;
+ for_each_possible_cpu(cpu) {
+ kfree(*(old++));
+ }
+
+ kfree(tmp_bufs);
+ tmp_bufs = NULL;
+}
+
+int trace_mmap_lock_reg(void)
+{
+ int cpu;
+ char *new;
+
+ mutex_lock(&reg_lock);
+
+ /* If the refcount is going 0->1, proceed with allocating buffers. */
+ if (reg_refcount++)
+ goto out;
+
+ tmp_bufs = kmalloc_array(num_possible_cpus(), sizeof(*tmp_bufs),
+ GFP_KERNEL);
+ if (tmp_bufs == NULL)
+ goto out_fail;
+
+ for_each_possible_cpu(cpu) {
+ new = kmalloc(MEMCG_PATH_BUF_SIZE * CONTEXT_COUNT, GFP_KERNEL);
+ if (new == NULL)
+ goto out_fail_free;
+ rcu_assign_pointer(per_cpu(memcg_path_buf, cpu), new);
+ /* Don't need to wait for inflights, they'd have gotten NULL. */
+ }
+
+out:
+ mutex_unlock(&reg_lock);
+ return 0;
+
+out_fail_free:
+ free_memcg_path_bufs();
+out_fail:
+ /* Since we failed, undo the earlier ref increment. */
+ --reg_refcount;
+
+ mutex_unlock(&reg_lock);
+ return -ENOMEM;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+ mutex_lock(&reg_lock);
+
+ /* If the refcount is going 1->0, proceed with freeing buffers. */
+ if (--reg_refcount)
+ goto out;
+
+ free_memcg_path_bufs();
+
+out:
+ mutex_unlock(&reg_lock);
+}
+
+static inline char *get_memcg_path_buf(void)
+{
+ char *buf;
+ int idx;
+
+ rcu_read_lock();
+ buf = rcu_dereference(*this_cpu_ptr(&memcg_path_buf));
+ if (buf == NULL) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ idx = this_cpu_add_return(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE) -
+ MEMCG_PATH_BUF_SIZE;
+ return &buf[idx];
+}
+
+static inline void put_memcg_path_buf(void)
+{
+ this_cpu_sub(memcg_path_buf_idx, MEMCG_PATH_BUF_SIZE);
+ rcu_read_unlock();
+}
+
+/*
+ * Write the given mm_struct's memcg path to a percpu buffer, and return a
+ * pointer to it. If the path cannot be determined, or no buffer was available
+ * (because the trace event is being unregistered), NULL is returned.
+ *
+ * Note: buffers are allocated per-cpu to avoid locking, so preemption must be
+ * disabled by the caller before calling us, and re-enabled only after the
+ * caller is done with the pointer.
+ *
+ * The caller must call put_memcg_path_buf() once the buffer is no longer
+ * needed. This must be done while preemption is still disabled.
+ */
+static const char *get_mm_memcg_path(struct mm_struct *mm)
+{
+ char *buf = NULL;
+ struct mem_cgroup *memcg = get_mem_cgroup_from_mm(mm);
+
+ if (memcg == NULL)
+ goto out;
+ if (unlikely(memcg->css.cgroup == NULL))
+ goto out_put;
+
+ buf = get_memcg_path_buf();
+ if (buf == NULL)
+ goto out_put;
+
+ cgroup_path(memcg->css.cgroup, buf, MEMCG_PATH_BUF_SIZE);
+
+out_put:
+ css_put(&memcg->css);
+out:
+ return buf;
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ do { \
+ const char *memcg_path; \
+ preempt_disable(); \
+ memcg_path = get_mm_memcg_path(mm); \
+ trace_mmap_lock_##type(mm, \
+ memcg_path != NULL ? memcg_path : "", \
+ ##__VA_ARGS__); \
+ if (likely(memcg_path != NULL)) \
+ put_memcg_path_buf(); \
+ preempt_enable(); \
+ } while (0)
+
+#else /* !CONFIG_MEMCG */
+
+int trace_mmap_lock_reg(void)
+{
+ return 0;
+}
+
+void trace_mmap_lock_unreg(void)
+{
+}
+
+#define TRACE_MMAP_LOCK_EVENT(type, mm, ...) \
+ trace_mmap_lock_##type(mm, "", ##__VA_ARGS__)
+
+#endif /* CONFIG_MEMCG */
+
+/*
+ * Trace calls must be in a separate file, as otherwise there's a circular
+ * dependency between linux/mmap_lock.h and trace/events/mmap_lock.h.
+ */
+
+void __mmap_lock_do_trace_start_locking(struct mm_struct *mm, bool write)
+{
+ TRACE_MMAP_LOCK_EVENT(start_locking, mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_start_locking);
+
+void __mmap_lock_do_trace_acquire_returned(struct mm_struct *mm, bool write,
+ bool success)
+{
+ TRACE_MMAP_LOCK_EVENT(acquire_returned, mm, write, success);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_acquire_returned);
+
+void __mmap_lock_do_trace_released(struct mm_struct *mm, bool write)
+{
+ TRACE_MMAP_LOCK_EVENT(released, mm, write);
+}
+EXPORT_SYMBOL(__mmap_lock_do_trace_released);
diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c
index 5654dd19addc..61ee40ed804e 100644
--- a/mm/mmu_notifier.c
+++ b/mm/mmu_notifier.c
@@ -612,13 +612,6 @@ int __mmu_notifier_register(struct mmu_notifier *subscription,
mmap_assert_write_locked(mm);
BUG_ON(atomic_read(&mm->mm_users) <= 0);
- if (IS_ENABLED(CONFIG_LOCKDEP)) {
- fs_reclaim_acquire(GFP_KERNEL);
- lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
- lock_map_release(&__mmu_notifier_invalidate_range_start_map);
- fs_reclaim_release(GFP_KERNEL);
- }
-
if (!mm->notifier_subscriptions) {
/*
* kmalloc cannot be called under mm_take_all_locks(), but we
diff --git a/mm/mmzone.c b/mm/mmzone.c
index 4686fdc23bb9..eb89d6e018e2 100644
--- a/mm/mmzone.c
+++ b/mm/mmzone.c
@@ -72,25 +72,12 @@ struct zoneref *__next_zones_zonelist(struct zoneref *z,
return z;
}
-#ifdef CONFIG_ARCH_HAS_HOLES_MEMORYMODEL
-bool memmap_valid_within(unsigned long pfn,
- struct page *page, struct zone *zone)
-{
- if (page_to_pfn(page) != pfn)
- return false;
-
- if (page_zone(page) != zone)
- return false;
-
- return true;
-}
-#endif /* CONFIG_ARCH_HAS_HOLES_MEMORYMODEL */
-
void lruvec_init(struct lruvec *lruvec)
{
enum lru_list lru;
memset(lruvec, 0, sizeof(struct lruvec));
+ spin_lock_init(&lruvec->lru_lock);
for_each_lru(lru)
INIT_LIST_HEAD(&lruvec->lists[lru]);
diff --git a/mm/mprotect.c b/mm/mprotect.c
index 56c02beb6041..ab709023e9aa 100644
--- a/mm/mprotect.c
+++ b/mm/mprotect.c
@@ -616,9 +616,16 @@ static int do_mprotect_pkey(unsigned long start, size_t len,
tmp = vma->vm_end;
if (tmp > end)
tmp = end;
+
+ if (vma->vm_ops && vma->vm_ops->mprotect)
+ error = vma->vm_ops->mprotect(vma, nstart, tmp, newflags);
+ if (error)
+ goto out;
+
error = mprotect_fixup(vma, &prev, nstart, tmp, newflags);
if (error)
goto out;
+
nstart = tmp;
if (nstart < prev->vm_end)
diff --git a/mm/mremap.c b/mm/mremap.c
index 138abbae4f75..c5590afe7165 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -30,12 +30,11 @@
#include "internal.h"
-static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+static pud_t *get_old_pud(struct mm_struct *mm, unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
pud_t *pud;
- pmd_t *pmd;
pgd = pgd_offset(mm, addr);
if (pgd_none_or_clear_bad(pgd))
@@ -49,6 +48,18 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
if (pud_none_or_clear_bad(pud))
return NULL;
+ return pud;
+}
+
+static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = get_old_pud(mm, addr);
+ if (!pud)
+ return NULL;
+
pmd = pmd_offset(pud, addr);
if (pmd_none(*pmd))
return NULL;
@@ -56,19 +67,27 @@ static pmd_t *get_old_pmd(struct mm_struct *mm, unsigned long addr)
return pmd;
}
-static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+static pud_t *alloc_new_pud(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long addr)
{
pgd_t *pgd;
p4d_t *p4d;
- pud_t *pud;
- pmd_t *pmd;
pgd = pgd_offset(mm, addr);
p4d = p4d_alloc(mm, pgd, addr);
if (!p4d)
return NULL;
- pud = pud_alloc(mm, p4d, addr);
+
+ return pud_alloc(mm, p4d, addr);
+}
+
+static pmd_t *alloc_new_pmd(struct mm_struct *mm, struct vm_area_struct *vma,
+ unsigned long addr)
+{
+ pud_t *pud;
+ pmd_t *pmd;
+
+ pud = alloc_new_pud(mm, vma, addr);
if (!pud)
return NULL;
@@ -249,14 +268,148 @@ static bool move_normal_pmd(struct vm_area_struct *vma, unsigned long old_addr,
return true;
}
+#else
+static inline bool move_normal_pmd(struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
+ pmd_t *new_pmd)
+{
+ return false;
+}
#endif
+#ifdef CONFIG_HAVE_MOVE_PUD
+static bool move_normal_pud(struct vm_area_struct *vma, unsigned long old_addr,
+ unsigned long new_addr, pud_t *old_pud, pud_t *new_pud)
+{
+ spinlock_t *old_ptl, *new_ptl;
+ struct mm_struct *mm = vma->vm_mm;
+ pud_t pud;
+
+ /*
+ * The destination pud shouldn't be established, free_pgtables()
+ * should have released it.
+ */
+ if (WARN_ON_ONCE(!pud_none(*new_pud)))
+ return false;
+
+ /*
+ * We don't have to worry about the ordering of src and dst
+ * ptlocks because exclusive mmap_lock prevents deadlock.
+ */
+ old_ptl = pud_lock(vma->vm_mm, old_pud);
+ new_ptl = pud_lockptr(mm, new_pud);
+ if (new_ptl != old_ptl)
+ spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
+
+ /* Clear the pud */
+ pud = *old_pud;
+ pud_clear(old_pud);
+
+ VM_BUG_ON(!pud_none(*new_pud));
+
+ /* Set the new pud */
+ set_pud_at(mm, new_addr, new_pud, pud);
+ flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
+ if (new_ptl != old_ptl)
+ spin_unlock(new_ptl);
+ spin_unlock(old_ptl);
+
+ return true;
+}
+#else
+static inline bool move_normal_pud(struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
+ pud_t *new_pud)
+{
+ return false;
+}
+#endif
+
+enum pgt_entry {
+ NORMAL_PMD,
+ HPAGE_PMD,
+ NORMAL_PUD,
+};
+
+/*
+ * Returns an extent of the corresponding size for the pgt_entry specified if
+ * valid. Else returns a smaller extent bounded by the end of the source and
+ * destination pgt_entry.
+ */
+static unsigned long get_extent(enum pgt_entry entry, unsigned long old_addr,
+ unsigned long old_end, unsigned long new_addr)
+{
+ unsigned long next, extent, mask, size;
+
+ switch (entry) {
+ case HPAGE_PMD:
+ case NORMAL_PMD:
+ mask = PMD_MASK;
+ size = PMD_SIZE;
+ break;
+ case NORMAL_PUD:
+ mask = PUD_MASK;
+ size = PUD_SIZE;
+ break;
+ default:
+ BUILD_BUG();
+ break;
+ }
+
+ next = (old_addr + size) & mask;
+ /* even if next overflowed, extent below will be ok */
+ extent = (next > old_end) ? old_end - old_addr : next - old_addr;
+ next = (new_addr + size) & mask;
+ if (extent > next - new_addr)
+ extent = next - new_addr;
+ return extent;
+}
+
+/*
+ * Attempts to speedup the move by moving entry at the level corresponding to
+ * pgt_entry. Returns true if the move was successful, else false.
+ */
+static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
+ unsigned long old_addr, unsigned long new_addr,
+ void *old_entry, void *new_entry, bool need_rmap_locks)
+{
+ bool moved = false;
+
+ /* See comment in move_ptes() */
+ if (need_rmap_locks)
+ take_rmap_locks(vma);
+
+ switch (entry) {
+ case NORMAL_PMD:
+ moved = move_normal_pmd(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+ case NORMAL_PUD:
+ moved = move_normal_pud(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+ case HPAGE_PMD:
+ moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
+ move_huge_pmd(vma, old_addr, new_addr, old_entry,
+ new_entry);
+ break;
+ default:
+ WARN_ON_ONCE(1);
+ break;
+ }
+
+ if (need_rmap_locks)
+ drop_rmap_locks(vma);
+
+ return moved;
+}
+
unsigned long move_page_tables(struct vm_area_struct *vma,
unsigned long old_addr, struct vm_area_struct *new_vma,
unsigned long new_addr, unsigned long len,
bool need_rmap_locks)
{
- unsigned long extent, next, old_end;
+ unsigned long extent, old_end;
struct mmu_notifier_range range;
pmd_t *old_pmd, *new_pmd;
@@ -269,53 +422,50 @@ unsigned long move_page_tables(struct vm_area_struct *vma,
for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
cond_resched();
- next = (old_addr + PMD_SIZE) & PMD_MASK;
- /* even if next overflowed, extent below will be ok */
- extent = next - old_addr;
- if (extent > old_end - old_addr)
- extent = old_end - old_addr;
- next = (new_addr + PMD_SIZE) & PMD_MASK;
- if (extent > next - new_addr)
- extent = next - new_addr;
+ /*
+ * If extent is PUD-sized try to speed up the move by moving at the
+ * PUD level if possible.
+ */
+ extent = get_extent(NORMAL_PUD, old_addr, old_end, new_addr);
+ if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
+ pud_t *old_pud, *new_pud;
+
+ old_pud = get_old_pud(vma->vm_mm, old_addr);
+ if (!old_pud)
+ continue;
+ new_pud = alloc_new_pud(vma->vm_mm, vma, new_addr);
+ if (!new_pud)
+ break;
+ if (move_pgt_entry(NORMAL_PUD, vma, old_addr, new_addr,
+ old_pud, new_pud, need_rmap_locks))
+ continue;
+ }
+
+ extent = get_extent(NORMAL_PMD, old_addr, old_end, new_addr);
old_pmd = get_old_pmd(vma->vm_mm, old_addr);
if (!old_pmd)
continue;
new_pmd = alloc_new_pmd(vma->vm_mm, vma, new_addr);
if (!new_pmd)
break;
- if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) || pmd_devmap(*old_pmd)) {
- if (extent == HPAGE_PMD_SIZE) {
- bool moved;
- /* See comment in move_ptes() */
- if (need_rmap_locks)
- take_rmap_locks(vma);
- moved = move_huge_pmd(vma, old_addr, new_addr,
- old_pmd, new_pmd);
- if (need_rmap_locks)
- drop_rmap_locks(vma);
- if (moved)
- continue;
- }
+ if (is_swap_pmd(*old_pmd) || pmd_trans_huge(*old_pmd) ||
+ pmd_devmap(*old_pmd)) {
+ if (extent == HPAGE_PMD_SIZE &&
+ move_pgt_entry(HPAGE_PMD, vma, old_addr, new_addr,
+ old_pmd, new_pmd, need_rmap_locks))
+ continue;
split_huge_pmd(vma, old_pmd, old_addr);
if (pmd_trans_unstable(old_pmd))
continue;
- } else if (extent == PMD_SIZE) {
-#ifdef CONFIG_HAVE_MOVE_PMD
+ } else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
+ extent == PMD_SIZE) {
/*
* If the extent is PMD-sized, try to speed the move by
* moving at the PMD level if possible.
*/
- bool moved;
-
- if (need_rmap_locks)
- take_rmap_locks(vma);
- moved = move_normal_pmd(vma, old_addr, new_addr,
- old_pmd, new_pmd);
- if (need_rmap_locks)
- drop_rmap_locks(vma);
- if (moved)
+ if (move_pgt_entry(NORMAL_PMD, vma, old_addr, new_addr,
+ old_pmd, new_pmd, need_rmap_locks))
continue;
-#endif
}
if (pte_alloc(new_vma->vm_mm, new_pmd))
@@ -343,7 +493,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
unsigned long excess = 0;
unsigned long hiwater_vm;
int split = 0;
- int err;
+ int err = 0;
bool need_rmap_locks;
/*
@@ -353,6 +503,15 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (mm->map_count >= sysctl_max_map_count - 3)
return -ENOMEM;
+ if (vma->vm_ops && vma->vm_ops->may_split) {
+ if (vma->vm_start != old_addr)
+ err = vma->vm_ops->may_split(vma, old_addr);
+ if (!err && vma->vm_end != old_addr + old_len)
+ err = vma->vm_ops->may_split(vma, old_addr + old_len);
+ if (err)
+ return err;
+ }
+
/*
* Advise KSM to break any KSM pages in the area to be moved:
* it would be confusing if they were to turn up at the new
@@ -365,18 +524,26 @@ static unsigned long move_vma(struct vm_area_struct *vma,
if (err)
return err;
+ if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT)) {
+ if (security_vm_enough_memory_mm(mm, new_len >> PAGE_SHIFT))
+ return -ENOMEM;
+ }
+
new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
new_vma = copy_vma(&vma, new_addr, new_len, new_pgoff,
&need_rmap_locks);
- if (!new_vma)
+ if (!new_vma) {
+ if (unlikely(flags & MREMAP_DONTUNMAP && vm_flags & VM_ACCOUNT))
+ vm_unacct_memory(new_len >> PAGE_SHIFT);
return -ENOMEM;
+ }
moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, old_len,
need_rmap_locks);
if (moved_len < old_len) {
err = -ENOMEM;
} else if (vma->vm_ops && vma->vm_ops->mremap) {
- err = vma->vm_ops->mremap(new_vma);
+ err = vma->vm_ops->mremap(new_vma, flags);
}
if (unlikely(err)) {
@@ -398,7 +565,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
}
/* Conceal VM_ACCOUNT so old reservation is not undone */
- if (vm_flags & VM_ACCOUNT) {
+ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
vma->vm_flags &= ~VM_ACCOUNT;
excess = vma->vm_end - vma->vm_start - old_len;
if (old_addr > vma->vm_start &&
@@ -423,34 +590,17 @@ static unsigned long move_vma(struct vm_area_struct *vma,
untrack_pfn_moved(vma);
if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
- if (vm_flags & VM_ACCOUNT) {
- /* Always put back VM_ACCOUNT since we won't unmap */
- vma->vm_flags |= VM_ACCOUNT;
-
- vm_acct_memory(new_len >> PAGE_SHIFT);
- }
-
- /*
- * VMAs can actually be merged back together in copy_vma
- * calling merge_vma. This can happen with anonymous vmas
- * which have not yet been faulted, so if we were to consider
- * this VMA split we'll end up adding VM_ACCOUNT on the
- * next VMA, which is completely unrelated if this VMA
- * was re-merged.
- */
- if (split && new_vma == vma)
- split = 0;
-
/* We always clear VM_LOCKED[ONFAULT] on the old vma */
vma->vm_flags &= VM_LOCKED_CLEAR_MASK;
/* Because we won't unmap we don't need to touch locked_vm */
- goto out;
+ return new_addr;
}
if (do_munmap(mm, old_addr, old_len, uf_unmap) < 0) {
/* OOM: unable to split vma, just get accounts right */
- vm_unacct_memory(excess >> PAGE_SHIFT);
+ if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
+ vm_acct_memory(new_len >> PAGE_SHIFT);
excess = 0;
}
@@ -458,7 +608,7 @@ static unsigned long move_vma(struct vm_area_struct *vma,
mm->locked_vm += new_len >> PAGE_SHIFT;
*locked = true;
}
-out:
+
mm->hiwater_vm = hiwater_vm;
/* Restore VM_ACCOUNT if one or two pieces of vma left */
diff --git a/mm/nommu.c b/mm/nommu.c
index 0faf39b32cdb..870fea12823e 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -1675,8 +1675,8 @@ void filemap_map_pages(struct vm_fault *vmf,
}
EXPORT_SYMBOL(filemap_map_pages);
-int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
- unsigned long addr, void *buf, int len, unsigned int gup_flags)
+int __access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf,
+ int len, unsigned int gup_flags)
{
struct vm_area_struct *vma;
int write = gup_flags & FOLL_WRITE;
@@ -1722,7 +1722,7 @@ int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm,
int access_remote_vm(struct mm_struct *mm, unsigned long addr,
void *buf, int len, unsigned int gup_flags)
{
- return __access_remote_vm(NULL, mm, addr, buf, len, gup_flags);
+ return __access_remote_vm(mm, addr, buf, len, gup_flags);
}
/*
@@ -1741,7 +1741,7 @@ int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, in
if (!mm)
return 0;
- len = __access_remote_vm(tsk, mm, addr, buf, len, gup_flags);
+ len = __access_remote_vm(mm, addr, buf, len, gup_flags);
mmput(mm);
return len;
diff --git a/mm/oom_kill.c b/mm/oom_kill.c
index 8b84661a6410..04b19b7b5435 100644
--- a/mm/oom_kill.c
+++ b/mm/oom_kill.c
@@ -170,11 +170,13 @@ static bool oom_unkillable_task(struct task_struct *p)
return false;
}
-/*
- * Print out unreclaimble slabs info when unreclaimable slabs amount is greater
- * than all user memory (LRU pages)
- */
-static bool is_dump_unreclaim_slabs(void)
+/**
+ * Check whether unreclaimable slab amount is greater than
+ * all user memory(LRU pages).
+ * dump_unreclaimable_slab() could help in the case that
+ * oom due to too much unreclaimable slab used by kernel.
+*/
+static bool should_dump_unreclaim_slab(void)
{
unsigned long nr_lru;
@@ -463,7 +465,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p)
mem_cgroup_print_oom_meminfo(oc->memcg);
else {
show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask);
- if (is_dump_unreclaim_slabs())
+ if (should_dump_unreclaim_slab())
dump_unreclaimable_slab();
}
if (sysctl_oom_dump_tasks)
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index eaa227a479e4..7a2c89b21115 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -57,6 +57,7 @@
#include <trace/events/oom.h>
#include <linux/prefetch.h>
#include <linux/mm_inline.h>
+#include <linux/mmu_notifier.h>
#include <linux/migrate.h>
#include <linux/hugetlb.h>
#include <linux/sched/rt.h>
@@ -70,6 +71,7 @@
#include <linux/psi.h>
#include <linux/padata.h>
#include <linux/khugepaged.h>
+#include <linux/buffer_head.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
@@ -165,53 +167,26 @@ unsigned long totalcma_pages __read_mostly;
int percpu_pagelist_fraction;
gfp_t gfp_allowed_mask __read_mostly = GFP_BOOT_MASK;
-#ifdef CONFIG_INIT_ON_ALLOC_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_alloc);
-#else
DEFINE_STATIC_KEY_FALSE(init_on_alloc);
-#endif
EXPORT_SYMBOL(init_on_alloc);
-#ifdef CONFIG_INIT_ON_FREE_DEFAULT_ON
-DEFINE_STATIC_KEY_TRUE(init_on_free);
-#else
DEFINE_STATIC_KEY_FALSE(init_on_free);
-#endif
EXPORT_SYMBOL(init_on_free);
+static bool _init_on_alloc_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_ON_ALLOC_DEFAULT_ON);
static int __init early_init_on_alloc(char *buf)
{
- int ret;
- bool bool_result;
- ret = kstrtobool(buf, &bool_result);
- if (ret)
- return ret;
- if (bool_result && page_poisoning_enabled())
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_alloc\n");
- if (bool_result)
- static_branch_enable(&init_on_alloc);
- else
- static_branch_disable(&init_on_alloc);
- return 0;
+ return kstrtobool(buf, &_init_on_alloc_enabled_early);
}
early_param("init_on_alloc", early_init_on_alloc);
+static bool _init_on_free_enabled_early __read_mostly
+ = IS_ENABLED(CONFIG_INIT_ON_FREE_DEFAULT_ON);
static int __init early_init_on_free(char *buf)
{
- int ret;
- bool bool_result;
-
- ret = kstrtobool(buf, &bool_result);
- if (ret)
- return ret;
- if (bool_result && page_poisoning_enabled())
- pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, will take precedence over init_on_free\n");
- if (bool_result)
- static_branch_enable(&init_on_free);
- else
- static_branch_disable(&init_on_free);
- return 0;
+ return kstrtobool(buf, &_init_on_free_enabled_early);
}
early_param("init_on_free", early_init_on_free);
@@ -495,14 +470,6 @@ static inline int pfn_to_bitidx(struct page *page, unsigned long pfn)
return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS;
}
-/**
- * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
- * @page: The page within the block of interest
- * @pfn: The target page frame number
- * @mask: mask of bits that the caller is interested in
- *
- * Return: pageblock_bits flags
- */
static __always_inline
unsigned long __get_pfnblock_flags_mask(struct page *page,
unsigned long pfn,
@@ -521,6 +488,14 @@ unsigned long __get_pfnblock_flags_mask(struct page *page,
return (word >> bitidx) & mask;
}
+/**
+ * get_pfnblock_flags_mask - Return the requested group of flags for the pageblock_nr_pages block of pages
+ * @page: The page within the block of interest
+ * @pfn: The target page frame number
+ * @mask: mask of bits that the caller is interested in
+ *
+ * Return: pageblock_bits flags
+ */
unsigned long get_pfnblock_flags_mask(struct page *page, unsigned long pfn,
unsigned long mask)
{
@@ -728,19 +703,6 @@ static int __init early_debug_pagealloc(char *buf)
}
early_param("debug_pagealloc", early_debug_pagealloc);
-void init_debug_pagealloc(void)
-{
- if (!debug_pagealloc_enabled())
- return;
-
- static_branch_enable(&_debug_pagealloc_enabled);
-
- if (!debug_guardpage_minorder())
- return;
-
- static_branch_enable(&_debug_guardpage_enabled);
-}
-
static int __init debug_guardpage_minorder_setup(char *buf)
{
unsigned long res;
@@ -792,6 +754,53 @@ static inline void clear_page_guard(struct zone *zone, struct page *page,
unsigned int order, int migratetype) {}
#endif
+/*
+ * Enable static keys related to various memory debugging and hardening options.
+ * Some override others, and depend on early params that are evaluated in the
+ * order of appearance. So we need to first gather the full picture of what was
+ * enabled, and then make decisions.
+ */
+void init_mem_debugging_and_hardening(void)
+{
+ if (_init_on_alloc_enabled_early) {
+ if (page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+ "will take precedence over init_on_alloc\n");
+ else
+ static_branch_enable(&init_on_alloc);
+ }
+ if (_init_on_free_enabled_early) {
+ if (page_poisoning_enabled())
+ pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, "
+ "will take precedence over init_on_free\n");
+ else
+ static_branch_enable(&init_on_free);
+ }
+
+#ifdef CONFIG_PAGE_POISONING
+ /*
+ * Page poisoning is debug page alloc for some arches. If
+ * either of those options are enabled, enable poisoning.
+ */
+ if (page_poisoning_enabled() ||
+ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
+ debug_pagealloc_enabled()))
+ static_branch_enable(&_page_poisoning_enabled);
+#endif
+
+#ifdef CONFIG_DEBUG_PAGEALLOC
+ if (!debug_pagealloc_enabled())
+ return;
+
+ static_branch_enable(&_debug_pagealloc_enabled);
+
+ if (!debug_guardpage_minorder())
+ return;
+
+ static_branch_enable(&_debug_guardpage_enabled);
+#endif
+}
+
static inline void set_buddy_order(struct page *page, unsigned int order)
{
set_page_private(page, order);
@@ -994,7 +1003,7 @@ static inline void __free_one_page(struct page *page,
struct page *buddy;
bool to_tail;
- max_order = min_t(unsigned int, MAX_ORDER, pageblock_order + 1);
+ max_order = min_t(unsigned int, MAX_ORDER - 1, pageblock_order);
VM_BUG_ON(!zone_is_initialized(zone));
VM_BUG_ON_PAGE(page->flags & PAGE_FLAGS_CHECK_AT_PREP, page);
@@ -1007,7 +1016,7 @@ static inline void __free_one_page(struct page *page,
VM_BUG_ON_PAGE(bad_range(zone, page), page);
continue_merging:
- while (order < max_order - 1) {
+ while (order < max_order) {
if (compaction_capture(capc, page, order, migratetype)) {
__mod_zone_freepage_state(zone, -(1 << order),
migratetype);
@@ -1033,7 +1042,7 @@ continue_merging:
pfn = combined_pfn;
order++;
}
- if (max_order < MAX_ORDER) {
+ if (order < MAX_ORDER - 1) {
/* If we are here, it means order is >= pageblock_order.
* We want to prevent merge between freepages on isolate
* pageblock and normal pageblock. Without this, pageblock
@@ -1054,7 +1063,7 @@ continue_merging:
is_migrate_isolate(buddy_mt)))
goto done_merging;
}
- max_order++;
+ max_order = order + 1;
goto continue_merging;
}
@@ -1092,7 +1101,7 @@ static inline bool page_expected_state(struct page *page,
if (unlikely((unsigned long)page->mapping |
page_ref_count(page) |
#ifdef CONFIG_MEMCG
- (unsigned long)page->mem_cgroup |
+ (unsigned long)page_memcg(page) |
#endif
(page->flags & check_flags)))
return false;
@@ -1117,7 +1126,7 @@ static const char *page_bad_reason(struct page *page, unsigned long flags)
bad_reason = "PAGE_FLAGS_CHECK_AT_FREE flag(s) set";
}
#ifdef CONFIG_MEMCG
- if (unlikely(page->mem_cgroup))
+ if (unlikely(page_memcg(page)))
bad_reason = "page still charged to cgroup";
#endif
return bad_reason;
@@ -1195,8 +1204,10 @@ static void kernel_init_free_pages(struct page *page, int numpages)
/* s390's use of memset() could override KASAN redzones. */
kasan_disable_current();
- for (i = 0; i < numpages; i++)
+ for (i = 0; i < numpages; i++) {
+ page_kasan_tag_reset(page + i);
clear_highpage(page + i);
+ }
kasan_enable_current();
}
@@ -1214,7 +1225,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
* Do not let hwpoison pages hit pcplists/buddy
* Untie memcg state and reset page's owner
*/
- if (memcg_kmem_enabled() && PageKmemcg(page))
+ if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
reset_page_owner(page, order);
return false;
@@ -1244,7 +1255,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
}
if (PageMappingFlags(page))
page->mapping = NULL;
- if (memcg_kmem_enabled() && PageKmemcg(page))
+ if (memcg_kmem_enabled() && PageMemcgKmem(page))
__memcg_kmem_uncharge_page(page, order);
if (check_free)
bad += check_free_page(page);
@@ -1264,7 +1275,8 @@ static __always_inline bool free_pages_prepare(struct page *page,
if (want_init_on_free())
kernel_init_free_pages(page, 1 << order);
- kernel_poison_pages(page, 1 << order, 0);
+ kernel_poison_pages(page, 1 << order);
+
/*
* arch_free_page() can make the page's contents inaccessible. s390
* does this. So nothing which can access the page's contents should
@@ -1272,8 +1284,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
*/
arch_free_page(page, order);
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 0);
+ debug_pagealloc_unmap_pages(page, 1 << order);
kasan_free_nondeferred_pages(page, order);
@@ -1344,7 +1355,7 @@ static void free_pcppages_bulk(struct zone *zone, int count,
{
int migratetype = 0;
int batch_free = 0;
- int prefetch_nr = 0;
+ int prefetch_nr = READ_ONCE(pcp->batch);
bool isolated_pageblocks;
struct page *page, *tmp;
LIST_HEAD(head);
@@ -1395,8 +1406,10 @@ static void free_pcppages_bulk(struct zone *zone, int count,
* avoid excessive prefetching due to large count, only
* prefetch buddy for the first pcp->batch nr of pages.
*/
- if (prefetch_nr++ < pcp->batch)
+ if (prefetch_nr) {
prefetch_buddy(page);
+ prefetch_nr--;
+ }
} while (--count && --batch_free && !list_empty(list));
}
@@ -1558,14 +1571,23 @@ void __free_pages_core(struct page *page, unsigned int order)
#ifdef CONFIG_NEED_MULTIPLE_NODES
-static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
+/*
+ * During memory init memblocks map pfns to nids. The search is expensive and
+ * this caches recent lookups. The implementation of __early_pfn_to_nid
+ * treats start/end as pfns.
+ */
+struct mminit_pfnnid_cache {
+ unsigned long last_start;
+ unsigned long last_end;
+ int last_nid;
+};
-#ifndef CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID
+static struct mminit_pfnnid_cache early_pfnnid_cache __meminitdata;
/*
* Required by SPARSEMEM. Given a PFN, return what node the PFN is on.
*/
-int __meminit __early_pfn_to_nid(unsigned long pfn,
+static int __meminit __early_pfn_to_nid(unsigned long pfn,
struct mminit_pfnnid_cache *state)
{
unsigned long start_pfn, end_pfn;
@@ -1583,7 +1605,6 @@ int __meminit __early_pfn_to_nid(unsigned long pfn,
return nid;
}
-#endif /* CONFIG_HAVE_ARCH_EARLY_PFN_TO_NID */
int __meminit early_pfn_to_nid(unsigned long pfn)
{
@@ -2103,6 +2124,8 @@ void __init page_alloc_init_late(void)
files_maxfiles_init();
#endif
+ buffer_init();
+
/* Discard memblock private memory */
memblock_discard();
@@ -2207,12 +2230,6 @@ static inline int check_new_page(struct page *page)
return 1;
}
-static inline bool free_pages_prezeroed(void)
-{
- return (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) &&
- page_poisoning_enabled()) || want_init_on_free();
-}
-
#ifdef CONFIG_DEBUG_VM
/*
* With DEBUG_VM enabled, order-0 pages are checked for expected state when
@@ -2270,11 +2287,13 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
set_page_refcounted(page);
arch_alloc_page(page, order);
- if (debug_pagealloc_enabled_static())
- kernel_map_pages(page, 1 << order, 1);
+ debug_pagealloc_map_pages(page, 1 << order);
kasan_alloc_pages(page, order);
- kernel_poison_pages(page, 1 << order, 1);
+ kernel_unpoison_pages(page, 1 << order);
set_page_owner(page, order, gfp_flags);
+
+ if (!want_init_on_free() && want_init_on_alloc(gfp_flags))
+ kernel_init_free_pages(page, 1 << order);
}
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
@@ -2282,9 +2301,6 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags
{
post_alloc_hook(page, order, gfp_flags);
- if (!free_pages_prezeroed() && want_init_on_alloc(gfp_flags))
- kernel_init_free_pages(page, 1 << order);
-
if (order && (gfp_flags & __GFP_COMP))
prep_compound_page(page, order);
@@ -2470,12 +2486,12 @@ static bool can_steal_fallback(unsigned int order, int start_mt)
return false;
}
-static inline void boost_watermark(struct zone *zone)
+static inline bool boost_watermark(struct zone *zone)
{
unsigned long max_boost;
if (!watermark_boost_factor)
- return;
+ return false;
/*
* Don't bother in zones that are unlikely to produce results.
* On small machines, including kdump capture kernels running
@@ -2483,7 +2499,7 @@ static inline void boost_watermark(struct zone *zone)
* memory situation immediately.
*/
if ((pageblock_nr_pages * 4) > zone_managed_pages(zone))
- return;
+ return false;
max_boost = mult_frac(zone->_watermark[WMARK_HIGH],
watermark_boost_factor, 10000);
@@ -2497,12 +2513,14 @@ static inline void boost_watermark(struct zone *zone)
* boosted watermark resulting in a hang.
*/
if (!max_boost)
- return;
+ return false;
max_boost = max(pageblock_nr_pages, max_boost);
zone->watermark_boost = min(zone->watermark_boost + pageblock_nr_pages,
max_boost);
+
+ return true;
}
/*
@@ -2540,8 +2558,7 @@ static void steal_suitable_fallback(struct zone *zone, struct page *page,
* likelihood of future fallbacks. Wake kswapd now as the node
* may be balanced overall and kswapd will not wake naturally.
*/
- boost_watermark(zone);
- if (alloc_flags & ALLOC_KSWAPD)
+ if (boost_watermark(zone) && (alloc_flags & ALLOC_KSWAPD))
set_bit(ZONE_BOOSTED_WATERMARK, &zone->flags);
/* We are not allowed to try stealing from the whole block */
@@ -3017,13 +3034,16 @@ static void drain_local_pages_wq(struct work_struct *work)
}
/*
- * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
- *
- * When zone parameter is non-NULL, spill just the single zone's pages.
+ * The implementation of drain_all_pages(), exposing an extra parameter to
+ * drain on all cpus.
*
- * Note that this can be extremely slow as the draining happens in a workqueue.
+ * drain_all_pages() is optimized to only execute on cpus where pcplists are
+ * not empty. The check for non-emptiness can however race with a free to
+ * pcplist that has not yet increased the pcp->count from 0 to 1. Callers
+ * that need the guarantee that every CPU has drained can disable the
+ * optimizing racy check.
*/
-void drain_all_pages(struct zone *zone)
+static void __drain_all_pages(struct zone *zone, bool force_all_cpus)
{
int cpu;
@@ -3062,7 +3082,13 @@ void drain_all_pages(struct zone *zone)
struct zone *z;
bool has_pcps = false;
- if (zone) {
+ if (force_all_cpus) {
+ /*
+ * The pcp.count check is racy, some callers need a
+ * guarantee that no cpu is missed.
+ */
+ has_pcps = true;
+ } else if (zone) {
pcp = per_cpu_ptr(zone->pageset, cpu);
if (pcp->pcp.count)
has_pcps = true;
@@ -3095,6 +3121,18 @@ void drain_all_pages(struct zone *zone)
mutex_unlock(&pcpu_drain_mutex);
}
+/*
+ * Spill all the per-cpu pages from all CPUs back into the buddy allocator.
+ *
+ * When zone parameter is non-NULL, spill just the single zone's pages.
+ *
+ * Note that this can be extremely slow as the draining happens in a workqueue.
+ */
+void drain_all_pages(struct zone *zone)
+{
+ __drain_all_pages(zone, false);
+}
+
#ifdef CONFIG_HIBERNATION
/*
@@ -3190,10 +3228,8 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn)
pcp = &this_cpu_ptr(zone->pageset)->pcp;
list_add(&page->lru, &pcp->lists[migratetype]);
pcp->count++;
- if (pcp->count >= pcp->high) {
- unsigned long batch = READ_ONCE(pcp->batch);
- free_pcppages_bulk(zone, batch, pcp);
- }
+ if (pcp->count >= READ_ONCE(pcp->high))
+ free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp);
}
/*
@@ -3378,7 +3414,7 @@ static struct page *__rmqueue_pcplist(struct zone *zone, int migratetype,
do {
if (list_empty(list)) {
pcp->count += rmqueue_bulk(zone, 0,
- pcp->batch, list,
+ READ_ONCE(pcp->batch), list,
migratetype, alloc_flags);
if (unlikely(list_empty(list)))
return NULL;
@@ -4264,10 +4300,8 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla
static struct lockdep_map __fs_reclaim_map =
STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map);
-static bool __need_fs_reclaim(gfp_t gfp_mask)
+static bool __need_reclaim(gfp_t gfp_mask)
{
- gfp_mask = current_gfp_context(gfp_mask);
-
/* no reclaim without waiting on it */
if (!(gfp_mask & __GFP_DIRECT_RECLAIM))
return false;
@@ -4276,10 +4310,6 @@ static bool __need_fs_reclaim(gfp_t gfp_mask)
if (current->flags & PF_MEMALLOC)
return false;
- /* We're only interested __GFP_FS allocations for now */
- if (!(gfp_mask & __GFP_FS))
- return false;
-
if (gfp_mask & __GFP_NOLOCKDEP)
return false;
@@ -4298,15 +4328,29 @@ void __fs_reclaim_release(void)
void fs_reclaim_acquire(gfp_t gfp_mask)
{
- if (__need_fs_reclaim(gfp_mask))
- __fs_reclaim_acquire();
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ if (__need_reclaim(gfp_mask)) {
+ if (gfp_mask & __GFP_FS)
+ __fs_reclaim_acquire();
+
+#ifdef CONFIG_MMU_NOTIFIER
+ lock_map_acquire(&__mmu_notifier_invalidate_range_start_map);
+ lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+#endif
+
+ }
}
EXPORT_SYMBOL_GPL(fs_reclaim_acquire);
void fs_reclaim_release(gfp_t gfp_mask)
{
- if (__need_fs_reclaim(gfp_mask))
- __fs_reclaim_release();
+ gfp_mask = current_gfp_context(gfp_mask);
+
+ if (__need_reclaim(gfp_mask)) {
+ if (gfp_mask & __GFP_FS)
+ __fs_reclaim_release();
+ }
}
EXPORT_SYMBOL_GPL(fs_reclaim_release);
#endif
@@ -5007,6 +5051,26 @@ static inline void free_the_page(struct page *page, unsigned int order)
__free_pages_ok(page, order, FPI_NONE);
}
+/**
+ * __free_pages - Free pages allocated with alloc_pages().
+ * @page: The page pointer returned from alloc_pages().
+ * @order: The order of the allocation.
+ *
+ * This function can free multi-page allocations that are not compound
+ * pages. It does not check that the @order passed in matches that of
+ * the allocation, so it is easy to leak memory. Freeing more memory
+ * than was allocated will probably emit a warning.
+ *
+ * If the last reference to this page is speculative, it will be released
+ * by put_page() which only frees the first page of a non-compound
+ * allocation. To prevent the remaining pages from being leaked, we free
+ * the subsequent pages here. If you want to use the page's reference
+ * count to decide when to free the allocation, you should allocate a
+ * compound page, and use put_page() instead of __free_pages().
+ *
+ * Context: May be called in interrupt context or while holding a normal
+ * spinlock, but not in NMI context or while holding a raw spinlock.
+ */
void __free_pages(struct page *page, unsigned int order)
{
if (put_page_testzero(page))
@@ -5465,7 +5529,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
global_node_page_state_pages(NR_SLAB_UNRECLAIMABLE_B),
global_node_page_state(NR_FILE_MAPPED),
global_node_page_state(NR_SHMEM),
- global_zone_page_state(NR_PAGETABLE),
+ global_node_page_state(NR_PAGETABLE),
global_zone_page_state(NR_BOUNCE),
global_zone_page_state(NR_FREE_PAGES),
free_pcp,
@@ -5497,6 +5561,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
#ifdef CONFIG_SHADOW_CALL_STACK
" shadow_call_stack:%lukB"
#endif
+ " pagetables:%lukB"
" all_unreclaimable? %s"
"\n",
pgdat->node_id,
@@ -5522,6 +5587,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
#ifdef CONFIG_SHADOW_CALL_STACK
node_page_state(pgdat, NR_KERNEL_SCS_KB),
#endif
+ K(node_page_state(pgdat, NR_PAGETABLE)),
pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ?
"yes" : "no");
}
@@ -5553,7 +5619,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
" present:%lukB"
" managed:%lukB"
" mlocked:%lukB"
- " pagetables:%lukB"
" bounce:%lukB"
" free_pcp:%lukB"
" local_pcp:%ukB"
@@ -5574,7 +5639,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask)
K(zone->present_pages),
K(zone_managed_pages(zone)),
K(zone_page_state(zone, NR_MLOCK)),
- K(zone_page_state(zone, NR_PAGETABLE)),
K(zone_page_state(zone, NR_BOUNCE)),
K(free_pcp),
K(this_cpu_read(zone->pageset->pcp.count)),
@@ -5904,7 +5968,10 @@ static void build_zonelists(pg_data_t *pgdat)
* not check if the processor is online before following the pageset pointer.
* Other parts of the kernel may not check if the zone is available.
*/
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch);
+static void pageset_init(struct per_cpu_pageset *p);
+/* These effectively disable the pcplists in the boot pageset completely */
+#define BOOT_PAGESET_HIGH 0
+#define BOOT_PAGESET_BATCH 1
static DEFINE_PER_CPU(struct per_cpu_pageset, boot_pageset);
static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats);
@@ -5972,7 +6039,7 @@ build_all_zonelists_init(void)
* (a chicken-egg dilemma).
*/
for_each_possible_cpu(cpu)
- setup_pageset(&per_cpu(boot_pageset, cpu), 0);
+ pageset_init(&per_cpu(boot_pageset, cpu));
mminit_verify_zonelist();
cpuset_init_current_mems_allowed();
@@ -6255,13 +6322,16 @@ static int zone_batchsize(struct zone *zone)
}
/*
- * pcp->high and pcp->batch values are related and dependent on one another:
- * ->batch must never be higher then ->high.
- * The following function updates them in a safe manner without read side
- * locking.
+ * pcp->high and pcp->batch values are related and generally batch is lower
+ * than high. They are also related to pcp->count such that count is lower
+ * than high, and as soon as it reaches high, the pcplist is flushed.
*
- * Any new users of pcp->batch and pcp->high should ensure they can cope with
- * those fields changing asynchronously (acording to the above rule).
+ * However, guaranteeing these relations at all times would require e.g. write
+ * barriers here but also careful usage of read barriers at the read side, and
+ * thus be prone to error and bad for performance. Thus the update only prevents
+ * store tearing. Any new users of pcp->batch and pcp->high should ensure they
+ * can cope with those fields changing asynchronously, and fully trust only the
+ * pcp->count field on the local CPU with interrupts disabled.
*
* mutex_is_locked(&pcp_batch_high_lock) required when calling this function
* outside of boot time (or some other assurance that no concurrent updaters
@@ -6270,21 +6340,8 @@ static int zone_batchsize(struct zone *zone)
static void pageset_update(struct per_cpu_pages *pcp, unsigned long high,
unsigned long batch)
{
- /* start with a fail safe value for batch */
- pcp->batch = 1;
- smp_wmb();
-
- /* Update high, then batch, in order */
- pcp->high = high;
- smp_wmb();
-
- pcp->batch = batch;
-}
-
-/* a companion to pageset_set_high() */
-static void pageset_set_batch(struct per_cpu_pageset *p, unsigned long batch)
-{
- pageset_update(&p->pcp, 6 * batch, max(1UL, 1 * batch));
+ WRITE_ONCE(pcp->batch, batch);
+ WRITE_ONCE(pcp->high, high);
}
static void pageset_init(struct per_cpu_pageset *p)
@@ -6297,53 +6354,70 @@ static void pageset_init(struct per_cpu_pageset *p)
pcp = &p->pcp;
for (migratetype = 0; migratetype < MIGRATE_PCPTYPES; migratetype++)
INIT_LIST_HEAD(&pcp->lists[migratetype]);
+
+ /*
+ * Set batch and high values safe for a boot pageset. A true percpu
+ * pageset's initialization will update them subsequently. Here we don't
+ * need to be as careful as pageset_update() as nobody can access the
+ * pageset yet.
+ */
+ pcp->high = BOOT_PAGESET_HIGH;
+ pcp->batch = BOOT_PAGESET_BATCH;
}
-static void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
+static void __zone_set_pageset_high_and_batch(struct zone *zone, unsigned long high,
+ unsigned long batch)
{
- pageset_init(p);
- pageset_set_batch(p, batch);
+ struct per_cpu_pageset *p;
+ int cpu;
+
+ for_each_possible_cpu(cpu) {
+ p = per_cpu_ptr(zone->pageset, cpu);
+ pageset_update(&p->pcp, high, batch);
+ }
}
/*
- * pageset_set_high() sets the high water mark for hot per_cpu_pagelist
- * to the value high for the pageset p.
+ * Calculate and set new high and batch values for all per-cpu pagesets of a
+ * zone, based on the zone's size and the percpu_pagelist_fraction sysctl.
*/
-static void pageset_set_high(struct per_cpu_pageset *p,
- unsigned long high)
+static void zone_set_pageset_high_and_batch(struct zone *zone)
{
- unsigned long batch = max(1UL, high / 4);
- if ((high / 4) > (PAGE_SHIFT * 8))
- batch = PAGE_SHIFT * 8;
+ unsigned long new_high, new_batch;
- pageset_update(&p->pcp, high, batch);
-}
+ if (percpu_pagelist_fraction) {
+ new_high = zone_managed_pages(zone) / percpu_pagelist_fraction;
+ new_batch = max(1UL, new_high / 4);
+ if ((new_high / 4) > (PAGE_SHIFT * 8))
+ new_batch = PAGE_SHIFT * 8;
+ } else {
+ new_batch = zone_batchsize(zone);
+ new_high = 6 * new_batch;
+ new_batch = max(1UL, 1 * new_batch);
+ }
-static void pageset_set_high_and_batch(struct zone *zone,
- struct per_cpu_pageset *pcp)
-{
- if (percpu_pagelist_fraction)
- pageset_set_high(pcp,
- (zone_managed_pages(zone) /
- percpu_pagelist_fraction));
- else
- pageset_set_batch(pcp, zone_batchsize(zone));
-}
+ if (zone->pageset_high == new_high &&
+ zone->pageset_batch == new_batch)
+ return;
-static void __meminit zone_pageset_init(struct zone *zone, int cpu)
-{
- struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
+ zone->pageset_high = new_high;
+ zone->pageset_batch = new_batch;
- pageset_init(pcp);
- pageset_set_high_and_batch(zone, pcp);
+ __zone_set_pageset_high_and_batch(zone, new_high, new_batch);
}
void __meminit setup_zone_pageset(struct zone *zone)
{
+ struct per_cpu_pageset *p;
int cpu;
+
zone->pageset = alloc_percpu(struct per_cpu_pageset);
- for_each_possible_cpu(cpu)
- zone_pageset_init(zone, cpu);
+ for_each_possible_cpu(cpu) {
+ p = per_cpu_ptr(zone->pageset, cpu);
+ pageset_init(p);
+ }
+
+ zone_set_pageset_high_and_batch(zone);
}
/*
@@ -6386,6 +6460,8 @@ static __meminit void zone_pcp_init(struct zone *zone)
* offset of a (static) per cpu variable into the per cpu area.
*/
zone->pageset = &boot_pageset;
+ zone->pageset_high = BOOT_PAGESET_HIGH;
+ zone->pageset_batch = BOOT_PAGESET_BATCH;
if (populated_zone(zone))
printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%u\n",
@@ -6796,7 +6872,6 @@ static void __meminit pgdat_init_internals(struct pglist_data *pgdat)
init_waitqueue_head(&pgdat->pfmemalloc_wait);
pgdat_page_ext_init(pgdat);
- spin_lock_init(&pgdat->lru_lock);
lruvec_init(&pgdat->__lruvec);
}
@@ -7598,6 +7673,11 @@ unsigned long free_reserved_area(void *start, void *end, int poison, const char
* alias for the memset().
*/
direct_map_addr = page_address(page);
+ /*
+ * Perform a kasan-unchecked memset() since this memory
+ * has not been initialized.
+ */
+ direct_map_addr = kasan_reset_tag(direct_map_addr);
if ((unsigned int)poison <= 0xFF)
memset(direct_map_addr, poison, PAGE_SIZE);
@@ -7791,31 +7871,24 @@ static void calculate_totalreserve_pages(void)
static void setup_per_zone_lowmem_reserve(void)
{
struct pglist_data *pgdat;
- enum zone_type j, idx;
+ enum zone_type i, j;
for_each_online_pgdat(pgdat) {
- for (j = 0; j < MAX_NR_ZONES; j++) {
- struct zone *zone = pgdat->node_zones + j;
- unsigned long managed_pages = zone_managed_pages(zone);
-
- zone->lowmem_reserve[j] = 0;
-
- idx = j;
- while (idx) {
- struct zone *lower_zone;
-
- idx--;
- lower_zone = pgdat->node_zones + idx;
-
- if (!sysctl_lowmem_reserve_ratio[idx] ||
- !zone_managed_pages(lower_zone)) {
- lower_zone->lowmem_reserve[j] = 0;
- continue;
+ for (i = 0; i < MAX_NR_ZONES - 1; i++) {
+ struct zone *zone = &pgdat->node_zones[i];
+ int ratio = sysctl_lowmem_reserve_ratio[i];
+ bool clear = !ratio || !zone_managed_pages(zone);
+ unsigned long managed_pages = 0;
+
+ for (j = i + 1; j < MAX_NR_ZONES; j++) {
+ if (clear) {
+ zone->lowmem_reserve[j] = 0;
} else {
- lower_zone->lowmem_reserve[j] =
- managed_pages / sysctl_lowmem_reserve_ratio[idx];
+ struct zone *upper_zone = &pgdat->node_zones[j];
+
+ managed_pages += zone_managed_pages(upper_zone);
+ zone->lowmem_reserve[j] = managed_pages / ratio;
}
- managed_pages += zone_managed_pages(lower_zone);
}
}
}
@@ -8077,15 +8150,6 @@ int lowmem_reserve_ratio_sysctl_handler(struct ctl_table *table, int write,
return 0;
}
-static void __zone_pcp_update(struct zone *zone)
-{
- unsigned int cpu;
-
- for_each_possible_cpu(cpu)
- pageset_set_high_and_batch(zone,
- per_cpu_ptr(zone->pageset, cpu));
-}
-
/*
* percpu_pagelist_fraction - changes the pcp->high for each zone on each
* cpu. It is the fraction of total pages in each zone that a hot per cpu
@@ -8118,7 +8182,7 @@ int percpu_pagelist_fraction_sysctl_handler(struct ctl_table *table, int write,
goto out;
for_each_populated_zone(zone)
- __zone_pcp_update(zone);
+ zone_set_pageset_high_and_batch(zone);
out:
mutex_unlock(&pcp_batch_high_lock);
return ret;
@@ -8517,6 +8581,8 @@ int alloc_contig_range(unsigned long start, unsigned long end,
if (ret)
return ret;
+ drain_all_pages(cc.zone);
+
/*
* In case of -EBUSY, we'd like to know which page causes problem.
* So, just fall through. test_pages_isolated() has a tracepoint
@@ -8725,7 +8791,28 @@ EXPORT_SYMBOL(free_contig_range);
void __meminit zone_pcp_update(struct zone *zone)
{
mutex_lock(&pcp_batch_high_lock);
- __zone_pcp_update(zone);
+ zone_set_pageset_high_and_batch(zone);
+ mutex_unlock(&pcp_batch_high_lock);
+}
+
+/*
+ * Effectively disable pcplists for the zone by setting the high limit to 0
+ * and draining all cpus. A concurrent page freeing on another CPU that's about
+ * to put the page on pcplist will either finish before the drain and the page
+ * will be drained, or observe the new high limit and skip the pcplist.
+ *
+ * Must be paired with a call to zone_pcp_enable().
+ */
+void zone_pcp_disable(struct zone *zone)
+{
+ mutex_lock(&pcp_batch_high_lock);
+ __zone_set_pageset_high_and_batch(zone, 0, 1);
+ __drain_all_pages(zone, true);
+}
+
+void zone_pcp_enable(struct zone *zone)
+{
+ __zone_set_pageset_high_and_batch(zone, zone->pageset_high, zone->pageset_batch);
mutex_unlock(&pcp_batch_high_lock);
}
diff --git a/mm/page_counter.c b/mm/page_counter.c
index b24a60b28bb0..c6860f51b6c6 100644
--- a/mm/page_counter.c
+++ b/mm/page_counter.c
@@ -183,14 +183,14 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages)
* the limit, so if it sees the old limit, we see the
* modified counter and retry.
*/
- usage = atomic_long_read(&counter->usage);
+ usage = page_counter_read(counter);
if (usage > nr_pages)
return -EBUSY;
old = xchg(&counter->max, nr_pages);
- if (atomic_long_read(&counter->usage) <= usage)
+ if (page_counter_read(counter) <= usage)
return 0;
counter->max = old;
diff --git a/mm/page_ext.c b/mm/page_ext.c
index a3616f7a0e9e..df6f74aac8e1 100644
--- a/mm/page_ext.c
+++ b/mm/page_ext.c
@@ -34,7 +34,7 @@
*
* The need callback is used to decide whether extended memory allocation is
* needed or not. Sometimes users want to deactivate some features in this
- * boot and extra memory would be unneccessary. In this case, to avoid
+ * boot and extra memory would be unnecessary. In this case, to avoid
* allocating huge chunk of memory, each clients represent their need of
* extra memory through the need callback. If one of the need callbacks
* returns true, it means that someone needs extra memory so that
@@ -99,12 +99,19 @@ static void __init invoke_init_callbacks(void)
}
}
+#ifndef CONFIG_SPARSEMEM
+void __init page_ext_init_flatmem_late(void)
+{
+ invoke_init_callbacks();
+}
+#endif
+
static inline struct page_ext *get_entry(void *base, unsigned long index)
{
return base + page_ext_size * index;
}
-#if !defined(CONFIG_SPARSEMEM)
+#ifndef CONFIG_SPARSEMEM
void __meminit pgdat_page_ext_init(struct pglist_data *pgdat)
@@ -177,7 +184,6 @@ void __init page_ext_init_flatmem(void)
goto fail;
}
pr_info("allocated %ld bytes of page_ext\n", total_usage);
- invoke_init_callbacks();
return;
fail:
diff --git a/mm/page_idle.c b/mm/page_idle.c
index 057c61df12db..64e5344a992c 100644
--- a/mm/page_idle.c
+++ b/mm/page_idle.c
@@ -32,19 +32,15 @@
static struct page *page_idle_get_page(unsigned long pfn)
{
struct page *page = pfn_to_online_page(pfn);
- pg_data_t *pgdat;
if (!page || !PageLRU(page) ||
!get_page_unless_zero(page))
return NULL;
- pgdat = page_pgdat(page);
- spin_lock_irq(&pgdat->lru_lock);
if (unlikely(!PageLRU(page))) {
put_page(page);
page = NULL;
}
- spin_unlock_irq(&pgdat->lru_lock);
return page;
}
diff --git a/mm/page_io.c b/mm/page_io.c
index 433df1263349..9bca17ecc4df 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -291,12 +291,14 @@ static inline void count_swpout_vm_event(struct page *page)
static void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
{
struct cgroup_subsys_state *css;
+ struct mem_cgroup *memcg;
- if (!page->mem_cgroup)
+ memcg = page_memcg(page);
+ if (!memcg)
return;
rcu_read_lock();
- css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
+ css = cgroup_e_css(memcg->css.cgroup, &io_cgrp_subsys);
bio_associate_blkg_from_css(bio, css);
rcu_read_unlock();
}
diff --git a/mm/page_isolation.c b/mm/page_isolation.c
index abbf42214485..bddf788f45bf 100644
--- a/mm/page_isolation.c
+++ b/mm/page_isolation.c
@@ -49,7 +49,6 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_
__mod_zone_freepage_state(zone, -nr_pages, mt);
spin_unlock_irqrestore(&zone->lock, flags);
- drain_all_pages(zone);
return 0;
}
@@ -89,7 +88,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
*/
if (PageBuddy(page)) {
order = buddy_order(page);
- if (order >= pageblock_order) {
+ if (order >= pageblock_order && order < MAX_ORDER - 1) {
pfn = page_to_pfn(page);
buddy_pfn = __find_buddy_pfn(pfn, order);
buddy = page + (buddy_pfn - pfn);
@@ -172,11 +171,12 @@ __first_valid_page(unsigned long pfn, unsigned long nr_pages)
*
* Please note that there is no strong synchronization with the page allocator
* either. Pages might be freed while their page blocks are marked ISOLATED.
- * In some cases pages might still end up on pcp lists and that would allow
+ * A call to drain_all_pages() after isolation can flush most of them. However
+ * in some cases pages might still end up on pcp lists and that would allow
* for their allocation even when they are in fact isolated already. Depending
- * on how strong of a guarantee the caller needs drain_all_pages might be needed
- * (e.g. __offline_pages will need to call it after check for isolated range for
- * a next retry).
+ * on how strong of a guarantee the caller needs, zone_pcp_disable/enable()
+ * might be used to flush and disable pcplist before isolation and enable after
+ * unisolation.
*
* Return: 0 on success and -EBUSY if any part of range cannot be isolated.
*/
diff --git a/mm/page_owner.c b/mm/page_owner.c
index b735a8eafcdb..af464bb7fbe7 100644
--- a/mm/page_owner.c
+++ b/mm/page_owner.c
@@ -10,6 +10,7 @@
#include <linux/migrate.h>
#include <linux/stackdepot.h>
#include <linux/seq_file.h>
+#include <linux/sched/clock.h>
#include "internal.h"
@@ -25,6 +26,8 @@ struct page_owner {
gfp_t gfp_mask;
depot_stack_handle_t handle;
depot_stack_handle_t free_handle;
+ u64 ts_nsec;
+ pid_t pid;
};
static bool page_owner_enabled = false;
@@ -172,6 +175,8 @@ static inline void __set_page_owner_handle(struct page *page,
page_owner->order = order;
page_owner->gfp_mask = gfp_mask;
page_owner->last_migrate_reason = -1;
+ page_owner->pid = current->pid;
+ page_owner->ts_nsec = local_clock();
__set_bit(PAGE_EXT_OWNER, &page_ext->flags);
__set_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags);
@@ -236,6 +241,8 @@ void __copy_page_owner(struct page *oldpage, struct page *newpage)
new_page_owner->last_migrate_reason =
old_page_owner->last_migrate_reason;
new_page_owner->handle = old_page_owner->handle;
+ new_page_owner->pid = old_page_owner->pid;
+ new_page_owner->ts_nsec = old_page_owner->ts_nsec;
/*
* We don't clear the bit on the oldpage as it's going to be freed
@@ -349,9 +356,10 @@ print_page_owner(char __user *buf, size_t count, unsigned long pfn,
return -ENOMEM;
ret = snprintf(kbuf, count,
- "Page allocated via order %u, mask %#x(%pGg)\n",
+ "Page allocated via order %u, mask %#x(%pGg), pid %d, ts %llu ns\n",
page_owner->order, page_owner->gfp_mask,
- &page_owner->gfp_mask);
+ &page_owner->gfp_mask, page_owner->pid,
+ page_owner->ts_nsec);
if (ret >= count)
goto err;
@@ -427,8 +435,9 @@ void __dump_page_owner(struct page *page)
else
pr_alert("page_owner tracks the page as freed\n");
- pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg)\n",
- page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask);
+ pr_alert("page last allocated via order %u, migratetype %s, gfp_mask %#x(%pGg), pid %d, ts %llu\n",
+ page_owner->order, migratetype_names[mt], gfp_mask, &gfp_mask,
+ page_owner->pid, page_owner->ts_nsec);
handle = READ_ONCE(page_owner->handle);
if (!handle) {
diff --git a/mm/page_poison.c b/mm/page_poison.c
index ae0482cded87..65cdf844c8ad 100644
--- a/mm/page_poison.c
+++ b/mm/page_poison.c
@@ -8,57 +8,29 @@
#include <linux/ratelimit.h>
#include <linux/kasan.h>
-static DEFINE_STATIC_KEY_FALSE_RO(want_page_poisoning);
+bool _page_poisoning_enabled_early;
+EXPORT_SYMBOL(_page_poisoning_enabled_early);
+DEFINE_STATIC_KEY_FALSE(_page_poisoning_enabled);
+EXPORT_SYMBOL(_page_poisoning_enabled);
static int __init early_page_poison_param(char *buf)
{
- int ret;
- bool tmp;
-
- ret = strtobool(buf, &tmp);
- if (ret)
- return ret;
-
- if (tmp)
- static_branch_enable(&want_page_poisoning);
- else
- static_branch_disable(&want_page_poisoning);
-
- return 0;
+ return kstrtobool(buf, &_page_poisoning_enabled_early);
}
early_param("page_poison", early_page_poison_param);
-/**
- * page_poisoning_enabled - check if page poisoning is enabled
- *
- * Return true if page poisoning is enabled, or false if not.
- */
-bool page_poisoning_enabled(void)
-{
- /*
- * Assumes that debug_pagealloc_enabled is set before
- * memblock_free_all.
- * Page poisoning is debug page alloc for some arches. If
- * either of those options are enabled, enable poisoning.
- */
- return (static_branch_unlikely(&want_page_poisoning) ||
- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) &&
- debug_pagealloc_enabled()));
-}
-EXPORT_SYMBOL_GPL(page_poisoning_enabled);
-
static void poison_page(struct page *page)
{
void *addr = kmap_atomic(page);
/* KASAN still think the page is in-use, so skip it. */
kasan_disable_current();
- memset(addr, PAGE_POISON, PAGE_SIZE);
+ memset(kasan_reset_tag(addr), PAGE_POISON, PAGE_SIZE);
kasan_enable_current();
kunmap_atomic(addr);
}
-static void poison_pages(struct page *page, int n)
+void __kernel_poison_pages(struct page *page, int n)
{
int i;
@@ -79,9 +51,6 @@ static void check_poison_mem(unsigned char *mem, size_t bytes)
unsigned char *start;
unsigned char *end;
- if (IS_ENABLED(CONFIG_PAGE_POISONING_NO_SANITY))
- return;
-
start = memchr_inv(mem, PAGE_POISON, bytes);
if (!start)
return;
@@ -117,7 +86,7 @@ static void unpoison_page(struct page *page)
kunmap_atomic(addr);
}
-static void unpoison_pages(struct page *page, int n)
+void __kernel_unpoison_pages(struct page *page, int n)
{
int i;
@@ -125,17 +94,6 @@ static void unpoison_pages(struct page *page, int n)
unpoison_page(page + i);
}
-void kernel_poison_pages(struct page *page, int numpages, int enable)
-{
- if (!page_poisoning_enabled())
- return;
-
- if (enable)
- unpoison_pages(page, numpages);
- else
- poison_pages(page, numpages);
-}
-
#ifndef CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC
void __kernel_map_pages(struct page *page, int numpages, int enable)
{
diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c
index 5e77b269c330..86e3a3688d59 100644
--- a/mm/page_vma_mapped.c
+++ b/mm/page_vma_mapped.c
@@ -66,18 +66,19 @@ static inline bool pfn_is_match(struct page *page, unsigned long pfn)
/**
* check_pte - check if @pvmw->page is mapped at the @pvmw->pte
+ * @pvmw: page_vma_mapped_walk struct, includes a pair pte and page for checking
*
* page_vma_mapped_walk() found a place where @pvmw->page is *potentially*
* mapped. check_pte() has to validate this.
*
- * @pvmw->pte may point to empty PTE, swap PTE or PTE pointing to arbitrary
- * page.
+ * pvmw->pte may point to empty PTE, swap PTE or PTE pointing to
+ * arbitrary page.
*
* If PVMW_MIGRATION flag is set, returns true if @pvmw->pte contains migration
* entry that points to @pvmw->page or any subpage in case of THP.
*
- * If PVMW_MIGRATION flag is not set, returns true if @pvmw->pte points to
- * @pvmw->page or any subpage in case of THP.
+ * If PVMW_MIGRATION flag is not set, returns true if pvmw->pte points to
+ * pvmw->page or any subpage in case of THP.
*
* Otherwise, return false.
*
diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c
index 702250f148e7..4bcc11958089 100644
--- a/mm/process_vm_access.c
+++ b/mm/process_vm_access.c
@@ -260,7 +260,7 @@ static ssize_t process_vm_rw(pid_t pid,
struct iovec iovstack_l[UIO_FASTIOV];
struct iovec iovstack_r[UIO_FASTIOV];
struct iovec *iov_l = iovstack_l;
- struct iovec *iov_r = iovstack_r;
+ struct iovec *iov_r;
struct iov_iter iter;
ssize_t rc;
int dir = vm_write ? WRITE : READ;
diff --git a/mm/ptdump.c b/mm/ptdump.c
index ba88ec43ff21..4354c1422d57 100644
--- a/mm/ptdump.c
+++ b/mm/ptdump.c
@@ -4,7 +4,7 @@
#include <linux/ptdump.h>
#include <linux/kasan.h>
-#ifdef CONFIG_KASAN
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
/*
* This is an optimization for KASAN=y case. Since all kasan page tables
* eventually point to the kasan_early_shadow_page we could call note_page()
@@ -31,7 +31,8 @@ static int ptdump_pgd_entry(pgd_t *pgd, unsigned long addr,
struct ptdump_state *st = walk->private;
pgd_t val = READ_ONCE(*pgd);
-#if CONFIG_PGTABLE_LEVELS > 4 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 4 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pgd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_p4d)))
return note_kasan_page_table(walk, addr);
#endif
@@ -51,7 +52,8 @@ static int ptdump_p4d_entry(p4d_t *p4d, unsigned long addr,
struct ptdump_state *st = walk->private;
p4d_t val = READ_ONCE(*p4d);
-#if CONFIG_PGTABLE_LEVELS > 3 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 3 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (p4d_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pud)))
return note_kasan_page_table(walk, addr);
#endif
@@ -71,7 +73,8 @@ static int ptdump_pud_entry(pud_t *pud, unsigned long addr,
struct ptdump_state *st = walk->private;
pud_t val = READ_ONCE(*pud);
-#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_KASAN)
+#if CONFIG_PGTABLE_LEVELS > 2 && \
+ (defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS))
if (pud_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pmd)))
return note_kasan_page_table(walk, addr);
#endif
@@ -91,7 +94,7 @@ static int ptdump_pmd_entry(pmd_t *pmd, unsigned long addr,
struct ptdump_state *st = walk->private;
pmd_t val = READ_ONCE(*pmd);
-#if defined(CONFIG_KASAN)
+#if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS)
if (pmd_page(val) == virt_to_page(lm_alias(kasan_early_shadow_pte)))
return note_kasan_page_table(walk, addr);
#endif
diff --git a/mm/rmap.c b/mm/rmap.c
index 31b29321adfe..08c56aaf72eb 100644
--- a/mm/rmap.c
+++ b/mm/rmap.c
@@ -28,12 +28,12 @@
* hugetlb_fault_mutex (hugetlbfs specific page fault mutex)
* anon_vma->rwsem
* mm->page_table_lock or pte_lock
- * pgdat->lru_lock (in mark_page_accessed, isolate_lru_page)
* swap_lock (in swap_duplicate, swap_info_get)
* mmlist_lock (in mmput, drain_mmlist and others)
* mapping->private_lock (in __set_page_dirty_buffers)
- * mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
+ * lock_page_memcg move_lock (in __set_page_dirty_buffers)
* i_pages lock (widely used)
+ * lruvec->lru_lock (in lock_page_lruvec_irq)
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
* sb_lock (within inode_lock in fs/fs-writeback.c)
@@ -1054,8 +1054,14 @@ static void __page_set_anon_rmap(struct page *page,
if (!exclusive)
anon_vma = anon_vma->root;
+ /*
+ * page_idle does a lockless/optimistic rmap scan on page->mapping.
+ * Make sure the compiler doesn't split the stores of anon_vma and
+ * the PAGE_MAPPING_ANON type identifier, otherwise the rmap code
+ * could mistake the mapping for a struct address_space and crash.
+ */
anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON;
- page->mapping = (struct address_space *) anon_vma;
+ WRITE_ONCE(page->mapping, (struct address_space *) anon_vma);
page->index = linear_page_index(vma, address);
}
@@ -1533,15 +1539,6 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma,
goto discard;
}
- if (!(flags & TTU_IGNORE_ACCESS)) {
- if (ptep_clear_flush_young_notify(vma, address,
- pvmw.pte)) {
- ret = false;
- page_vma_mapped_walk_done(&pvmw);
- break;
- }
- }
-
/* Nuke the page table entry. */
flush_cache_page(vma, address, pte_pfn(*pvmw.pte));
if (should_defer_flush(mm, flags)) {
diff --git a/mm/shmem.c b/mm/shmem.c
index 537c137698f8..7c6b6d8f6c39 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -246,7 +246,7 @@ static inline void shmem_inode_unacct_blocks(struct inode *inode, long pages)
}
static const struct super_operations shmem_ops;
-static const struct address_space_operations shmem_aops;
+const struct address_space_operations shmem_aops;
static const struct file_operations shmem_file_operations;
static const struct inode_operations shmem_inode_operations;
static const struct inode_operations shmem_dir_inode_operations;
@@ -713,7 +713,7 @@ next:
}
if (PageTransHuge(page)) {
count_vm_event(THP_FILE_ALLOC);
- __inc_node_page_state(page, NR_SHMEM_THPS);
+ __inc_lruvec_page_state(page, NR_SHMEM_THPS);
}
mapping->nrpages += nr;
__mod_lruvec_page_state(page, NR_FILE_PAGES, nr);
@@ -1152,7 +1152,7 @@ static void shmem_evict_inode(struct inode *inode)
struct shmem_inode_info *info = SHMEM_I(inode);
struct shmem_sb_info *sbinfo = SHMEM_SB(inode->i_sb);
- if (inode->i_mapping->a_ops == &shmem_aops) {
+ if (shmem_mapping(inode->i_mapping)) {
shmem_unacct_size(info->flags, inode->i_size);
inode->i_size = 0;
shmem_truncate_range(inode, 0, (loff_t)-1);
@@ -1858,7 +1858,7 @@ repeat:
}
/* shmem_symlink() */
- if (mapping->a_ops != &shmem_aops)
+ if (!shmem_mapping(mapping))
goto alloc_nohuge;
if (shmem_huge == SHMEM_HUGE_DENY || sgp_huge == SGP_NOHUGE)
goto alloc_nohuge;
@@ -2352,11 +2352,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, const struct inode
return inode;
}
-bool shmem_mapping(struct address_space *mapping)
-{
- return mapping->a_ops == &shmem_aops;
-}
-
static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm,
pmd_t *dst_pmd,
struct vm_area_struct *dst_vma,
@@ -3865,7 +3860,7 @@ static void shmem_destroy_inodecache(void)
kmem_cache_destroy(shmem_inode_cachep);
}
-static const struct address_space_operations shmem_aops = {
+const struct address_space_operations shmem_aops = {
.writepage = shmem_writepage,
.set_page_dirty = __set_page_dirty_no_writeback,
#ifdef CONFIG_TMPFS
@@ -3877,6 +3872,7 @@ static const struct address_space_operations shmem_aops = {
#endif
.error_remove_page = generic_error_remove_page,
};
+EXPORT_SYMBOL(shmem_aops);
static const struct file_operations shmem_file_operations = {
.mmap = shmem_mmap,
@@ -4024,7 +4020,7 @@ out2:
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_SYSFS)
static ssize_t shmem_enabled_show(struct kobject *kobj,
- struct kobj_attribute *attr, char *buf)
+ struct kobj_attribute *attr, char *buf)
{
static const int values[] = {
SHMEM_HUGE_ALWAYS,
@@ -4034,16 +4030,19 @@ static ssize_t shmem_enabled_show(struct kobject *kobj,
SHMEM_HUGE_DENY,
SHMEM_HUGE_FORCE,
};
- int i, count;
-
- for (i = 0, count = 0; i < ARRAY_SIZE(values); i++) {
- const char *fmt = shmem_huge == values[i] ? "[%s] " : "%s ";
+ int len = 0;
+ int i;
- count += sprintf(buf + count, fmt,
- shmem_format_huge(values[i]));
+ for (i = 0; i < ARRAY_SIZE(values); i++) {
+ len += sysfs_emit_at(buf, len,
+ shmem_huge == values[i] ? "%s[%s]" : "%s%s",
+ i ? " " : "",
+ shmem_format_huge(values[i]));
}
- buf[count - 1] = '\n';
- return count;
+
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
static ssize_t shmem_enabled_store(struct kobject *kobj,
@@ -4312,7 +4311,7 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping,
struct page *page;
int error;
- BUG_ON(mapping->a_ops != &shmem_aops);
+ BUG_ON(!shmem_mapping(mapping));
error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE,
gfp, NULL, NULL, NULL);
if (error)
diff --git a/mm/slab.c b/mm/slab.c
index b1113561b98b..d7c8da9319c7 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -1399,7 +1399,8 @@ static void kmem_freepages(struct kmem_cache *cachep, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
page_mapcount_reset(page);
- page->mapping = NULL;
+ /* In union with page->mapping where page allocator expects NULL */
+ page->slab_cache = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += 1 << order;
@@ -1434,7 +1435,7 @@ static void slab_kernel_map(struct kmem_cache *cachep, void *objp, int map)
if (!is_debug_pagealloc_cache(cachep))
return;
- kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
+ __kernel_map_pages(virt_to_page(objp), cachep->size / PAGE_SIZE, map);
}
#else
@@ -3416,6 +3417,9 @@ free_done:
static __always_inline void __cache_free(struct kmem_cache *cachep, void *objp,
unsigned long caller)
{
+ if (unlikely(slab_want_init_on_free(cachep)))
+ memset(objp, 0, cachep->object_size);
+
/* Put the object into the quarantine, don't touch it for now. */
if (kasan_slab_free(cachep, objp, _RET_IP_))
return;
@@ -3434,8 +3438,6 @@ void ___cache_free(struct kmem_cache *cachep, void *objp,
struct array_cache *ac = cpu_cache_get(cachep);
check_irq_off();
- if (unlikely(slab_want_init_on_free(cachep)))
- memset(objp, 0, cachep->object_size);
kmemleak_free_recursive(objp, cachep->flags);
objp = cache_free_debugcheck(cachep, objp, caller);
memcg_slab_free_hook(cachep, &objp, 1);
diff --git a/mm/slab.h b/mm/slab.h
index 6d7c6a5056ba..1a756a359fa8 100644
--- a/mm/slab.h
+++ b/mm/slab.h
@@ -204,7 +204,7 @@ ssize_t slabinfo_write(struct file *file, const char __user *buffer,
void __kmem_cache_free_bulk(struct kmem_cache *, size_t, void **);
int __kmem_cache_alloc_bulk(struct kmem_cache *, gfp_t, size_t, void **);
-static inline int cache_vmstat_idx(struct kmem_cache *s)
+static inline enum node_stat_item cache_vmstat_idx(struct kmem_cache *s)
{
return (s->flags & SLAB_RECLAIM_ACCOUNT) ?
NR_SLAB_RECLAIMABLE_B : NR_SLAB_UNRECLAIMABLE_B;
@@ -239,30 +239,13 @@ static inline bool kmem_cache_debug_flags(struct kmem_cache *s, slab_flags_t fla
}
#ifdef CONFIG_MEMCG_KMEM
-static inline struct obj_cgroup **page_obj_cgroups(struct page *page)
-{
- /*
- * page->mem_cgroup and page->obj_cgroups are sharing the same
- * space. To distinguish between them in case we don't know for sure
- * that the page is a slab page (e.g. page_cgroup_ino()), let's
- * always set the lowest bit of obj_cgroups.
- */
- return (struct obj_cgroup **)
- ((unsigned long)page->obj_cgroups & ~0x1UL);
-}
-
-static inline bool page_has_obj_cgroups(struct page *page)
-{
- return ((unsigned long)page->obj_cgroups & 0x1UL);
-}
-
int memcg_alloc_page_obj_cgroups(struct page *page, struct kmem_cache *s,
gfp_t gfp);
static inline void memcg_free_page_obj_cgroups(struct page *page)
{
- kfree(page_obj_cgroups(page));
- page->obj_cgroups = NULL;
+ kfree(page_objcgs(page));
+ page->memcg_data = 0;
}
static inline size_t obj_full_size(struct kmem_cache *s)
@@ -274,27 +257,37 @@ static inline size_t obj_full_size(struct kmem_cache *s)
return s->size + sizeof(struct obj_cgroup *);
}
-static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- size_t objects,
- gfp_t flags)
+/*
+ * Returns false if the allocation should fail.
+ */
+static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
{
struct obj_cgroup *objcg;
+ if (!memcg_kmem_enabled())
+ return true;
+
+ if (!(flags & __GFP_ACCOUNT) && !(s->flags & SLAB_ACCOUNT))
+ return true;
+
objcg = get_obj_cgroup_from_current();
if (!objcg)
- return NULL;
+ return true;
if (obj_cgroup_charge(objcg, flags, objects * obj_full_size(s))) {
obj_cgroup_put(objcg);
- return NULL;
+ return false;
}
- return objcg;
+ *objcgp = objcg;
+ return true;
}
static inline void mod_objcg_state(struct obj_cgroup *objcg,
struct pglist_data *pgdat,
- int idx, int nr)
+ enum node_stat_item idx, int nr)
{
struct mem_cgroup *memcg;
struct lruvec *lruvec;
@@ -315,7 +308,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
unsigned long off;
size_t i;
- if (!objcg)
+ if (!memcg_kmem_enabled() || !objcg)
return;
flags &= ~__GFP_ACCOUNT;
@@ -323,7 +316,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
if (likely(p[i])) {
page = virt_to_head_page(p[i]);
- if (!page_has_obj_cgroups(page) &&
+ if (!page_objcgs(page) &&
memcg_alloc_page_obj_cgroups(page, s, flags)) {
obj_cgroup_uncharge(objcg, obj_full_size(s));
continue;
@@ -331,7 +324,7 @@ static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
off = obj_to_index(s, page, p[i]);
obj_cgroup_get(objcg);
- page_obj_cgroups(page)[off] = objcg;
+ page_objcgs(page)[off] = objcg;
mod_objcg_state(objcg, page_pgdat(page),
cache_vmstat_idx(s), obj_full_size(s));
} else {
@@ -345,6 +338,7 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
void **p, int objects)
{
struct kmem_cache *s;
+ struct obj_cgroup **objcgs;
struct obj_cgroup *objcg;
struct page *page;
unsigned int off;
@@ -358,7 +352,8 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
continue;
page = virt_to_head_page(p[i]);
- if (!page_has_obj_cgroups(page))
+ objcgs = page_objcgs(page);
+ if (!objcgs)
continue;
if (!s_orig)
@@ -367,11 +362,11 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
s = s_orig;
off = obj_to_index(s, page, p[i]);
- objcg = page_obj_cgroups(page)[off];
+ objcg = objcgs[off];
if (!objcg)
continue;
- page_obj_cgroups(page)[off] = NULL;
+ objcgs[off] = NULL;
obj_cgroup_uncharge(objcg, obj_full_size(s));
mod_objcg_state(objcg, page_pgdat(page), cache_vmstat_idx(s),
-obj_full_size(s));
@@ -380,11 +375,6 @@ static inline void memcg_slab_free_hook(struct kmem_cache *s_orig,
}
#else /* CONFIG_MEMCG_KMEM */
-static inline bool page_has_obj_cgroups(struct page *page)
-{
- return false;
-}
-
static inline struct mem_cgroup *memcg_from_slab_obj(void *ptr)
{
return NULL;
@@ -400,11 +390,11 @@ static inline void memcg_free_page_obj_cgroups(struct page *page)
{
}
-static inline struct obj_cgroup *memcg_slab_pre_alloc_hook(struct kmem_cache *s,
- size_t objects,
- gfp_t flags)
+static inline bool memcg_slab_pre_alloc_hook(struct kmem_cache *s,
+ struct obj_cgroup **objcgp,
+ size_t objects, gfp_t flags)
{
- return NULL;
+ return true;
}
static inline void memcg_slab_post_alloc_hook(struct kmem_cache *s,
@@ -500,17 +490,13 @@ static inline struct kmem_cache *slab_pre_alloc_hook(struct kmem_cache *s,
{
flags &= gfp_allowed_mask;
- fs_reclaim_acquire(flags);
- fs_reclaim_release(flags);
-
- might_sleep_if(gfpflags_allow_blocking(flags));
+ might_alloc(flags);
if (should_failslab(s, flags))
return NULL;
- if (memcg_kmem_enabled() &&
- ((flags & __GFP_ACCOUNT) || (s->flags & SLAB_ACCOUNT)))
- *objcgp = memcg_slab_pre_alloc_hook(s, size, flags);
+ if (!memcg_slab_pre_alloc_hook(s, objcgp, size, flags))
+ return NULL;
return s;
}
@@ -529,8 +515,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s,
s->flags, flags);
}
- if (memcg_kmem_enabled())
- memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
+ memcg_slab_post_alloc_hook(s, objcg, flags, size, p);
}
#ifndef CONFIG_SLOB
diff --git a/mm/slab_common.c b/mm/slab_common.c
index f9ccd5dc13f3..e981c80d216c 100644
--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -18,6 +18,7 @@
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
#include <linux/debugfs.h>
+#include <linux/kasan.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>
#include <asm/page.h>
@@ -53,7 +54,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work,
*/
#define SLAB_NEVER_MERGE (SLAB_RED_ZONE | SLAB_POISON | SLAB_STORE_USER | \
SLAB_TRACE | SLAB_TYPESAFE_BY_RCU | SLAB_NOLEAKTRACE | \
- SLAB_FAILSLAB | SLAB_KASAN)
+ SLAB_FAILSLAB | kasan_never_merge())
#define SLAB_MERGE_SAME (SLAB_RECLAIM_ACCOUNT | SLAB_CACHE_DMA | \
SLAB_CACHE_DMA32 | SLAB_ACCOUNT)
@@ -978,7 +979,7 @@ static int slab_show(struct seq_file *m, void *p)
void dump_unreclaimable_slab(void)
{
- struct kmem_cache *s, *s2;
+ struct kmem_cache *s;
struct slabinfo sinfo;
/*
@@ -996,7 +997,7 @@ void dump_unreclaimable_slab(void)
pr_info("Unreclaimable slab info:\n");
pr_info("Name Used Total\n");
- list_for_each_entry_safe(s, s2, &slab_caches, list) {
+ list_for_each_entry(s, &slab_caches, list) {
if (s->flags & SLAB_RECLAIM_ACCOUNT)
continue;
@@ -1091,9 +1092,9 @@ static __always_inline void *__do_krealloc(const void *p, size_t new_size,
* @flags: the type of memory to allocate.
*
* The contents of the object pointed to are preserved up to the
- * lesser of the new and old sizes. If @p is %NULL, krealloc()
- * behaves exactly like kmalloc(). If @new_size is 0 and @p is not a
- * %NULL pointer, the object pointed to is freed.
+ * lesser of the new and old sizes (__GFP_ZERO flag is effectively ignored).
+ * If @p is %NULL, krealloc() behaves exactly like kmalloc(). If @new_size
+ * is 0 and @p is not a %NULL pointer, the object pointed to is freed.
*
* Return: pointer to the allocated memory or %NULL in case of error
*/
@@ -1176,7 +1177,7 @@ size_t ksize(const void *objp)
* We assume that ksize callers could use whole allocated area,
* so we need to unpoison this area.
*/
- kasan_unpoison_shadow(objp, size);
+ kasan_unpoison_range(objp, size);
return size;
}
EXPORT_SYMBOL(ksize);
diff --git a/mm/slob.c b/mm/slob.c
index 7cc9805c8091..8d4bfa46247f 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -474,8 +474,7 @@ __do_kmalloc_node(size_t size, gfp_t gfp, int node, unsigned long caller)
gfp &= gfp_allowed_mask;
- fs_reclaim_acquire(gfp);
- fs_reclaim_release(gfp);
+ might_alloc(gfp);
if (size < PAGE_SIZE - minalign) {
int align = minalign;
@@ -597,8 +596,7 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
flags &= gfp_allowed_mask;
- fs_reclaim_acquire(flags);
- fs_reclaim_release(flags);
+ might_alloc(flags);
if (c->size < PAGE_SIZE) {
b = slob_alloc(c->size, flags, c->align, node, 0);
diff --git a/mm/slub.c b/mm/slub.c
index 34dcc09e2ec9..0c8b43a5b3b0 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -249,7 +249,7 @@ static inline void *freelist_ptr(const struct kmem_cache *s, void *ptr,
{
#ifdef CONFIG_SLAB_FREELIST_HARDENED
/*
- * When CONFIG_KASAN_SW_TAGS is enabled, ptr_addr might be tagged.
+ * When CONFIG_KASAN_SW/HW_TAGS is enabled, ptr_addr might be tagged.
* Normally, this doesn't cause any issues, as both set_freepointer()
* and get_freepointer() are called with a pointer with the same tag.
* However, there are some issues with CONFIG_SLUB_DEBUG code. For
@@ -275,6 +275,7 @@ static inline void *freelist_dereference(const struct kmem_cache *s,
static inline void *get_freepointer(struct kmem_cache *s, void *object)
{
+ object = kasan_reset_tag(object);
return freelist_dereference(s, object + s->offset);
}
@@ -304,6 +305,7 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp)
BUG_ON(object == fp); /* naive detection of double free or corruption */
#endif
+ freeptr_addr = (unsigned long)kasan_reset_tag((void *)freeptr_addr);
*(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr);
}
@@ -538,8 +540,8 @@ static void print_section(char *level, char *text, u8 *addr,
unsigned int length)
{
metadata_access_enable();
- print_hex_dump(level, text, DUMP_PREFIX_ADDRESS, 16, 1, addr,
- length, 1);
+ print_hex_dump(level, kasan_reset_tag(text), DUMP_PREFIX_ADDRESS,
+ 16, 1, addr, length, 1);
metadata_access_disable();
}
@@ -570,7 +572,7 @@ static struct track *get_track(struct kmem_cache *s, void *object,
p = object + get_info_end(s);
- return p + alloc;
+ return kasan_reset_tag(p + alloc);
}
static void set_track(struct kmem_cache *s, void *object,
@@ -583,7 +585,8 @@ static void set_track(struct kmem_cache *s, void *object,
unsigned int nr_entries;
metadata_access_enable();
- nr_entries = stack_trace_save(p->addrs, TRACK_ADDRS_COUNT, 3);
+ nr_entries = stack_trace_save(kasan_reset_tag(p->addrs),
+ TRACK_ADDRS_COUNT, 3);
metadata_access_disable();
if (nr_entries < TRACK_ADDRS_COUNT)
@@ -747,7 +750,7 @@ static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page,
static void init_object(struct kmem_cache *s, void *object, u8 val)
{
- u8 *p = object;
+ u8 *p = kasan_reset_tag(object);
if (s->flags & SLAB_RED_ZONE)
memset(p - s->red_left_pad, val, s->red_left_pad);
@@ -777,7 +780,7 @@ static int check_bytes_and_report(struct kmem_cache *s, struct page *page,
u8 *addr = page_address(page);
metadata_access_enable();
- fault = memchr_inv(start, value, bytes);
+ fault = memchr_inv(kasan_reset_tag(start), value, bytes);
metadata_access_disable();
if (!fault)
return 1;
@@ -873,7 +876,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page)
pad = end - remainder;
metadata_access_enable();
- fault = memchr_inv(pad, POISON_INUSE, remainder);
+ fault = memchr_inv(kasan_reset_tag(pad), POISON_INUSE, remainder);
metadata_access_disable();
if (!fault)
return 1;
@@ -1118,7 +1121,7 @@ void setup_page_debug(struct kmem_cache *s, struct page *page, void *addr)
return;
metadata_access_enable();
- memset(addr, POISON_INUSE, page_size(page));
+ memset(kasan_reset_tag(addr), POISON_INUSE, page_size(page));
metadata_access_disable();
}
@@ -1566,10 +1569,10 @@ static inline bool slab_free_freelist_hook(struct kmem_cache *s,
* Clear the object and the metadata, but don't touch
* the redzone.
*/
- memset(object, 0, s->object_size);
+ memset(kasan_reset_tag(object), 0, s->object_size);
rsize = (s->flags & SLAB_RED_ZONE) ? s->red_left_pad
: 0;
- memset((char *)object + s->inuse, 0,
+ memset((char *)kasan_reset_tag(object) + s->inuse, 0,
s->size - s->inuse - rsize);
}
@@ -1836,8 +1839,8 @@ static void __free_slab(struct kmem_cache *s, struct page *page)
__ClearPageSlabPfmemalloc(page);
__ClearPageSlab(page);
-
- page->mapping = NULL;
+ /* In union with page->mapping where page allocator expects NULL */
+ page->slab_cache = NULL;
if (current->reclaim_state)
current->reclaim_state->reclaimed_slab += pages;
unaccount_slab_page(page, order, s);
@@ -2245,8 +2248,7 @@ redo:
}
} else {
m = M_FULL;
-#ifdef CONFIG_SLUB_DEBUG
- if ((s->flags & SLAB_STORE_USER) && !lock) {
+ if (kmem_cache_debug_flags(s, SLAB_STORE_USER) && !lock) {
lock = 1;
/*
* This also ensures that the scanning of full
@@ -2255,7 +2257,6 @@ redo:
*/
spin_lock(&n->list_lock);
}
-#endif
}
if (l != m) {
@@ -2883,10 +2884,10 @@ redo:
stat(s, ALLOC_FASTPATH);
}
- maybe_wipe_obj_freeptr(s, object);
+ maybe_wipe_obj_freeptr(s, kasan_reset_tag(object));
if (unlikely(slab_want_init_on_alloc(gfpflags, s)) && object)
- memset(object, 0, s->object_size);
+ memset(kasan_reset_tag(object), 0, s->object_size);
slab_post_alloc_hook(s, objcg, gfpflags, 1, &object);
@@ -3433,7 +3434,7 @@ static inline int calculate_order(unsigned int size)
*/
min_objects = slub_min_objects;
if (!min_objects)
- min_objects = 4 * (fls(nr_cpu_ids) + 1);
+ min_objects = 4 * (fls(num_online_cpus()) + 1);
max_objects = order_objects(slub_max_order, size);
min_objects = min(min_objects, max_objects);
@@ -4726,7 +4727,7 @@ static void process_slab(struct loc_track *t, struct kmem_cache *s,
}
static int list_locations(struct kmem_cache *s, char *buf,
- enum track_item alloc)
+ enum track_item alloc)
{
int len = 0;
unsigned long i;
@@ -4736,7 +4737,7 @@ static int list_locations(struct kmem_cache *s, char *buf,
if (!alloc_loc_track(&t, PAGE_SIZE / sizeof(struct location),
GFP_KERNEL)) {
- return sprintf(buf, "Out of memory\n");
+ return sysfs_emit(buf, "Out of memory\n");
}
/* Push back cpu slabs */
flush_all(s);
@@ -4759,50 +4760,45 @@ static int list_locations(struct kmem_cache *s, char *buf,
for (i = 0; i < t.count; i++) {
struct location *l = &t.loc[i];
- if (len > PAGE_SIZE - KSYM_SYMBOL_LEN - 100)
- break;
- len += sprintf(buf + len, "%7ld ", l->count);
+ len += sysfs_emit_at(buf, len, "%7ld ", l->count);
if (l->addr)
- len += sprintf(buf + len, "%pS", (void *)l->addr);
+ len += sysfs_emit_at(buf, len, "%pS", (void *)l->addr);
else
- len += sprintf(buf + len, "<not-available>");
-
- if (l->sum_time != l->min_time) {
- len += sprintf(buf + len, " age=%ld/%ld/%ld",
- l->min_time,
- (long)div_u64(l->sum_time, l->count),
- l->max_time);
- } else
- len += sprintf(buf + len, " age=%ld",
- l->min_time);
+ len += sysfs_emit_at(buf, len, "<not-available>");
+
+ if (l->sum_time != l->min_time)
+ len += sysfs_emit_at(buf, len, " age=%ld/%ld/%ld",
+ l->min_time,
+ (long)div_u64(l->sum_time,
+ l->count),
+ l->max_time);
+ else
+ len += sysfs_emit_at(buf, len, " age=%ld", l->min_time);
if (l->min_pid != l->max_pid)
- len += sprintf(buf + len, " pid=%ld-%ld",
- l->min_pid, l->max_pid);
+ len += sysfs_emit_at(buf, len, " pid=%ld-%ld",
+ l->min_pid, l->max_pid);
else
- len += sprintf(buf + len, " pid=%ld",
- l->min_pid);
+ len += sysfs_emit_at(buf, len, " pid=%ld",
+ l->min_pid);
if (num_online_cpus() > 1 &&
- !cpumask_empty(to_cpumask(l->cpus)) &&
- len < PAGE_SIZE - 60)
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
- " cpus=%*pbl",
- cpumask_pr_args(to_cpumask(l->cpus)));
-
- if (nr_online_nodes > 1 && !nodes_empty(l->nodes) &&
- len < PAGE_SIZE - 60)
- len += scnprintf(buf + len, PAGE_SIZE - len - 50,
- " nodes=%*pbl",
- nodemask_pr_args(&l->nodes));
-
- len += sprintf(buf + len, "\n");
+ !cpumask_empty(to_cpumask(l->cpus)))
+ len += sysfs_emit_at(buf, len, " cpus=%*pbl",
+ cpumask_pr_args(to_cpumask(l->cpus)));
+
+ if (nr_online_nodes > 1 && !nodes_empty(l->nodes))
+ len += sysfs_emit_at(buf, len, " nodes=%*pbl",
+ nodemask_pr_args(&l->nodes));
+
+ len += sysfs_emit_at(buf, len, "\n");
}
free_loc_track(&t);
if (!t.count)
- len += sprintf(buf, "No data\n");
+ len += sysfs_emit_at(buf, len, "No data\n");
+
return len;
}
#endif /* CONFIG_SLUB_DEBUG */
@@ -4899,12 +4895,13 @@ __setup("slub_memcg_sysfs=", setup_slub_memcg_sysfs);
#endif
static ssize_t show_slab_objects(struct kmem_cache *s,
- char *buf, unsigned long flags)
+ char *buf, unsigned long flags)
{
unsigned long total = 0;
int node;
int x;
unsigned long *nodes;
+ int len = 0;
nodes = kcalloc(nr_node_ids, sizeof(unsigned long), GFP_KERNEL);
if (!nodes)
@@ -4993,15 +4990,19 @@ static ssize_t show_slab_objects(struct kmem_cache *s,
nodes[node] += x;
}
}
- x = sprintf(buf, "%lu", total);
+
+ len += sysfs_emit_at(buf, len, "%lu", total);
#ifdef CONFIG_NUMA
- for (node = 0; node < nr_node_ids; node++)
+ for (node = 0; node < nr_node_ids; node++) {
if (nodes[node])
- x += sprintf(buf + x, " N%d=%lu",
- node, nodes[node]);
+ len += sysfs_emit_at(buf, len, " N%d=%lu",
+ node, nodes[node]);
+ }
#endif
+ len += sysfs_emit_at(buf, len, "\n");
kfree(nodes);
- return x + sprintf(buf + x, "\n");
+
+ return len;
}
#define to_slab_attr(n) container_of(n, struct slab_attribute, attr)
@@ -5023,37 +5024,37 @@ struct slab_attribute {
static ssize_t slab_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->size);
+ return sysfs_emit(buf, "%u\n", s->size);
}
SLAB_ATTR_RO(slab_size);
static ssize_t align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->align);
+ return sysfs_emit(buf, "%u\n", s->align);
}
SLAB_ATTR_RO(align);
static ssize_t object_size_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->object_size);
+ return sysfs_emit(buf, "%u\n", s->object_size);
}
SLAB_ATTR_RO(object_size);
static ssize_t objs_per_slab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", oo_objects(s->oo));
+ return sysfs_emit(buf, "%u\n", oo_objects(s->oo));
}
SLAB_ATTR_RO(objs_per_slab);
static ssize_t order_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", oo_order(s->oo));
+ return sysfs_emit(buf, "%u\n", oo_order(s->oo));
}
SLAB_ATTR_RO(order);
static ssize_t min_partial_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%lu\n", s->min_partial);
+ return sysfs_emit(buf, "%lu\n", s->min_partial);
}
static ssize_t min_partial_store(struct kmem_cache *s, const char *buf,
@@ -5073,7 +5074,7 @@ SLAB_ATTR(min_partial);
static ssize_t cpu_partial_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", slub_cpu_partial(s));
+ return sysfs_emit(buf, "%u\n", slub_cpu_partial(s));
}
static ssize_t cpu_partial_store(struct kmem_cache *s, const char *buf,
@@ -5098,13 +5099,13 @@ static ssize_t ctor_show(struct kmem_cache *s, char *buf)
{
if (!s->ctor)
return 0;
- return sprintf(buf, "%pS\n", s->ctor);
+ return sysfs_emit(buf, "%pS\n", s->ctor);
}
SLAB_ATTR_RO(ctor);
static ssize_t aliases_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
+ return sysfs_emit(buf, "%d\n", s->refcount < 0 ? 0 : s->refcount - 1);
}
SLAB_ATTR_RO(aliases);
@@ -5137,7 +5138,7 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
int objects = 0;
int pages = 0;
int cpu;
- int len;
+ int len = 0;
for_each_online_cpu(cpu) {
struct page *page;
@@ -5150,52 +5151,53 @@ static ssize_t slabs_cpu_partial_show(struct kmem_cache *s, char *buf)
}
}
- len = sprintf(buf, "%d(%d)", objects, pages);
+ len += sysfs_emit_at(buf, len, "%d(%d)", objects, pages);
#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
struct page *page;
page = slub_percpu_partial(per_cpu_ptr(s->cpu_slab, cpu));
-
- if (page && len < PAGE_SIZE - 20)
- len += sprintf(buf + len, " C%d=%d(%d)", cpu,
- page->pobjects, page->pages);
+ if (page)
+ len += sysfs_emit_at(buf, len, " C%d=%d(%d)",
+ cpu, page->pobjects, page->pages);
}
#endif
- return len + sprintf(buf + len, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
SLAB_ATTR_RO(slabs_cpu_partial);
static ssize_t reclaim_account_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RECLAIM_ACCOUNT));
}
SLAB_ATTR_RO(reclaim_account);
static ssize_t hwcache_align_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_HWCACHE_ALIGN));
}
SLAB_ATTR_RO(hwcache_align);
#ifdef CONFIG_ZONE_DMA
static ssize_t cache_dma_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CACHE_DMA));
}
SLAB_ATTR_RO(cache_dma);
#endif
static ssize_t usersize_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->usersize);
+ return sysfs_emit(buf, "%u\n", s->usersize);
}
SLAB_ATTR_RO(usersize);
static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TYPESAFE_BY_RCU));
}
SLAB_ATTR_RO(destroy_by_rcu);
@@ -5214,33 +5216,33 @@ SLAB_ATTR_RO(total_objects);
static ssize_t sanity_checks_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_CONSISTENCY_CHECKS));
}
SLAB_ATTR_RO(sanity_checks);
static ssize_t trace_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_TRACE));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_TRACE));
}
SLAB_ATTR_RO(trace);
static ssize_t red_zone_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_RED_ZONE));
}
SLAB_ATTR_RO(red_zone);
static ssize_t poison_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_POISON));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_POISON));
}
SLAB_ATTR_RO(poison);
static ssize_t store_user_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_STORE_USER));
}
SLAB_ATTR_RO(store_user);
@@ -5284,7 +5286,7 @@ SLAB_ATTR_RO(free_calls);
#ifdef CONFIG_FAILSLAB
static ssize_t failslab_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
+ return sysfs_emit(buf, "%d\n", !!(s->flags & SLAB_FAILSLAB));
}
SLAB_ATTR_RO(failslab);
#endif
@@ -5308,7 +5310,7 @@ SLAB_ATTR(shrink);
#ifdef CONFIG_NUMA
static ssize_t remote_node_defrag_ratio_show(struct kmem_cache *s, char *buf)
{
- return sprintf(buf, "%u\n", s->remote_node_defrag_ratio / 10);
+ return sysfs_emit(buf, "%u\n", s->remote_node_defrag_ratio / 10);
}
static ssize_t remote_node_defrag_ratio_store(struct kmem_cache *s,
@@ -5335,7 +5337,7 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
{
unsigned long sum = 0;
int cpu;
- int len;
+ int len = 0;
int *data = kmalloc_array(nr_cpu_ids, sizeof(int), GFP_KERNEL);
if (!data)
@@ -5348,16 +5350,19 @@ static int show_stat(struct kmem_cache *s, char *buf, enum stat_item si)
sum += x;
}
- len = sprintf(buf, "%lu", sum);
+ len += sysfs_emit_at(buf, len, "%lu", sum);
#ifdef CONFIG_SMP
for_each_online_cpu(cpu) {
- if (data[cpu] && len < PAGE_SIZE - 20)
- len += sprintf(buf + len, " C%d=%u", cpu, data[cpu]);
+ if (data[cpu])
+ len += sysfs_emit_at(buf, len, " C%d=%u",
+ cpu, data[cpu]);
}
#endif
kfree(data);
- return len + sprintf(buf + len, "\n");
+ len += sysfs_emit_at(buf, len, "\n");
+
+ return len;
}
static void clear_stat(struct kmem_cache *s, enum stat_item si)
diff --git a/mm/swap.c b/mm/swap.c
index 47a47681c86b..2cca7141470c 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -79,16 +79,14 @@ static DEFINE_PER_CPU(struct lru_pvecs, lru_pvecs) = {
static void __page_cache_release(struct page *page)
{
if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
struct lruvec *lruvec;
unsigned long flags;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ lruvec = lock_page_lruvec_irqsave(page, &flags);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ unlock_page_lruvec_irqrestore(lruvec, flags);
}
__ClearPageWaiters(page);
}
@@ -204,63 +202,46 @@ int get_kernel_page(unsigned long start, int write, struct page **pages)
EXPORT_SYMBOL_GPL(get_kernel_page);
static void pagevec_lru_move_fn(struct pagevec *pvec,
- void (*move_fn)(struct page *page, struct lruvec *lruvec, void *arg),
- void *arg)
+ void (*move_fn)(struct page *page, struct lruvec *lruvec))
{
int i;
- struct pglist_data *pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags = 0;
for (i = 0; i < pagevec_count(pvec); i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
- pgdat = pagepgdat;
- spin_lock_irqsave(&pgdat->lru_lock, flags);
- }
+ /* block memcg migration during page moving between lru */
+ if (!TestClearPageLRU(page))
+ continue;
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- (*move_fn)(page, lruvec, arg);
+ lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
+ (*move_fn)(page, lruvec);
+
+ SetPageLRU(page);
}
- if (pgdat)
- spin_unlock_irqrestore(&pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
release_pages(pvec->pages, pvec->nr);
pagevec_reinit(pvec);
}
-static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void pagevec_move_tail_fn(struct page *page, struct lruvec *lruvec)
{
- int *pgmoved = arg;
-
- if (PageLRU(page) && !PageUnevictable(page)) {
+ if (!PageUnevictable(page)) {
del_page_from_lru_list(page, lruvec, page_lru(page));
ClearPageActive(page);
add_page_to_lru_list_tail(page, lruvec, page_lru(page));
- (*pgmoved) += thp_nr_pages(page);
+ __count_vm_events(PGROTATED, thp_nr_pages(page));
}
}
/*
- * pagevec_move_tail() must be called with IRQ disabled.
- * Otherwise this may cause nasty races.
- */
-static void pagevec_move_tail(struct pagevec *pvec)
-{
- int pgmoved = 0;
-
- pagevec_lru_move_fn(pvec, pagevec_move_tail_fn, &pgmoved);
- __count_vm_events(PGROTATED, pgmoved);
-}
-
-/*
* Writeback is about to end against a page which has been marked for immediate
* reclaim. If it still appears to be reclaimable, move it to the tail of the
* inactive list.
+ *
+ * rotate_reclaimable_page() must disable IRQs, to prevent nasty races.
*/
void rotate_reclaimable_page(struct page *page)
{
@@ -273,7 +254,7 @@ void rotate_reclaimable_page(struct page *page)
local_lock_irqsave(&lru_rotate.lock, flags);
pvec = this_cpu_ptr(&lru_rotate.pvec);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_move_tail(pvec);
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
}
@@ -283,6 +264,14 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
do {
unsigned long lrusize;
+ /*
+ * Hold lruvec->lru_lock is safe here, since
+ * 1) The pinned lruvec in reclaim, or
+ * 2) From a pre-LRU page during refault (which also holds the
+ * rcu lock, so would be safe even if the page was on the LRU
+ * and could move simultaneously to a new lruvec).
+ */
+ spin_lock_irq(&lruvec->lru_lock);
/* Record cost event */
if (file)
lruvec->file_cost += nr_pages;
@@ -306,6 +295,7 @@ void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages)
lruvec->file_cost /= 2;
lruvec->anon_cost /= 2;
}
+ spin_unlock_irq(&lruvec->lru_lock);
} while ((lruvec = parent_lruvec(lruvec)));
}
@@ -315,10 +305,9 @@ void lru_note_cost_page(struct page *page)
page_is_file_lru(page), thp_nr_pages(page));
}
-static void __activate_page(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void __activate_page(struct page *page, struct lruvec *lruvec)
{
- if (PageLRU(page) && !PageActive(page) && !PageUnevictable(page)) {
+ if (!PageActive(page) && !PageUnevictable(page)) {
int lru = page_lru_base_type(page);
int nr_pages = thp_nr_pages(page);
@@ -340,7 +329,7 @@ static void activate_page_drain(int cpu)
struct pagevec *pvec = &per_cpu(lru_pvecs.activate_page, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ pagevec_lru_move_fn(pvec, __activate_page);
}
static bool need_activate_page_drain(int cpu)
@@ -358,7 +347,7 @@ static void activate_page(struct page *page)
pvec = this_cpu_ptr(&lru_pvecs.activate_page);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, __activate_page, NULL);
+ pagevec_lru_move_fn(pvec, __activate_page);
local_unlock(&lru_pvecs.lock);
}
}
@@ -370,12 +359,15 @@ static inline void activate_page_drain(int cpu)
static void activate_page(struct page *page)
{
- pg_data_t *pgdat = page_pgdat(page);
+ struct lruvec *lruvec;
page = compound_head(page);
- spin_lock_irq(&pgdat->lru_lock);
- __activate_page(page, mem_cgroup_page_lruvec(page, pgdat), NULL);
- spin_unlock_irq(&pgdat->lru_lock);
+ if (TestClearPageLRU(page)) {
+ lruvec = lock_page_lruvec_irq(page);
+ __activate_page(page, lruvec);
+ unlock_page_lruvec_irq(lruvec);
+ SetPageLRU(page);
+ }
}
#endif
@@ -525,16 +517,12 @@ void lru_cache_add_inactive_or_unevictable(struct page *page,
* be write it out by flusher threads as this is much more effective
* than the single-page writeout from reclaim.
*/
-static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec)
{
int lru;
bool active;
int nr_pages = thp_nr_pages(page);
- if (!PageLRU(page))
- return;
-
if (PageUnevictable(page))
return;
@@ -573,10 +561,9 @@ static void lru_deactivate_file_fn(struct page *page, struct lruvec *lruvec,
}
}
-static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageLRU(page) && PageActive(page) && !PageUnevictable(page)) {
+ if (PageActive(page) && !PageUnevictable(page)) {
int lru = page_lru_base_type(page);
int nr_pages = thp_nr_pages(page);
@@ -591,10 +578,9 @@ static void lru_deactivate_fn(struct page *page, struct lruvec *lruvec,
}
}
-static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void lru_lazyfree_fn(struct page *page, struct lruvec *lruvec)
{
- if (PageLRU(page) && PageAnon(page) && PageSwapBacked(page) &&
+ if (PageAnon(page) && PageSwapBacked(page) &&
!PageSwapCache(page) && !PageUnevictable(page)) {
bool active = PageActive(page);
int nr_pages = thp_nr_pages(page);
@@ -636,21 +622,21 @@ void lru_add_drain_cpu(int cpu)
/* No harm done if a racing interrupt already did this */
local_lock_irqsave(&lru_rotate.lock, flags);
- pagevec_move_tail(pvec);
+ pagevec_lru_move_fn(pvec, pagevec_move_tail_fn);
local_unlock_irqrestore(&lru_rotate.lock, flags);
}
pvec = &per_cpu(lru_pvecs.lru_deactivate_file, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
pvec = &per_cpu(lru_pvecs.lru_deactivate, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn);
pvec = &per_cpu(lru_pvecs.lru_lazyfree, cpu);
if (pagevec_count(pvec))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
activate_page_drain(cpu);
}
@@ -679,7 +665,7 @@ void deactivate_file_page(struct page *page)
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate_file);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_file_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_deactivate_file_fn);
local_unlock(&lru_pvecs.lock);
}
}
@@ -701,7 +687,7 @@ void deactivate_page(struct page *page)
pvec = this_cpu_ptr(&lru_pvecs.lru_deactivate);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_deactivate_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_deactivate_fn);
local_unlock(&lru_pvecs.lock);
}
}
@@ -723,7 +709,7 @@ void mark_page_lazyfree(struct page *page)
pvec = this_cpu_ptr(&lru_pvecs.lru_lazyfree);
get_page(page);
if (!pagevec_add(pvec, page) || PageCompound(page))
- pagevec_lru_move_fn(pvec, lru_lazyfree_fn, NULL);
+ pagevec_lru_move_fn(pvec, lru_lazyfree_fn);
local_unlock(&lru_pvecs.lock);
}
}
@@ -871,8 +857,7 @@ void release_pages(struct page **pages, int nr)
{
int i;
LIST_HEAD(pages_to_free);
- struct pglist_data *locked_pgdat = NULL;
- struct lruvec *lruvec;
+ struct lruvec *lruvec = NULL;
unsigned long flags;
unsigned int lock_batch;
@@ -882,11 +867,11 @@ void release_pages(struct page **pages, int nr)
/*
* Make sure the IRQ-safe lock-holding time does not get
* excessive with a continuous string of pages from the
- * same pgdat. The lock is held only if pgdat != NULL.
+ * same lruvec. The lock is held only if lruvec != NULL.
*/
- if (locked_pgdat && ++lock_batch == SWAP_CLUSTER_MAX) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec && ++lock_batch == SWAP_CLUSTER_MAX) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
page = compound_head(page);
@@ -894,10 +879,9 @@ void release_pages(struct page **pages, int nr)
continue;
if (is_zone_device_page(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
/*
* ZONE_DEVICE pages that return 'false' from
@@ -909,33 +893,31 @@ void release_pages(struct page **pages, int nr)
put_devmap_managed_page(page);
continue;
}
+ if (put_page_testzero(page))
+ put_dev_pagemap(page->pgmap);
+ continue;
}
if (!put_page_testzero(page))
continue;
if (PageCompound(page)) {
- if (locked_pgdat) {
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
- locked_pgdat = NULL;
+ if (lruvec) {
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ lruvec = NULL;
}
__put_compound_page(page);
continue;
}
if (PageLRU(page)) {
- struct pglist_data *pgdat = page_pgdat(page);
+ struct lruvec *prev_lruvec = lruvec;
- if (pgdat != locked_pgdat) {
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock,
- flags);
+ lruvec = relock_page_lruvec_irqsave(page, lruvec,
+ &flags);
+ if (prev_lruvec != lruvec)
lock_batch = 0;
- locked_pgdat = pgdat;
- spin_lock_irqsave(&locked_pgdat->lru_lock, flags);
- }
- lruvec = mem_cgroup_page_lruvec(page, locked_pgdat);
VM_BUG_ON_PAGE(!PageLRU(page), page);
__ClearPageLRU(page);
del_page_from_lru_list(page, lruvec, page_off_lru(page));
@@ -945,8 +927,8 @@ void release_pages(struct page **pages, int nr)
list_add(&page->lru, &pages_to_free);
}
- if (locked_pgdat)
- spin_unlock_irqrestore(&locked_pgdat->lru_lock, flags);
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
mem_cgroup_uncharge_list(&pages_to_free);
free_unref_page_list(&pages_to_free);
@@ -974,41 +956,7 @@ void __pagevec_release(struct pagevec *pvec)
}
EXPORT_SYMBOL(__pagevec_release);
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
-/* used by __split_huge_page_refcount() */
-void lru_add_page_tail(struct page *page, struct page *page_tail,
- struct lruvec *lruvec, struct list_head *list)
-{
- VM_BUG_ON_PAGE(!PageHead(page), page);
- VM_BUG_ON_PAGE(PageCompound(page_tail), page);
- VM_BUG_ON_PAGE(PageLRU(page_tail), page);
- lockdep_assert_held(&lruvec_pgdat(lruvec)->lru_lock);
-
- if (!list)
- SetPageLRU(page_tail);
-
- if (likely(PageLRU(page)))
- list_add_tail(&page_tail->lru, &page->lru);
- else if (list) {
- /* page reclaim is reclaiming a huge page */
- get_page(page_tail);
- list_add_tail(&page_tail->lru, list);
- } else {
- /*
- * Head page has not yet been counted, as an hpage,
- * so we must account for each subpage individually.
- *
- * Put page_tail on the list at the correct position
- * so they all end up in order.
- */
- add_page_to_lru_list_tail(page_tail, lruvec,
- page_lru(page_tail));
- }
-}
-#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
-
-static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
- void *arg)
+static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec)
{
enum lru_list lru;
int was_unevictable = TestClearPageUnevictable(page);
@@ -1067,7 +1015,20 @@ static void __pagevec_lru_add_fn(struct page *page, struct lruvec *lruvec,
*/
void __pagevec_lru_add(struct pagevec *pvec)
{
- pagevec_lru_move_fn(pvec, __pagevec_lru_add_fn, NULL);
+ int i;
+ struct lruvec *lruvec = NULL;
+ unsigned long flags = 0;
+
+ for (i = 0; i < pagevec_count(pvec); i++) {
+ struct page *page = pvec->pages[i];
+
+ lruvec = relock_page_lruvec_irqsave(page, lruvec, &flags);
+ __pagevec_lru_add_fn(page, lruvec);
+ }
+ if (lruvec)
+ unlock_page_lruvec_irqrestore(lruvec, flags);
+ release_pages(pvec->pages, pvec->nr);
+ pagevec_reinit(pvec);
}
/**
@@ -1164,15 +1125,6 @@ unsigned pagevec_lookup_range_tag(struct pagevec *pvec,
}
EXPORT_SYMBOL(pagevec_lookup_range_tag);
-unsigned pagevec_lookup_range_nr_tag(struct pagevec *pvec,
- struct address_space *mapping, pgoff_t *index, pgoff_t end,
- xa_mark_t tag, unsigned max_pages)
-{
- pvec->nr = find_get_pages_range_tag(mapping, index, end, tag,
- min_t(unsigned int, max_pages, PAGEVEC_SIZE), pvec->pages);
- return pagevec_count(pvec);
-}
-EXPORT_SYMBOL(pagevec_lookup_range_nr_tag);
/*
* Perform any setup for the swap system
*/
diff --git a/mm/swap_state.c b/mm/swap_state.c
index ee465827420e..751c1ef2fe0e 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -839,7 +839,9 @@ static struct page *swap_vma_readahead(swp_entry_t fentry, gfp_t gfp_mask,
swp_entry_t entry;
unsigned int i;
bool page_allocated;
- struct vma_swap_readahead ra_info = {0,};
+ struct vma_swap_readahead ra_info = {
+ .win = 1,
+ };
swap_ra_info(vmf, &ra_info);
if (ra_info.win == 1)
@@ -900,7 +902,8 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
static ssize_t vma_ra_enabled_show(struct kobject *kobj,
struct kobj_attribute *attr, char *buf)
{
- return sprintf(buf, "%s\n", enable_vma_readahead ? "true" : "false");
+ return sysfs_emit(buf, "%s\n",
+ enable_vma_readahead ? "true" : "false");
}
static ssize_t vma_ra_enabled_store(struct kobject *kobj,
struct kobj_attribute *attr,
diff --git a/mm/swapfile.c b/mm/swapfile.c
index c4a613688a17..9fffc5af29d1 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -975,8 +975,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
{
unsigned long idx;
struct swap_cluster_info *ci;
- unsigned long offset, i;
- unsigned char *map;
+ unsigned long offset;
/*
* Should not even be attempting cluster allocations when huge
@@ -996,9 +995,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot)
alloc_cluster(si, idx);
cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE);
- map = si->swap_map + offset;
- for (i = 0; i < SWAPFILE_CLUSTER; i++)
- map[i] = SWAP_HAS_CACHE;
+ memset(si->swap_map + offset, SWAP_HAS_CACHE, SWAPFILE_CLUSTER);
unlock_cluster(ci);
swap_range_alloc(si, offset, SWAPFILE_CLUSTER);
*slot = swp_entry(si->type, offset);
@@ -1045,16 +1042,18 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
/* Only single cluster request supported */
WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER);
+ spin_lock(&swap_avail_lock);
+
avail_pgs = atomic_long_read(&nr_swap_pages) / size;
- if (avail_pgs <= 0)
+ if (avail_pgs <= 0) {
+ spin_unlock(&swap_avail_lock);
goto noswap;
+ }
n_goal = min3((long)n_goal, (long)SWAP_BATCH, avail_pgs);
atomic_long_sub(n_goal * size, &nr_swap_pages);
- spin_lock(&swap_avail_lock);
-
start_over:
node = numa_node_id();
plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) {
@@ -1128,14 +1127,13 @@ swp_entry_t get_swap_page_of_type(int type)
spin_lock(&si->lock);
if (si->flags & SWP_WRITEOK) {
- atomic_long_dec(&nr_swap_pages);
/* This is called for allocating swap entry, not cache */
offset = scan_swap_map(si, 1);
if (offset) {
+ atomic_long_dec(&nr_swap_pages);
spin_unlock(&si->lock);
return swp_entry(type, offset);
}
- atomic_long_inc(&nr_swap_pages);
}
spin_unlock(&si->lock);
fail:
@@ -2867,6 +2865,7 @@ late_initcall(max_swapfiles_check);
static struct swap_info_struct *alloc_swap_info(void)
{
struct swap_info_struct *p;
+ struct swap_info_struct *defer = NULL;
unsigned int type;
int i;
@@ -2895,7 +2894,7 @@ static struct swap_info_struct *alloc_swap_info(void)
smp_wmb();
WRITE_ONCE(nr_swapfiles, nr_swapfiles + 1);
} else {
- kvfree(p);
+ defer = p;
p = swap_info[type];
/*
* Do not memset this entry: a racing procfs swap_next()
@@ -2908,6 +2907,7 @@ static struct swap_info_struct *alloc_swap_info(void)
plist_node_init(&p->avail_lists[i], 0);
p->flags = SWP_USED;
spin_unlock(&swap_lock);
+ kvfree(defer);
spin_lock_init(&p->lock);
spin_lock_init(&p->cont_lock);
@@ -3443,11 +3443,11 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unsigned long offset;
unsigned char count;
unsigned char has_cache;
- int err = -EINVAL;
+ int err;
p = get_swap_device(entry);
if (!p)
- goto out;
+ return -EINVAL;
offset = swp_offset(entry);
ci = lock_cluster_or_swap_info(p, offset);
@@ -3494,7 +3494,6 @@ static int __swap_duplicate(swp_entry_t entry, unsigned char usage)
unlock_out:
unlock_cluster_or_swap_info(p, ci);
-out:
if (p)
put_swap_device(p);
return err;
@@ -3611,7 +3610,7 @@ int add_swap_count_continuation(swp_entry_t entry, gfp_t gfp_mask)
ci = lock_cluster(si, offset);
- count = si->swap_map[offset] & ~SWAP_HAS_CACHE;
+ count = swap_count(si->swap_map[offset]);
if ((count & ~COUNT_CONTINUED) != SWAP_MAP_MAX) {
/*
diff --git a/mm/truncate.c b/mm/truncate.c
index 960edf5803ca..8aa4907e06e0 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -637,9 +637,15 @@ unsigned long invalidate_mapping_pages(struct address_space *mapping,
EXPORT_SYMBOL(invalidate_mapping_pages);
/**
- * This helper is similar with the above one, except that it accounts for pages
- * that are likely on a pagevec and count them in @nr_pagevec, which will used by
- * the caller.
+ * invalidate_mapping_pagevec - Invalidate all the unlocked pages of one inode
+ * @mapping: the address_space which holds the pages to invalidate
+ * @start: the offset 'from' which to invalidate
+ * @end: the offset 'to' which to invalidate (inclusive)
+ * @nr_pagevec: invalidate failed page number for caller
+ *
+ * This helper is similar to invalidate_mapping_pages(), except that it accounts
+ * for pages that are likely on a pagevec and counts them in @nr_pagevec, which
+ * will be used by the caller.
*/
void invalidate_mapping_pagevec(struct address_space *mapping,
pgoff_t start, pgoff_t end, unsigned long *nr_pagevec)
diff --git a/mm/util.c b/mm/util.c
index 4ddb6e186dd5..8c9b7d1e7c49 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -311,6 +311,18 @@ int vma_is_stack_for_current(struct vm_area_struct *vma)
return (vma->vm_start <= KSTK_ESP(t) && vma->vm_end >= KSTK_ESP(t));
}
+/*
+ * Change backing file, only valid to use during initial VMA setup.
+ */
+void vma_set_file(struct vm_area_struct *vma, struct file *file)
+{
+ /* Changing an anonymous vma with this is illegal */
+ get_file(file);
+ swap(vma->vm_file, file);
+ fput(file);
+}
+EXPORT_SYMBOL(vma_set_file);
+
#ifndef STACK_RND_MASK
#define STACK_RND_MASK (0x7ff >> (PAGE_SHIFT - 12)) /* 8MB of VA */
#endif
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 6ae491a8b210..4d88fe5a277a 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -413,10 +413,13 @@ static DEFINE_SPINLOCK(vmap_area_lock);
static DEFINE_SPINLOCK(free_vmap_area_lock);
/* Export for kexec only */
LIST_HEAD(vmap_area_list);
-static LLIST_HEAD(vmap_purge_list);
static struct rb_root vmap_area_root = RB_ROOT;
static bool vmap_initialized __read_mostly;
+static struct rb_root purge_vmap_area_root = RB_ROOT;
+static LIST_HEAD(purge_vmap_area_list);
+static DEFINE_SPINLOCK(purge_vmap_area_lock);
+
/*
* This kmem_cache is used for vmap_area objects. Instead of
* allocating from slab we reuse an object from this cache to
@@ -820,10 +823,17 @@ insert:
if (!merged)
link_va(va, root, parent, link, head);
- /*
- * Last step is to check and update the tree.
- */
- augment_tree_propagate_from(va);
+ return va;
+}
+
+static __always_inline struct vmap_area *
+merge_or_add_vmap_area_augment(struct vmap_area *va,
+ struct rb_root *root, struct list_head *head)
+{
+ va = merge_or_add_vmap_area(va, root, head);
+ if (va)
+ augment_tree_propagate_from(va);
+
return va;
}
@@ -1138,7 +1148,7 @@ static void free_vmap_area(struct vmap_area *va)
* Insert/Merge it back to the free tree/list.
*/
spin_lock(&free_vmap_area_lock);
- merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list);
+ merge_or_add_vmap_area_augment(va, &free_vmap_area_root, &free_vmap_area_list);
spin_unlock(&free_vmap_area_lock);
}
@@ -1326,32 +1336,32 @@ void set_iounmap_nonlazy(void)
static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
{
unsigned long resched_threshold;
- struct llist_node *valist;
- struct vmap_area *va;
- struct vmap_area *n_va;
+ struct list_head local_pure_list;
+ struct vmap_area *va, *n_va;
lockdep_assert_held(&vmap_purge_lock);
- valist = llist_del_all(&vmap_purge_list);
- if (unlikely(valist == NULL))
+ spin_lock(&purge_vmap_area_lock);
+ purge_vmap_area_root = RB_ROOT;
+ list_replace_init(&purge_vmap_area_list, &local_pure_list);
+ spin_unlock(&purge_vmap_area_lock);
+
+ if (unlikely(list_empty(&local_pure_list)))
return false;
- /*
- * TODO: to calculate a flush range without looping.
- * The list can be up to lazy_max_pages() elements.
- */
- llist_for_each_entry(va, valist, purge_list) {
- if (va->va_start < start)
- start = va->va_start;
- if (va->va_end > end)
- end = va->va_end;
- }
+ start = min(start,
+ list_first_entry(&local_pure_list,
+ struct vmap_area, list)->va_start);
+
+ end = max(end,
+ list_last_entry(&local_pure_list,
+ struct vmap_area, list)->va_end);
flush_tlb_kernel_range(start, end);
resched_threshold = lazy_max_pages() << 1;
spin_lock(&free_vmap_area_lock);
- llist_for_each_entry_safe(va, n_va, valist, purge_list) {
+ list_for_each_entry_safe(va, n_va, &local_pure_list, list) {
unsigned long nr = (va->va_end - va->va_start) >> PAGE_SHIFT;
unsigned long orig_start = va->va_start;
unsigned long orig_end = va->va_end;
@@ -1361,8 +1371,8 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end)
* detached and there is no need to "unlink" it from
* anything.
*/
- va = merge_or_add_vmap_area(va, &free_vmap_area_root,
- &free_vmap_area_list);
+ va = merge_or_add_vmap_area_augment(va, &free_vmap_area_root,
+ &free_vmap_area_list);
if (!va)
continue;
@@ -1419,9 +1429,15 @@ static void free_vmap_area_noflush(struct vmap_area *va)
nr_lazy = atomic_long_add_return((va->va_end - va->va_start) >>
PAGE_SHIFT, &vmap_lazy_nr);
- /* After this point, we may free va at any time */
- llist_add(&va->purge_list, &vmap_purge_list);
+ /*
+ * Merge or place it to the purge tree/list.
+ */
+ spin_lock(&purge_vmap_area_lock);
+ merge_or_add_vmap_area(va,
+ &purge_vmap_area_root, &purge_vmap_area_list);
+ spin_unlock(&purge_vmap_area_lock);
+ /* After this point, we may free va at any time */
if (unlikely(nr_lazy > lazy_max_pages()))
try_purge_vmap_area_lazy();
}
@@ -2256,7 +2272,7 @@ static void __vunmap(const void *addr, int deallocate_pages)
debug_check_no_locks_freed(area->addr, get_vm_area_size(area));
debug_check_no_obj_freed(area->addr, get_vm_area_size(area));
- kasan_poison_vmalloc(area->addr, area->size);
+ kasan_poison_vmalloc(area->addr, get_vm_area_size(area));
vm_remove_mappings(area, deallocate_pages);
@@ -2275,7 +2291,6 @@ static void __vunmap(const void *addr, int deallocate_pages)
}
kfree(area);
- return;
}
static inline void __vfree_deferred(const void *addr)
@@ -2461,9 +2476,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
{
const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO;
unsigned int nr_pages = get_vm_area_size(area) >> PAGE_SHIFT;
- unsigned int array_size = nr_pages * sizeof(struct page *), i;
+ unsigned long array_size;
+ unsigned int i;
struct page **pages;
+ array_size = (unsigned long)nr_pages * sizeof(struct page *);
gfp_mask |= __GFP_NOWARN;
if (!(gfp_mask & (GFP_DMA | GFP_DMA32)))
gfp_mask |= __GFP_HIGHMEM;
@@ -2477,8 +2494,7 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
}
if (!pages) {
- remove_vm_area(area->addr);
- kfree(area);
+ free_vm_area(area);
return NULL;
}
@@ -3134,6 +3150,7 @@ pvm_find_va_enclose_addr(unsigned long addr)
* @va:
* in - the VA we start the search(reverse order);
* out - the VA with the highest aligned end address.
+ * @align: alignment for required highest address
*
* Returns: determined end address within vmap_area
*/
@@ -3350,8 +3367,8 @@ recovery:
while (area--) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
- va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -3400,8 +3417,8 @@ err_free_shadow:
for (area = 0; area < nr_vms; area++) {
orig_start = vas[area]->va_start;
orig_end = vas[area]->va_end;
- va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root,
- &free_vmap_area_list);
+ va = merge_or_add_vmap_area_augment(vas[area], &free_vmap_area_root,
+ &free_vmap_area_list);
if (va)
kasan_release_vmalloc(orig_start, orig_end,
va->va_start, va->va_end);
@@ -3448,11 +3465,11 @@ static void *s_next(struct seq_file *m, void *p, loff_t *pos)
}
static void s_stop(struct seq_file *m, void *p)
- __releases(&vmap_purge_lock)
__releases(&vmap_area_lock)
+ __releases(&vmap_purge_lock)
{
- mutex_unlock(&vmap_purge_lock);
spin_unlock(&vmap_area_lock);
+ mutex_unlock(&vmap_purge_lock);
}
static void show_numa_info(struct seq_file *m, struct vm_struct *v)
@@ -3481,18 +3498,15 @@ static void show_numa_info(struct seq_file *m, struct vm_struct *v)
static void show_purge_info(struct seq_file *m)
{
- struct llist_node *head;
struct vmap_area *va;
- head = READ_ONCE(vmap_purge_list.first);
- if (head == NULL)
- return;
-
- llist_for_each_entry(va, head, purge_list) {
+ spin_lock(&purge_vmap_area_lock);
+ list_for_each_entry(va, &purge_vmap_area_list, list) {
seq_printf(m, "0x%pK-0x%pK %7ld unpurged vm_area\n",
(void *)va->va_start, (void *)va->va_end,
va->va_end - va->va_start);
}
+ spin_unlock(&purge_vmap_area_lock);
}
static int s_show(struct seq_file *m, void *p)
@@ -3550,10 +3564,7 @@ static int s_show(struct seq_file *m, void *p)
seq_putc(m, '\n');
/*
- * As a final step, dump "unpurged" areas. Note,
- * that entire "/proc/vmallocinfo" output will not
- * be address sorted, because the purge list is not
- * sorted.
+ * As a final step, dump "unpurged" areas.
*/
if (list_is_last(&va->list, &vmap_area_list))
show_purge_info(m);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 7b4e31eac2cf..257cba79a96d 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -1,7 +1,5 @@
// SPDX-License-Identifier: GPL-2.0
/*
- * linux/mm/vmscan.c
- *
* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
*
* Swap reorganised 29.12.95, Stephen Tweedie.
@@ -1072,7 +1070,6 @@ static void page_check_dirty_writeback(struct page *page,
static unsigned int shrink_page_list(struct list_head *page_list,
struct pglist_data *pgdat,
struct scan_control *sc,
- enum ttu_flags ttu_flags,
struct reclaim_stat *stat,
bool ignore_references)
{
@@ -1297,7 +1294,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
* processes. Try to unmap it here.
*/
if (page_mapped(page)) {
- enum ttu_flags flags = ttu_flags | TTU_BATCH_FLUSH;
+ enum ttu_flags flags = TTU_BATCH_FLUSH;
bool was_swapbacked = PageSwapBacked(page);
if (unlikely(PageTransHuge(page)))
@@ -1372,6 +1369,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
if (PageDirty(page) || PageWriteback(page))
goto keep_locked;
mapping = page_mapping(page);
+ fallthrough;
case PAGE_CLEAN:
; /* try to free the page below */
}
@@ -1393,7 +1391,7 @@ static unsigned int shrink_page_list(struct list_head *page_list,
*
* Rarely, pages can have buffers and no ->mapping. These are
* the pages which were not successfully invalidated in
- * truncate_complete_page(). We try to drop those buffers here
+ * truncate_cleanup_page(). We try to drop those buffers here
* and if that worked, and the page is no longer mapped into
* process address space (page_count == 1) it can be freed.
* Otherwise, leave the page on the LRU so it is swappable.
@@ -1514,7 +1512,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
}
nr_reclaimed = shrink_page_list(&clean_pages, zone->zone_pgdat, &sc,
- TTU_IGNORE_ACCESS, &stat, true);
+ &stat, true);
list_splice(&clean_pages, page_list);
mod_node_page_state(zone->zone_pgdat, NR_ISOLATED_FILE,
-(long)nr_reclaimed);
@@ -1541,9 +1539,9 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone,
*
* returns 0 on success, -ve errno on failure.
*/
-int __isolate_lru_page(struct page *page, isolate_mode_t mode)
+int __isolate_lru_page_prepare(struct page *page, isolate_mode_t mode)
{
- int ret = -EINVAL;
+ int ret = -EBUSY;
/* Only take pages on the LRU. */
if (!PageLRU(page))
@@ -1553,8 +1551,6 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if (PageUnevictable(page) && !(mode & ISOLATE_UNEVICTABLE))
return ret;
- ret = -EBUSY;
-
/*
* To minimise LRU disruption, the caller can indicate that it only
* wants to isolate pages it will be able to operate on without
@@ -1595,20 +1591,9 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode)
if ((mode & ISOLATE_UNMAPPED) && page_mapped(page))
return ret;
- if (likely(get_page_unless_zero(page))) {
- /*
- * Be careful not to clear PageLRU until after we're
- * sure the page is not being freed elsewhere -- the
- * page release code relies on it.
- */
- ClearPageLRU(page);
- ret = 0;
- }
-
- return ret;
+ return 0;
}
-
/*
* Update LRU sizes after isolating pages. The LRU size updates must
* be complete before mem_cgroup_update_lru_size due to a sanity check.
@@ -1628,14 +1613,16 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec,
}
/**
- * pgdat->lru_lock is heavily contended. Some of the functions that
+ * Isolating page from the lruvec to fill in @dst list by nr_to_scan times.
+ *
+ * lruvec->lru_lock is heavily contended. Some of the functions that
* shrink the lists perform better by taking out a batch of pages
* and working on them outside the LRU lock.
*
* For pagecache intensive workloads, this function is the hottest
* spot in the kernel (apart from copy_*_user functions).
*
- * Appropriate locks must be held before calling this function.
+ * Lru_lock must be held before calling this function.
*
* @nr_to_scan: The number of eligible pages to look through on the list.
* @lruvec: The LRU vector to pull pages from.
@@ -1668,8 +1655,6 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
page = lru_to_page(src);
prefetchw_prev_lru_page(page, src, flags);
- VM_BUG_ON_PAGE(!PageLRU(page), page);
-
nr_pages = compound_nr(page);
total_scan += nr_pages;
@@ -1690,20 +1675,34 @@ static unsigned long isolate_lru_pages(unsigned long nr_to_scan,
* only when the page is being freed somewhere else.
*/
scan += nr_pages;
- switch (__isolate_lru_page(page, mode)) {
+ switch (__isolate_lru_page_prepare(page, mode)) {
case 0:
+ /*
+ * Be careful not to clear PageLRU until after we're
+ * sure the page is not being freed elsewhere -- the
+ * page release code relies on it.
+ */
+ if (unlikely(!get_page_unless_zero(page)))
+ goto busy;
+
+ if (!TestClearPageLRU(page)) {
+ /*
+ * This page may in other isolation path,
+ * but we still hold lru_lock.
+ */
+ put_page(page);
+ goto busy;
+ }
+
nr_taken += nr_pages;
nr_zone_taken[page_zonenum(page)] += nr_pages;
list_move(&page->lru, dst);
break;
- case -EBUSY:
+ default:
+busy:
/* else it is being freed elsewhere */
list_move(&page->lru, src);
- continue;
-
- default:
- BUG();
}
}
@@ -1766,21 +1765,16 @@ int isolate_lru_page(struct page *page)
VM_BUG_ON_PAGE(!page_count(page), page);
WARN_RATELIMIT(PageTail(page), "trying to isolate tail page");
- if (PageLRU(page)) {
- pg_data_t *pgdat = page_pgdat(page);
+ if (TestClearPageLRU(page)) {
struct lruvec *lruvec;
- spin_lock_irq(&pgdat->lru_lock);
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
- if (PageLRU(page)) {
- int lru = page_lru(page);
- get_page(page);
- ClearPageLRU(page);
- del_page_from_lru_list(page, lruvec, lru);
- ret = 0;
- }
- spin_unlock_irq(&pgdat->lru_lock);
+ get_page(page);
+ lruvec = lock_page_lruvec_irq(page);
+ del_page_from_lru_list(page, lruvec, page_lru(page));
+ unlock_page_lruvec_irq(lruvec);
+ ret = 0;
}
+
return ret;
}
@@ -1822,29 +1816,14 @@ static int too_many_isolated(struct pglist_data *pgdat, int file,
}
/*
- * This moves pages from @list to corresponding LRU list.
- *
- * We move them the other way if the page is referenced by one or more
- * processes, from rmap.
- *
- * If the pages are mostly unmapped, the processing is fast and it is
- * appropriate to hold zone_lru_lock across the whole operation. But if
- * the pages are mapped, the processing is slow (page_referenced()) so we
- * should drop zone_lru_lock around each page. It's impossible to balance
- * this, so instead we remove the pages from the LRU while processing them.
- * It is safe to rely on PG_active against the non-LRU pages in here because
- * nobody will play with that bit on a non-LRU page.
- *
- * The downside is that we have to touch page->_refcount against each page.
- * But we had to alter page->flags anyway.
+ * move_pages_to_lru() moves pages from private @list to appropriate LRU list.
+ * On return, @list is reused as a list of pages to be freed by the caller.
*
* Returns the number of pages moved to the given lruvec.
*/
-
static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
struct list_head *list)
{
- struct pglist_data *pgdat = lruvec_pgdat(lruvec);
int nr_pages, nr_moved = 0;
LIST_HEAD(pages_to_free);
struct page *page;
@@ -1853,38 +1832,54 @@ static unsigned noinline_for_stack move_pages_to_lru(struct lruvec *lruvec,
while (!list_empty(list)) {
page = lru_to_page(list);
VM_BUG_ON_PAGE(PageLRU(page), page);
+ list_del(&page->lru);
if (unlikely(!page_evictable(page))) {
- list_del(&page->lru);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
putback_lru_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
continue;
}
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
+ /*
+ * The SetPageLRU needs to be kept here for list integrity.
+ * Otherwise:
+ * #0 move_pages_to_lru #1 release_pages
+ * if !put_page_testzero
+ * if (put_page_testzero())
+ * !PageLRU //skip lru_lock
+ * SetPageLRU()
+ * list_add(&page->lru,)
+ * list_add(&page->lru,)
+ */
SetPageLRU(page);
- lru = page_lru(page);
- nr_pages = thp_nr_pages(page);
- update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
- list_move(&page->lru, &lruvec->lists[lru]);
-
- if (put_page_testzero(page)) {
+ if (unlikely(put_page_testzero(page))) {
__ClearPageLRU(page);
__ClearPageActive(page);
- del_page_from_lru_list(page, lruvec, lru);
if (unlikely(PageCompound(page))) {
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
destroy_compound_page(page);
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
} else
list_add(&page->lru, &pages_to_free);
- } else {
- nr_moved += nr_pages;
- if (PageActive(page))
- workingset_age_nonresident(lruvec, nr_pages);
+
+ continue;
}
+
+ /*
+ * All pages were isolated from the same lruvec (and isolation
+ * inhibits memcg migration).
+ */
+ VM_BUG_ON_PAGE(!lruvec_holds_page_lru_lock(page, lruvec), page);
+ lru = page_lru(page);
+ nr_pages = thp_nr_pages(page);
+
+ update_lru_size(lruvec, lru, page_zonenum(page), nr_pages);
+ list_add(&page->lru, &lruvec->lists[lru]);
+ nr_moved += nr_pages;
+ if (PageActive(page))
+ workingset_age_nonresident(lruvec, nr_pages);
}
/*
@@ -1941,7 +1936,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
lru_add_drain();
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list,
&nr_scanned, sc, lru);
@@ -1953,28 +1948,25 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
__count_vm_events(PGSCAN_ANON + file, nr_scanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
if (nr_taken == 0)
return 0;
- nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, 0,
- &stat, false);
-
- spin_lock_irq(&pgdat->lru_lock);
+ nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false);
+ spin_lock_irq(&lruvec->lru_lock);
move_pages_to_lru(lruvec, &page_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- lru_note_cost(lruvec, file, stat.nr_pageout);
item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
+ spin_unlock_irq(&lruvec->lru_lock);
- spin_unlock_irq(&pgdat->lru_lock);
-
+ lru_note_cost(lruvec, file, stat.nr_pageout);
mem_cgroup_uncharge_list(&page_list);
free_unref_page_list(&page_list);
@@ -2006,6 +1998,23 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec,
return nr_reclaimed;
}
+/*
+ * shrink_active_list() moves pages from the active LRU to the inactive LRU.
+ *
+ * We move them the other way if the page is referenced by one or more
+ * processes.
+ *
+ * If the pages are mostly unmapped, the processing is fast and it is
+ * appropriate to hold lru_lock across the whole operation. But if
+ * the pages are mapped, the processing is slow (page_referenced()), so
+ * we should drop lru_lock around each page. It's impossible to balance
+ * this, so instead we remove the pages from the LRU while processing them.
+ * It is safe to rely on PG_active against the non-LRU pages in here because
+ * nobody will play with that bit on a non-LRU page.
+ *
+ * The downside is that we have to touch page->_refcount against each page.
+ * But we had to alter page->flags anyway.
+ */
static void shrink_active_list(unsigned long nr_to_scan,
struct lruvec *lruvec,
struct scan_control *sc,
@@ -2025,7 +2034,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
lru_add_drain();
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold,
&nr_scanned, sc, lru);
@@ -2036,7 +2045,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_vm_events(PGREFILL, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), PGREFILL, nr_scanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
while (!list_empty(&l_hold)) {
cond_resched();
@@ -2082,7 +2091,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
/*
* Move pages back to the lru list.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&lruvec->lru_lock);
nr_activate = move_pages_to_lru(lruvec, &l_active);
nr_deactivate = move_pages_to_lru(lruvec, &l_inactive);
@@ -2093,7 +2102,7 @@ static void shrink_active_list(unsigned long nr_to_scan,
__count_memcg_events(lruvec_memcg(lruvec), PGDEACTIVATE, nr_deactivate);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&lruvec->lru_lock);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
@@ -2131,8 +2140,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
nr_reclaimed += shrink_page_list(&node_page_list,
NODE_DATA(nid),
- &sc, 0,
- &dummy_stat, false);
+ &sc, &dummy_stat, false);
while (!list_empty(&node_page_list)) {
page = lru_to_page(&node_page_list);
list_del(&page->lru);
@@ -2145,8 +2153,7 @@ unsigned long reclaim_pages(struct list_head *page_list)
if (!list_empty(&node_page_list)) {
nr_reclaimed += shrink_page_list(&node_page_list,
NODE_DATA(nid),
- &sc, 0,
- &dummy_stat, false);
+ &sc, &dummy_stat, false);
while (!list_empty(&node_page_list)) {
page = lru_to_page(&node_page_list);
list_del(&page->lru);
@@ -2683,10 +2690,10 @@ again:
/*
* Determine the scan balance between anon and file LRUs.
*/
- spin_lock_irq(&pgdat->lru_lock);
+ spin_lock_irq(&target_lruvec->lru_lock);
sc->anon_cost = target_lruvec->anon_cost;
sc->file_cost = target_lruvec->file_cost;
- spin_unlock_irq(&pgdat->lru_lock);
+ spin_unlock_irq(&target_lruvec->lru_lock);
/*
* Target desirable inactive:active list ratios for the anon
@@ -3899,7 +3906,7 @@ kswapd_try_sleep:
highest_zoneidx);
/* Read the new order and highest_zoneidx */
- alloc_order = reclaim_order = READ_ONCE(pgdat->kswapd_order);
+ alloc_order = READ_ONCE(pgdat->kswapd_order);
highest_zoneidx = kswapd_highest_zoneidx(pgdat,
highest_zoneidx);
WRITE_ONCE(pgdat->kswapd_order, 0);
@@ -4262,15 +4269,13 @@ int node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned int order)
*/
void check_move_unevictable_pages(struct pagevec *pvec)
{
- struct lruvec *lruvec;
- struct pglist_data *pgdat = NULL;
+ struct lruvec *lruvec = NULL;
int pgscanned = 0;
int pgrescued = 0;
int i;
for (i = 0; i < pvec->nr; i++) {
struct page *page = pvec->pages[i];
- struct pglist_data *pagepgdat = page_pgdat(page);
int nr_pages;
if (PageTransTail(page))
@@ -4279,18 +4284,12 @@ void check_move_unevictable_pages(struct pagevec *pvec)
nr_pages = thp_nr_pages(page);
pgscanned += nr_pages;
- if (pagepgdat != pgdat) {
- if (pgdat)
- spin_unlock_irq(&pgdat->lru_lock);
- pgdat = pagepgdat;
- spin_lock_irq(&pgdat->lru_lock);
- }
- lruvec = mem_cgroup_page_lruvec(page, pgdat);
-
- if (!PageLRU(page) || !PageUnevictable(page))
+ /* block memcg migration during page moving between lru */
+ if (!TestClearPageLRU(page))
continue;
- if (page_evictable(page)) {
+ lruvec = relock_page_lruvec_irq(page, lruvec);
+ if (page_evictable(page) && PageUnevictable(page)) {
enum lru_list lru = page_lru_base_type(page);
VM_BUG_ON_PAGE(PageActive(page), page);
@@ -4299,12 +4298,15 @@ void check_move_unevictable_pages(struct pagevec *pvec)
add_page_to_lru_list(page, lruvec, lru);
pgrescued += nr_pages;
}
+ SetPageLRU(page);
}
- if (pgdat) {
+ if (lruvec) {
__count_vm_events(UNEVICTABLE_PGRESCUED, pgrescued);
__count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
- spin_unlock_irq(&pgdat->lru_lock);
+ unlock_page_lruvec_irq(lruvec);
+ } else if (pgscanned) {
+ count_vm_events(UNEVICTABLE_PGSCANNED, pgscanned);
}
}
EXPORT_SYMBOL_GPL(check_move_unevictable_pages);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 698bc0bc18d1..f8942160fc95 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -1157,7 +1157,6 @@ const char * const vmstat_text[] = {
"nr_zone_unevictable",
"nr_zone_write_pending",
"nr_mlock",
- "nr_page_table_pages",
"nr_bounce",
#if IS_ENABLED(CONFIG_ZSMALLOC)
"nr_zspages",
@@ -1215,6 +1214,7 @@ const char * const vmstat_text[] = {
#if IS_ENABLED(CONFIG_SHADOW_CALL_STACK)
"nr_shadow_call_stack",
#endif
+ "nr_page_table_pages",
/* enum writeback_stat_item counters */
"nr_dirty_threshold",
@@ -1503,10 +1503,6 @@ static void pagetypeinfo_showblockcount_print(struct seq_file *m,
if (!page)
continue;
- /* Watch for unexpected holes punched in the memmap */
- if (!memmap_valid_within(pfn, page, zone))
- continue;
-
if (page_zone(page) != zone)
continue;
diff --git a/mm/workingset.c b/mm/workingset.c
index 975a4d2dd02e..10e96de945b3 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -257,7 +257,7 @@ void *workingset_eviction(struct page *page, struct mem_cgroup *target_memcg)
struct lruvec *lruvec;
int memcgid;
- /* Page is fully exclusive and pins page->mem_cgroup */
+ /* Page is fully exclusive and pins page's memory cgroup pointer */
VM_BUG_ON_PAGE(PageLRU(page), page);
VM_BUG_ON_PAGE(page_count(page), page);
VM_BUG_ON_PAGE(!PageLocked(page), page);
@@ -381,9 +381,7 @@ void workingset_refault(struct page *page, void *shadow)
if (workingset) {
SetPageWorkingset(page);
/* XXX: Move to lru_cache_add() when it supports new vs putback */
- spin_lock_irq(&page_pgdat(page)->lru_lock);
lru_note_cost_page(page);
- spin_unlock_irq(&page_pgdat(page)->lru_lock);
inc_lruvec_state(lruvec, WORKINGSET_RESTORE_BASE + file);
}
out:
@@ -445,12 +443,12 @@ void workingset_update_node(struct xa_node *node)
if (node->count && node->count == node->nr_values) {
if (list_empty(&node->private_list)) {
list_lru_add(&shadow_nodes, &node->private_list);
- __inc_lruvec_slab_state(node, WORKINGSET_NODES);
+ __inc_lruvec_kmem_state(node, WORKINGSET_NODES);
}
} else {
if (!list_empty(&node->private_list)) {
list_lru_del(&shadow_nodes, &node->private_list);
- __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+ __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
}
}
}
@@ -544,7 +542,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
}
list_lru_isolate(lru, item);
- __dec_lruvec_slab_state(node, WORKINGSET_NODES);
+ __dec_lruvec_kmem_state(node, WORKINGSET_NODES);
spin_unlock(lru_lock);
@@ -559,7 +557,7 @@ static enum lru_status shadow_lru_isolate(struct list_head *item,
goto out_invalid;
mapping->nrexceptional -= node->nr_values;
xa_delete_node(node, workingset_update_node);
- __inc_lruvec_slab_state(node, WORKINGSET_NODERECLAIM);
+ __inc_lruvec_kmem_state(node, WORKINGSET_NODERECLAIM);
out_invalid:
xa_unlock_irq(&mapping->i_pages);
diff --git a/mm/z3fold.c b/mm/z3fold.c
index 18feaa0bc537..dacb0d70fa61 100644
--- a/mm/z3fold.c
+++ b/mm/z3fold.c
@@ -90,7 +90,7 @@ struct z3fold_buddy_slots {
* be enough slots to hold all possible variants
*/
unsigned long slot[BUDDY_MASK + 1];
- unsigned long pool; /* back link + flags */
+ unsigned long pool; /* back link */
rwlock_t lock;
};
#define HANDLE_FLAG_MASK (0x03)
@@ -185,7 +185,7 @@ enum z3fold_page_flags {
* handle flags, go under HANDLE_FLAG_MASK
*/
enum z3fold_handle_flags {
- HANDLES_ORPHANED = 0,
+ HANDLES_NOFREE = 0,
};
/*
@@ -303,10 +303,9 @@ static inline void put_z3fold_header(struct z3fold_header *zhdr)
z3fold_page_unlock(zhdr);
}
-static inline void free_handle(unsigned long handle)
+static inline void free_handle(unsigned long handle, struct z3fold_header *zhdr)
{
struct z3fold_buddy_slots *slots;
- struct z3fold_header *zhdr;
int i;
bool is_free;
@@ -316,22 +315,19 @@ static inline void free_handle(unsigned long handle)
if (WARN_ON(*(unsigned long *)handle == 0))
return;
- zhdr = handle_to_z3fold_header(handle);
slots = handle_to_slots(handle);
write_lock(&slots->lock);
*(unsigned long *)handle = 0;
- if (zhdr->slots == slots) {
+
+ if (test_bit(HANDLES_NOFREE, &slots->pool)) {
write_unlock(&slots->lock);
return; /* simple case, nothing else to do */
}
- /* we are freeing a foreign handle if we are here */
- zhdr->foreign_handles--;
+ if (zhdr->slots != slots)
+ zhdr->foreign_handles--;
+
is_free = true;
- if (!test_bit(HANDLES_ORPHANED, &slots->pool)) {
- write_unlock(&slots->lock);
- return;
- }
for (i = 0; i <= BUDDY_MASK; i++) {
if (slots->slot[i]) {
is_free = false;
@@ -343,6 +339,8 @@ static inline void free_handle(unsigned long handle)
if (is_free) {
struct z3fold_pool *pool = slots_to_pool(slots);
+ if (zhdr->slots == slots)
+ zhdr->slots = NULL;
kmem_cache_free(pool->c_handle, slots);
}
}
@@ -525,8 +523,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
{
struct page *page = virt_to_page(zhdr);
struct z3fold_pool *pool = zhdr_to_pool(zhdr);
- bool is_free = true;
- int i;
WARN_ON(!list_empty(&zhdr->buddy));
set_bit(PAGE_STALE, &page->private);
@@ -536,21 +532,6 @@ static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
list_del_init(&page->lru);
spin_unlock(&pool->lock);
- /* If there are no foreign handles, free the handles array */
- read_lock(&zhdr->slots->lock);
- for (i = 0; i <= BUDDY_MASK; i++) {
- if (zhdr->slots->slot[i]) {
- is_free = false;
- break;
- }
- }
- if (!is_free)
- set_bit(HANDLES_ORPHANED, &zhdr->slots->pool);
- read_unlock(&zhdr->slots->lock);
-
- if (is_free)
- kmem_cache_free(pool->c_handle, zhdr->slots);
-
if (locked)
z3fold_page_unlock(zhdr);
@@ -642,15 +623,39 @@ static inline void add_to_unbuddied(struct z3fold_pool *pool,
{
if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
zhdr->middle_chunks == 0) {
- struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
-
+ struct list_head *unbuddied;
int freechunks = num_free_chunks(zhdr);
+
+ migrate_disable();
+ unbuddied = this_cpu_ptr(pool->unbuddied);
spin_lock(&pool->lock);
list_add(&zhdr->buddy, &unbuddied[freechunks]);
spin_unlock(&pool->lock);
zhdr->cpu = smp_processor_id();
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
+ }
+}
+
+static inline enum buddy get_free_buddy(struct z3fold_header *zhdr, int chunks)
+{
+ enum buddy bud = HEADLESS;
+
+ if (zhdr->middle_chunks) {
+ if (!zhdr->first_chunks &&
+ chunks <= zhdr->start_middle - ZHDR_CHUNKS)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ } else {
+ if (!zhdr->first_chunks)
+ bud = FIRST;
+ else if (!zhdr->last_chunks)
+ bud = LAST;
+ else
+ bud = MIDDLE;
}
+
+ return bud;
}
static inline void *mchunk_memmove(struct z3fold_header *zhdr,
@@ -714,18 +719,7 @@ static struct z3fold_header *compact_single_buddy(struct z3fold_header *zhdr)
if (WARN_ON(new_zhdr == zhdr))
goto out_fail;
- if (new_zhdr->first_chunks == 0) {
- if (new_zhdr->middle_chunks != 0 &&
- chunks >= new_zhdr->start_middle) {
- new_bud = LAST;
- } else {
- new_bud = FIRST;
- }
- } else if (new_zhdr->last_chunks == 0) {
- new_bud = LAST;
- } else if (new_zhdr->middle_chunks == 0) {
- new_bud = MIDDLE;
- }
+ new_bud = get_free_buddy(new_zhdr, chunks);
q = new_zhdr;
switch (new_bud) {
case FIRST:
@@ -847,9 +841,8 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
return;
}
- if (unlikely(PageIsolated(page) ||
- test_bit(PAGE_CLAIMED, &page->private) ||
- test_bit(PAGE_STALE, &page->private))) {
+ if (test_bit(PAGE_STALE, &page->private) ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
return;
}
@@ -858,13 +851,16 @@ static void do_compact_page(struct z3fold_header *zhdr, bool locked)
zhdr->mapped_count == 0 && compact_single_buddy(zhdr)) {
if (kref_put(&zhdr->refcount, release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
- else
+ else {
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
+ }
return;
}
z3fold_compact_page(zhdr);
add_to_unbuddied(pool, zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
@@ -886,8 +882,9 @@ static inline struct z3fold_header *__z3fold_alloc(struct z3fold_pool *pool,
int chunks = size_to_chunks(size), i;
lookup:
+ migrate_disable();
/* First, try to find an unbuddied z3fold page. */
- unbuddied = get_cpu_ptr(pool->unbuddied);
+ unbuddied = this_cpu_ptr(pool->unbuddied);
for_each_unbuddied_list(i, chunks) {
struct list_head *l = &unbuddied[i];
@@ -905,7 +902,7 @@ lookup:
!z3fold_page_trylock(zhdr)) {
spin_unlock(&pool->lock);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -919,7 +916,7 @@ lookup:
test_bit(PAGE_CLAIMED, &page->private)) {
z3fold_page_unlock(zhdr);
zhdr = NULL;
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (can_sleep)
cond_resched();
goto lookup;
@@ -934,7 +931,7 @@ lookup:
kref_get(&zhdr->refcount);
break;
}
- put_cpu_ptr(pool->unbuddied);
+ migrate_enable();
if (!zhdr) {
int cpu;
@@ -973,6 +970,9 @@ lookup:
}
}
+ if (zhdr && !zhdr->slots)
+ zhdr->slots = alloc_slots(pool,
+ can_sleep ? GFP_NOIO : GFP_ATOMIC);
return zhdr;
}
@@ -1109,17 +1109,8 @@ static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
retry:
zhdr = __z3fold_alloc(pool, size, can_sleep);
if (zhdr) {
- if (zhdr->first_chunks == 0) {
- if (zhdr->middle_chunks != 0 &&
- chunks >= zhdr->start_middle)
- bud = LAST;
- else
- bud = FIRST;
- } else if (zhdr->last_chunks == 0)
- bud = LAST;
- else if (zhdr->middle_chunks == 0)
- bud = MIDDLE;
- else {
+ bud = get_free_buddy(zhdr, chunks);
+ if (bud == HEADLESS) {
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked))
atomic64_dec(&pool->pages_nr);
@@ -1265,12 +1256,11 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
pr_err("%s: unknown bud %d\n", __func__, bud);
WARN_ON(1);
put_z3fold_header(zhdr);
- clear_bit(PAGE_CLAIMED, &page->private);
return;
}
if (!page_claimed)
- free_handle(handle);
+ free_handle(handle, zhdr);
if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
atomic64_dec(&pool->pages_nr);
return;
@@ -1280,8 +1270,7 @@ static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
z3fold_page_unlock(zhdr);
return;
}
- if (unlikely(PageIsolated(page)) ||
- test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
+ if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
put_z3fold_header(zhdr);
clear_bit(PAGE_CLAIMED, &page->private);
return;
@@ -1345,6 +1334,10 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
struct page *page = NULL;
struct list_head *pos;
unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
+ struct z3fold_buddy_slots slots __attribute__((aligned(SLOTS_ALIGN)));
+
+ rwlock_init(&slots.lock);
+ slots.pool = (unsigned long)pool | (1 << HANDLES_NOFREE);
spin_lock(&pool->lock);
if (!pool->ops || !pool->ops->evict || retries == 0) {
@@ -1359,35 +1352,36 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
list_for_each_prev(pos, &pool->lru) {
page = list_entry(pos, struct page, lru);
- /* this bit could have been set by free, in which case
- * we pass over to the next page in the pool.
- */
- if (test_and_set_bit(PAGE_CLAIMED, &page->private)) {
- page = NULL;
- continue;
- }
-
- if (unlikely(PageIsolated(page))) {
- clear_bit(PAGE_CLAIMED, &page->private);
- page = NULL;
- continue;
- }
zhdr = page_address(page);
if (test_bit(PAGE_HEADLESS, &page->private))
break;
+ if (kref_get_unless_zero(&zhdr->refcount) == 0) {
+ zhdr = NULL;
+ break;
+ }
if (!z3fold_page_trylock(zhdr)) {
- clear_bit(PAGE_CLAIMED, &page->private);
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
zhdr = NULL;
continue; /* can't evict at this point */
}
- if (zhdr->foreign_handles) {
- clear_bit(PAGE_CLAIMED, &page->private);
- z3fold_page_unlock(zhdr);
+
+ /* test_and_set_bit is of course atomic, but we still
+ * need to do it under page lock, otherwise checking
+ * that bit in __z3fold_alloc wouldn't make sense
+ */
+ if (zhdr->foreign_handles ||
+ test_and_set_bit(PAGE_CLAIMED, &page->private)) {
+ if (kref_put(&zhdr->refcount,
+ release_z3fold_page))
+ atomic64_dec(&pool->pages_nr);
+ else
+ z3fold_page_unlock(zhdr);
zhdr = NULL;
continue; /* can't evict such page */
}
- kref_get(&zhdr->refcount);
list_del_init(&zhdr->buddy);
zhdr->cpu = -1;
break;
@@ -1409,12 +1403,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
first_handle = 0;
last_handle = 0;
middle_handle = 0;
+ memset(slots.slot, 0, sizeof(slots.slot));
if (zhdr->first_chunks)
- first_handle = encode_handle(zhdr, FIRST);
+ first_handle = __encode_handle(zhdr, &slots,
+ FIRST);
if (zhdr->middle_chunks)
- middle_handle = encode_handle(zhdr, MIDDLE);
+ middle_handle = __encode_handle(zhdr, &slots,
+ MIDDLE);
if (zhdr->last_chunks)
- last_handle = encode_handle(zhdr, LAST);
+ last_handle = __encode_handle(zhdr, &slots,
+ LAST);
/*
* it's safe to unlock here because we hold a
* reference to this page
@@ -1429,19 +1427,16 @@ static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
ret = pool->ops->evict(pool, middle_handle);
if (ret)
goto next;
- free_handle(middle_handle);
}
if (first_handle) {
ret = pool->ops->evict(pool, first_handle);
if (ret)
goto next;
- free_handle(first_handle);
}
if (last_handle) {
ret = pool->ops->evict(pool, last_handle);
if (ret)
goto next;
- free_handle(last_handle);
}
next:
if (test_bit(PAGE_HEADLESS, &page->private)) {
@@ -1455,9 +1450,11 @@ next:
spin_unlock(&pool->lock);
clear_bit(PAGE_CLAIMED, &page->private);
} else {
+ struct z3fold_buddy_slots *slots = zhdr->slots;
z3fold_page_lock(zhdr);
if (kref_put(&zhdr->refcount,
release_z3fold_page_locked)) {
+ kmem_cache_free(pool->c_handle, slots);
atomic64_dec(&pool->pages_nr);
return 0;
}
@@ -1573,8 +1570,7 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(PageIsolated(page), page);
- if (test_bit(PAGE_HEADLESS, &page->private) ||
- test_bit(PAGE_CLAIMED, &page->private))
+ if (test_bit(PAGE_HEADLESS, &page->private))
return false;
zhdr = page_address(page);
@@ -1586,6 +1582,8 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode)
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0)
goto out;
+ if (test_and_set_bit(PAGE_CLAIMED, &page->private))
+ goto out;
pool = zhdr_to_pool(zhdr);
spin_lock(&pool->lock);
if (!list_empty(&zhdr->buddy))
@@ -1612,16 +1610,17 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
VM_BUG_ON_PAGE(!PageMovable(page), page);
VM_BUG_ON_PAGE(!PageIsolated(page), page);
+ VM_BUG_ON_PAGE(!test_bit(PAGE_CLAIMED, &page->private), page);
VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
zhdr = page_address(page);
pool = zhdr_to_pool(zhdr);
- if (!z3fold_page_trylock(zhdr)) {
+ if (!z3fold_page_trylock(zhdr))
return -EAGAIN;
- }
if (zhdr->mapped_count != 0 || zhdr->foreign_handles != 0) {
z3fold_page_unlock(zhdr);
+ clear_bit(PAGE_CLAIMED, &page->private);
return -EBUSY;
}
if (work_pending(&zhdr->work)) {
@@ -1663,6 +1662,7 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa
queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work);
page_mapcount_reset(page);
+ clear_bit(PAGE_CLAIMED, &page->private);
put_page(page);
return 0;
}
@@ -1686,6 +1686,7 @@ static void z3fold_page_putback(struct page *page)
spin_lock(&pool->lock);
list_add(&page->lru, &pool->lru);
spin_unlock(&pool->lock);
+ clear_bit(PAGE_CLAIMED, &page->private);
z3fold_page_unlock(zhdr);
}
diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c
index 918c7b019b3d..7289f502ffac 100644
--- a/mm/zsmalloc.c
+++ b/mm/zsmalloc.c
@@ -293,11 +293,7 @@ struct zspage {
};
struct mapping_area {
-#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
- struct vm_struct *vm; /* vm area for mapping object that span pages */
-#else
char *vm_buf; /* copy buffer for objects that span pages */
-#endif
char *vm_addr; /* address of kmap_atomic()'ed pages */
enum zs_mapmode vm_mm; /* mapping mode */
};
@@ -730,13 +726,10 @@ static void insert_zspage(struct size_class *class,
* We want to see more ZS_FULL pages and less almost empty/full.
* Put pages with higher ->inuse first.
*/
- if (head) {
- if (get_zspage_inuse(zspage) < get_zspage_inuse(head)) {
- list_add(&zspage->list, &head->list);
- return;
- }
- }
- list_add(&zspage->list, &class->fullness_list[fullness]);
+ if (head && get_zspage_inuse(zspage) < get_zspage_inuse(head))
+ list_add(&zspage->list, &head->list);
+ else
+ list_add(&zspage->list, &class->fullness_list[fullness]);
}
/*
@@ -1113,54 +1106,6 @@ static struct zspage *find_get_zspage(struct size_class *class)
return zspage;
}
-#ifdef CONFIG_ZSMALLOC_PGTABLE_MAPPING
-static inline int __zs_cpu_up(struct mapping_area *area)
-{
- /*
- * Make sure we don't leak memory if a cpu UP notification
- * and zs_init() race and both call zs_cpu_up() on the same cpu
- */
- if (area->vm)
- return 0;
- area->vm = get_vm_area(PAGE_SIZE * 2, 0);
- if (!area->vm)
- return -ENOMEM;
-
- /*
- * Populate ptes in advance to avoid pte allocation with GFP_KERNEL
- * in non-preemtible context of zs_map_object.
- */
- return apply_to_page_range(&init_mm, (unsigned long)area->vm->addr,
- PAGE_SIZE * 2, NULL, NULL);
-}
-
-static inline void __zs_cpu_down(struct mapping_area *area)
-{
- if (area->vm)
- free_vm_area(area->vm);
- area->vm = NULL;
-}
-
-static inline void *__zs_map_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- unsigned long addr = (unsigned long)area->vm->addr;
-
- BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0);
- area->vm_addr = area->vm->addr;
- return area->vm_addr + off;
-}
-
-static inline void __zs_unmap_object(struct mapping_area *area,
- struct page *pages[2], int off, int size)
-{
- unsigned long addr = (unsigned long)area->vm_addr;
-
- unmap_kernel_range(addr, PAGE_SIZE * 2);
-}
-
-#else /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
-
static inline int __zs_cpu_up(struct mapping_area *area)
{
/*
@@ -1241,8 +1186,6 @@ out:
pagefault_enable();
}
-#endif /* CONFIG_ZSMALLOC_PGTABLE_MAPPING */
-
static int zs_cpu_prepare(unsigned int cpu)
{
struct mapping_area *area;
diff --git a/mm/zswap.c b/mm/zswap.c
index fbb782924ccc..182f6ad5aa69 100644
--- a/mm/zswap.c
+++ b/mm/zswap.c
@@ -24,8 +24,10 @@
#include <linux/rbtree.h>
#include <linux/swap.h>
#include <linux/crypto.h>
+#include <linux/scatterlist.h>
#include <linux/mempool.h>
#include <linux/zpool.h>
+#include <crypto/acompress.h>
#include <linux/mm_types.h>
#include <linux/page-flags.h>
@@ -81,7 +83,7 @@ static bool zswap_pool_reached_full;
static bool zswap_enabled = IS_ENABLED(CONFIG_ZSWAP_DEFAULT_ON);
static int zswap_enabled_param_set(const char *,
const struct kernel_param *);
-static struct kernel_param_ops zswap_enabled_param_ops = {
+static const struct kernel_param_ops zswap_enabled_param_ops = {
.set = zswap_enabled_param_set,
.get = param_get_bool,
};
@@ -91,7 +93,7 @@ module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
static char *zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
static int zswap_compressor_param_set(const char *,
const struct kernel_param *);
-static struct kernel_param_ops zswap_compressor_param_ops = {
+static const struct kernel_param_ops zswap_compressor_param_ops = {
.set = zswap_compressor_param_set,
.get = param_get_charp,
.free = param_free_charp,
@@ -102,7 +104,7 @@ module_param_cb(compressor, &zswap_compressor_param_ops,
/* Compressed storage zpool to use */
static char *zswap_zpool_type = CONFIG_ZSWAP_ZPOOL_DEFAULT;
static int zswap_zpool_param_set(const char *, const struct kernel_param *);
-static struct kernel_param_ops zswap_zpool_param_ops = {
+static const struct kernel_param_ops zswap_zpool_param_ops = {
.set = zswap_zpool_param_set,
.get = param_get_charp,
.free = param_free_charp,
@@ -127,9 +129,17 @@ module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled,
* data structures
**********************************/
+struct crypto_acomp_ctx {
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+ struct crypto_wait wait;
+ u8 *dstmem;
+ struct mutex *mutex;
+};
+
struct zswap_pool {
struct zpool *zpool;
- struct crypto_comp * __percpu *tfm;
+ struct crypto_acomp_ctx __percpu *acomp_ctx;
struct kref kref;
struct list_head list;
struct work_struct release_work;
@@ -388,23 +398,43 @@ static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
* per-cpu code
**********************************/
static DEFINE_PER_CPU(u8 *, zswap_dstmem);
+/*
+ * If users dynamically change the zpool type and compressor at runtime, i.e.
+ * zswap is running, zswap can have more than one zpool on one cpu, but they
+ * are sharing dtsmem. So we need this mutex to be per-cpu.
+ */
+static DEFINE_PER_CPU(struct mutex *, zswap_mutex);
static int zswap_dstmem_prepare(unsigned int cpu)
{
+ struct mutex *mutex;
u8 *dst;
dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
if (!dst)
return -ENOMEM;
+ mutex = kmalloc_node(sizeof(*mutex), GFP_KERNEL, cpu_to_node(cpu));
+ if (!mutex) {
+ kfree(dst);
+ return -ENOMEM;
+ }
+
+ mutex_init(mutex);
per_cpu(zswap_dstmem, cpu) = dst;
+ per_cpu(zswap_mutex, cpu) = mutex;
return 0;
}
static int zswap_dstmem_dead(unsigned int cpu)
{
+ struct mutex *mutex;
u8 *dst;
+ mutex = per_cpu(zswap_mutex, cpu);
+ kfree(mutex);
+ per_cpu(zswap_mutex, cpu) = NULL;
+
dst = per_cpu(zswap_dstmem, cpu);
kfree(dst);
per_cpu(zswap_dstmem, cpu) = NULL;
@@ -415,30 +445,54 @@ static int zswap_dstmem_dead(unsigned int cpu)
static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_comp *tfm;
-
- if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
- return 0;
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+ struct crypto_acomp *acomp;
+ struct acomp_req *req;
+
+ acomp = crypto_alloc_acomp_node(pool->tfm_name, 0, 0, cpu_to_node(cpu));
+ if (IS_ERR(acomp)) {
+ pr_err("could not alloc crypto acomp %s : %ld\n",
+ pool->tfm_name, PTR_ERR(acomp));
+ return PTR_ERR(acomp);
+ }
+ acomp_ctx->acomp = acomp;
- tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
- if (IS_ERR_OR_NULL(tfm)) {
- pr_err("could not alloc crypto comp %s : %ld\n",
- pool->tfm_name, PTR_ERR(tfm));
+ req = acomp_request_alloc(acomp_ctx->acomp);
+ if (!req) {
+ pr_err("could not alloc crypto acomp_request %s\n",
+ pool->tfm_name);
+ crypto_free_acomp(acomp_ctx->acomp);
return -ENOMEM;
}
- *per_cpu_ptr(pool->tfm, cpu) = tfm;
+ acomp_ctx->req = req;
+
+ crypto_init_wait(&acomp_ctx->wait);
+ /*
+ * if the backend of acomp is async zip, crypto_req_done() will wakeup
+ * crypto_wait_req(); if the backend of acomp is scomp, the callback
+ * won't be called, crypto_wait_req() will return without blocking.
+ */
+ acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG,
+ crypto_req_done, &acomp_ctx->wait);
+
+ acomp_ctx->mutex = per_cpu(zswap_mutex, cpu);
+ acomp_ctx->dstmem = per_cpu(zswap_dstmem, cpu);
+
return 0;
}
static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
{
struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
- struct crypto_comp *tfm;
+ struct crypto_acomp_ctx *acomp_ctx = per_cpu_ptr(pool->acomp_ctx, cpu);
+
+ if (!IS_ERR_OR_NULL(acomp_ctx)) {
+ if (!IS_ERR_OR_NULL(acomp_ctx->req))
+ acomp_request_free(acomp_ctx->req);
+ if (!IS_ERR_OR_NULL(acomp_ctx->acomp))
+ crypto_free_acomp(acomp_ctx->acomp);
+ }
- tfm = *per_cpu_ptr(pool->tfm, cpu);
- if (!IS_ERR_OR_NULL(tfm))
- crypto_free_comp(tfm);
- *per_cpu_ptr(pool->tfm, cpu) = NULL;
return 0;
}
@@ -561,8 +615,9 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
- pool->tfm = alloc_percpu(struct crypto_comp *);
- if (!pool->tfm) {
+
+ pool->acomp_ctx = alloc_percpu(*pool->acomp_ctx);
+ if (!pool->acomp_ctx) {
pr_err("percpu alloc failed\n");
goto error;
}
@@ -585,7 +640,8 @@ static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
return pool;
error:
- free_percpu(pool->tfm);
+ if (pool->acomp_ctx)
+ free_percpu(pool->acomp_ctx);
if (pool->zpool)
zpool_destroy_pool(pool->zpool);
kfree(pool);
@@ -596,14 +652,14 @@ static __init struct zswap_pool *__zswap_pool_create_fallback(void)
{
bool has_comp, has_zpool;
- has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
if (!has_comp && strcmp(zswap_compressor,
CONFIG_ZSWAP_COMPRESSOR_DEFAULT)) {
pr_err("compressor %s not available, using default %s\n",
zswap_compressor, CONFIG_ZSWAP_COMPRESSOR_DEFAULT);
param_free_charp(&zswap_compressor);
zswap_compressor = CONFIG_ZSWAP_COMPRESSOR_DEFAULT;
- has_comp = crypto_has_comp(zswap_compressor, 0, 0);
+ has_comp = crypto_has_acomp(zswap_compressor, 0, 0);
}
if (!has_comp) {
pr_err("default compressor %s not available\n",
@@ -639,7 +695,7 @@ static void zswap_pool_destroy(struct zswap_pool *pool)
zswap_pool_debug("destroying", pool);
cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
- free_percpu(pool->tfm);
+ free_percpu(pool->acomp_ctx);
zpool_destroy_pool(pool->zpool);
kfree(pool);
}
@@ -723,7 +779,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
}
type = s;
} else if (!compressor) {
- if (!crypto_has_comp(s, 0, 0)) {
+ if (!crypto_has_acomp(s, 0, 0)) {
pr_err("compressor %s not available\n", s);
return -ENOENT;
}
@@ -774,7 +830,7 @@ static int __zswap_param_set(const char *val, const struct kernel_param *kp,
* failed, maybe both compressor and zpool params were bad.
* Allow changing this param, so pool creation will succeed
* when the other param is changed. We already verified this
- * param is ok in the zpool_has_pool() or crypto_has_comp()
+ * param is ok in the zpool_has_pool() or crypto_has_acomp()
* checks above.
*/
ret = param_set_charp(s, kp);
@@ -876,8 +932,10 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
pgoff_t offset;
struct zswap_entry *entry;
struct page *page;
- struct crypto_comp *tfm;
- u8 *src, *dst;
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
+
+ u8 *src;
unsigned int dlen;
int ret;
struct writeback_control wbc = {
@@ -916,14 +974,20 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
case ZSWAP_SWAPCACHE_NEW: /* page is locked */
/* decompress */
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
dlen = PAGE_SIZE;
src = (u8 *)zhdr + sizeof(struct zswap_header);
- dst = kmap_atomic(page);
- tfm = *get_cpu_ptr(entry->pool->tfm);
- ret = crypto_comp_decompress(tfm, src, entry->length,
- dst, &dlen);
- put_cpu_ptr(entry->pool->tfm);
- kunmap_atomic(dst);
+
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+ mutex_unlock(acomp_ctx->mutex);
+
BUG_ON(ret);
BUG_ON(dlen != PAGE_SIZE);
@@ -1004,7 +1068,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry, *dupentry;
- struct crypto_comp *tfm;
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
int ret;
unsigned int hlen, dlen = PAGE_SIZE;
unsigned long handle, value;
@@ -1074,12 +1139,32 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
}
/* compress */
- dst = get_cpu_var(zswap_dstmem);
- tfm = *get_cpu_ptr(entry->pool->tfm);
- src = kmap_atomic(page);
- ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
- kunmap_atomic(src);
- put_cpu_ptr(entry->pool->tfm);
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+
+ mutex_lock(acomp_ctx->mutex);
+
+ dst = acomp_ctx->dstmem;
+ sg_init_table(&input, 1);
+ sg_set_page(&input, page, PAGE_SIZE, 0);
+
+ /* zswap_dstmem is of size (PAGE_SIZE * 2). Reflect same in sg_list */
+ sg_init_one(&output, dst, PAGE_SIZE * 2);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, PAGE_SIZE, dlen);
+ /*
+ * it maybe looks a little bit silly that we send an asynchronous request,
+ * then wait for its completion synchronously. This makes the process look
+ * synchronous in fact.
+ * Theoretically, acomp supports users send multiple acomp requests in one
+ * acomp instance, then get those requests done simultaneously. but in this
+ * case, frontswap actually does store and load page by page, there is no
+ * existing method to send the second page before the first page is done
+ * in one thread doing frontswap.
+ * but in different threads running on different cpu, we have different
+ * acomp instance, so multiple threads can do (de)compression in parallel.
+ */
+ ret = crypto_wait_req(crypto_acomp_compress(acomp_ctx->req), &acomp_ctx->wait);
+ dlen = acomp_ctx->req->dlen;
+
if (ret) {
ret = -EINVAL;
goto put_dstmem;
@@ -1103,7 +1188,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset,
memcpy(buf, &zhdr, hlen);
memcpy(buf + hlen, dst, dlen);
zpool_unmap_handle(entry->pool->zpool, handle);
- put_cpu_var(zswap_dstmem);
+ mutex_unlock(acomp_ctx->mutex);
/* populate entry */
entry->offset = offset;
@@ -1131,7 +1216,7 @@ insert_entry:
return 0;
put_dstmem:
- put_cpu_var(zswap_dstmem);
+ mutex_unlock(acomp_ctx->mutex);
zswap_pool_put(entry->pool);
freepage:
zswap_entry_cache_free(entry);
@@ -1148,7 +1233,8 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
{
struct zswap_tree *tree = zswap_trees[type];
struct zswap_entry *entry;
- struct crypto_comp *tfm;
+ struct scatterlist input, output;
+ struct crypto_acomp_ctx *acomp_ctx;
u8 *src, *dst;
unsigned int dlen;
int ret;
@@ -1175,11 +1261,16 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset,
src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO);
if (zpool_evictable(entry->pool->zpool))
src += sizeof(struct zswap_header);
- dst = kmap_atomic(page);
- tfm = *get_cpu_ptr(entry->pool->tfm);
- ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
- put_cpu_ptr(entry->pool->tfm);
- kunmap_atomic(dst);
+
+ acomp_ctx = raw_cpu_ptr(entry->pool->acomp_ctx);
+ mutex_lock(acomp_ctx->mutex);
+ sg_init_one(&input, src, entry->length);
+ sg_init_table(&output, 1);
+ sg_set_page(&output, page, PAGE_SIZE, 0);
+ acomp_request_set_params(acomp_ctx->req, &input, &output, entry->length, dlen);
+ ret = crypto_wait_req(crypto_acomp_decompress(acomp_ctx->req), &acomp_ctx->wait);
+ mutex_unlock(acomp_ctx->mutex);
+
zpool_unmap_handle(entry->pool->zpool, entry->handle);
BUG_ON(ret);