summaryrefslogtreecommitdiffstats
path: root/mm/vmscan.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r--mm/vmscan.c207
1 files changed, 138 insertions, 69 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 4936a88bb26a..bd6637fcd8f9 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -54,6 +54,7 @@
#include <linux/shmem_fs.h>
#include <linux/ctype.h>
#include <linux/debugfs.h>
+#include <linux/khugepaged.h>
#include <asm/tlbflush.h>
#include <asm/div64.h>
@@ -1020,31 +1021,52 @@ out:
return freed;
}
-static void drop_slab_node(int nid)
+static unsigned long drop_slab_node(int nid)
{
- unsigned long freed;
- int shift = 0;
+ unsigned long freed = 0;
+ struct mem_cgroup *memcg = NULL;
+ memcg = mem_cgroup_iter(NULL, NULL, NULL);
do {
- struct mem_cgroup *memcg = NULL;
+ freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
+ } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
- if (fatal_signal_pending(current))
- return;
+ return freed;
+}
+void drop_slab(void)
+{
+ int nid;
+ int shift = 0;
+ unsigned long freed;
+
+ do {
freed = 0;
- memcg = mem_cgroup_iter(NULL, NULL, NULL);
- do {
- freed += shrink_slab(GFP_KERNEL, nid, memcg, 0);
- } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL);
+ for_each_online_node(nid) {
+ if (fatal_signal_pending(current))
+ return;
+
+ freed += drop_slab_node(nid);
+ }
} while ((freed >> shift++) > 1);
}
-void drop_slab(void)
+static int reclaimer_offset(void)
{
- int nid;
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGDEMOTE_DIRECT - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_DIRECT - PGSTEAL_KSWAPD !=
+ PGSCAN_DIRECT - PGSCAN_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGDEMOTE_KHUGEPAGED - PGDEMOTE_KSWAPD);
+ BUILD_BUG_ON(PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD !=
+ PGSCAN_KHUGEPAGED - PGSCAN_KSWAPD);
- for_each_online_node(nid)
- drop_slab_node(nid);
+ if (current_is_kswapd())
+ return 0;
+ if (current_is_khugepaged())
+ return PGSTEAL_KHUGEPAGED - PGSTEAL_KSWAPD;
+ return PGSTEAL_DIRECT - PGSTEAL_KSWAPD;
}
static inline int is_page_cache_freeable(struct folio *folio)
@@ -1346,11 +1368,10 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
if (folio_test_swapcache(folio)) {
swp_entry_t swap = folio_swap_entry(folio);
- /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */
if (reclaimed && !mapping_exiting(mapping))
shadow = workingset_eviction(folio, target_memcg);
- mem_cgroup_swapout(folio, swap);
__delete_from_swap_cache(folio, swap, shadow);
+ mem_cgroup_swapout(folio, swap);
xa_unlock_irq(&mapping->i_pages);
put_swap_folio(folio, swap);
} else {
@@ -1599,10 +1620,7 @@ static unsigned int demote_folio_list(struct list_head *demote_folios,
(unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION,
&nr_succeeded);
- if (current_is_kswapd())
- __count_vm_events(PGDEMOTE_KSWAPD, nr_succeeded);
- else
- __count_vm_events(PGDEMOTE_DIRECT, nr_succeeded);
+ __count_vm_events(PGDEMOTE_KSWAPD + reclaimer_offset(), nr_succeeded);
return nr_succeeded;
}
@@ -2069,10 +2087,29 @@ keep:
nr_reclaimed += demote_folio_list(&demote_folios, pgdat);
/* Folios that could not be demoted are still in @demote_folios */
if (!list_empty(&demote_folios)) {
- /* Folios which weren't demoted go back on @folio_list for retry: */
+ /* Folios which weren't demoted go back on @folio_list */
list_splice_init(&demote_folios, folio_list);
- do_demote_pass = false;
- goto retry;
+
+ /*
+ * goto retry to reclaim the undemoted folios in folio_list if
+ * desired.
+ *
+ * Reclaiming directly from top tier nodes is not often desired
+ * due to it breaking the LRU ordering: in general memory
+ * should be reclaimed from lower tier nodes and demoted from
+ * top tier nodes.
+ *
+ * However, disabling reclaim from top tier nodes entirely
+ * would cause ooms in edge scenarios where lower tier memory
+ * is unreclaimable for whatever reason, eg memory being
+ * mlocked or too hot to reclaim. We can disable reclaim
+ * from top tier nodes in proactive reclaim though as that is
+ * not real memory pressure.
+ */
+ if (!sc->proactive) {
+ do_demote_pass = false;
+ goto retry;
+ }
}
pgactivate = stat->nr_activate[0] + stat->nr_activate[1];
@@ -2475,7 +2512,7 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
&nr_scanned, sc, lru);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken);
- item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ item = PGSCAN_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_scanned);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_scanned);
@@ -2492,14 +2529,14 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
move_folios_to_lru(lruvec, &folio_list);
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, nr_reclaimed);
__count_memcg_events(lruvec_memcg(lruvec), item, nr_reclaimed);
__count_vm_events(PGSTEAL_ANON + file, nr_reclaimed);
spin_unlock_irq(&lruvec->lru_lock);
- lru_note_cost(lruvec, file, stat.nr_pageout);
+ lru_note_cost(lruvec, file, stat.nr_pageout, nr_scanned - nr_reclaimed);
mem_cgroup_uncharge_list(&folio_list);
free_unref_page_list(&folio_list);
@@ -2514,8 +2551,20 @@ static unsigned long shrink_inactive_list(unsigned long nr_to_scan,
* the flushers simply cannot keep up with the allocation
* rate. Nudge the flusher threads in case they are asleep.
*/
- if (stat.nr_unqueued_dirty == nr_taken)
+ if (stat.nr_unqueued_dirty == nr_taken) {
wakeup_flusher_threads(WB_REASON_VMSCAN);
+ /*
+ * For cgroupv1 dirty throttling is achieved by waking up
+ * the kernel flusher here and later waiting on folios
+ * which are in writeback to finish (see shrink_folio_list()).
+ *
+ * Flusher may not be able to issue writeback quickly
+ * enough for cgroupv1 writeback throttling to work
+ * on a large system.
+ */
+ if (!writeback_throttling_sane(sc))
+ reclaim_throttle(pgdat, VMSCAN_THROTTLE_WRITEBACK);
+ }
sc->nr.dirty += stat.nr_dirty;
sc->nr.congested += stat.nr_congested;
@@ -2639,6 +2688,8 @@ static void shrink_active_list(unsigned long nr_to_scan,
__mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken);
spin_unlock_irq(&lruvec->lru_lock);
+ if (nr_rotated)
+ lru_note_cost(lruvec, file, 0, nr_rotated);
mem_cgroup_uncharge_list(&l_active);
free_unref_page_list(&l_active);
trace_mm_vmscan_lru_shrink_active(pgdat->node_id, nr_taken, nr_activate,
@@ -3133,7 +3184,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
if (memcg) {
struct lruvec *lruvec = &memcg->nodeinfo[nid]->lruvec;
- /* for hotadd_new_pgdat() */
+ /* see the comment in mem_cgroup_lruvec() */
if (!lruvec->pgdat)
lruvec->pgdat = pgdat;
@@ -3142,7 +3193,7 @@ static struct lruvec *get_lruvec(struct mem_cgroup *memcg, int nid)
#endif
VM_WARN_ON_ONCE(!mem_cgroup_disabled());
- return pgdat ? &pgdat->__lruvec : NULL;
+ return &pgdat->__lruvec;
}
static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc)
@@ -3206,9 +3257,6 @@ void lru_gen_add_mm(struct mm_struct *mm)
for_each_node_state(nid, N_MEMORY) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (!lruvec)
- continue;
-
/* the first addition since the last iteration */
if (lruvec->mm_state.tail == &mm_list->fifo)
lruvec->mm_state.tail = &mm->lru_gen.list;
@@ -3238,9 +3286,6 @@ void lru_gen_del_mm(struct mm_struct *mm)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (!lruvec)
- continue;
-
/* where the last iteration ended (exclusive) */
if (lruvec->mm_state.tail == &mm->lru_gen.list)
lruvec->mm_state.tail = lruvec->mm_state.tail->next;
@@ -3975,7 +4020,7 @@ static void walk_pmd_range_locked(pud_t *pud, unsigned long next, struct vm_area
goto next;
if (!pmd_trans_huge(pmd[i])) {
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) &&
+ if (arch_has_hw_nonleaf_pmd_young() &&
get_cap(LRU_GEN_NONLEAF_YOUNG))
pmdp_test_and_clear_young(vma, addr, pmd + i);
goto next;
@@ -4070,14 +4115,14 @@ restart:
#endif
walk->mm_stats[MM_NONLEAF_TOTAL]++;
-#ifdef CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG
- if (get_cap(LRU_GEN_NONLEAF_YOUNG)) {
+ if (arch_has_hw_nonleaf_pmd_young() &&
+ get_cap(LRU_GEN_NONLEAF_YOUNG)) {
if (!pmd_young(val))
continue;
walk_pmd_range_locked(pud, addr, vma, args, bitmap, &pos);
}
-#endif
+
if (!walk->force_scan && !test_bloom_filter(walk->lruvec, walk->max_seq, pmd + i))
continue;
@@ -4483,7 +4528,7 @@ static bool age_lruvec(struct lruvec *lruvec, struct scan_control *sc, unsigned
mem_cgroup_calculate_protection(NULL, memcg);
- if (mem_cgroup_below_min(memcg))
+ if (mem_cgroup_below_min(NULL, memcg))
return false;
need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan);
@@ -4854,7 +4899,7 @@ static int scan_folios(struct lruvec *lruvec, struct scan_control *sc,
break;
}
- item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT;
+ item = PGSCAN_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc)) {
__count_vm_events(item, isolated);
__count_vm_events(PGREFILL, sorted);
@@ -4968,10 +5013,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
int scanned;
int reclaimed;
LIST_HEAD(list);
+ LIST_HEAD(clean);
struct folio *folio;
+ struct folio *next;
enum vm_event_item item;
struct reclaim_stat stat;
struct lru_gen_mm_walk *walk;
+ bool skip_retry = false;
struct mem_cgroup *memcg = lruvec_memcg(lruvec);
struct pglist_data *pgdat = lruvec_pgdat(lruvec);
@@ -4988,20 +5036,37 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (list_empty(&list))
return scanned;
-
+retry:
reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false);
+ sc->nr_reclaimed += reclaimed;
- list_for_each_entry(folio, &list, lru) {
- /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
- if (folio_test_workingset(folio))
- folio_set_referenced(folio);
+ list_for_each_entry_safe_reverse(folio, next, &list, lru) {
+ if (!folio_evictable(folio)) {
+ list_del(&folio->lru);
+ folio_putback_lru(folio);
+ continue;
+ }
- /* don't add rejected pages to the oldest generation */
if (folio_test_reclaim(folio) &&
- (folio_test_dirty(folio) || folio_test_writeback(folio)))
- folio_clear_active(folio);
- else
- folio_set_active(folio);
+ (folio_test_dirty(folio) || folio_test_writeback(folio))) {
+ /* restore LRU_REFS_FLAGS cleared by isolate_folio() */
+ if (folio_test_workingset(folio))
+ folio_set_referenced(folio);
+ continue;
+ }
+
+ if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) ||
+ folio_mapped(folio) || folio_test_locked(folio) ||
+ folio_test_dirty(folio) || folio_test_writeback(folio)) {
+ /* don't add rejected folios to the oldest generation */
+ set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS,
+ BIT(PG_active));
+ continue;
+ }
+
+ /* retry folios that may have missed folio_rotate_reclaimable() */
+ list_move(&folio->lru, &clean);
+ sc->nr_scanned -= folio_nr_pages(folio);
}
spin_lock_irq(&lruvec->lru_lock);
@@ -5012,7 +5077,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
if (walk && walk->batched)
reset_batch_size(lruvec, walk);
- item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT;
+ item = PGSTEAL_KSWAPD + reclaimer_offset();
if (!cgroup_reclaim(sc))
__count_vm_events(item, reclaimed);
__count_memcg_events(memcg, item, reclaimed);
@@ -5023,7 +5088,13 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap
mem_cgroup_uncharge_list(&list);
free_unref_page_list(&list);
- sc->nr_reclaimed += reclaimed;
+ INIT_LIST_HEAD(&list);
+ list_splice_init(&clean, &list);
+
+ if (!list_empty(&list)) {
+ skip_retry = true;
+ goto retry;
+ }
if (need_swapping && type == LRU_GEN_ANON)
*need_swapping = true;
@@ -5044,8 +5115,9 @@ static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *
DEFINE_MAX_SEQ(lruvec);
DEFINE_MIN_SEQ(lruvec);
- if (mem_cgroup_below_min(memcg) ||
- (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim))
+ if (mem_cgroup_below_min(sc->target_mem_cgroup, memcg) ||
+ (mem_cgroup_below_low(sc->target_mem_cgroup, memcg) &&
+ !sc->memcg_low_reclaim))
return 0;
*need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan);
@@ -5286,9 +5358,6 @@ static void lru_gen_change_state(bool enabled)
for_each_node(nid) {
struct lruvec *lruvec = get_lruvec(memcg, nid);
- if (!lruvec)
- continue;
-
spin_lock_irq(&lruvec->lru_lock);
VM_WARN_ON_ONCE(!seq_is_valid(lruvec));
@@ -5351,10 +5420,10 @@ static ssize_t show_enabled(struct kobject *kobj, struct kobj_attribute *attr, c
if (arch_has_hw_pte_young() && get_cap(LRU_GEN_MM_WALK))
caps |= BIT(LRU_GEN_MM_WALK);
- if (IS_ENABLED(CONFIG_ARCH_HAS_NONLEAF_PMD_YOUNG) && get_cap(LRU_GEN_NONLEAF_YOUNG))
+ if (arch_has_hw_nonleaf_pmd_young() && get_cap(LRU_GEN_NONLEAF_YOUNG))
caps |= BIT(LRU_GEN_NONLEAF_YOUNG);
- return snprintf(buf, PAGE_SIZE, "0x%04x\n", caps);
+ return sysfs_emit(buf, "0x%04x\n", caps);
}
/* see Documentation/admin-guide/mm/multigen_lru.rst for details */
@@ -5841,8 +5910,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
enum lru_list lru;
unsigned long nr_reclaimed = 0;
unsigned long nr_to_reclaim = sc->nr_to_reclaim;
+ bool proportional_reclaim;
struct blk_plug plug;
- bool scan_adjusted;
if (lru_gen_enabled()) {
lru_gen_shrink_lruvec(lruvec, sc);
@@ -5865,8 +5934,8 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
* abort proportional reclaim if either the file or anon lru has already
* dropped to zero at the first pass.
*/
- scan_adjusted = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
- sc->priority == DEF_PRIORITY);
+ proportional_reclaim = (!cgroup_reclaim(sc) && !current_is_kswapd() &&
+ sc->priority == DEF_PRIORITY);
blk_start_plug(&plug);
while (nr[LRU_INACTIVE_ANON] || nr[LRU_ACTIVE_FILE] ||
@@ -5886,7 +5955,7 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
cond_resched();
- if (nr_reclaimed < nr_to_reclaim || scan_adjusted)
+ if (nr_reclaimed < nr_to_reclaim || proportional_reclaim)
continue;
/*
@@ -5937,8 +6006,6 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc)
nr_scanned = targets[lru] - nr[lru];
nr[lru] = targets[lru] * (100 - percentage) / 100;
nr[lru] -= min(nr[lru], nr_scanned);
-
- scan_adjusted = true;
}
blk_finish_plug(&plug);
sc->nr_reclaimed += nr_reclaimed;
@@ -6045,13 +6112,13 @@ static void shrink_node_memcgs(pg_data_t *pgdat, struct scan_control *sc)
mem_cgroup_calculate_protection(target_memcg, memcg);
- if (mem_cgroup_below_min(memcg)) {
+ if (mem_cgroup_below_min(target_memcg, memcg)) {
/*
* Hard protection.
* If there is no reclaimable memory, OOM.
*/
continue;
- } else if (mem_cgroup_below_low(memcg)) {
+ } else if (mem_cgroup_below_low(target_memcg, memcg)) {
/*
* Soft protection.
* Respect the protection only as long as
@@ -6687,7 +6754,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages,
gfp_t gfp_mask,
- unsigned int reclaim_options)
+ unsigned int reclaim_options,
+ nodemask_t *nodemask)
{
unsigned long nr_reclaimed;
unsigned int noreclaim_flag;
@@ -6702,6 +6770,7 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
+ .nodemask = nodemask,
};
/*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put