diff options
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 792 |
1 files changed, 784 insertions, 8 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 680ad52090e1..674d336dfe00 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1334,9 +1334,11 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, if (folio_test_swapcache(folio)) { swp_entry_t swap = folio_swap_entry(folio); - mem_cgroup_swapout(folio, swap); + + /* get a shadow entry before mem_cgroup_swapout() clears folio_memcg() */ if (reclaimed && !mapping_exiting(mapping)) shadow = workingset_eviction(folio, target_memcg); + mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); xa_unlock_irq(&mapping->i_pages); put_swap_page(&folio->page, swap); @@ -2733,6 +2735,9 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) unsigned long file; struct lruvec *target_lruvec; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(sc->target_mem_cgroup, pgdat); /* @@ -3056,6 +3061,17 @@ static bool can_age_anon_pages(struct pglist_data *pgdat, * shorthand helpers ******************************************************************************/ +#define LRU_REFS_FLAGS (BIT(PG_referenced) | BIT(PG_workingset)) + +#define DEFINE_MAX_SEQ(lruvec) \ + unsigned long max_seq = READ_ONCE((lruvec)->lrugen.max_seq) + +#define DEFINE_MIN_SEQ(lruvec) \ + unsigned long min_seq[ANON_AND_FILE] = { \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_ANON]), \ + READ_ONCE((lruvec)->lrugen.min_seq[LRU_GEN_FILE]), \ + } + #define for_each_gen_type_zone(gen, type, zone) \ for ((gen) = 0; (gen) < MAX_NR_GENS; (gen)++) \ for ((type) = 0; (type) < ANON_AND_FILE; (type)++) \ @@ -3081,6 +3097,745 @@ static struct lruvec __maybe_unused *get_lruvec(struct mem_cgroup *memcg, int ni return pgdat ? &pgdat->__lruvec : NULL; } +static int get_swappiness(struct lruvec *lruvec, struct scan_control *sc) +{ + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + if (!can_demote(pgdat->node_id, sc) && + mem_cgroup_get_nr_swap_pages(memcg) < MIN_LRU_BATCH) + return 0; + + return mem_cgroup_swappiness(memcg); +} + +static int get_nr_gens(struct lruvec *lruvec, int type) +{ + return lruvec->lrugen.max_seq - lruvec->lrugen.min_seq[type] + 1; +} + +static bool __maybe_unused seq_is_valid(struct lruvec *lruvec) +{ + /* see the comment on lru_gen_struct */ + return get_nr_gens(lruvec, LRU_GEN_FILE) >= MIN_NR_GENS && + get_nr_gens(lruvec, LRU_GEN_FILE) <= get_nr_gens(lruvec, LRU_GEN_ANON) && + get_nr_gens(lruvec, LRU_GEN_ANON) <= MAX_NR_GENS; +} + +/****************************************************************************** + * refault feedback loop + ******************************************************************************/ + +/* + * A feedback loop based on Proportional-Integral-Derivative (PID) controller. + * + * The P term is refaulted/(evicted+protected) from a tier in the generation + * currently being evicted; the I term is the exponential moving average of the + * P term over the generations previously evicted, using the smoothing factor + * 1/2; the D term isn't supported. + * + * The setpoint (SP) is always the first tier of one type; the process variable + * (PV) is either any tier of the other type or any other tier of the same + * type. + * + * The error is the difference between the SP and the PV; the correction is to + * turn off protection when SP>PV or turn on protection when SP<PV. + * + * For future optimizations: + * 1. The D term may discount the other two terms over time so that long-lived + * generations can resist stale information. + */ +struct ctrl_pos { + unsigned long refaulted; + unsigned long total; + int gain; +}; + +static void read_ctrl_pos(struct lruvec *lruvec, int type, int tier, int gain, + struct ctrl_pos *pos) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + pos->refaulted = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + pos->total = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + pos->total += lrugen->protected[hist][type][tier - 1]; + pos->gain = gain; +} + +static void reset_ctrl_pos(struct lruvec *lruvec, int type, bool carryover) +{ + int hist, tier; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + bool clear = carryover ? NR_HIST_GENS == 1 : NR_HIST_GENS > 1; + unsigned long seq = carryover ? lrugen->min_seq[type] : lrugen->max_seq + 1; + + lockdep_assert_held(&lruvec->lru_lock); + + if (!carryover && !clear) + return; + + hist = lru_hist_from_seq(seq); + + for (tier = 0; tier < MAX_NR_TIERS; tier++) { + if (carryover) { + unsigned long sum; + + sum = lrugen->avg_refaulted[type][tier] + + atomic_long_read(&lrugen->refaulted[hist][type][tier]); + WRITE_ONCE(lrugen->avg_refaulted[type][tier], sum / 2); + + sum = lrugen->avg_total[type][tier] + + atomic_long_read(&lrugen->evicted[hist][type][tier]); + if (tier) + sum += lrugen->protected[hist][type][tier - 1]; + WRITE_ONCE(lrugen->avg_total[type][tier], sum / 2); + } + + if (clear) { + atomic_long_set(&lrugen->refaulted[hist][type][tier], 0); + atomic_long_set(&lrugen->evicted[hist][type][tier], 0); + if (tier) + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], 0); + } + } +} + +static bool positive_ctrl_err(struct ctrl_pos *sp, struct ctrl_pos *pv) +{ + /* + * Return true if the PV has a limited number of refaults or a lower + * refaulted/total than the SP. + */ + return pv->refaulted < MIN_LRU_BATCH || + pv->refaulted * (sp->total + MIN_LRU_BATCH) * sp->gain <= + (sp->refaulted + 1) * pv->total * pv->gain; +} + +/****************************************************************************** + * the aging + ******************************************************************************/ + +/* protect pages accessed multiple times through file descriptors */ +static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclaiming) +{ + int type = folio_is_file_lru(folio); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + int new_gen, old_gen = lru_gen_from_seq(lrugen->min_seq[type]); + unsigned long new_flags, old_flags = READ_ONCE(folio->flags); + + VM_WARN_ON_ONCE_FOLIO(!(old_flags & LRU_GEN_MASK), folio); + + do { + new_gen = (old_gen + 1) % MAX_NR_GENS; + + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; + /* for folio_end_writeback() */ + if (reclaiming) + new_flags |= BIT(PG_reclaim); + } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); + + lru_gen_update_size(lruvec, folio, old_gen, new_gen); + + return new_gen; +} + +static void inc_min_seq(struct lruvec *lruvec, int type) +{ + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], lrugen->min_seq[type] + 1); +} + +static bool try_to_inc_min_seq(struct lruvec *lruvec, bool can_swap) +{ + int gen, type, zone; + bool success = false; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + /* find the oldest populated generation */ + for (type = !can_swap; type < ANON_AND_FILE; type++) { + while (min_seq[type] + MIN_NR_GENS <= lrugen->max_seq) { + gen = lru_gen_from_seq(min_seq[type]); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + if (!list_empty(&lrugen->lists[gen][type][zone])) + goto next; + } + + min_seq[type]++; + } +next: + ; + } + + /* see the comment on lru_gen_struct */ + if (can_swap) { + min_seq[LRU_GEN_ANON] = min(min_seq[LRU_GEN_ANON], min_seq[LRU_GEN_FILE]); + min_seq[LRU_GEN_FILE] = max(min_seq[LRU_GEN_ANON], lrugen->min_seq[LRU_GEN_FILE]); + } + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + if (min_seq[type] == lrugen->min_seq[type]) + continue; + + reset_ctrl_pos(lruvec, type, true); + WRITE_ONCE(lrugen->min_seq[type], min_seq[type]); + success = true; + } + + return success; +} + +static void inc_max_seq(struct lruvec *lruvec, unsigned long max_seq, bool can_swap) +{ + int prev, next; + int type, zone; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + spin_lock_irq(&lruvec->lru_lock); + + VM_WARN_ON_ONCE(!seq_is_valid(lruvec)); + + if (max_seq != lrugen->max_seq) + goto unlock; + + for (type = ANON_AND_FILE - 1; type >= 0; type--) { + if (get_nr_gens(lruvec, type) != MAX_NR_GENS) + continue; + + VM_WARN_ON_ONCE(type == LRU_GEN_FILE || can_swap); + + inc_min_seq(lruvec, type); + } + + /* + * Update the active/inactive LRU sizes for compatibility. Both sides of + * the current max_seq need to be covered, since max_seq+1 can overlap + * with min_seq[LRU_GEN_ANON] if swapping is constrained. And if they do + * overlap, cold/hot inversion happens. + */ + prev = lru_gen_from_seq(lrugen->max_seq - 1); + next = lru_gen_from_seq(lrugen->max_seq + 1); + + for (type = 0; type < ANON_AND_FILE; type++) { + for (zone = 0; zone < MAX_NR_ZONES; zone++) { + enum lru_list lru = type * LRU_INACTIVE_FILE; + long delta = lrugen->nr_pages[prev][type][zone] - + lrugen->nr_pages[next][type][zone]; + + if (!delta) + continue; + + __update_lru_size(lruvec, lru, zone, delta); + __update_lru_size(lruvec, lru + LRU_ACTIVE, zone, -delta); + } + } + + for (type = 0; type < ANON_AND_FILE; type++) + reset_ctrl_pos(lruvec, type, false); + + /* make sure preceding modifications appear */ + smp_store_release(&lrugen->max_seq, lrugen->max_seq + 1); +unlock: + spin_unlock_irq(&lruvec->lru_lock); +} + +static bool should_run_aging(struct lruvec *lruvec, unsigned long max_seq, unsigned long *min_seq, + struct scan_control *sc, bool can_swap, unsigned long *nr_to_scan) +{ + int gen, type, zone; + unsigned long old = 0; + unsigned long young = 0; + unsigned long total = 0; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + for (type = !can_swap; type < ANON_AND_FILE; type++) { + unsigned long seq; + + for (seq = min_seq[type]; seq <= max_seq; seq++) { + unsigned long size = 0; + + gen = lru_gen_from_seq(seq); + + for (zone = 0; zone < MAX_NR_ZONES; zone++) + size += max(READ_ONCE(lrugen->nr_pages[gen][type][zone]), 0L); + + total += size; + if (seq == max_seq) + young += size; + else if (seq + MIN_NR_GENS == max_seq) + old += size; + } + } + + /* try to scrape all its memory if this memcg was deleted */ + *nr_to_scan = mem_cgroup_online(memcg) ? (total >> sc->priority) : total; + + /* + * The aging tries to be lazy to reduce the overhead, while the eviction + * stalls when the number of generations reaches MIN_NR_GENS. Hence, the + * ideal number of generations is MIN_NR_GENS+1. + */ + if (min_seq[!can_swap] + MIN_NR_GENS > max_seq) + return true; + if (min_seq[!can_swap] + MIN_NR_GENS < max_seq) + return false; + + /* + * It's also ideal to spread pages out evenly, i.e., 1/(MIN_NR_GENS+1) + * of the total number of pages for each generation. A reasonable range + * for this average portion is [1/MIN_NR_GENS, 1/(MIN_NR_GENS+2)]. The + * aging cares about the upper bound of hot pages, while the eviction + * cares about the lower bound of cold pages. + */ + if (young * MIN_NR_GENS > total) + return true; + if (old * (MIN_NR_GENS + 2) < total) + return true; + + return false; +} + +static void age_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + bool need_aging; + unsigned long nr_to_scan; + int swappiness = get_swappiness(lruvec, sc); + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + VM_WARN_ON_ONCE(sc->memcg_low_reclaim); + + mem_cgroup_calculate_protection(NULL, memcg); + + if (mem_cgroup_below_min(memcg)) + return; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, swappiness, &nr_to_scan); + if (need_aging) + inc_max_seq(lruvec, max_seq, swappiness); +} + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ + struct mem_cgroup *memcg; + + VM_WARN_ON_ONCE(!current_is_kswapd()); + + memcg = mem_cgroup_iter(NULL, NULL, NULL); + do { + struct lruvec *lruvec = mem_cgroup_lruvec(memcg, pgdat); + + age_lruvec(lruvec, sc); + + cond_resched(); + } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL))); +} + +/****************************************************************************** + * the eviction + ******************************************************************************/ + +static bool sort_folio(struct lruvec *lruvec, struct folio *folio, int tier_idx) +{ + bool success; + int gen = folio_lru_gen(folio); + int type = folio_is_file_lru(folio); + int zone = folio_zonenum(folio); + int delta = folio_nr_pages(folio); + int refs = folio_lru_refs(folio); + int tier = lru_tier_from_refs(refs); + struct lru_gen_struct *lrugen = &lruvec->lrugen; + + VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); + + /* unevictable */ + if (!folio_evictable(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_unevictable(folio); + lruvec_add_folio(lruvec, folio); + __count_vm_events(UNEVICTABLE_PGCULLED, delta); + return true; + } + + /* dirty lazyfree */ + if (type == LRU_GEN_FILE && folio_test_anon(folio) && folio_test_dirty(folio)) { + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + folio_set_swapbacked(folio); + lruvec_add_folio_tail(lruvec, folio); + return true; + } + + /* protected */ + if (tier > tier_idx) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); + + gen = folio_inc_gen(lruvec, folio, false); + list_move_tail(&folio->lru, &lrugen->lists[gen][type][zone]); + + WRITE_ONCE(lrugen->protected[hist][type][tier - 1], + lrugen->protected[hist][type][tier - 1] + delta); + __mod_lruvec_state(lruvec, WORKINGSET_ACTIVATE_BASE + type, delta); + return true; + } + + /* waiting for writeback */ + if (folio_test_locked(folio) || folio_test_writeback(folio) || + (type == LRU_GEN_FILE && folio_test_dirty(folio))) { + gen = folio_inc_gen(lruvec, folio, true); + list_move(&folio->lru, &lrugen->lists[gen][type][zone]); + return true; + } + + return false; +} + +static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct scan_control *sc) +{ + bool success; + + /* unmapping inhibited */ + if (!sc->may_unmap && folio_mapped(folio)) + return false; + + /* swapping inhibited */ + if (!(sc->may_writepage && (sc->gfp_mask & __GFP_IO)) && + (folio_test_dirty(folio) || + (folio_test_anon(folio) && !folio_test_swapcache(folio)))) + return false; + + /* raced with release_pages() */ + if (!folio_try_get(folio)) + return false; + + /* raced with another isolation */ + if (!folio_test_clear_lru(folio)) { + folio_put(folio); + return false; + } + + /* see the comment on MAX_NR_TIERS */ + if (!folio_test_referenced(folio)) + set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); + + /* for shrink_page_list() */ + folio_clear_reclaim(folio); + folio_clear_referenced(folio); + + success = lru_gen_del_folio(lruvec, folio, true); + VM_WARN_ON_ONCE_FOLIO(!success, folio); + + return true; +} + +static int scan_folios(struct lruvec *lruvec, struct scan_control *sc, + int type, int tier, struct list_head *list) +{ + int gen, zone; + enum vm_event_item item; + int sorted = 0; + int scanned = 0; + int isolated = 0; + int remaining = MAX_LRU_BATCH; + struct lru_gen_struct *lrugen = &lruvec->lrugen; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + + VM_WARN_ON_ONCE(!list_empty(list)); + + if (get_nr_gens(lruvec, type) == MIN_NR_GENS) + return 0; + + gen = lru_gen_from_seq(lrugen->min_seq[type]); + + for (zone = sc->reclaim_idx; zone >= 0; zone--) { + LIST_HEAD(moved); + int skipped = 0; + struct list_head *head = &lrugen->lists[gen][type][zone]; + + while (!list_empty(head)) { + struct folio *folio = lru_to_folio(head); + int delta = folio_nr_pages(folio); + + VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); + VM_WARN_ON_ONCE_FOLIO(folio_is_file_lru(folio) != type, folio); + VM_WARN_ON_ONCE_FOLIO(folio_zonenum(folio) != zone, folio); + + scanned += delta; + + if (sort_folio(lruvec, folio, tier)) + sorted += delta; + else if (isolate_folio(lruvec, folio, sc)) { + list_add(&folio->lru, list); + isolated += delta; + } else { + list_move(&folio->lru, &moved); + skipped += delta; + } + + if (!--remaining || max(isolated, skipped) >= MIN_LRU_BATCH) + break; + } + + if (skipped) { + list_splice(&moved, head); + __count_zid_vm_events(PGSCAN_SKIP, zone, skipped); + } + + if (!remaining || isolated >= MIN_LRU_BATCH) + break; + } + + item = current_is_kswapd() ? PGSCAN_KSWAPD : PGSCAN_DIRECT; + if (!cgroup_reclaim(sc)) { + __count_vm_events(item, isolated); + __count_vm_events(PGREFILL, sorted); + } + __count_memcg_events(memcg, item, isolated); + __count_memcg_events(memcg, PGREFILL, sorted); + __count_vm_events(PGSCAN_ANON + type, isolated); + + /* + * There might not be eligible pages due to reclaim_idx, may_unmap and + * may_writepage. Check the remaining to prevent livelock if it's not + * making progress. + */ + return isolated || !remaining ? scanned : 0; +} + +static int get_tier_idx(struct lruvec *lruvec, int type) +{ + int tier; + struct ctrl_pos sp, pv; + + /* + * To leave a margin for fluctuations, use a larger gain factor (1:2). + * This value is chosen because any other tier would have at least twice + * as many refaults as the first tier. + */ + read_ctrl_pos(lruvec, type, 0, 1, &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, 2, &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + return tier - 1; +} + +static int get_type_to_scan(struct lruvec *lruvec, int swappiness, int *tier_idx) +{ + int type, tier; + struct ctrl_pos sp, pv; + int gain[ANON_AND_FILE] = { swappiness, 200 - swappiness }; + + /* + * Compare the first tier of anon with that of file to determine which + * type to scan. Also need to compare other tiers of the selected type + * with the first tier of the other type to determine the last tier (of + * the selected type) to evict. + */ + read_ctrl_pos(lruvec, LRU_GEN_ANON, 0, gain[LRU_GEN_ANON], &sp); + read_ctrl_pos(lruvec, LRU_GEN_FILE, 0, gain[LRU_GEN_FILE], &pv); + type = positive_ctrl_err(&sp, &pv); + + read_ctrl_pos(lruvec, !type, 0, gain[!type], &sp); + for (tier = 1; tier < MAX_NR_TIERS; tier++) { + read_ctrl_pos(lruvec, type, tier, gain[type], &pv); + if (!positive_ctrl_err(&sp, &pv)) + break; + } + + *tier_idx = tier - 1; + + return type; +} + +static int isolate_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness, + int *type_scanned, struct list_head *list) +{ + int i; + int type; + int scanned; + int tier = -1; + DEFINE_MIN_SEQ(lruvec); + + /* + * Try to make the obvious choice first. When anon and file are both + * available from the same generation, interpret swappiness 1 as file + * first and 200 as anon first. + */ + if (!swappiness) + type = LRU_GEN_FILE; + else if (min_seq[LRU_GEN_ANON] < min_seq[LRU_GEN_FILE]) + type = LRU_GEN_ANON; + else if (swappiness == 1) + type = LRU_GEN_FILE; + else if (swappiness == 200) + type = LRU_GEN_ANON; + else + type = get_type_to_scan(lruvec, swappiness, &tier); + + for (i = !swappiness; i < ANON_AND_FILE; i++) { + if (tier < 0) + tier = get_tier_idx(lruvec, type); + + scanned = scan_folios(lruvec, sc, type, tier, list); + if (scanned) + break; + + type = !type; + tier = -1; + } + + *type_scanned = type; + + return scanned; +} + +static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swappiness) +{ + int type; + int scanned; + int reclaimed; + LIST_HEAD(list); + struct folio *folio; + enum vm_event_item item; + struct reclaim_stat stat; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + + spin_lock_irq(&lruvec->lru_lock); + + scanned = isolate_folios(lruvec, sc, swappiness, &type, &list); + + scanned += try_to_inc_min_seq(lruvec, swappiness); + + if (get_nr_gens(lruvec, !swappiness) == MIN_NR_GENS) + scanned = 0; + + spin_unlock_irq(&lruvec->lru_lock); + + if (list_empty(&list)) + return scanned; + + reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + + list_for_each_entry(folio, &list, lru) { + /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ + if (folio_test_workingset(folio)) + folio_set_referenced(folio); + + /* don't add rejected pages to the oldest generation */ + if (folio_test_reclaim(folio) && + (folio_test_dirty(folio) || folio_test_writeback(folio))) + folio_clear_active(folio); + else + folio_set_active(folio); + } + + spin_lock_irq(&lruvec->lru_lock); + + move_pages_to_lru(lruvec, &list); + + item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; + if (!cgroup_reclaim(sc)) + __count_vm_events(item, reclaimed); + __count_memcg_events(memcg, item, reclaimed); + __count_vm_events(PGSTEAL_ANON + type, reclaimed); + + spin_unlock_irq(&lruvec->lru_lock); + + mem_cgroup_uncharge_list(&list); + free_unref_page_list(&list); + + sc->nr_reclaimed += reclaimed; + + return scanned; +} + +static unsigned long get_nr_to_scan(struct lruvec *lruvec, struct scan_control *sc, + bool can_swap) +{ + bool need_aging; + unsigned long nr_to_scan; + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + DEFINE_MAX_SEQ(lruvec); + DEFINE_MIN_SEQ(lruvec); + + if (mem_cgroup_below_min(memcg) || + (mem_cgroup_below_low(memcg) && !sc->memcg_low_reclaim)) + return 0; + + need_aging = should_run_aging(lruvec, max_seq, min_seq, sc, can_swap, &nr_to_scan); + if (!need_aging) + return nr_to_scan; + + /* skip the aging path at the default priority */ + if (sc->priority == DEF_PRIORITY) + goto done; + + /* leave the work to lru_gen_age_node() */ + if (current_is_kswapd()) + return 0; + + inc_max_seq(lruvec, max_seq, can_swap); +done: + return min_seq[!can_swap] + MIN_NR_GENS <= max_seq ? nr_to_scan : 0; +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ + struct blk_plug plug; + unsigned long scanned = 0; + + lru_add_drain(); + + blk_start_plug(&plug); + + while (true) { + int delta; + int swappiness; + unsigned long nr_to_scan; + + if (sc->may_swap) + swappiness = get_swappiness(lruvec, sc); + else if (!cgroup_reclaim(sc) && get_swappiness(lruvec, sc)) + swappiness = 1; + else + swappiness = 0; + + nr_to_scan = get_nr_to_scan(lruvec, sc, swappiness); + if (!nr_to_scan) + break; + + delta = evict_folios(lruvec, sc, swappiness); + if (!delta) + break; + + scanned += delta; + if (scanned >= nr_to_scan) + break; + + cond_resched(); + } + + blk_finish_plug(&plug); +} + /****************************************************************************** * initialization ******************************************************************************/ @@ -3123,6 +3878,16 @@ static int __init init_lru_gen(void) }; late_initcall(init_lru_gen); +#else /* !CONFIG_LRU_GEN */ + +static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) +{ +} + +static void lru_gen_shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) +{ +} + #endif /* CONFIG_LRU_GEN */ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) @@ -3136,6 +3901,11 @@ static void shrink_lruvec(struct lruvec *lruvec, struct scan_control *sc) struct blk_plug plug; bool scan_adjusted; + if (lru_gen_enabled()) { + lru_gen_shrink_lruvec(lruvec, sc); + return; + } + get_scan_count(lruvec, sc, nr); /* Record the original scan target for proportional adjustments later */ @@ -3642,6 +4412,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) struct lruvec *target_lruvec; unsigned long refaults; + if (lru_gen_enabled()) + return; + target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); target_lruvec->refaults[WORKINGSET_ANON] = refaults; @@ -4008,12 +4781,16 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, } #endif -static void age_active_anon(struct pglist_data *pgdat, - struct scan_control *sc) +static void kswapd_age_node(struct pglist_data *pgdat, struct scan_control *sc) { struct mem_cgroup *memcg; struct lruvec *lruvec; + if (lru_gen_enabled()) { + lru_gen_age_node(pgdat, sc); + return; + } + if (!can_age_anon_pages(pgdat, sc)) return; @@ -4333,12 +5110,11 @@ restart: sc.may_swap = !nr_boost_reclaim; /* - * Do some background aging of the anon list, to give - * pages a chance to be referenced before reclaiming. All - * pages are rotated regardless of classzone as this is - * about consistent aging. + * Do some background aging, to give pages a chance to be + * referenced before reclaiming. All pages are rotated + * regardless of classzone as this is about consistent aging. */ - age_active_anon(pgdat, &sc); + kswapd_age_node(pgdat, &sc); /* * If we're getting trouble reclaiming, start doing writepage |