diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/backing-dev.c | 120 | ||||
-rw-r--r-- | mm/balloon_compaction.c | 3 | ||||
-rw-r--r-- | mm/fadvise.c | 4 | ||||
-rw-r--r-- | mm/filemap.c | 3 | ||||
-rw-r--r-- | mm/huge_memory.c | 4 | ||||
-rw-r--r-- | mm/kasan/common.c | 10 | ||||
-rw-r--r-- | mm/khugepaged.c | 2 | ||||
-rw-r--r-- | mm/madvise.c | 22 | ||||
-rw-r--r-- | mm/memcontrol.c | 226 | ||||
-rw-r--r-- | mm/memory.c | 4 | ||||
-rw-r--r-- | mm/mmap.c | 8 | ||||
-rw-r--r-- | mm/page-writeback.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 21 | ||||
-rw-r--r-- | mm/percpu.c | 23 | ||||
-rw-r--r-- | mm/shmem.c | 385 | ||||
-rw-r--r-- | mm/swapfile.c | 41 | ||||
-rw-r--r-- | mm/vmalloc.c | 5 | ||||
-rw-r--r-- | mm/vmscan.c | 5 | ||||
-rw-r--r-- | mm/z3fold.c | 90 | ||||
-rw-r--r-- | mm/zsmalloc.c | 80 |
20 files changed, 830 insertions, 230 deletions
diff --git a/mm/backing-dev.c b/mm/backing-dev.c index e8e89158adec..d9daa3e422d0 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0-only #include <linux/wait.h> +#include <linux/rbtree.h> #include <linux/backing-dev.h> #include <linux/kthread.h> #include <linux/freezer.h> @@ -22,10 +23,12 @@ EXPORT_SYMBOL_GPL(noop_backing_dev_info); static struct class *bdi_class; /* - * bdi_lock protects updates to bdi_list. bdi_list has RCU reader side - * locking. + * bdi_lock protects bdi_tree and updates to bdi_list. bdi_list has RCU + * reader side locking. */ DEFINE_SPINLOCK(bdi_lock); +static u64 bdi_id_cursor; +static struct rb_root bdi_tree = RB_ROOT; LIST_HEAD(bdi_list); /* bdi_wq serves all asynchronous writeback tasks */ @@ -615,13 +618,12 @@ out_put: } /** - * wb_get_create - get wb for a given memcg, create if necessary + * wb_get_lookup - get wb for a given memcg * @bdi: target bdi * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) - * @gfp: allocation mask to use * - * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to - * create one. The returned wb has its refcount incremented. + * Try to get the wb for @memcg_css on @bdi. The returned wb has its + * refcount incremented. * * This function uses css_get() on @memcg_css and thus expects its refcnt * to be positive on invocation. IOW, rcu_read_lock() protection on @@ -638,6 +640,39 @@ out_put: * each lookup. On mismatch, the existing wb is discarded and a new one is * created. */ +struct bdi_writeback *wb_get_lookup(struct backing_dev_info *bdi, + struct cgroup_subsys_state *memcg_css) +{ + struct bdi_writeback *wb; + + if (!memcg_css->parent) + return &bdi->wb; + + rcu_read_lock(); + wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); + if (wb) { + struct cgroup_subsys_state *blkcg_css; + + /* see whether the blkcg association has changed */ + blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); + if (unlikely(wb->blkcg_css != blkcg_css || !wb_tryget(wb))) + wb = NULL; + css_put(blkcg_css); + } + rcu_read_unlock(); + + return wb; +} + +/** + * wb_get_create - get wb for a given memcg, create if necessary + * @bdi: target bdi + * @memcg_css: cgroup_subsys_state of the target memcg (must have positive ref) + * @gfp: allocation mask to use + * + * Try to get the wb for @memcg_css on @bdi. If it doesn't exist, try to + * create one. See wb_get_lookup() for more details. + */ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, struct cgroup_subsys_state *memcg_css, gfp_t gfp) @@ -650,20 +685,7 @@ struct bdi_writeback *wb_get_create(struct backing_dev_info *bdi, return &bdi->wb; do { - rcu_read_lock(); - wb = radix_tree_lookup(&bdi->cgwb_tree, memcg_css->id); - if (wb) { - struct cgroup_subsys_state *blkcg_css; - - /* see whether the blkcg association has changed */ - blkcg_css = cgroup_get_e_css(memcg_css->cgroup, - &io_cgrp_subsys); - if (unlikely(wb->blkcg_css != blkcg_css || - !wb_tryget(wb))) - wb = NULL; - css_put(blkcg_css); - } - rcu_read_unlock(); + wb = wb_get_lookup(bdi, memcg_css); } while (!wb && !cgwb_create(bdi, memcg_css, gfp)); return wb; @@ -859,9 +881,58 @@ struct backing_dev_info *bdi_alloc_node(gfp_t gfp_mask, int node_id) } EXPORT_SYMBOL(bdi_alloc_node); +static struct rb_node **bdi_lookup_rb_node(u64 id, struct rb_node **parentp) +{ + struct rb_node **p = &bdi_tree.rb_node; + struct rb_node *parent = NULL; + struct backing_dev_info *bdi; + + lockdep_assert_held(&bdi_lock); + + while (*p) { + parent = *p; + bdi = rb_entry(parent, struct backing_dev_info, rb_node); + + if (bdi->id > id) + p = &(*p)->rb_left; + else if (bdi->id < id) + p = &(*p)->rb_right; + else + break; + } + + if (parentp) + *parentp = parent; + return p; +} + +/** + * bdi_get_by_id - lookup and get bdi from its id + * @id: bdi id to lookup + * + * Find bdi matching @id and get it. Returns NULL if the matching bdi + * doesn't exist or is already unregistered. + */ +struct backing_dev_info *bdi_get_by_id(u64 id) +{ + struct backing_dev_info *bdi = NULL; + struct rb_node **p; + + spin_lock_bh(&bdi_lock); + p = bdi_lookup_rb_node(id, NULL); + if (*p) { + bdi = rb_entry(*p, struct backing_dev_info, rb_node); + bdi_get(bdi); + } + spin_unlock_bh(&bdi_lock); + + return bdi; +} + int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) { struct device *dev; + struct rb_node *parent, **p; if (bdi->dev) /* The driver needs to use separate queues per device */ return 0; @@ -877,7 +948,15 @@ int bdi_register_va(struct backing_dev_info *bdi, const char *fmt, va_list args) set_bit(WB_registered, &bdi->wb.state); spin_lock_bh(&bdi_lock); + + bdi->id = ++bdi_id_cursor; + + p = bdi_lookup_rb_node(bdi->id, &parent); + rb_link_node(&bdi->rb_node, parent, p); + rb_insert_color(&bdi->rb_node, &bdi_tree); + list_add_tail_rcu(&bdi->bdi_list, &bdi_list); + spin_unlock_bh(&bdi_lock); trace_writeback_bdi_register(bdi); @@ -918,6 +997,7 @@ EXPORT_SYMBOL(bdi_register_owner); static void bdi_remove_from_list(struct backing_dev_info *bdi) { spin_lock_bh(&bdi_lock); + rb_erase(&bdi->rb_node, &bdi_tree); list_del_rcu(&bdi->bdi_list); spin_unlock_bh(&bdi_lock); diff --git a/mm/balloon_compaction.c b/mm/balloon_compaction.c index 798275a51887..26de020aae7b 100644 --- a/mm/balloon_compaction.c +++ b/mm/balloon_compaction.c @@ -124,7 +124,8 @@ EXPORT_SYMBOL_GPL(balloon_page_list_dequeue); struct page *balloon_page_alloc(void) { struct page *page = alloc_page(balloon_mapping_gfp_mask() | - __GFP_NOMEMALLOC | __GFP_NORETRY); + __GFP_NOMEMALLOC | __GFP_NORETRY | + __GFP_NOWARN); return page; } EXPORT_SYMBOL_GPL(balloon_page_alloc); diff --git a/mm/fadvise.c b/mm/fadvise.c index 467bcd032037..4f17c83db575 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -27,8 +27,7 @@ * deactivate the pages and clear PG_Referenced. */ -static int generic_fadvise(struct file *file, loff_t offset, loff_t len, - int advice) +int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { struct inode *inode; struct address_space *mapping; @@ -178,6 +177,7 @@ static int generic_fadvise(struct file *file, loff_t offset, loff_t len, } return 0; } +EXPORT_SYMBOL(generic_fadvise); int vfs_fadvise(struct file *file, loff_t offset, loff_t len, int advice) { diff --git a/mm/filemap.c b/mm/filemap.c index d0cf700bf201..40667c2f3383 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2988,6 +2988,9 @@ inline ssize_t generic_write_checks(struct kiocb *iocb, struct iov_iter *from) loff_t count; int ret; + if (IS_SWAPFILE(inode)) + return -ETXTBSY; + if (!iov_iter_count(from)) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 738065f765ab..de1f15969e27 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -32,6 +32,7 @@ #include <linux/shmem_fs.h> #include <linux/oom.h> #include <linux/numa.h> +#include <linux/page_owner.h> #include <asm/tlb.h> #include <asm/pgalloc.h> @@ -2516,6 +2517,9 @@ static void __split_huge_page(struct page *page, struct list_head *list, } ClearPageCompound(head); + + split_page_owner(head, HPAGE_PMD_ORDER); + /* See comment in __split_huge_page_tail() */ if (PageAnon(head)) { /* Additional pin to swap cache */ diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 2277b82902d8..95d16a42db6b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -407,8 +407,14 @@ static inline bool shadow_invalid(u8 tag, s8 shadow_byte) if (IS_ENABLED(CONFIG_KASAN_GENERIC)) return shadow_byte < 0 || shadow_byte >= KASAN_SHADOW_SCALE_SIZE; - else - return tag != (u8)shadow_byte; + + /* else CONFIG_KASAN_SW_TAGS: */ + if ((u8)shadow_byte == KASAN_TAG_INVALID) + return true; + if ((tag != KASAN_TAG_KERNEL) && (tag != (u8)shadow_byte)) + return true; + + return false; } static bool __kasan_slab_free(struct kmem_cache *cache, void *object, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index eaaa21b23215..ccede2425c3f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -710,7 +710,7 @@ static bool khugepaged_scan_abort(int nid) for (i = 0; i < MAX_NUMNODES; i++) { if (!khugepaged_node_load[i]) continue; - if (node_distance(nid, i) > RECLAIM_DISTANCE) + if (node_distance(nid, i) > node_reclaim_distance) return true; } return false; diff --git a/mm/madvise.c b/mm/madvise.c index afe2b015ea58..88babcc384b9 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -14,6 +14,7 @@ #include <linux/userfaultfd_k.h> #include <linux/hugetlb.h> #include <linux/falloc.h> +#include <linux/fadvise.h> #include <linux/sched.h> #include <linux/ksm.h> #include <linux/fs.h> @@ -266,6 +267,7 @@ static long madvise_willneed(struct vm_area_struct *vma, unsigned long start, unsigned long end) { struct file *file = vma->vm_file; + loff_t offset; *prev = vma; #ifdef CONFIG_SWAP @@ -290,12 +292,20 @@ static long madvise_willneed(struct vm_area_struct *vma, return 0; } - start = ((start - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - if (end > vma->vm_end) - end = vma->vm_end; - end = ((end - vma->vm_start) >> PAGE_SHIFT) + vma->vm_pgoff; - - force_page_cache_readahead(file->f_mapping, file, start, end - start); + /* + * Filesystem's fadvise may need to take various locks. We need to + * explicitly grab a reference because the vma (and hence the + * vma's reference to the file) can go away as soon as we drop + * mmap_sem. + */ + *prev = NULL; /* tell sys_madvise we drop mmap_sem */ + get_file(file); + up_read(¤t->mm->mmap_sem); + offset = (loff_t)(start - vma->vm_start) + + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vfs_fadvise(file, offset, end - start, POSIX_FADV_WILLNEED); + fput(file); + down_read(¤t->mm->mmap_sem); return 0; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9b2516a76be2..f3c15bb07cce 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -87,6 +87,10 @@ int do_swap_account __read_mostly; #define do_swap_account 0 #endif +#ifdef CONFIG_CGROUP_WRITEBACK +static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); +#endif + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -752,15 +756,13 @@ void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, /* Update memcg */ __mod_memcg_state(memcg, idx, val); + /* Update lruvec */ + __this_cpu_add(pn->lruvec_stat_local->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { struct mem_cgroup_per_node *pi; - /* - * Batch local counters to keep them in sync with - * the hierarchical ones. - */ - __this_cpu_add(pn->lruvec_stat_local->count[idx], x); for (pi = pn; pi; pi = parent_nodeinfo(pi, pgdat->node_id)) atomic_long_add(x, &pi->lruvec_stat[idx]); x = 0; @@ -3260,6 +3262,72 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, } } +static void memcg_flush_percpu_vmstats(struct mem_cgroup *memcg, bool slab_only) +{ + unsigned long stat[MEMCG_NR_STAT]; + struct mem_cgroup *mi; + int node, cpu, i; + int min_idx, max_idx; + + if (slab_only) { + min_idx = NR_SLAB_RECLAIMABLE; + max_idx = NR_SLAB_UNRECLAIMABLE; + } else { + min_idx = 0; + max_idx = MEMCG_NR_STAT; + } + + for (i = min_idx; i < max_idx; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = min_idx; i < max_idx; i++) + stat[i] += per_cpu(memcg->vmstats_percpu->stat[i], cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = min_idx; i < max_idx; i++) + atomic_long_add(stat[i], &mi->vmstats[i]); + + if (!slab_only) + max_idx = NR_VM_NODE_STAT_ITEMS; + + for_each_node(node) { + struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; + struct mem_cgroup_per_node *pi; + + for (i = min_idx; i < max_idx; i++) + stat[i] = 0; + + for_each_online_cpu(cpu) + for (i = min_idx; i < max_idx; i++) + stat[i] += per_cpu( + pn->lruvec_stat_cpu->count[i], cpu); + + for (pi = pn; pi; pi = parent_nodeinfo(pi, node)) + for (i = min_idx; i < max_idx; i++) + atomic_long_add(stat[i], &pi->lruvec_stat[i]); + } +} + +static void memcg_flush_percpu_vmevents(struct mem_cgroup *memcg) +{ + unsigned long events[NR_VM_EVENT_ITEMS]; + struct mem_cgroup *mi; + int cpu, i; + + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] = 0; + + for_each_online_cpu(cpu) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + events[i] += per_cpu(memcg->vmstats_percpu->events[i], + cpu); + + for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) + for (i = 0; i < NR_VM_EVENT_ITEMS; i++) + atomic_long_add(events[i], &mi->vmevents[i]); +} + #ifdef CONFIG_MEMCG_KMEM static int memcg_online_kmem(struct mem_cgroup *memcg) { @@ -3309,7 +3377,14 @@ static void memcg_offline_kmem(struct mem_cgroup *memcg) if (!parent) parent = root_mem_cgroup; + /* + * Deactivate and reparent kmem_caches. Then flush percpu + * slab statistics to have precise values at the parent and + * all ancestor levels. It's required to keep slab stats + * accurate after the reparenting of kmem_caches. + */ memcg_deactivate_kmem_caches(memcg, parent); + memcg_flush_percpu_vmstats(memcg, true); kmemcg_id = memcg->kmemcg_id; BUG_ON(kmemcg_id < 0); @@ -4101,6 +4176,8 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK +#include <trace/events/writeback.h> + static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); @@ -4184,6 +4261,130 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, } } +/* + * Foreign dirty flushing + * + * There's an inherent mismatch between memcg and writeback. The former + * trackes ownership per-page while the latter per-inode. This was a + * deliberate design decision because honoring per-page ownership in the + * writeback path is complicated, may lead to higher CPU and IO overheads + * and deemed unnecessary given that write-sharing an inode across + * different cgroups isn't a common use-case. + * + * Combined with inode majority-writer ownership switching, this works well + * enough in most cases but there are some pathological cases. For + * example, let's say there are two cgroups A and B which keep writing to + * different but confined parts of the same inode. B owns the inode and + * A's memory is limited far below B's. A's dirty ratio can rise enough to + * trigger balance_dirty_pages() sleeps but B's can be low enough to avoid + * triggering background writeback. A will be slowed down without a way to + * make writeback of the dirty pages happen. + * + * Conditions like the above can lead to a cgroup getting repatedly and + * severely throttled after making some progress after each + * dirty_expire_interval while the underyling IO device is almost + * completely idle. + * + * Solving this problem completely requires matching the ownership tracking + * granularities between memcg and writeback in either direction. However, + * the more egregious behaviors can be avoided by simply remembering the + * most recent foreign dirtying events and initiating remote flushes on + * them when local writeback isn't enough to keep the memory clean enough. + * + * The following two functions implement such mechanism. When a foreign + * page - a page whose memcg and writeback ownerships don't match - is + * dirtied, mem_cgroup_track_foreign_dirty() records the inode owning + * bdi_writeback on the page owning memcg. When balance_dirty_pages() + * decides that the memcg needs to sleep due to high dirty ratio, it calls + * mem_cgroup_flush_foreign() which queues writeback on the recorded + * foreign bdi_writebacks which haven't expired. Both the numbers of + * recorded bdi_writebacks and concurrent in-flight foreign writebacks are + * limited to MEMCG_CGWB_FRN_CNT. + * + * The mechanism only remembers IDs and doesn't hold any object references. + * As being wrong occasionally doesn't matter, updates and accesses to the + * records are lockless and racy. + */ +void mem_cgroup_track_foreign_dirty_slowpath(struct page *page, + struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = page->mem_cgroup; + struct memcg_cgwb_frn *frn; + u64 now = get_jiffies_64(); + u64 oldest_at = now; + int oldest = -1; + int i; + + trace_track_foreign_dirty(page, wb); + + /* + * Pick the slot to use. If there is already a slot for @wb, keep + * using it. If not replace the oldest one which isn't being + * written out. + */ + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + frn = &memcg->cgwb_frn[i]; + if (frn->bdi_id == wb->bdi->id && + frn->memcg_id == wb->memcg_css->id) + break; + if (time_before64(frn->at, oldest_at) && + atomic_read(&frn->done.cnt) == 1) { + oldest = i; + oldest_at = frn->at; + } + } + + if (i < MEMCG_CGWB_FRN_CNT) { + /* + * Re-using an existing one. Update timestamp lazily to + * avoid making the cacheline hot. We want them to be + * reasonably up-to-date and significantly shorter than + * dirty_expire_interval as that's what expires the record. + * Use the shorter of 1s and dirty_expire_interval / 8. + */ + unsigned long update_intv = + min_t(unsigned long, HZ, + msecs_to_jiffies(dirty_expire_interval * 10) / 8); + + if (time_before64(frn->at, now - update_intv)) + frn->at = now; + } else if (oldest >= 0) { + /* replace the oldest free one */ + frn = &memcg->cgwb_frn[oldest]; + frn->bdi_id = wb->bdi->id; + frn->memcg_id = wb->memcg_css->id; + frn->at = now; + } +} + +/* issue foreign writeback flushes for recorded foreign dirtying events */ +void mem_cgroup_flush_foreign(struct bdi_writeback *wb) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css); + unsigned long intv = msecs_to_jiffies(dirty_expire_interval * 10); + u64 now = jiffies_64; + int i; + + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) { + struct memcg_cgwb_frn *frn = &memcg->cgwb_frn[i]; + + /* + * If the record is older than dirty_expire_interval, + * writeback on it has already started. No need to kick it + * off again. Also, don't start a new one if there's + * already one in flight. + */ + if (time_after64(frn->at, now - intv) && + atomic_read(&frn->done.cnt) == 1) { + frn->at = 0; + trace_flush_foreign(wb, frn->bdi_id, frn->memcg_id); + cgroup_writeback_by_id(frn->bdi_id, frn->memcg_id, 0, + WB_REASON_FOREIGN_FLUSH, + &frn->done); + } + } +} + #else /* CONFIG_CGROUP_WRITEBACK */ static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) @@ -4682,6 +4883,12 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) { int node; + /* + * Flush percpu vmstats and vmevents to guarantee the value correctness + * on parent's and all ancestor levels. + */ + memcg_flush_percpu_vmstats(memcg, false); + memcg_flush_percpu_vmevents(memcg); for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->vmstats_percpu); @@ -4700,6 +4907,7 @@ static struct mem_cgroup *mem_cgroup_alloc(void) struct mem_cgroup *memcg; unsigned int size; int node; + int __maybe_unused i; size = sizeof(struct mem_cgroup); size += nr_node_ids * sizeof(struct mem_cgroup_per_node *); @@ -4743,6 +4951,9 @@ static struct mem_cgroup *mem_cgroup_alloc(void) #endif #ifdef CONFIG_CGROUP_WRITEBACK INIT_LIST_HEAD(&memcg->cgwb_list); + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + memcg->cgwb_frn[i].done = + __WB_COMPLETION_INIT(&memcg_cgwb_frn_waitq); #endif idr_replace(&mem_cgroup_idr, memcg, memcg->id.id); return memcg; @@ -4872,7 +5083,12 @@ static void mem_cgroup_css_released(struct cgroup_subsys_state *css) static void mem_cgroup_css_free(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); + int __maybe_unused i; +#ifdef CONFIG_CGROUP_WRITEBACK + for (i = 0; i < MEMCG_CGWB_FRN_CNT; i++) + wb_wait_for_completion(&memcg->cgwb_frn[i].done); +#endif if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket) static_branch_dec(&memcg_sockets_enabled_key); diff --git a/mm/memory.c b/mm/memory.c index e2bb51b6242e..b1dff75640b7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2196,6 +2196,10 @@ static vm_fault_t do_page_mkwrite(struct vm_fault *vmf) vmf->flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; + if (vmf->vma->vm_file && + IS_SWAPFILE(vmf->vma->vm_file->f_mapping->host)) + return VM_FAULT_SIGBUS; + ret = vmf->vma->vm_ops->page_mkwrite(vmf); /* Restore original flags so that caller is not surprised */ vmf->flags = old_flags; diff --git a/mm/mmap.c b/mm/mmap.c index 7e8c3e8ae75f..6bc21fca20bc 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1483,8 +1483,12 @@ unsigned long do_mmap(struct file *file, unsigned long addr, case MAP_SHARED_VALIDATE: if (flags & ~flags_mask) return -EOPNOTSUPP; - if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) - return -EACCES; + if (prot & PROT_WRITE) { + if (!(file->f_mode & FMODE_WRITE)) + return -EACCES; + if (IS_SWAPFILE(file->f_mapping->host)) + return -ETXTBSY; + } /* * Make sure we don't allow writing to an append-only diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 1804f64ff43c..50055d2e4ea8 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -1667,6 +1667,8 @@ static void balance_dirty_pages(struct bdi_writeback *wb, if (unlikely(!writeback_in_progress(wb))) wb_start_background_writeback(wb); + mem_cgroup_flush_foreign(wb); + /* * Calculate global domain's pos_ratio and select the * global dtc by default. @@ -2427,6 +2429,8 @@ void account_page_dirtied(struct page *page, struct address_space *mapping) task_io_account_write(PAGE_SIZE); current->nr_dirtied++; this_cpu_inc(bdp_ratelimits); + + mem_cgroup_track_foreign_dirty(page, wb); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b39baa2b1faf..ff5484fdbdf9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2238,27 +2238,12 @@ static int move_freepages(struct zone *zone, unsigned int order; int pages_moved = 0; -#ifndef CONFIG_HOLES_IN_ZONE - /* - * page_zone is not safe to call in this context when - * CONFIG_HOLES_IN_ZONE is set. This bug check is probably redundant - * anyway as we check zone boundaries in move_freepages_block(). - * Remove at a later date when no bug reports exist related to - * grouping pages by mobility - */ - VM_BUG_ON(pfn_valid(page_to_pfn(start_page)) && - pfn_valid(page_to_pfn(end_page)) && - page_zone(start_page) != page_zone(end_page)); -#endif for (page = start_page; page <= end_page;) { if (!pfn_valid_within(page_to_pfn(page))) { page++; continue; } - /* Make sure we are not inadvertently changing nodes */ - VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); - if (!PageBuddy(page)) { /* * We assume that pages that could be isolated for @@ -2273,6 +2258,10 @@ static int move_freepages(struct zone *zone, continue; } + /* Make sure we are not inadvertently changing nodes */ + VM_BUG_ON_PAGE(page_to_nid(page) != zone_to_nid(zone), page); + VM_BUG_ON_PAGE(page_zone(page) != zone, page); + order = page_order(page); move_to_free_area(page, &zone->free_area[order], migratetype); page += 1 << order; @@ -3522,7 +3511,7 @@ bool zone_watermark_ok_safe(struct zone *z, unsigned int order, static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) { return node_distance(zone_to_nid(local_zone), zone_to_nid(zone)) <= - RECLAIM_DISTANCE; + node_reclaim_distance; } #else /* CONFIG_NUMA */ static bool zone_allows_reclaim(struct zone *local_zone, struct zone *zone) diff --git a/mm/percpu.c b/mm/percpu.c index 9821241fdede..7e06a1e58720 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2125,7 +2125,7 @@ struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups, void *ptr; int unit; - base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]), + base_size = ALIGN(struct_size(ai, groups, nr_groups), __alignof__(ai->groups[0].cpu_map[0])); ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]); @@ -2220,7 +2220,7 @@ static void pcpu_dump_alloc_info(const char *lvl, * @base_addr: mapped address * * Initialize the first percpu chunk which contains the kernel static - * perpcu area. This function is to be called from arch percpu area + * percpu area. This function is to be called from arch percpu area * setup path. * * @ai contains all information necessary to initialize the first @@ -2267,12 +2267,9 @@ static void pcpu_dump_alloc_info(const char *lvl, * share the same vm, but use offset regions in the area allocation map. * The chunk serving the dynamic region is circulated in the chunk slots * and available for dynamic allocation like any other chunk. - * - * RETURNS: - * 0 on success, -errno on failure. */ -int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, - void *base_addr) +void __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, + void *base_addr) { size_t size_sum = ai->static_size + ai->reserved_size + ai->dyn_size; size_t static_size, dyn_size; @@ -2457,7 +2454,6 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, /* we're done */ pcpu_base_addr = base_addr; - return 0; } #ifdef CONFIG_SMP @@ -2710,7 +2706,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, struct pcpu_alloc_info *ai; size_t size_sum, areas_size; unsigned long max_distance; - int group, i, highest_group, rc; + int group, i, highest_group, rc = 0; ai = pcpu_build_alloc_info(reserved_size, dyn_size, atom_size, cpu_distance_fn); @@ -2795,7 +2791,7 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); - rc = pcpu_setup_first_chunk(ai, base); + pcpu_setup_first_chunk(ai, base); goto out_free; out_free_areas: @@ -2839,7 +2835,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, int unit_pages; size_t pages_size; struct page **pages; - int unit, i, j, rc; + int unit, i, j, rc = 0; int upa; int nr_g0_units; @@ -2920,7 +2916,7 @@ int __init pcpu_page_first_chunk(size_t reserved_size, unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); - rc = pcpu_setup_first_chunk(ai, vm.addr); + pcpu_setup_first_chunk(ai, vm.addr); goto out_free_ar; enomem: @@ -3014,8 +3010,7 @@ void __init setup_per_cpu_areas(void) ai->groups[0].nr_units = 1; ai->groups[0].cpu_map[0] = 0; - if (pcpu_setup_first_chunk(ai, fc) < 0) - panic("Failed to initialize percpu areas."); + pcpu_setup_first_chunk(ai, fc); pcpu_free_alloc_info(ai); } diff --git a/mm/shmem.c b/mm/shmem.c index 2bed4761f279..0f7fd4a85db6 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -37,6 +37,7 @@ #include <linux/khugepaged.h> #include <linux/hugetlb.h> #include <linux/frontswap.h> +#include <linux/fs_parser.h> #include <asm/tlbflush.h> /* for arch/microblaze update_mmu_cache() */ @@ -107,6 +108,20 @@ struct shmem_falloc { pgoff_t nr_unswapped; /* how often writepage refused to swap out */ }; +struct shmem_options { + unsigned long long blocks; + unsigned long long inodes; + struct mempolicy *mpol; + kuid_t uid; + kgid_t gid; + umode_t mode; + int huge; + int seen; +#define SHMEM_SEEN_BLOCKS 1 +#define SHMEM_SEEN_INODES 2 +#define SHMEM_SEEN_HUGE 4 +}; + #ifdef CONFIG_TMPFS static unsigned long shmem_default_max_blocks(void) { @@ -3349,16 +3364,126 @@ static const struct export_operations shmem_export_ops = { .fh_to_dentry = shmem_fh_to_dentry, }; -static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, - bool remount) +enum shmem_param { + Opt_gid, + Opt_huge, + Opt_mode, + Opt_mpol, + Opt_nr_blocks, + Opt_nr_inodes, + Opt_size, + Opt_uid, +}; + +static const struct fs_parameter_spec shmem_param_specs[] = { + fsparam_u32 ("gid", Opt_gid), + fsparam_enum ("huge", Opt_huge), + fsparam_u32oct("mode", Opt_mode), + fsparam_string("mpol", Opt_mpol), + fsparam_string("nr_blocks", Opt_nr_blocks), + fsparam_string("nr_inodes", Opt_nr_inodes), + fsparam_string("size", Opt_size), + fsparam_u32 ("uid", Opt_uid), + {} +}; + +static const struct fs_parameter_enum shmem_param_enums[] = { + { Opt_huge, "never", SHMEM_HUGE_NEVER }, + { Opt_huge, "always", SHMEM_HUGE_ALWAYS }, + { Opt_huge, "within_size", SHMEM_HUGE_WITHIN_SIZE }, + { Opt_huge, "advise", SHMEM_HUGE_ADVISE }, + {} +}; + +const struct fs_parameter_description shmem_fs_parameters = { + .name = "tmpfs", + .specs = shmem_param_specs, + .enums = shmem_param_enums, +}; + +static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param) { - char *this_char, *value, *rest; - struct mempolicy *mpol = NULL; - uid_t uid; - gid_t gid; + struct shmem_options *ctx = fc->fs_private; + struct fs_parse_result result; + unsigned long long size; + char *rest; + int opt; + + opt = fs_parse(fc, &shmem_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_size: + size = memparse(param->string, &rest); + if (*rest == '%') { + size <<= PAGE_SHIFT; + size *= totalram_pages(); + do_div(size, 100); + rest++; + } + if (*rest) + goto bad_value; + ctx->blocks = DIV_ROUND_UP(size, PAGE_SIZE); + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_blocks: + ctx->blocks = memparse(param->string, &rest); + if (*rest) + goto bad_value; + ctx->seen |= SHMEM_SEEN_BLOCKS; + break; + case Opt_nr_inodes: + ctx->inodes = memparse(param->string, &rest); + if (*rest) + goto bad_value; + ctx->seen |= SHMEM_SEEN_INODES; + break; + case Opt_mode: + ctx->mode = result.uint_32 & 07777; + break; + case Opt_uid: + ctx->uid = make_kuid(current_user_ns(), result.uint_32); + if (!uid_valid(ctx->uid)) + goto bad_value; + break; + case Opt_gid: + ctx->gid = make_kgid(current_user_ns(), result.uint_32); + if (!gid_valid(ctx->gid)) + goto bad_value; + break; + case Opt_huge: + ctx->huge = result.uint_32; + if (ctx->huge != SHMEM_HUGE_NEVER && + !(IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + has_transparent_hugepage())) + goto unsupported_parameter; + ctx->seen |= SHMEM_SEEN_HUGE; + break; + case Opt_mpol: + if (IS_ENABLED(CONFIG_NUMA)) { + mpol_put(ctx->mpol); + ctx->mpol = NULL; + if (mpol_parse_str(param->string, &ctx->mpol)) + goto bad_value; + break; + } + goto unsupported_parameter; + } + return 0; + +unsupported_parameter: + return invalf(fc, "tmpfs: Unsupported parameter '%s'", param->key); +bad_value: + return invalf(fc, "tmpfs: Bad value for '%s'", param->key); +} + +static int shmem_parse_options(struct fs_context *fc, void *data) +{ + char *options = data; while (options != NULL) { - this_char = options; + char *this_char = options; for (;;) { /* * NUL-terminate this option: unfortunately, @@ -3374,139 +3499,83 @@ static int shmem_parse_options(char *options, struct shmem_sb_info *sbinfo, break; } } - if (!*this_char) - continue; - if ((value = strchr(this_char,'=')) != NULL) { - *value++ = 0; - } else { - pr_err("tmpfs: No value for mount option '%s'\n", - this_char); - goto error; - } - - if (!strcmp(this_char,"size")) { - unsigned long long size; - size = memparse(value,&rest); - if (*rest == '%') { - size <<= PAGE_SHIFT; - size *= totalram_pages(); - do_div(size, 100); - rest++; + if (*this_char) { + char *value = strchr(this_char,'='); + size_t len = 0; + int err; + + if (value) { + *value++ = '\0'; + len = strlen(value); } - if (*rest) - goto bad_val; - sbinfo->max_blocks = - DIV_ROUND_UP(size, PAGE_SIZE); - } else if (!strcmp(this_char,"nr_blocks")) { - sbinfo->max_blocks = memparse(value, &rest); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"nr_inodes")) { - sbinfo->max_inodes = memparse(value, &rest); - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"mode")) { - if (remount) - continue; - sbinfo->mode = simple_strtoul(value, &rest, 8) & 07777; - if (*rest) - goto bad_val; - } else if (!strcmp(this_char,"uid")) { - if (remount) - continue; - uid = simple_strtoul(value, &rest, 0); - if (*rest) - goto bad_val; - sbinfo->uid = make_kuid(current_user_ns(), uid); - if (!uid_valid(sbinfo->uid)) - goto bad_val; - } else if (!strcmp(this_char,"gid")) { - if (remount) - continue; - gid = simple_strtoul(value, &rest, 0); - if (*rest) - goto bad_val; - sbinfo->gid = make_kgid(current_user_ns(), gid); - if (!gid_valid(sbinfo->gid)) - goto bad_val; -#ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE - } else if (!strcmp(this_char, "huge")) { - int huge; - huge = shmem_parse_huge(value); - if (huge < 0) - goto bad_val; - if (!has_transparent_hugepage() && - huge != SHMEM_HUGE_NEVER) - goto bad_val; - sbinfo->huge = huge; -#endif -#ifdef CONFIG_NUMA - } else if (!strcmp(this_char,"mpol")) { - mpol_put(mpol); - mpol = NULL; - if (mpol_parse_str(value, &mpol)) - goto bad_val; -#endif - } else { - pr_err("tmpfs: Bad mount option %s\n", this_char); - goto error; + err = vfs_parse_fs_string(fc, this_char, value, len); + if (err < 0) + return err; } } - sbinfo->mpol = mpol; return 0; - -bad_val: - pr_err("tmpfs: Bad value '%s' for mount option '%s'\n", - value, this_char); -error: - mpol_put(mpol); - return 1; - } -static int shmem_remount_fs(struct super_block *sb, int *flags, char *data) +/* + * Reconfigure a shmem filesystem. + * + * Note that we disallow change from limited->unlimited blocks/inodes while any + * are in use; but we must separately disallow unlimited->limited, because in + * that case we have no record of how much is already in use. + */ +static int shmem_reconfigure(struct fs_context *fc) { - struct shmem_sb_info *sbinfo = SHMEM_SB(sb); - struct shmem_sb_info config = *sbinfo; + struct shmem_options *ctx = fc->fs_private; + struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; - int error = -EINVAL; - - config.mpol = NULL; - if (shmem_parse_options(data, &config, true)) - return error; + const char *err; spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; - if (percpu_counter_compare(&sbinfo->used_blocks, config.max_blocks) > 0) - goto out; - if (config.max_inodes < inodes) - goto out; - /* - * Those tests disallow limited->unlimited while any are in use; - * but we must separately disallow unlimited->limited, because - * in that case we have no record of how much is already in use. - */ - if (config.max_blocks && !sbinfo->max_blocks) - goto out; - if (config.max_inodes && !sbinfo->max_inodes) - goto out; + if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { + if (!sbinfo->max_blocks) { + err = "Cannot retroactively limit size"; + goto out; + } + if (percpu_counter_compare(&sbinfo->used_blocks, + ctx->blocks) > 0) { + err = "Too small a size for current use"; + goto out; + } + } + if ((ctx->seen & SHMEM_SEEN_INODES) && ctx->inodes) { + if (!sbinfo->max_inodes) { + err = "Cannot retroactively limit inodes"; + goto out; + } + if (ctx->inodes < inodes) { + err = "Too few inodes for current use"; + goto out; + } + } - error = 0; - sbinfo->huge = config.huge; - sbinfo->max_blocks = config.max_blocks; - sbinfo->max_inodes = config.max_inodes; - sbinfo->free_inodes = config.max_inodes - inodes; + if (ctx->seen & SHMEM_SEEN_HUGE) + sbinfo->huge = ctx->huge; + if (ctx->seen & SHMEM_SEEN_BLOCKS) + sbinfo->max_blocks = ctx->blocks; + if (ctx->seen & SHMEM_SEEN_INODES) { + sbinfo->max_inodes = ctx->inodes; + sbinfo->free_inodes = ctx->inodes - inodes; + } /* * Preserve previous mempolicy unless mpol remount option was specified. */ - if (config.mpol) { + if (ctx->mpol) { mpol_put(sbinfo->mpol); - sbinfo->mpol = config.mpol; /* transfers initial ref */ + sbinfo->mpol = ctx->mpol; /* transfers initial ref */ + ctx->mpol = NULL; } + spin_unlock(&sbinfo->stat_lock); + return 0; out: spin_unlock(&sbinfo->stat_lock); - return error; + return invalf(fc, "tmpfs: %s", err); } static int shmem_show_options(struct seq_file *seq, struct dentry *root) @@ -3547,8 +3616,9 @@ static void shmem_put_super(struct super_block *sb) sb->s_fs_info = NULL; } -int shmem_fill_super(struct super_block *sb, void *data, int silent) +static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) { + struct shmem_options *ctx = fc->fs_private; struct inode *inode; struct shmem_sb_info *sbinfo; int err = -ENOMEM; @@ -3559,9 +3629,6 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) if (!sbinfo) return -ENOMEM; - sbinfo->mode = 0777 | S_ISVTX; - sbinfo->uid = current_fsuid(); - sbinfo->gid = current_fsgid(); sb->s_fs_info = sbinfo; #ifdef CONFIG_TMPFS @@ -3571,12 +3638,10 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) * but the internal instance is left unlimited. */ if (!(sb->s_flags & SB_KERNMOUNT)) { - sbinfo->max_blocks = shmem_default_max_blocks(); - sbinfo->max_inodes = shmem_default_max_inodes(); - if (shmem_parse_options(data, sbinfo, false)) { - err = -EINVAL; - goto failed; - } + if (!(ctx->seen & SHMEM_SEEN_BLOCKS)) + ctx->blocks = shmem_default_max_blocks(); + if (!(ctx->seen & SHMEM_SEEN_INODES)) + ctx->inodes = shmem_default_max_inodes(); } else { sb->s_flags |= SB_NOUSER; } @@ -3585,11 +3650,18 @@ int shmem_fill_super(struct super_block *sb, void *data, int silent) #else sb->s_flags |= SB_NOUSER; #endif + sbinfo->max_blocks = ctx->blocks; + sbinfo->free_inodes = sbinfo->max_inodes = ctx->inodes; + sbinfo->uid = ctx->uid; + sbinfo->gid = ctx->gid; + sbinfo->mode = ctx->mode; + sbinfo->huge = ctx->huge; + sbinfo->mpol = ctx->mpol; + ctx->mpol = NULL; spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; - sbinfo->free_inodes = sbinfo->max_inodes; spin_lock_init(&sbinfo->shrinklist_lock); INIT_LIST_HEAD(&sbinfo->shrinklist); @@ -3622,6 +3694,31 @@ failed: return err; } +static int shmem_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, shmem_fill_super); +} + +static void shmem_free_fc(struct fs_context *fc) +{ + struct shmem_options *ctx = fc->fs_private; + + if (ctx) { + mpol_put(ctx->mpol); + kfree(ctx); + } +} + +static const struct fs_context_operations shmem_fs_context_ops = { + .free = shmem_free_fc, + .get_tree = shmem_get_tree, +#ifdef CONFIG_TMPFS + .parse_monolithic = shmem_parse_options, + .parse_param = shmem_parse_one, + .reconfigure = shmem_reconfigure, +#endif +}; + static struct kmem_cache *shmem_inode_cachep; static struct inode *shmem_alloc_inode(struct super_block *sb) @@ -3738,7 +3835,6 @@ static const struct super_operations shmem_ops = { .destroy_inode = shmem_destroy_inode, #ifdef CONFIG_TMPFS .statfs = shmem_statfs, - .remount_fs = shmem_remount_fs, .show_options = shmem_show_options, #endif .evict_inode = shmem_evict_inode, @@ -3759,16 +3855,30 @@ static const struct vm_operations_struct shmem_vm_ops = { #endif }; -static struct dentry *shmem_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +int shmem_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, shmem_fill_super); + struct shmem_options *ctx; + + ctx = kzalloc(sizeof(struct shmem_options), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->mode = 0777 | S_ISVTX; + ctx->uid = current_fsuid(); + ctx->gid = current_fsgid(); + + fc->fs_private = ctx; + fc->ops = &shmem_fs_context_ops; + return 0; } static struct file_system_type shmem_fs_type = { .owner = THIS_MODULE, .name = "tmpfs", - .mount = shmem_mount, + .init_fs_context = shmem_init_fs_context, +#ifdef CONFIG_TMPFS + .parameters = &shmem_fs_parameters, +#endif .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT, }; @@ -3912,7 +4022,8 @@ bool shmem_huge_enabled(struct vm_area_struct *vma) static struct file_system_type shmem_fs_type = { .name = "tmpfs", - .mount = ramfs_mount, + .init_fs_context = ramfs_init_fs_context, + .parameters = &ramfs_fs_parameters, .kill_sb = kill_litter_super, .fs_flags = FS_USERNS_MOUNT, }; diff --git a/mm/swapfile.c b/mm/swapfile.c index 0789a762ce2f..dab43523afdd 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2368,9 +2368,8 @@ EXPORT_SYMBOL_GPL(add_swap_extent); * requirements, they are simply tossed out - we will never use those blocks * for swapping. * - * For S_ISREG swapfiles we set S_SWAPFILE across the life of the swapon. This - * prevents root from shooting her foot off by ftruncating an in-use swapfile, - * which will scribble on the fs. + * For all swap devices we set S_SWAPFILE across the life of the swapon. This + * prevents users from writing to the swap device, which will corrupt memory. * * The amount of disk space which a single swap extent represents varies. * Typically it is in the 1-4 megabyte range. So we can have hundreds of @@ -2661,13 +2660,14 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) inode = mapping->host; if (S_ISBLK(inode->i_mode)) { struct block_device *bdev = I_BDEV(inode); + set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - } else { - inode_lock(inode); - inode->i_flags &= ~S_SWAPFILE; - inode_unlock(inode); } + + inode_lock(inode); + inode->i_flags &= ~S_SWAPFILE; + inode_unlock(inode); filp_close(swap_file, NULL); /* @@ -2890,11 +2890,11 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - inode_lock(inode); - if (IS_SWAPFILE(inode)) - return -EBUSY; - } else - return -EINVAL; + } + + inode_lock(inode); + if (IS_SWAPFILE(inode)) + return -EBUSY; return 0; } @@ -3275,6 +3275,17 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (error) goto bad_swap; + /* + * Flush any pending IO and dirty mappings before we start using this + * swap device. + */ + inode->i_flags |= S_SWAPFILE; + error = inode_drain_writes(inode); + if (error) { + inode->i_flags &= ~S_SWAPFILE; + goto bad_swap; + } + mutex_lock(&swapon_mutex); prio = -1; if (swap_flags & SWAP_FLAG_PREFER) @@ -3295,8 +3306,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) atomic_inc(&proc_poll_event); wake_up_interruptible(&proc_poll_wait); - if (S_ISREG(inode->i_mode)) - inode->i_flags |= S_SWAPFILE; error = 0; goto out; bad_swap: @@ -3318,7 +3327,7 @@ bad_swap: if (inced_nr_rotate_swap) atomic_dec(&nr_rotate_swap); if (swap_file) { - if (inode && S_ISREG(inode->i_mode)) { + if (inode) { inode_unlock(inode); inode = NULL; } @@ -3331,7 +3340,7 @@ out: } if (name) putname(name); - if (inode && S_ISREG(inode->i_mode)) + if (inode) inode_unlock(inode); if (!error) enable_swap_slots_cache(); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 7ba11e12a11f..c1246d77cf75 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2993,7 +2993,7 @@ int remap_vmalloc_range_partial(struct vm_area_struct *vma, unsigned long uaddr, if (!area) return -EINVAL; - if (!(area->flags & VM_USERMAP)) + if (!(area->flags & (VM_USERMAP | VM_DMA_COHERENT))) return -EINVAL; if (kaddr + size > area->addr + get_vm_area_size(area)) @@ -3496,6 +3496,9 @@ static int s_show(struct seq_file *m, void *p) if (v->flags & VM_USERMAP) seq_puts(m, " user"); + if (v->flags & VM_DMA_COHERENT) + seq_puts(m, " dma-coherent"); + if (is_vmalloc_addr(v->pages)) seq_puts(m, " vpages"); diff --git a/mm/vmscan.c b/mm/vmscan.c index c77d1e3761a7..a6c5d0b28321 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3220,6 +3220,7 @@ unsigned long try_to_free_pages(struct zonelist *zonelist, int order, #ifdef CONFIG_MEMCG +/* Only used by soft limit reclaim. Do not reuse for anything else. */ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, gfp_t gfp_mask, bool noswap, pg_data_t *pgdat, @@ -3235,7 +3236,8 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, }; unsigned long lru_pages; - set_task_reclaim_state(current, &sc.reclaim_state); + WARN_ON_ONCE(!current->reclaim_state); + sc.gfp_mask = (gfp_mask & GFP_RECLAIM_MASK) | (GFP_HIGHUSER_MOVABLE & ~GFP_RECLAIM_MASK); @@ -3253,7 +3255,6 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, trace_mm_vmscan_memcg_softlimit_reclaim_end(sc.nr_reclaimed); - set_task_reclaim_state(current, NULL); *nr_scanned = sc.nr_scanned; return sc.nr_reclaimed; diff --git a/mm/z3fold.c b/mm/z3fold.c index ed19d98c9dcd..75b7962439ff 100644 --- a/mm/z3fold.c +++ b/mm/z3fold.c @@ -41,6 +41,7 @@ #include <linux/workqueue.h> #include <linux/slab.h> #include <linux/spinlock.h> +#include <linux/wait.h> #include <linux/zpool.h> #include <linux/magic.h> @@ -145,6 +146,8 @@ struct z3fold_header { * @release_wq: workqueue for safe page release * @work: work_struct for safe page release * @inode: inode for z3fold pseudo filesystem + * @destroying: bool to stop migration once we start destruction + * @isolated: int to count the number of pages currently in isolation * * This structure is allocated at pool creation time and maintains metadata * pertaining to a particular z3fold pool. @@ -163,8 +166,11 @@ struct z3fold_pool { const struct zpool_ops *zpool_ops; struct workqueue_struct *compact_wq; struct workqueue_struct *release_wq; + struct wait_queue_head isolate_wait; struct work_struct work; struct inode *inode; + bool destroying; + int isolated; }; /* @@ -769,6 +775,7 @@ static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp, goto out_c; spin_lock_init(&pool->lock); spin_lock_init(&pool->stale_lock); + init_waitqueue_head(&pool->isolate_wait); pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2); if (!pool->unbuddied) goto out_pool; @@ -808,6 +815,15 @@ out: return NULL; } +static bool pool_isolated_are_drained(struct z3fold_pool *pool) +{ + bool ret; + + spin_lock(&pool->lock); + ret = pool->isolated == 0; + spin_unlock(&pool->lock); + return ret; +} /** * z3fold_destroy_pool() - destroys an existing z3fold pool * @pool: the z3fold pool to be destroyed @@ -817,6 +833,22 @@ out: static void z3fold_destroy_pool(struct z3fold_pool *pool) { kmem_cache_destroy(pool->c_handle); + /* + * We set pool-> destroying under lock to ensure that + * z3fold_page_isolate() sees any changes to destroying. This way we + * avoid the need for any memory barriers. + */ + + spin_lock(&pool->lock); + pool->destroying = true; + spin_unlock(&pool->lock); + + /* + * We need to ensure that no pages are being migrated while we destroy + * these workqueues, as migration can queue work on either of the + * workqueues. + */ + wait_event(pool->isolate_wait, !pool_isolated_are_drained(pool)); /* * We need to destroy pool->compact_wq before pool->release_wq, @@ -1307,6 +1339,28 @@ static u64 z3fold_get_pool_size(struct z3fold_pool *pool) return atomic64_read(&pool->pages_nr); } +/* + * z3fold_dec_isolated() expects to be called while pool->lock is held. + */ +static void z3fold_dec_isolated(struct z3fold_pool *pool) +{ + assert_spin_locked(&pool->lock); + VM_BUG_ON(pool->isolated <= 0); + pool->isolated--; + + /* + * If we have no more isolated pages, we have to see if + * z3fold_destroy_pool() is waiting for a signal. + */ + if (pool->isolated == 0 && waitqueue_active(&pool->isolate_wait)) + wake_up_all(&pool->isolate_wait); +} + +static void z3fold_inc_isolated(struct z3fold_pool *pool) +{ + pool->isolated++; +} + static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) { struct z3fold_header *zhdr; @@ -1333,6 +1387,34 @@ static bool z3fold_page_isolate(struct page *page, isolate_mode_t mode) spin_lock(&pool->lock); if (!list_empty(&page->lru)) list_del(&page->lru); + /* + * We need to check for destruction while holding pool->lock, as + * otherwise destruction could see 0 isolated pages, and + * proceed. + */ + if (unlikely(pool->destroying)) { + spin_unlock(&pool->lock); + /* + * If this page isn't stale, somebody else holds a + * reference to it. Let't drop our refcount so that they + * can call the release logic. + */ + if (unlikely(kref_put(&zhdr->refcount, + release_z3fold_page_locked))) { + /* + * If we get here we have kref problems, so we + * should freak out. + */ + WARN(1, "Z3fold is experiencing kref problems\n"); + z3fold_page_unlock(zhdr); + return false; + } + z3fold_page_unlock(zhdr); + return false; + } + + + z3fold_inc_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); return true; @@ -1401,6 +1483,10 @@ static int z3fold_page_migrate(struct address_space *mapping, struct page *newpa queue_work_on(new_zhdr->cpu, pool->compact_wq, &new_zhdr->work); + spin_lock(&pool->lock); + z3fold_dec_isolated(pool); + spin_unlock(&pool->lock); + page_mapcount_reset(page); put_page(page); return 0; @@ -1420,10 +1506,14 @@ static void z3fold_page_putback(struct page *page) INIT_LIST_HEAD(&page->lru); if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) { atomic64_dec(&pool->pages_nr); + spin_lock(&pool->lock); + z3fold_dec_isolated(pool); + spin_unlock(&pool->lock); return; } spin_lock(&pool->lock); list_add(&page->lru, &pool->lru); + z3fold_dec_isolated(pool); spin_unlock(&pool->lock); z3fold_page_unlock(zhdr); } diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 57fbb7ced69f..e98bb6ab4f7e 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -54,6 +54,7 @@ #include <linux/mount.h> #include <linux/pseudo_fs.h> #include <linux/migrate.h> +#include <linux/wait.h> #include <linux/pagemap.h> #include <linux/fs.h> @@ -268,6 +269,10 @@ struct zs_pool { #ifdef CONFIG_COMPACTION struct inode *inode; struct work_struct free_work; + /* A wait queue for when migration races with async_free_zspage() */ + struct wait_queue_head migration_wait; + atomic_long_t isolated_pages; + bool destroying; #endif }; @@ -1862,6 +1867,31 @@ static void dec_zspage_isolation(struct zspage *zspage) zspage->isolated--; } +static void putback_zspage_deferred(struct zs_pool *pool, + struct size_class *class, + struct zspage *zspage) +{ + enum fullness_group fg; + + fg = putback_zspage(class, zspage); + if (fg == ZS_EMPTY) + schedule_work(&pool->free_work); + +} + +static inline void zs_pool_dec_isolated(struct zs_pool *pool) +{ + VM_BUG_ON(atomic_long_read(&pool->isolated_pages) <= 0); + atomic_long_dec(&pool->isolated_pages); + /* + * There's no possibility of racing, since wait_for_isolated_drain() + * checks the isolated count under &class->lock after enqueuing + * on migration_wait. + */ + if (atomic_long_read(&pool->isolated_pages) == 0 && pool->destroying) + wake_up_all(&pool->migration_wait); +} + static void replace_sub_page(struct size_class *class, struct zspage *zspage, struct page *newpage, struct page *oldpage) { @@ -1931,6 +1961,7 @@ static bool zs_page_isolate(struct page *page, isolate_mode_t mode) */ if (!list_empty(&zspage->list) && !is_zspage_isolated(zspage)) { get_zspage_mapping(zspage, &class_idx, &fullness); + atomic_long_inc(&pool->isolated_pages); remove_zspage(class, zspage, fullness); } @@ -2030,8 +2061,16 @@ static int zs_page_migrate(struct address_space *mapping, struct page *newpage, * Page migration is done so let's putback isolated zspage to * the list if @page is final isolated subpage in the zspage. */ - if (!is_zspage_isolated(zspage)) - putback_zspage(class, zspage); + if (!is_zspage_isolated(zspage)) { + /* + * We cannot race with zs_destroy_pool() here because we wait + * for isolation to hit zero before we start destroying. + * Also, we ensure that everyone can see pool->destroying before + * we start waiting. + */ + putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); + } reset_page(page); put_page(page); @@ -2077,13 +2116,12 @@ static void zs_page_putback(struct page *page) spin_lock(&class->lock); dec_zspage_isolation(zspage); if (!is_zspage_isolated(zspage)) { - fg = putback_zspage(class, zspage); /* * Due to page_lock, we cannot free zspage immediately * so let's defer. */ - if (fg == ZS_EMPTY) - schedule_work(&pool->free_work); + putback_zspage_deferred(pool, class, zspage); + zs_pool_dec_isolated(pool); } spin_unlock(&class->lock); } @@ -2107,8 +2145,36 @@ static int zs_register_migration(struct zs_pool *pool) return 0; } +static bool pool_isolated_are_drained(struct zs_pool *pool) +{ + return atomic_long_read(&pool->isolated_pages) == 0; +} + +/* Function for resolving migration */ +static void wait_for_isolated_drain(struct zs_pool *pool) +{ + + /* + * We're in the process of destroying the pool, so there are no + * active allocations. zs_page_isolate() fails for completely free + * zspages, so we need only wait for the zs_pool's isolated + * count to hit zero. + */ + wait_event(pool->migration_wait, + pool_isolated_are_drained(pool)); +} + static void zs_unregister_migration(struct zs_pool *pool) { + pool->destroying = true; + /* + * We need a memory barrier here to ensure global visibility of + * pool->destroying. Thus pool->isolated pages will either be 0 in which + * case we don't care, or it will be > 0 and pool->destroying will + * ensure that we wake up once isolation hits 0. + */ + smp_mb(); + wait_for_isolated_drain(pool); /* This can block */ flush_work(&pool->free_work); iput(pool->inode); } @@ -2346,6 +2412,10 @@ struct zs_pool *zs_create_pool(const char *name) if (!pool->name) goto err; +#ifdef CONFIG_COMPACTION + init_waitqueue_head(&pool->migration_wait); +#endif + if (create_cache(pool)) goto err; |