From a3df69278c5052acf0a5335b3fc614e0a7e5ea93 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Mon, 1 Jun 2020 21:46:00 -0700 Subject: Documentation/vm/slub.rst: s/Toggle/Enable/ "toggle" means to change a boolean thing's state. This operation doesn't do that - it sets it to "true". Signed-off-by: Andrew Morton Acked-by: Rafael Aquini Cc: Christoph Lameter Cc: David Rientjes Cc: Joonsoo Kim Cc: Pekka Enberg Signed-off-by: Linus Torvalds --- Documentation/vm/slub.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'Documentation') diff --git a/Documentation/vm/slub.rst b/Documentation/vm/slub.rst index 933ada4368ff..4eee598555c9 100644 --- a/Documentation/vm/slub.rst +++ b/Documentation/vm/slub.rst @@ -49,7 +49,7 @@ Possible debug options are:: P Poisoning (object and padding) U User tracking (free and alloc) T Trace (please only use on single slabs) - A Toggle failslab filter mark for the cache + A Enable failslab filter mark for the cache O Switch debugging off for caches that would have caused higher minimum slab orders - Switch all debugging off (useful if the kernel is -- cgit v1.2.3 From 8151b4c8bee43cea7a28cb0300123df90880e60c Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Mon, 1 Jun 2020 21:46:44 -0700 Subject: mm: add readahead address space operation This replaces ->readpages with a saner interface: - Return void instead of an ignored error code. - Page cache is already populated with locked pages when ->readahead is called. - New arguments can be passed to the implementation without changing all the filesystems that use a common helper function like mpage_readahead(). Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: William Kucharski Cc: Chao Yu Cc: Cong Wang Cc: Darrick J. Wong Cc: Dave Chinner Cc: Eric Biggers Cc: Gao Xiang Cc: Jaegeuk Kim Cc: Joseph Qi Cc: Junxiao Bi Cc: Michal Hocko Cc: Zi Yan Cc: Johannes Thumshirn Cc: Miklos Szeredi Link: http://lkml.kernel.org/r/20200414150233.24495-12-willy@infradead.org Signed-off-by: Linus Torvalds --- Documentation/filesystems/locking.rst | 6 +++++- Documentation/filesystems/vfs.rst | 15 +++++++++++++++ include/linux/fs.h | 2 ++ mm/readahead.c | 12 ++++++++++-- 4 files changed, 32 insertions(+), 3 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/locking.rst b/Documentation/filesystems/locking.rst index 5057e4d9dcd1..0af2e0e11461 100644 --- a/Documentation/filesystems/locking.rst +++ b/Documentation/filesystems/locking.rst @@ -239,6 +239,7 @@ prototypes:: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -271,7 +272,8 @@ writepage: yes, unlocks (see below) readpage: yes, unlocks writepages: set_page_dirty no -readpages: +readahead: yes, unlocks +readpages: no write_begin: locks the page exclusive write_end: yes, unlocks exclusive bmap: @@ -295,6 +297,8 @@ the request handler (/dev/loop). ->readpage() unlocks the page, either synchronously or via I/O completion. +->readahead() unlocks the pages that I/O is attempted on like ->readpage(). + ->readpages() populates the pagecache with the passed pages and starts I/O against them. They come unlocked upon I/O completion. diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 7d4d09dd5e6d..ed17771c212b 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -706,6 +706,7 @@ cache in your filesystem. The following members are defined: int (*readpage)(struct file *, struct page *); int (*writepages)(struct address_space *, struct writeback_control *); int (*set_page_dirty)(struct page *page); + void (*readahead)(struct readahead_control *); int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); int (*write_begin)(struct file *, struct address_space *mapping, @@ -781,12 +782,26 @@ cache in your filesystem. The following members are defined: If defined, it should set the PageDirty flag, and the PAGECACHE_TAG_DIRTY tag in the radix tree. +``readahead`` + Called by the VM to read pages associated with the address_space + object. The pages are consecutive in the page cache and are + locked. The implementation should decrement the page refcount + after starting I/O on each page. Usually the page will be + unlocked by the I/O completion handler. If the filesystem decides + to stop attempting I/O before reaching the end of the readahead + window, it can simply return. The caller will decrement the page + refcount and unlock the remaining pages for you. Set PageUptodate + if the I/O completes successfully. Setting PageError on any page + will be ignored; simply unlock the page if an I/O error occurs. + ``readpages`` called by the VM to read pages associated with the address_space object. This is essentially just a vector version of readpage. Instead of just one page, several pages are requested. readpages is only used for read-ahead, so read errors are ignored. If anything goes wrong, feel free to give up. + This interface is deprecated and will be removed by the end of + 2020; implement readahead instead. ``write_begin`` Called by the generic buffered write code to ask the filesystem diff --git a/include/linux/fs.h b/include/linux/fs.h index f2fb5b7406b9..1434ed801b80 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -292,6 +292,7 @@ enum positive_aop_returns { struct page; struct address_space; struct writeback_control; +struct readahead_control; /* * Write life time hint values. @@ -375,6 +376,7 @@ struct address_space_operations { */ int (*readpages)(struct file *filp, struct address_space *mapping, struct list_head *pages, unsigned nr_pages); + void (*readahead)(struct readahead_control *); int (*write_begin)(struct file *, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, diff --git a/mm/readahead.c b/mm/readahead.c index e52b3a7b9da5..d01531ef9f3c 100644 --- a/mm/readahead.c +++ b/mm/readahead.c @@ -125,7 +125,14 @@ static void read_pages(struct readahead_control *rac, struct list_head *pages, blk_start_plug(&plug); - if (aops->readpages) { + if (aops->readahead) { + aops->readahead(rac); + /* Clean up the remaining pages */ + while ((page = readahead_page(rac))) { + unlock_page(page); + put_page(page); + } + } else if (aops->readpages) { aops->readpages(rac->file, rac->mapping, pages, readahead_count(rac)); /* Clean up the remaining pages */ @@ -233,7 +240,8 @@ void force_page_cache_readahead(struct address_space *mapping, struct file_ra_state *ra = &filp->f_ra; unsigned long max_pages; - if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages)) + if (unlikely(!mapping->a_ops->readpage && !mapping->a_ops->readpages && + !mapping->a_ops->readahead)) return; /* -- cgit v1.2.3 From 8d92890bd6b8502d6aee4b37430ae6444ade7a8c Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Mon, 1 Jun 2020 21:48:21 -0700 Subject: mm/writeback: discard NR_UNSTABLE_NFS, use NR_WRITEBACK instead After an NFS page has been written it is considered "unstable" until a COMMIT request succeeds. If the COMMIT fails, the page will be re-written. These "unstable" pages are currently accounted as "reclaimable", either in WB_RECLAIMABLE, or in NR_UNSTABLE_NFS which is included in a 'reclaimable' count. This might have made sense when sending the COMMIT required a separate action by the VFS/MM (e.g. releasepage() used to send a COMMIT). However now that all writes generated by ->writepages() will automatically be followed by a COMMIT (since commit 919e3bd9a875 ("NFS: Ensure we commit after writeback is complete")) it makes more sense to treat them as writeback pages. So this patch removes NR_UNSTABLE_NFS and accounts unstable pages in NR_WRITEBACK and WB_WRITEBACK. A particular effect of this change is that when wb_check_background_flush() calls wb_over_bg_threshold(), the latter will report 'true' a lot less often as the 'unstable' pages are no longer considered 'dirty' (as there is nothing that writeback can do about them anyway). Currently wb_check_background_flush() will trigger writeback to NFS even when there are relatively few dirty pages (if there are lots of unstable pages), this can result in small writes going to the server (10s of Kilobytes rather than a Megabyte) which hurts throughput. With this patch, there are fewer writes which are each larger on average. Where the NR_UNSTABLE_NFS count was included in statistics virtual-files, the entry is retained, but the value is hard-coded as zero. static trace points and warning printks which mentioned this counter no longer report it. [akpm@linux-foundation.org: re-layout comment] [akpm@linux-foundation.org: fix printk warning] Signed-off-by: NeilBrown Signed-off-by: Andrew Morton Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Acked-by: Trond Myklebust Acked-by: Michal Hocko [mm] Cc: Christoph Hellwig Cc: Chuck Lever Link: http://lkml.kernel.org/r/87d06j7gqa.fsf@notabene.neil.brown.name Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.rst | 4 ++-- drivers/base/node.c | 2 +- fs/fs-writeback.c | 1 - fs/nfs/internal.h | 10 +++++++--- fs/nfs/write.c | 4 ++-- fs/proc/meminfo.c | 3 +-- include/linux/mmzone.h | 1 - include/trace/events/writeback.h | 5 +---- mm/memcontrol.c | 1 - mm/page-writeback.c | 17 ++++------------- mm/page_alloc.c | 5 +---- mm/vmstat.c | 11 +++++++++-- 12 files changed, 28 insertions(+), 36 deletions(-) (limited to 'Documentation') diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 38b606991065..092b7b44d158 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -1042,8 +1042,8 @@ PageTables amount of memory dedicated to the lowest level of page tables. NFS_Unstable - NFS pages sent to the server, but not yet committed to stable - storage + Always zero. Previous counted pages which had been written to + the server, but has not been committed to stable storage. Bounce Memory used for block device "bounce buffers" WritebackTmp diff --git a/drivers/base/node.c b/drivers/base/node.c index 10d7e818e118..6012574913f7 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -439,7 +439,7 @@ static ssize_t node_read_meminfo(struct device *dev, nid, K(i.sharedram), nid, sum_zone_node_page_state(nid, NR_KERNEL_STACK_KB), nid, K(sum_zone_node_page_state(nid, NR_PAGETABLE)), - nid, K(node_page_state(pgdat, NR_UNSTABLE_NFS)), + nid, 0UL, nid, K(sum_zone_node_page_state(nid, NR_BOUNCE)), nid, K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), nid, K(sreclaimable + diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 76ac9c7d32ec..c5bdf46e3b4b 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1070,7 +1070,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi, static unsigned long get_nr_dirty_pages(void) { return global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS) + get_nr_dirty_inodes(); } diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 1f32a9fbfdaf..6673a77884d9 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -668,7 +668,8 @@ void nfs_super_set_maxbytes(struct super_block *sb, __u64 maxfilesize) } /* - * Record the page as unstable and mark its inode as dirty. + * Record the page as unstable (an extra writeback period) and mark its + * inode as dirty. */ static inline void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) @@ -676,8 +677,11 @@ void nfs_mark_page_unstable(struct page *page, struct nfs_commit_info *cinfo) if (!cinfo->dreq) { struct inode *inode = page_file_mapping(page)->host; - inc_node_page_state(page, NR_UNSTABLE_NFS); - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_RECLAIMABLE); + /* This page is really still in write-back - just that the + * writeback is happening on the server now. + */ + inc_node_page_state(page, NR_WRITEBACK); + inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); __mark_inode_dirty(inode, I_DIRTY_DATASYNC); } } diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 1e767f779c49..639c34fec04a 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -946,9 +946,9 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, static void nfs_clear_page_commit(struct page *page) { - dec_node_page_state(page, NR_UNSTABLE_NFS); + dec_node_page_state(page, NR_WRITEBACK); dec_wb_stat(&inode_to_bdi(page_file_mapping(page)->host)->wb, - WB_RECLAIMABLE); + WB_WRITEBACK); } /* Called holding the request lock on @req */ diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8c1f1bb1a5ce..9bd94b5a9658 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -106,8 +106,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "PageTables: ", global_zone_page_state(NR_PAGETABLE)); - show_val_kb(m, "NFS_Unstable: ", - global_node_page_state(NR_UNSTABLE_NFS)); + show_val_kb(m, "NFS_Unstable: ", 0); show_val_kb(m, "Bounce: ", global_zone_page_state(NR_BOUNCE)); show_val_kb(m, "WritebackTmp: ", diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1b9de7d220fb..a89f47515eb1 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -193,7 +193,6 @@ enum node_stat_item { NR_FILE_THPS, NR_FILE_PMDMAPPED, NR_ANON_THPS, - NR_UNSTABLE_NFS, /* NFS unstable pages */ NR_VMSCAN_WRITE, NR_VMSCAN_IMMEDIATE, /* Prioritise for reclaim when writeback ends */ NR_DIRTIED, /* page dirtyings since bootup */ diff --git a/include/trace/events/writeback.h b/include/trace/events/writeback.h index 85a33bea76f1..10f5d1fa7347 100644 --- a/include/trace/events/writeback.h +++ b/include/trace/events/writeback.h @@ -541,7 +541,6 @@ TRACE_EVENT(global_dirty_state, TP_STRUCT__entry( __field(unsigned long, nr_dirty) __field(unsigned long, nr_writeback) - __field(unsigned long, nr_unstable) __field(unsigned long, background_thresh) __field(unsigned long, dirty_thresh) __field(unsigned long, dirty_limit) @@ -552,7 +551,6 @@ TRACE_EVENT(global_dirty_state, TP_fast_assign( __entry->nr_dirty = global_node_page_state(NR_FILE_DIRTY); __entry->nr_writeback = global_node_page_state(NR_WRITEBACK); - __entry->nr_unstable = global_node_page_state(NR_UNSTABLE_NFS); __entry->nr_dirtied = global_node_page_state(NR_DIRTIED); __entry->nr_written = global_node_page_state(NR_WRITTEN); __entry->background_thresh = background_thresh; @@ -560,12 +558,11 @@ TRACE_EVENT(global_dirty_state, __entry->dirty_limit = global_wb_domain.dirty_limit; ), - TP_printk("dirty=%lu writeback=%lu unstable=%lu " + TP_printk("dirty=%lu writeback=%lu " "bg_thresh=%lu thresh=%lu limit=%lu " "dirtied=%lu written=%lu", __entry->nr_dirty, __entry->nr_writeback, - __entry->nr_unstable, __entry->background_thresh, __entry->dirty_thresh, __entry->dirty_limit, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a3b97f103966..1db4b285c407 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4330,7 +4330,6 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pdirty = memcg_exact_page_state(memcg, NR_FILE_DIRTY); - /* this should eventually include NR_UNSTABLE_NFS */ *pwriteback = memcg_exact_page_state(memcg, NR_WRITEBACK); *pfilepages = memcg_exact_page_state(memcg, NR_INACTIVE_FILE) + memcg_exact_page_state(memcg, NR_ACTIVE_FILE); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 7ff2290cf43d..718565266257 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -504,7 +504,6 @@ bool node_dirty_ok(struct pglist_data *pgdat) unsigned long nr_pages = 0; nr_pages += node_page_state(pgdat, NR_FILE_DIRTY); - nr_pages += node_page_state(pgdat, NR_UNSTABLE_NFS); nr_pages += node_page_state(pgdat, NR_WRITEBACK); return nr_pages <= limit; @@ -758,7 +757,7 @@ static void mdtc_calc_avail(struct dirty_throttle_control *mdtc, * bounded by the bdi->min_ratio and/or bdi->max_ratio parameters, if set. * * Return: @wb's dirty limit in pages. The term "dirty" in the context of - * dirty balancing includes all PG_dirty, PG_writeback and NFS unstable pages. + * dirty balancing includes all PG_dirty and PG_writeback pages. */ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) { @@ -1566,7 +1565,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, struct dirty_throttle_control * const mdtc = mdtc_valid(&mdtc_stor) ? &mdtc_stor : NULL; struct dirty_throttle_control *sdtc; - unsigned long nr_reclaimable; /* = file_dirty + unstable_nfs */ + unsigned long nr_reclaimable; /* = file_dirty */ long period; long pause; long max_pause; @@ -1586,14 +1585,7 @@ static void balance_dirty_pages(struct bdi_writeback *wb, unsigned long m_thresh = 0; unsigned long m_bg_thresh = 0; - /* - * Unstable writes are a feature of certain networked - * filesystems (i.e. NFS) in which data may have been - * written to the server's write cache, but has not yet - * been flushed to permanent storage. - */ - nr_reclaimable = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + nr_reclaimable = global_node_page_state(NR_FILE_DIRTY); gdtc->avail = global_dirtyable_memory(); gdtc->dirty = nr_reclaimable + global_node_page_state(NR_WRITEBACK); @@ -1963,8 +1955,7 @@ bool wb_over_bg_thresh(struct bdi_writeback *wb) * as we're trying to decide whether to put more under writeback. */ gdtc->avail = global_dirtyable_memory(); - gdtc->dirty = global_node_page_state(NR_FILE_DIRTY) + - global_node_page_state(NR_UNSTABLE_NFS); + gdtc->dirty = global_node_page_state(NR_FILE_DIRTY); domain_dirty_limits(gdtc); if (gdtc->dirty > gdtc->bg_thresh) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 13cc653122b7..cc406ee17ad9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5319,7 +5319,7 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) printk("active_anon:%lu inactive_anon:%lu isolated_anon:%lu\n" " active_file:%lu inactive_file:%lu isolated_file:%lu\n" - " unevictable:%lu dirty:%lu writeback:%lu unstable:%lu\n" + " unevictable:%lu dirty:%lu writeback:%lu\n" " slab_reclaimable:%lu slab_unreclaimable:%lu\n" " mapped:%lu shmem:%lu pagetables:%lu bounce:%lu\n" " free:%lu free_pcp:%lu free_cma:%lu\n", @@ -5332,7 +5332,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) global_node_page_state(NR_UNEVICTABLE), global_node_page_state(NR_FILE_DIRTY), global_node_page_state(NR_WRITEBACK), - global_node_page_state(NR_UNSTABLE_NFS), global_node_page_state(NR_SLAB_RECLAIMABLE), global_node_page_state(NR_SLAB_UNRECLAIMABLE), global_node_page_state(NR_FILE_MAPPED), @@ -5365,7 +5364,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) " anon_thp: %lukB" #endif " writeback_tmp:%lukB" - " unstable:%lukB" " all_unreclaimable? %s" "\n", pgdat->node_id, @@ -5387,7 +5385,6 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) K(node_page_state(pgdat, NR_ANON_THPS) * HPAGE_PMD_NR), #endif K(node_page_state(pgdat, NR_WRITEBACK_TEMP)), - K(node_page_state(pgdat, NR_UNSTABLE_NFS)), pgdat->kswapd_failures >= MAX_RECLAIM_RETRIES ? "yes" : "no"); } diff --git a/mm/vmstat.c b/mm/vmstat.c index 96d21a792b57..b1582fdf757c 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1108,7 +1108,7 @@ int fragmentation_index(struct zone *zone, unsigned int order) TEXT_FOR_HIGHMEM(xx) xx "_movable", const char * const vmstat_text[] = { - /* enum zone_stat_item countes */ + /* enum zone_stat_item counters */ "nr_free_pages", "nr_zone_inactive_anon", "nr_zone_active_anon", @@ -1162,7 +1162,6 @@ const char * const vmstat_text[] = { "nr_file_hugepages", "nr_file_pmdmapped", "nr_anon_transparent_hugepages", - "nr_unstable", "nr_vmscan_write", "nr_vmscan_immediate_reclaim", "nr_dirtied", @@ -1723,6 +1722,14 @@ static int vmstat_show(struct seq_file *m, void *arg) seq_puts(m, vmstat_text[off]); seq_put_decimal_ull(m, " ", *l); seq_putc(m, '\n'); + + if (off == NR_VMSTAT_ITEMS - 1) { + /* + * We've come to the end - add any deprecated counters to avoid + * breaking userspace which might depend on them being present. + */ + seq_puts(m, "nr_unstable 0\n"); + } return 0; } -- cgit v1.2.3 From a6f5576bb195c3b7508e3e1c98d2dcf6691f96e8 Mon Sep 17 00:00:00 2001 From: Yafang Shao Date: Mon, 1 Jun 2020 21:49:32 -0700 Subject: mm, memcg: add workingset_restore in memory.stat There's a new workingset counter introduced in commit 1899ad18c607 ("mm: workingset: tell cache transitions from workingset thrashing"). With the help of this counter we can know the workingset is transitioning or thrashing. To leverage the benifit of this counter to memcg, we should introduce it into memory.stat. Then we could know the workingset of the workload inside a memcg better. Bellow is the verification of this new counter in memory.stat. Read a file into the memory and then read it again to make these pages be active. The size of this file is 1G. (memory.max is greater than file size) The counters in memory.stat will be inactive_file 0 active_file 1073639424 workingset_refault 0 workingset_activate 0 workingset_restore 0 workingset_nodereclaim 0 Trigger the memcg reclaim by setting a lower value to memory.high, and then some pages will be demoted into inactive list, and then some pages in the inactive list will be evicted into the storage. inactive_file 498094080 active_file 310063104 workingset_refault 0 workingset_activate 0 workingset_restore 0 workingset_nodereclaim 0 Then recover the memory.high and read the file into memory again. As a result of it, the transitioning will occur. Bellow is the result of this transitioning, inactive_file 498094080 active_file 575397888 workingset_refault 64746 workingset_activate 64746 workingset_restore 64746 workingset_nodereclaim 0 Signed-off-by: Yafang Shao Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Acked-by: Michal Hocko Acked-by: Chris Down Cc: Peter Zijlstra (Intel) Cc: Suren Baghdasaryan Cc: Shakeel Butt Link: http://lkml.kernel.org/r/20200504153522.11553-1-laoar.shao@gmail.com Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 4 ++++ mm/memcontrol.c | 2 ++ 2 files changed, 6 insertions(+) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index bcc80269bb6a..5f12f203822e 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1329,6 +1329,10 @@ PAGE_SIZE multiple when read back. workingset_activate Number of refaulted pages that were immediately activated + workingset_restore + Number of restored pages which have been detected as an active + workingset before they got reclaimed. + workingset_nodereclaim Number of times a shadow node has been reclaimed diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1db4b285c407..4bb922c02521 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1451,6 +1451,8 @@ static char *memory_stat_format(struct mem_cgroup *memcg) memcg_page_state(memcg, WORKINGSET_REFAULT)); seq_buf_printf(&s, "workingset_activate %lu\n", memcg_page_state(memcg, WORKINGSET_ACTIVATE)); + seq_buf_printf(&s, "workingset_restore %lu\n", + memcg_page_state(memcg, WORKINGSET_RESTORE)); seq_buf_printf(&s, "workingset_nodereclaim %lu\n", memcg_page_state(memcg, WORKINGSET_NODERECLAIM)); -- cgit v1.2.3 From 4b82ab4f28836646eca12cb37f408568d3cdc5c3 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Mon, 1 Jun 2020 21:49:52 -0700 Subject: mm/memcg: automatically penalize tasks with high swap use Add a memory.swap.high knob, which can be used to protect the system from SWAP exhaustion. The mechanism used for penalizing is similar to memory.high penalty (sleep on return to user space). That is not to say that the knob itself is equivalent to memory.high. The objective is more to protect the system from potentially buggy tasks consuming a lot of swap and impacting other tasks, or even bringing the whole system to stand still with complete SWAP exhaustion. Hopefully without the need to find per-task hard limits. Slowing misbehaving tasks down gradually allows user space oom killers or other protection mechanisms to react. oomd and earlyoom already do killing based on swap exhaustion, and memory.swap.high protection will help implement such userspace oom policies more reliably. We can use one counter for number of pages allocated under pressure to save struct task space and avoid two separate hierarchy walks on the hot path. The exact overage is calculated on return to user space, anyway. Take the new high limit into account when determining if swap is "full". Borrowing the explanation from Johannes: The idea behind "swap full" is that as long as the workload has plenty of swap space available and it's not changing its memory contents, it makes sense to generously hold on to copies of data in the swap device, even after the swapin. A later reclaim cycle can drop the page without any IO. Trading disk space for IO. But the only two ways to reclaim a swap slot is when they're faulted in and the references go away, or by scanning the virtual address space like swapoff does - which is very expensive (one could argue it's too expensive even for swapoff, it's often more practical to just reboot). So at some point in the fill level, we have to start freeing up swap slots on fault/swapin. Otherwise we could eventually run out of swap slots while they're filled with copies of data that is also in RAM. We don't want to OOM a workload because its available swap space is filled with redundant cache. Signed-off-by: Jakub Kicinski Signed-off-by: Andrew Morton Acked-by: Johannes Weiner Cc: Tejun Heo Cc: Chris Down Cc: Shakeel Butt Cc: Michal Hocko Cc: Hugh Dickins Link: http://lkml.kernel.org/r/20200527195846.102707-5-kuba@kernel.org Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 20 ++++++++ include/linux/memcontrol.h | 1 + mm/memcontrol.c | 88 ++++++++++++++++++++++++++++++--- 3 files changed, 102 insertions(+), 7 deletions(-) (limited to 'Documentation') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 5f12f203822e..b8c0460730f3 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1374,6 +1374,22 @@ PAGE_SIZE multiple when read back. The total amount of swap currently being used by the cgroup and its descendants. + memory.swap.high + A read-write single value file which exists on non-root + cgroups. The default is "max". + + Swap usage throttle limit. If a cgroup's swap usage exceeds + this limit, all its further allocations will be throttled to + allow userspace to implement custom out-of-memory procedures. + + This limit marks a point of no return for the cgroup. It is NOT + designed to manage the amount of swapping a workload does + during regular operation. Compare to memory.swap.max, which + prohibits swapping past a set amount, but lets the cgroup + continue unimpeded as long as other memory can be reclaimed. + + Healthy workloads are not expected to reach this limit. + memory.swap.max A read-write single value file which exists on non-root cgroups. The default is "max". @@ -1387,6 +1403,10 @@ PAGE_SIZE multiple when read back. otherwise, a value change in this file generates a file modified event. + high + The number of times the cgroup's swap usage was over + the high threshold. + max The number of times the cgroup's swap usage was about to go over the max boundary and swap allocation diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 95a09a7ec412..bfe9533bb67e 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -45,6 +45,7 @@ enum memcg_memory_event { MEMCG_MAX, MEMCG_OOM, MEMCG_OOM_KILL, + MEMCG_SWAP_HIGH, MEMCG_SWAP_MAX, MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 08cf17b186fb..f3087e22dfa9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2354,6 +2354,22 @@ static u64 mem_find_max_overage(struct mem_cgroup *memcg) return max_overage; } +static u64 swap_find_max_overage(struct mem_cgroup *memcg) +{ + u64 overage, max_overage = 0; + + do { + overage = calculate_overage(page_counter_read(&memcg->swap), + READ_ONCE(memcg->swap.high)); + if (overage) + memcg_memory_event(memcg, MEMCG_SWAP_HIGH); + max_overage = max(overage, max_overage); + } while ((memcg = parent_mem_cgroup(memcg)) && + !mem_cgroup_is_root(memcg)); + + return max_overage; +} + /* * Get the number of jiffies that we should penalise a mischievous cgroup which * is exceeding its memory.high by checking both it and its ancestors. @@ -2415,6 +2431,9 @@ void mem_cgroup_handle_over_high(void) penalty_jiffies = calculate_high_delay(memcg, nr_pages, mem_find_max_overage(memcg)); + penalty_jiffies += calculate_high_delay(memcg, nr_pages, + swap_find_max_overage(memcg)); + /* * Clamp the max delay per usermode return so as to still keep the * application moving forwards and also permit diagnostics, albeit @@ -2605,13 +2624,32 @@ done_restock: * reclaim, the cost of mismatch is negligible. */ do { - if (page_counter_read(&memcg->memory) > - READ_ONCE(memcg->memory.high)) { - /* Don't bother a random interrupted task */ - if (in_interrupt()) { + bool mem_high, swap_high; + + mem_high = page_counter_read(&memcg->memory) > + READ_ONCE(memcg->memory.high); + swap_high = page_counter_read(&memcg->swap) > + READ_ONCE(memcg->swap.high); + + /* Don't bother a random interrupted task */ + if (in_interrupt()) { + if (mem_high) { schedule_work(&memcg->high_work); break; } + continue; + } + + if (mem_high || swap_high) { + /* + * The allocating tasks in this cgroup will need to do + * reclaim or be throttled to prevent further growth + * of the memory or swap footprints. + * + * Target some best-effort fairness between the tasks, + * and distribute reclaim work and delay penalties + * based on how much each task is actually allocating. + */ current->memcg_nr_pages_over_high += batch; set_notify_resume(current); break; @@ -5076,6 +5114,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); if (parent) { memcg->swappiness = mem_cgroup_swappiness(parent); memcg->oom_kill_disable = parent->oom_kill_disable; @@ -5229,6 +5268,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_low(&memcg->memory, 0); page_counter_set_high(&memcg->memory, PAGE_COUNTER_MAX); memcg->soft_limit = PAGE_COUNTER_MAX; + page_counter_set_high(&memcg->swap, PAGE_COUNTER_MAX); memcg_wb_domain_size_changed(memcg); } @@ -7142,10 +7182,13 @@ bool mem_cgroup_swap_full(struct page *page) if (!memcg) return false; - for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= - READ_ONCE(memcg->swap.max)) + for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) { + unsigned long usage = page_counter_read(&memcg->swap); + + if (usage * 2 >= READ_ONCE(memcg->swap.high) || + usage * 2 >= READ_ONCE(memcg->swap.max)) return true; + } return false; } @@ -7175,6 +7218,29 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->swap) * PAGE_SIZE; } +static int swap_high_show(struct seq_file *m, void *v) +{ + return seq_puts_memcg_tunable(m, + READ_ONCE(mem_cgroup_from_seq(m)->swap.high)); +} + +static ssize_t swap_high_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long high; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &high); + if (err) + return err; + + page_counter_set_high(&memcg->swap, high); + + return nbytes; +} + static int swap_max_show(struct seq_file *m, void *v) { return seq_puts_memcg_tunable(m, @@ -7202,6 +7268,8 @@ static int swap_events_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_seq(m); + seq_printf(m, "high %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_HIGH])); seq_printf(m, "max %lu\n", atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); seq_printf(m, "fail %lu\n", @@ -7216,6 +7284,12 @@ static struct cftype swap_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = swap_current_read, }, + { + .name = "swap.high", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = swap_high_show, + .write = swap_high_write, + }, { .name = "swap.max", .flags = CFTYPE_NOT_ON_ROOT, -- cgit v1.2.3 From ed1f324c5fed06c91f30a36aedb66f34244ab86e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 1 Jun 2020 21:51:19 -0700 Subject: mm: remove map_vm_range Switch all callers to map_kernel_range, which symmetric to the unmap side (as well as the _noflush versions). Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Cc: Christian Borntraeger Cc: Christophe Leroy Cc: Daniel Vetter Cc: David Airlie Cc: Gao Xiang Cc: Greg Kroah-Hartman Cc: Haiyang Zhang Cc: Johannes Weiner Cc: "K. Y. Srinivasan" Cc: Laura Abbott Cc: Mark Rutland Cc: Michael Kelley Cc: Minchan Kim Cc: Nitin Gupta Cc: Robin Murphy Cc: Sakari Ailus Cc: Stephen Hemminger Cc: Sumit Semwal Cc: Wei Liu Cc: Benjamin Herrenschmidt Cc: Catalin Marinas Cc: Heiko Carstens Cc: Paul Mackerras Cc: Vasily Gorbik Cc: Will Deacon Link: http://lkml.kernel.org/r/20200414131348.444715-17-hch@lst.de Signed-off-by: Linus Torvalds --- Documentation/core-api/cachetlb.rst | 2 +- include/linux/vmalloc.h | 10 ++++------ mm/vmalloc.c | 21 +++++++-------------- mm/zsmalloc.c | 4 +++- net/ceph/ceph_common.c | 3 +-- 5 files changed, 16 insertions(+), 24 deletions(-) (limited to 'Documentation') diff --git a/Documentation/core-api/cachetlb.rst b/Documentation/core-api/cachetlb.rst index 93cb65d52720..a1582cc79f0f 100644 --- a/Documentation/core-api/cachetlb.rst +++ b/Documentation/core-api/cachetlb.rst @@ -213,7 +213,7 @@ Here are the routines, one by one: there will be no entries in the cache for the kernel address space for virtual addresses in the range 'start' to 'end-1'. - The first of these two routines is invoked after map_vm_area() + The first of these two routines is invoked after map_kernel_range() has installed the page table entries. The second is invoked before unmap_kernel_range() deletes the page table entries. diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index ff69d1e037ca..a1e9bdc3ad9e 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -168,11 +168,11 @@ extern struct vm_struct *__get_vm_area_caller(unsigned long size, extern struct vm_struct *remove_vm_area(const void *addr); extern struct vm_struct *find_vm_area(const void *addr); -extern int map_vm_area(struct vm_struct *area, pgprot_t prot, - struct page **pages); #ifdef CONFIG_MMU extern int map_kernel_range_noflush(unsigned long start, unsigned long size, pgprot_t prot, struct page **pages); +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages); extern void unmap_kernel_range_noflush(unsigned long addr, unsigned long size); extern void unmap_kernel_range(unsigned long addr, unsigned long size); static inline void set_vm_flush_reset_perms(void *addr) @@ -189,14 +189,12 @@ map_kernel_range_noflush(unsigned long start, unsigned long size, { return size >> PAGE_SHIFT; } +#define map_kernel_range map_kernel_range_noflush static inline void unmap_kernel_range_noflush(unsigned long addr, unsigned long size) { } -static inline void -unmap_kernel_range(unsigned long addr, unsigned long size) -{ -} +#define unmap_kernel_range unmap_kernel_range_noflush static inline void set_vm_flush_reset_perms(void *addr) { } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index aab00ddee686..49ca687d8853 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -273,8 +273,8 @@ int map_kernel_range_noflush(unsigned long addr, unsigned long size, return 0; } -static int map_kernel_range(unsigned long start, unsigned long size, - pgprot_t prot, struct page **pages) +int map_kernel_range(unsigned long start, unsigned long size, pgprot_t prot, + struct page **pages) { int ret; @@ -2028,16 +2028,6 @@ void unmap_kernel_range(unsigned long addr, unsigned long size) flush_tlb_kernel_range(addr, end); } -int map_vm_area(struct vm_struct *area, pgprot_t prot, struct page **pages) -{ - unsigned long addr = (unsigned long)area->addr; - int err; - - err = map_kernel_range(addr, get_vm_area_size(area), prot, pages); - - return err > 0 ? 0 : err; -} - static inline void setup_vmalloc_vm_locked(struct vm_struct *vm, struct vmap_area *va, unsigned long flags, const void *caller) { @@ -2409,7 +2399,8 @@ void *vmap(struct page **pages, unsigned int count, if (!area) return NULL; - if (map_vm_area(area, prot, pages)) { + if (map_kernel_range((unsigned long)area->addr, size, prot, + pages) < 0) { vunmap(area->addr); return NULL; } @@ -2472,8 +2463,10 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, } atomic_long_add(area->nr_pages, &nr_vmalloc_pages); - if (map_vm_area(area, prot, pages)) + if (map_kernel_range((unsigned long)area->addr, get_vm_area_size(area), + prot, pages) < 0) goto fail; + return area->addr; fail: diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index ac0524330b9b..f6dc0673e62c 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1138,7 +1138,9 @@ static inline void __zs_cpu_down(struct mapping_area *area) static inline void *__zs_map_object(struct mapping_area *area, struct page *pages[2], int off, int size) { - BUG_ON(map_vm_area(area->vm, PAGE_KERNEL, pages)); + unsigned long addr = (unsigned long)area->vm->addr; + + BUG_ON(map_kernel_range(addr, PAGE_SIZE * 2, PAGE_KERNEL, pages) < 0); area->vm_addr = area->vm->addr; return area->vm_addr + off; } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index a0e97f6c1072..66f22e8aa529 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,8 +190,7 @@ EXPORT_SYMBOL(ceph_compare_options); * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are * compatible with (a superset of) GFP_KERNEL. This is because while the * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. map_vm_area() doesn't even take - * flags because GFP_KERNEL is hard-coded in {p4d,pud,pmd,pte}_alloc(). + * are always allocated with GFP_KERNEL. * * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. */ -- cgit v1.2.3