From 8df4a44cc46bf9dcfb80a37a59ae5dea2232dc58 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Tue, 21 Aug 2018 21:51:49 -0700 Subject: mm: check shrinker is memcg-aware in register_shrinker_prepared() There is a sad BUG introduced in patch adding SHRINKER_REGISTERING. shrinker_idr business is only for memcg-aware shrinkers. Only such type of shrinkers have id and they must be finaly installed via idr_replace() in this function. For !memcg-aware shrinkers we never initialize shrinker->id field. But there are all types of shrinkers passed to idr_replace(), and every !memcg-aware shrinker with random ID (most probably, its id is 0) replaces memcg-aware shrinker pointed by the ID in IDR. This patch fixes the problem. Link: http://lkml.kernel.org/r/8ff8a793-8211-713a-4ed9-d6e52390c2fc@virtuozzo.com Fixes: 7e010df53c80 "mm: use special value SHRINKER_REGISTERING instead of list_empty() check" Signed-off-by: Kirill Tkhai Reported-by: Cc: Andrey Ryabinin Cc: Johannes Weiner Cc: Josef Bacik Cc: Mel Gorman Cc: Michal Hocko Cc: Tetsuo Handa Cc: Shakeel Butt Cc: Cc: Huang Ying Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'mm') diff --git a/mm/vmscan.c b/mm/vmscan.c index 4375b1e9bd56..3c6e2bfee427 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -408,7 +408,8 @@ void register_shrinker_prepared(struct shrinker *shrinker) down_write(&shrinker_rwsem); list_add_tail(&shrinker->list, &shrinker_list); #ifdef CONFIG_MEMCG_KMEM - idr_replace(&shrinker_idr, shrinker, shrinker->id); + if (shrinker->flags & SHRINKER_MEMCG_AWARE) + idr_replace(&shrinker_idr, shrinker, shrinker->id); #endif up_write(&shrinker_rwsem); } -- cgit v1.2.3 From 59d98bf3c2b9218ff4cdb2a70aa52fffedf1786c Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:01 -0700 Subject: mm: swap: add comments to lock_cluster_or_swap_info() Patch series "swap: THP optimizing refactoring", v4. Now the THP (Transparent Huge Page) swap optimizing is implemented in the way like below, #ifdef CONFIG_THP_SWAP huge_function(...) { } #else normal_function(...) { } #endif general_function(...) { if (huge) return thp_function(...); else return normal_function(...); } As pointed out by Dave Hansen, this will, 1. Create a new, wholly untested code path for huge page 2. Create two places to patch bugs 3. Are not reusing code when possible This patchset is to address these problems via merging huge/normal code path/functions if possible. One concern is that this may cause code size to dilate when !CONFIG_TRANSPARENT_HUGEPAGE. The data shows that most refactoring will only cause quite slight code size increase. This patch (of 8): To improve code readability. Link: http://lkml.kernel.org/r/20180720071845.17920-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-and-acked-by: Dave Hansen Reviewed-by: Daniel Jordan Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 8837b22c848d..52a9dd9dab8e 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -296,13 +296,18 @@ static inline void unlock_cluster(struct swap_cluster_info *ci) spin_unlock(&ci->lock); } +/* + * Determine the locking method in use for this device. Return + * swap_cluster_info if SSD-style cluster-based locking is in place. + */ static inline struct swap_cluster_info *lock_cluster_or_swap_info( - struct swap_info_struct *si, - unsigned long offset) + struct swap_info_struct *si, unsigned long offset) { struct swap_cluster_info *ci; + /* Try to use fine-grained SSD-style locking if available: */ ci = lock_cluster(si, offset); + /* Otherwise, fall back to traditional, coarse locking: */ if (!ci) spin_lock(&si->lock); -- cgit v1.2.3 From fe5266d5d5948abc6e71cdabb98e3f5cba811205 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:05 -0700 Subject: mm/swapfile.c: replace some #ifdef with IS_ENABLED() In mm/swapfile.c, THP (Transparent Huge Page) swap specific code is enclosed by #ifdef CONFIG_THP_SWAP/#endif to avoid code dilating when THP isn't enabled. But #ifdef/#endif in .c file hurt the code readability, so Dave suggested to use IS_ENABLED(CONFIG_THP_SWAP) instead and let compiler to do the dirty job for us. This has potential to remove some duplicated code too. From output of `size`, text data bss dec hex filename THP=y: 26269 2076 340 28685 700d mm/swapfile.o ifdef/endif: 24115 2028 340 26483 6773 mm/swapfile.o IS_ENABLED: 24179 2028 340 26547 67b3 mm/swapfile.o IS_ENABLED() based solution works quite well, almost as good as that of #ifdef/#endif. And from the diffstat, the removed lines are more than added lines. One #ifdef for split_swap_cluster() is kept. Because it is a public function with a stub implementation for CONFIG_THP_SWAP=n in swap.h. Link: http://lkml.kernel.org/r/20180720071845.17920-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-and-acked-by: Dave Hansen Reviewed-by: Daniel Jordan Reviewed-by: Andrew Morton Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 60 ++++++++++++++++++++--------------------------------------- 1 file changed, 20 insertions(+), 40 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 52a9dd9dab8e..618358ad464b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -868,7 +868,6 @@ no_page: return n_ret; } -#ifdef CONFIG_THP_SWAP static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) { unsigned long idx; @@ -876,6 +875,15 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) unsigned long offset, i; unsigned char *map; + /* + * Should not even be attempting cluster allocations when huge + * page swap is disabled. Warn and fail the allocation. + */ + if (!IS_ENABLED(CONFIG_THP_SWAP)) { + VM_WARN_ON_ONCE(1); + return 0; + } + if (cluster_list_empty(&si->free_clusters)) return 0; @@ -906,13 +914,6 @@ static void swap_free_cluster(struct swap_info_struct *si, unsigned long idx) unlock_cluster(ci); swap_range_free(si, offset, SWAPFILE_CLUSTER); } -#else -static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) -{ - VM_WARN_ON_ONCE(1); - return 0; -} -#endif /* CONFIG_THP_SWAP */ static unsigned long scan_swap_map(struct swap_info_struct *si, unsigned char usage) @@ -1200,7 +1201,6 @@ static void swapcache_free(swp_entry_t entry) } } -#ifdef CONFIG_THP_SWAP static void swapcache_free_cluster(swp_entry_t entry) { unsigned long offset = swp_offset(entry); @@ -1211,6 +1211,9 @@ static void swapcache_free_cluster(swp_entry_t entry) unsigned int i, free_entries = 0; unsigned char val; + if (!IS_ENABLED(CONFIG_THP_SWAP)) + return; + si = _swap_info_get(entry); if (!si) return; @@ -1246,6 +1249,7 @@ static void swapcache_free_cluster(swp_entry_t entry) } } +#ifdef CONFIG_THP_SWAP int split_swap_cluster(swp_entry_t entry) { struct swap_info_struct *si; @@ -1260,11 +1264,7 @@ int split_swap_cluster(swp_entry_t entry) unlock_cluster(ci); return 0; } -#else -static inline void swapcache_free_cluster(swp_entry_t entry) -{ -} -#endif /* CONFIG_THP_SWAP */ +#endif void put_swap_page(struct page *page, swp_entry_t entry) { @@ -1414,7 +1414,6 @@ out: return count; } -#ifdef CONFIG_THP_SWAP static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, swp_entry_t entry) { @@ -1425,6 +1424,9 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; + if (!IS_ENABLED(CONFIG_THP_SWAP)) + return swap_swapcount(si, entry) != 0; + ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { if (map[roffset] != SWAP_HAS_CACHE) @@ -1447,7 +1449,7 @@ static bool page_swapped(struct page *page) swp_entry_t entry; struct swap_info_struct *si; - if (likely(!PageTransCompound(page))) + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) return page_swapcount(page) != 0; page = compound_head(page); @@ -1471,10 +1473,8 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; + if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { + mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) @@ -1521,26 +1521,6 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, return map_swapcount; } -#else -#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) -#define page_swapped(page) (page_swapcount(page) != 0) - -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, - int *total_swapcount) -{ - int mapcount, swapcount = 0; - - /* hugetlbfs shouldn't call it */ - VM_BUG_ON_PAGE(PageHuge(page), page); - - mapcount = page_trans_huge_mapcount(page, total_mapcount); - if (PageSwapCache(page)) - swapcount = page_swapcount(page); - if (total_swapcount) - *total_swapcount = swapcount; - return mapcount + swapcount; -} -#endif /* * We can write to an anon page without COW if there are no other references -- cgit v1.2.3 From afa4711ef1399beb60964dcb2cd0369dfb0ac20a Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:09 -0700 Subject: mm/swapfile.c: use swap_count() in swap_page_trans_huge_swapped() In swap_page_trans_huge_swapped(), to identify whether there's any page table mapping for a 4k sized swap entry, "si->swap_map[i] != SWAP_HAS_CACHE" is used. This works correctly now, because all users of the function will only call it after checking SWAP_HAS_CACHE. But as pointed out by Daniel, it is better to use "swap_count(map[i])" here, because it works for "map[i] == 0" case too. And this makes the implementation more consistent between normal and huge swap entry. Link: http://lkml.kernel.org/r/20180720071845.17920-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-and-reviewed-by: Daniel Jordan Acked-by: Dave Hansen Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 618358ad464b..c291400c4463 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1429,12 +1429,12 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { - if (map[roffset] != SWAP_HAS_CACHE) + if (swap_count(map[roffset])) ret = true; goto unlock_out; } for (i = 0; i < SWAPFILE_CLUSTER; i++) { - if (map[offset + i] != SWAP_HAS_CACHE) { + if (swap_count(map[offset + i])) { ret = true; break; } -- cgit v1.2.3 From 33ee011e5656edef6a58952006c486d342b7bbb5 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:13 -0700 Subject: mm/swapfile.c: unify normal/huge code path in swap_page_trans_huge_swapped() As suggested by Dave, we should unify the code path for normal and huge swap support if possible to avoid duplicated code, bugs, etc. and make it easier to review code. In this patch, the normal/huge code path in swap_page_trans_huge_swapped() is unified, the added and removed lines are same. And the binary size is kept almost same when CONFIG_TRANSPARENT_HUGEPAGE=n. text data bss dec hex filename base: 24179 2028 340 26547 67b3 mm/swapfile.o unified: 24215 2028 340 26583 67d7 mm/swapfile.o Link: http://lkml.kernel.org/r/20180720071845.17920-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-and-acked-by: Dave Hansen Reviewed-by: Daniel Jordan Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index c291400c4463..3c1172b53436 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -269,7 +269,9 @@ static inline void cluster_set_null(struct swap_cluster_info *info) static inline bool cluster_is_huge(struct swap_cluster_info *info) { - return info->flags & CLUSTER_FLAG_HUGE; + if (IS_ENABLED(CONFIG_THP_SWAP)) + return info->flags & CLUSTER_FLAG_HUGE; + return false; } static inline void cluster_clear_huge(struct swap_cluster_info *info) @@ -1424,9 +1426,6 @@ static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, int i; bool ret = false; - if (!IS_ENABLED(CONFIG_THP_SWAP)) - return swap_swapcount(si, entry) != 0; - ci = lock_cluster_or_swap_info(si, offset); if (!ci || !cluster_is_huge(ci)) { if (swap_count(map[roffset])) -- cgit v1.2.3 From a448f2d07f891ac65cc48017f17735ec73086bf0 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:17 -0700 Subject: mm/swapfile.c: unify normal/huge code path in put_swap_page() In this patch, the normal/huge code path in put_swap_page() and several helper functions are unified to avoid duplicated code, bugs, etc. and make it easier to review the code. The removed lines are more than added lines. And the binary size is kept exactly same when CONFIG_TRANSPARENT_HUGEPAGE=n. Link: http://lkml.kernel.org/r/20180720071845.17920-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Dave Hansen Acked-by: Dave Hansen Reviewed-by: Daniel Jordan Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 83 ++++++++++++++++++++++++++--------------------------------- 1 file changed, 37 insertions(+), 46 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index 3c1172b53436..043645e7f0b5 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -204,8 +204,16 @@ static void discard_swap_cluster(struct swap_info_struct *si, #ifdef CONFIG_THP_SWAP #define SWAPFILE_CLUSTER HPAGE_PMD_NR + +#define swap_entry_size(size) (size) #else #define SWAPFILE_CLUSTER 256 + +/* + * Define swap_entry_size() as constant to let compiler to optimize + * out some code if !CONFIG_THP_SWAP + */ +#define swap_entry_size(size) 1 #endif #define LATENCY_LIMIT 256 @@ -1192,18 +1200,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -static void swapcache_free(swp_entry_t entry) -{ - struct swap_info_struct *p; - - p = _swap_info_get(entry); - if (p) { - if (!__swap_entry_free(p, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); - } -} - -static void swapcache_free_cluster(swp_entry_t entry) +void put_swap_page(struct page *page, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1212,39 +1209,41 @@ static void swapcache_free_cluster(swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - - if (!IS_ENABLED(CONFIG_THP_SWAP)) - return; + int size = swap_entry_size(hpage_nr_pages(page)); si = _swap_info_get(entry); if (!si) return; - ci = lock_cluster(si, offset); - VM_BUG_ON(!cluster_is_huge(ci)); - map = si->swap_map + offset; - for (i = 0; i < SWAPFILE_CLUSTER; i++) { - val = map[i]; - VM_BUG_ON(!(val & SWAP_HAS_CACHE)); - if (val == SWAP_HAS_CACHE) - free_entries++; - } - if (!free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] &= ~SWAP_HAS_CACHE; - } - cluster_clear_huge(ci); - unlock_cluster(ci); - if (free_entries == SWAPFILE_CLUSTER) { - spin_lock(&si->lock); + if (size == SWAPFILE_CLUSTER) { ci = lock_cluster(si, offset); - memset(map, 0, SWAPFILE_CLUSTER); + VM_BUG_ON(!cluster_is_huge(ci)); + map = si->swap_map + offset; + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + if (!free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] &= ~SWAP_HAS_CACHE; + } + cluster_clear_huge(ci); unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); - } else if (free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { + if (free_entries == SWAPFILE_CLUSTER) { + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + return; + } + } + if (size == 1 || free_entries) { + for (i = 0; i < size; i++, entry.val++) { if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) free_swap_slot(entry); } @@ -1268,14 +1267,6 @@ int split_swap_cluster(swp_entry_t entry) } #endif -void put_swap_page(struct page *page, swp_entry_t entry) -{ - if (!PageTransHuge(page)) - swapcache_free(entry); - else - swapcache_free_cluster(entry); -} - static int swp_entry_cmp(const void *ent1, const void *ent2) { const swp_entry_t *e1 = ent1, *e2 = ent2; -- cgit v1.2.3 From 5d5e8f19544a35ae1609bd96ae6b28c9fcd1baf6 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:20 -0700 Subject: mm, swap, get_swap_pages: use entry_size instead of cluster in parameter As suggested by Matthew Wilcox, it is better to use "int entry_size" instead of "bool cluster" as parameter to specify whether to operate for huge or normal swap entries. Because this improve the flexibility to support other swap entry size. And Dave Hansen thinks that this improves code readability too. So in this patch, the "bool cluster" parameter of get_swap_pages() is replaced by "int entry_size". And nr_swap_entries() trick is used to reduce the binary size when !CONFIG_TRANSPARENT_HUGE_PAGE. text data bss dec hex filename base 24215 2028 340 26583 67d7 mm/swapfile.o head 24123 2004 340 26467 6763 mm/swapfile.o Link: http://lkml.kernel.org/r/20180720071845.17920-7-ying.huang@intel.com Signed-off-by: "Huang, Ying" Suggested-by: Matthew Wilcox Acked-by: Dave Hansen Cc: Daniel Jordan Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 2 +- mm/swap_slots.c | 8 ++++---- mm/swapfile.c | 16 ++++++++-------- 3 files changed, 13 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/include/linux/swap.h b/include/linux/swap.h index 1a8bd05a335e..8e2c11e692ba 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -447,7 +447,7 @@ extern void si_swapinfo(struct sysinfo *); extern swp_entry_t get_swap_page(struct page *page); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); -extern int get_swap_pages(int n, bool cluster, swp_entry_t swp_entries[]); +extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); extern void swap_shmem_alloc(swp_entry_t); extern int swap_duplicate(swp_entry_t); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 008ccb22fee6..63a7b4563a57 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -269,8 +269,8 @@ static int refill_swap_slots_cache(struct swap_slots_cache *cache) cache->cur = 0; if (swap_slot_cache_active) - cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, false, - cache->slots); + cache->nr = get_swap_pages(SWAP_SLOTS_CACHE_SIZE, + cache->slots, 1); return cache->nr; } @@ -316,7 +316,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) - get_swap_pages(1, true, &entry); + get_swap_pages(1, &entry, HPAGE_PMD_NR); goto out; } @@ -350,7 +350,7 @@ repeat: goto out; } - get_swap_pages(1, false, &entry); + get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(page, entry)) { put_swap_page(page, entry); diff --git a/mm/swapfile.c b/mm/swapfile.c index 043645e7f0b5..b30dd0642ccf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -940,18 +940,18 @@ static unsigned long scan_swap_map(struct swap_info_struct *si, } -int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) +int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) { - unsigned long nr_pages = cluster ? SWAPFILE_CLUSTER : 1; + unsigned long size = swap_entry_size(entry_size); struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; int node; /* Only single cluster request supported */ - WARN_ON_ONCE(n_goal > 1 && cluster); + WARN_ON_ONCE(n_goal > 1 && size == SWAPFILE_CLUSTER); - avail_pgs = atomic_long_read(&nr_swap_pages) / nr_pages; + avail_pgs = atomic_long_read(&nr_swap_pages) / size; if (avail_pgs <= 0) goto noswap; @@ -961,7 +961,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) if (n_goal > avail_pgs) n_goal = avail_pgs; - atomic_long_sub(n_goal * nr_pages, &nr_swap_pages); + atomic_long_sub(n_goal * size, &nr_swap_pages); spin_lock(&swap_avail_lock); @@ -988,14 +988,14 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) { + if (size == SWAPFILE_CLUSTER) { if (!(si->flags & SWP_FILE)) n_ret = swap_alloc_cluster(si, swp_entries); } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); - if (n_ret || cluster) + if (n_ret || size == SWAPFILE_CLUSTER) goto check_out; pr_debug("scan_swap_map of si %d failed to find offset\n", si->type); @@ -1021,7 +1021,7 @@ nextsi: check_out: if (n_ret < n_goal) - atomic_long_add((long)(n_goal - n_ret) * nr_pages, + atomic_long_add((long)(n_goal - n_ret) * size, &nr_swap_pages); noswap: return n_ret; -- cgit v1.2.3 From b32d5f32b9dbe64970f41602066f7904df15f3e9 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:24 -0700 Subject: mm/swapfile.c: add __swap_entry_free_locked() The part of __swap_entry_free() with lock held is separated into a new function __swap_entry_free_locked(). Because we want to reuse that piece of code in some other places. Just mechanical code refactoring, there is no any functional change in this function. Link: http://lkml.kernel.org/r/20180720071845.17920-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Daniel Jordan Acked-by: Dave Hansen Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index b30dd0642ccf..d44b2d60a66a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1123,16 +1123,13 @@ static struct swap_info_struct *swap_info_get_cont(swp_entry_t entry, return p; } -static unsigned char __swap_entry_free(struct swap_info_struct *p, - swp_entry_t entry, unsigned char usage) +static unsigned char __swap_entry_free_locked(struct swap_info_struct *p, + unsigned long offset, + unsigned char usage) { - struct swap_cluster_info *ci; - unsigned long offset = swp_offset(entry); unsigned char count; unsigned char has_cache; - ci = lock_cluster_or_swap_info(p, offset); - count = p->swap_map[offset]; has_cache = count & SWAP_HAS_CACHE; @@ -1160,6 +1157,17 @@ static unsigned char __swap_entry_free(struct swap_info_struct *p, usage = count | has_cache; p->swap_map[offset] = usage ? : SWAP_HAS_CACHE; + return usage; +} + +static unsigned char __swap_entry_free(struct swap_info_struct *p, + swp_entry_t entry, unsigned char usage) +{ + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + ci = lock_cluster_or_swap_info(p, offset); + usage = __swap_entry_free_locked(p, offset, usage); unlock_cluster_or_swap_info(p, ci); return usage; -- cgit v1.2.3 From c2343d2761f86ae1b857f78c7cdb9f51e5fa1641 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Tue, 21 Aug 2018 21:52:29 -0700 Subject: mm/swapfile.c: put_swap_page: share more between huge/normal code path In this patch, locking related code is shared between huge/normal code path in put_swap_page() to reduce code duplication. The `free_entries == 0` case is merged into the more general `free_entries != SWAPFILE_CLUSTER` case, because the new locking method makes it easy. The added lines is same as the removed lines. But the code size is increased when CONFIG_TRANSPARENT_HUGEPAGE=n. text data bss dec hex filename base: 24123 2004 340 26467 6763 mm/swapfile.o unified: 24485 2004 340 26829 68cd mm/swapfile.o Dig on step deeper with `size -A mm/swapfile.o` for base and unified kernel and compare the result, yields, -.text 17723 0 +.text 17835 0 -.orc_unwind_ip 1380 0 +.orc_unwind_ip 1480 0 -.orc_unwind 2070 0 +.orc_unwind 2220 0 -Total 26686 +Total 27048 The total difference is the same. The text segment difference is much smaller: 112. More difference comes from the ORC unwinder segments: (1480 + 2220) - (1380 + 2070) = 250. If the frame pointer unwinder is used, this costs nothing. Link: http://lkml.kernel.org/r/20180720071845.17920-9-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Daniel Jordan Acked-by: Dave Hansen Cc: Michal Hocko Cc: Johannes Weiner Cc: Shaohua Li Cc: Hugh Dickins Cc: Minchan Kim Cc: Rik van Riel Cc: Dan Williams Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/mm/swapfile.c b/mm/swapfile.c index d44b2d60a66a..d954b71c4f9c 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1223,8 +1223,8 @@ void put_swap_page(struct page *page, swp_entry_t entry) if (!si) return; + ci = lock_cluster_or_swap_info(si, offset); if (size == SWAPFILE_CLUSTER) { - ci = lock_cluster(si, offset); VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { @@ -1233,13 +1233,9 @@ void put_swap_page(struct page *page, swp_entry_t entry) if (val == SWAP_HAS_CACHE) free_entries++; } - if (!free_entries) { - for (i = 0; i < SWAPFILE_CLUSTER; i++) - map[i] &= ~SWAP_HAS_CACHE; - } cluster_clear_huge(ci); - unlock_cluster(ci); if (free_entries == SWAPFILE_CLUSTER) { + unlock_cluster_or_swap_info(si, ci); spin_lock(&si->lock); ci = lock_cluster(si, offset); memset(map, 0, SWAPFILE_CLUSTER); @@ -1250,12 +1246,16 @@ void put_swap_page(struct page *page, swp_entry_t entry) return; } } - if (size == 1 || free_entries) { - for (i = 0; i < size; i++, entry.val++) { - if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) - free_swap_slot(entry); + for (i = 0; i < size; i++, entry.val++) { + if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE)) { + unlock_cluster_or_swap_info(si, ci); + free_swap_slot(entry); + if (i == size - 1) + return; + lock_cluster_or_swap_info(si, offset); } } + unlock_cluster_or_swap_info(si, ci); } #ifdef CONFIG_THP_SWAP -- cgit v1.2.3 From 93065ac753e4443840a057bfef4be71ec766fde9 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2018 21:52:33 -0700 Subject: mm, oom: distinguish blockable mode for mmu notifiers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are several blockable mmu notifiers which might sleep in mmu_notifier_invalidate_range_start and that is a problem for the oom_reaper because it needs to guarantee a forward progress so it cannot depend on any sleepable locks. Currently we simply back off and mark an oom victim with blockable mmu notifiers as done after a short sleep. That can result in selecting a new oom victim prematurely because the previous one still hasn't torn its memory down yet. We can do much better though. Even if mmu notifiers use sleepable locks there is no reason to automatically assume those locks are held. Moreover majority of notifiers only care about a portion of the address space and there is absolutely zero reason to fail when we are unmapping an unrelated range. Many notifiers do really block and wait for HW which is harder to handle and we have to bail out though. This patch handles the low hanging fruit. __mmu_notifier_invalidate_range_start gets a blockable flag and callbacks are not allowed to sleep if the flag is set to false. This is achieved by using trylock instead of the sleepable lock for most callbacks and continue as long as we do not block down the call chain. I think we can improve that even further because there is a common pattern to do a range lookup first and then do something about that. The first part can be done without a sleeping lock in most cases AFAICS. The oom_reaper end then simply retries if there is at least one notifier which couldn't make any progress in !blockable mode. A retry loop is already implemented to wait for the mmap_sem and this is basically the same thing. The simplest way for driver developers to test this code path is to wrap userspace code which uses these notifiers into a memcg and set the hard limit to hit the oom. This can be done e.g. after the test faults in all the mmu notifier managed memory and set the hard limit to something really small. Then we are looking for a proper process tear down. [akpm@linux-foundation.org: coding style fixes] [akpm@linux-foundation.org: minor code simplification] Link: http://lkml.kernel.org/r/20180716115058.5559-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Christian König # AMD notifiers Acked-by: Leon Romanovsky # mlx and umem_odp Reported-by: David Rientjes Cc: "David (ChunMing) Zhou" Cc: Paolo Bonzini Cc: Alex Deucher Cc: David Airlie Cc: Jani Nikula Cc: Joonas Lahtinen Cc: Rodrigo Vivi Cc: Doug Ledford Cc: Jason Gunthorpe Cc: Mike Marciniszyn Cc: Dennis Dalessandro Cc: Sudeep Dutt Cc: Ashutosh Dixit Cc: Dimitri Sivanich Cc: Boris Ostrovsky Cc: Juergen Gross Cc: "Jérôme Glisse" Cc: Andrea Arcangeli Cc: Felix Kuehling Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kvm/x86.c | 7 ++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c | 43 ++++++++++++++++++++++++++------ drivers/gpu/drm/i915/i915_gem_userptr.c | 13 +++++++--- drivers/gpu/drm/radeon/radeon_mn.c | 22 ++++++++++++++--- drivers/infiniband/core/umem_odp.c | 33 +++++++++++++++++++------ drivers/infiniband/hw/hfi1/mmu_rb.c | 11 ++++++--- drivers/infiniband/hw/mlx5/odp.c | 2 +- drivers/misc/mic/scif/scif_dma.c | 7 ++++-- drivers/misc/sgi-gru/grutlbpurge.c | 7 ++++-- drivers/xen/gntdev.c | 44 ++++++++++++++++++++++++++------- include/linux/kvm_host.h | 4 +-- include/linux/mmu_notifier.h | 33 +++++++++++++++++++------ include/linux/oom.h | 2 +- include/rdma/ib_umem_odp.h | 3 ++- mm/hmm.c | 7 ++++-- mm/mmap.c | 2 +- mm/mmu_notifier.c | 19 +++++++++++--- mm/oom_kill.c | 29 +++++++++++----------- virt/kvm/kvm_main.c | 15 +++++++---- 19 files changed, 223 insertions(+), 80 deletions(-) (limited to 'mm') diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index f7dff0457846..4a74a7cf0a8b 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -7305,8 +7305,9 @@ static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) kvm_x86_ops->load_eoi_exitmap(vcpu, eoi_exit_bitmap); } -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, + bool blockable) { unsigned long apic_address; @@ -7317,6 +7318,8 @@ void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, apic_address = gfn_to_hva(kvm, APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT); if (start <= apic_address && apic_address < end) kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD); + + return 0; } void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c index a365ea2383d1..e55508b39496 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mn.c @@ -178,12 +178,18 @@ void amdgpu_mn_unlock(struct amdgpu_mn *mn) * * @amn: our notifier */ -static void amdgpu_mn_read_lock(struct amdgpu_mn *amn) +static int amdgpu_mn_read_lock(struct amdgpu_mn *amn, bool blockable) { - mutex_lock(&amn->read_lock); + if (blockable) + mutex_lock(&amn->read_lock); + else if (!mutex_trylock(&amn->read_lock)) + return -EAGAIN; + if (atomic_inc_return(&amn->recursion) == 1) down_read_non_owner(&amn->lock); mutex_unlock(&amn->read_lock); + + return 0; } /** @@ -239,10 +245,11 @@ static void amdgpu_mn_invalidate_node(struct amdgpu_mn_node *node, * Block for operations on BOs to finish and mark pages as accessed and * potentially dirty. */ -static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, +static int amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; @@ -250,17 +257,28 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(amn); + /* TODO we should be able to split locking for interval tree and + * amdgpu_mn_invalidate_node + */ + if (amdgpu_mn_read_lock(amn, blockable)) + return -EAGAIN; it = interval_tree_iter_first(&amn->objects, start, end); while (it) { struct amdgpu_mn_node *node; + if (!blockable) { + amdgpu_mn_read_unlock(amn); + return -EAGAIN; + } + node = container_of(it, struct amdgpu_mn_node, it); it = interval_tree_iter_next(it, start, end); amdgpu_mn_invalidate_node(node, start, end); } + + return 0; } /** @@ -275,10 +293,11 @@ static void amdgpu_mn_invalidate_range_start_gfx(struct mmu_notifier *mn, * necessitates evicting all user-mode queues of the process. The BOs * are restorted in amdgpu_mn_invalidate_range_end_hsa. */ -static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, +static int amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct amdgpu_mn *amn = container_of(mn, struct amdgpu_mn, mn); struct interval_tree_node *it; @@ -286,13 +305,19 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, /* notification is exclusive, but interval is inclusive */ end -= 1; - amdgpu_mn_read_lock(amn); + if (amdgpu_mn_read_lock(amn, blockable)) + return -EAGAIN; it = interval_tree_iter_first(&amn->objects, start, end); while (it) { struct amdgpu_mn_node *node; struct amdgpu_bo *bo; + if (!blockable) { + amdgpu_mn_read_unlock(amn); + return -EAGAIN; + } + node = container_of(it, struct amdgpu_mn_node, it); it = interval_tree_iter_next(it, start, end); @@ -304,6 +329,8 @@ static void amdgpu_mn_invalidate_range_start_hsa(struct mmu_notifier *mn, amdgpu_amdkfd_evict_userptr(mem, mm); } } + + return 0; } /** diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index dcd6e230d16a..2c9b284036d1 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -112,10 +112,11 @@ static void del_object(struct i915_mmu_object *mo) mo->attached = false; } -static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, +static int i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct i915_mmu_notifier *mn = container_of(_mn, struct i915_mmu_notifier, mn); @@ -124,7 +125,7 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, LIST_HEAD(cancelled); if (RB_EMPTY_ROOT(&mn->objects.rb_root)) - return; + return 0; /* interval ranges are inclusive, but invalidate range is exclusive */ end--; @@ -132,6 +133,10 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, spin_lock(&mn->lock); it = interval_tree_iter_first(&mn->objects, start, end); while (it) { + if (!blockable) { + spin_unlock(&mn->lock); + return -EAGAIN; + } /* The mmu_object is released late when destroying the * GEM object so it is entirely possible to gain a * reference on an object in the process of being freed @@ -154,6 +159,8 @@ static void i915_gem_userptr_mn_invalidate_range_start(struct mmu_notifier *_mn, if (!list_empty(&cancelled)) flush_workqueue(mn->wq); + + return 0; } static const struct mmu_notifier_ops i915_gem_userptr_notifier = { diff --git a/drivers/gpu/drm/radeon/radeon_mn.c b/drivers/gpu/drm/radeon/radeon_mn.c index abd24975c9b1..f8b35df44c60 100644 --- a/drivers/gpu/drm/radeon/radeon_mn.c +++ b/drivers/gpu/drm/radeon/radeon_mn.c @@ -118,19 +118,27 @@ static void radeon_mn_release(struct mmu_notifier *mn, * We block for all BOs between start and end to be idle and * unmap them by move them into system domain again. */ -static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, +static int radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct radeon_mn *rmn = container_of(mn, struct radeon_mn, mn); struct ttm_operation_ctx ctx = { false, false }; struct interval_tree_node *it; + int ret = 0; /* notification is exclusive, but interval is inclusive */ end -= 1; - mutex_lock(&rmn->lock); + /* TODO we should be able to split locking for interval tree and + * the tear down. + */ + if (blockable) + mutex_lock(&rmn->lock); + else if (!mutex_trylock(&rmn->lock)) + return -EAGAIN; it = interval_tree_iter_first(&rmn->objects, start, end); while (it) { @@ -138,6 +146,11 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, struct radeon_bo *bo; long r; + if (!blockable) { + ret = -EAGAIN; + goto out_unlock; + } + node = container_of(it, struct radeon_mn_node, it); it = interval_tree_iter_next(it, start, end); @@ -166,7 +179,10 @@ static void radeon_mn_invalidate_range_start(struct mmu_notifier *mn, } } +out_unlock: mutex_unlock(&rmn->lock); + + return ret; } static const struct mmu_notifier_ops radeon_mn_ops = { diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 182436b92ba9..6ec748eccff7 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -186,6 +186,7 @@ static void ib_umem_notifier_release(struct mmu_notifier *mn, rbt_ib_umem_for_each_in_range(&context->umem_tree, 0, ULLONG_MAX, ib_umem_notifier_release_trampoline, + true, NULL); up_read(&context->umem_rwsem); } @@ -207,22 +208,31 @@ static int invalidate_range_start_trampoline(struct ib_umem *item, u64 start, return 0; } -static void ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int ib_umem_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct ib_ucontext *context = container_of(mn, struct ib_ucontext, mn); + int ret; if (!context->invalidate_range) - return; + return 0; + + if (blockable) + down_read(&context->umem_rwsem); + else if (!down_read_trylock(&context->umem_rwsem)) + return -EAGAIN; ib_ucontext_notifier_start_account(context); - down_read(&context->umem_rwsem); - rbt_ib_umem_for_each_in_range(&context->umem_tree, start, + ret = rbt_ib_umem_for_each_in_range(&context->umem_tree, start, end, - invalidate_range_start_trampoline, NULL); + invalidate_range_start_trampoline, + blockable, NULL); up_read(&context->umem_rwsem); + + return ret; } static int invalidate_range_end_trampoline(struct ib_umem *item, u64 start, @@ -242,10 +252,15 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, if (!context->invalidate_range) return; + /* + * TODO: we currently bail out if there is any sleepable work to be done + * in ib_umem_notifier_invalidate_range_start so we shouldn't really block + * here. But this is ugly and fragile. + */ down_read(&context->umem_rwsem); rbt_ib_umem_for_each_in_range(&context->umem_tree, start, end, - invalidate_range_end_trampoline, NULL); + invalidate_range_end_trampoline, true, NULL); up_read(&context->umem_rwsem); ib_ucontext_notifier_end_account(context); } @@ -798,6 +813,7 @@ EXPORT_SYMBOL(ib_umem_odp_unmap_dma_pages); int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 last, umem_call_back cb, + bool blockable, void *cookie) { int ret_val = 0; @@ -809,6 +825,9 @@ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, for (node = rbt_ib_umem_iter_first(root, start, last - 1); node; node = next) { + /* TODO move the blockable decision up to the callback */ + if (!blockable) + return -EAGAIN; next = rbt_ib_umem_iter_next(node, start, last - 1); umem = container_of(node, struct ib_umem_odp, interval_tree); ret_val = cb(umem->umem, start, last, cookie) || ret_val; diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index 70aceefe14d5..e1c7996c018e 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -67,9 +67,9 @@ struct mmu_rb_handler { static unsigned long mmu_node_start(struct mmu_rb_node *); static unsigned long mmu_node_last(struct mmu_rb_node *); -static void mmu_notifier_range_start(struct mmu_notifier *, +static int mmu_notifier_range_start(struct mmu_notifier *, struct mm_struct *, - unsigned long, unsigned long); + unsigned long, unsigned long, bool); static struct mmu_rb_node *__mmu_rb_search(struct mmu_rb_handler *, unsigned long, unsigned long); static void do_remove(struct mmu_rb_handler *handler, @@ -284,10 +284,11 @@ void hfi1_mmu_rb_remove(struct mmu_rb_handler *handler, handler->ops->remove(handler->ops_arg, node); } -static void mmu_notifier_range_start(struct mmu_notifier *mn, +static int mmu_notifier_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct mmu_rb_handler *handler = container_of(mn, struct mmu_rb_handler, mn); @@ -313,6 +314,8 @@ static void mmu_notifier_range_start(struct mmu_notifier *mn, if (added) queue_work(handler->wq, &handler->del_work); + + return 0; } /* diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index f1a87a690a4c..d216e0d2921d 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -488,7 +488,7 @@ void mlx5_ib_free_implicit_mr(struct mlx5_ib_mr *imr) down_read(&ctx->umem_rwsem); rbt_ib_umem_for_each_in_range(&ctx->umem_tree, 0, ULLONG_MAX, - mr_leaf_free, imr); + mr_leaf_free, true, imr); up_read(&ctx->umem_rwsem); wait_event(imr->q_leaf_free, !atomic_read(&imr->num_leaf_free)); diff --git a/drivers/misc/mic/scif/scif_dma.c b/drivers/misc/mic/scif/scif_dma.c index 63d6246d6dff..6369aeaa7056 100644 --- a/drivers/misc/mic/scif/scif_dma.c +++ b/drivers/misc/mic/scif/scif_dma.c @@ -200,15 +200,18 @@ static void scif_mmu_notifier_release(struct mmu_notifier *mn, schedule_work(&scif_info.misc_work); } -static void scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int scif_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct scif_mmu_notif *mmn; mmn = container_of(mn, struct scif_mmu_notif, ep_mmu_notifier); scif_rma_destroy_tcw(mmn, start, end - start); + + return 0; } static void scif_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index a3454eb56fbf..be28f05bfafa 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -219,9 +219,10 @@ void gru_flush_all_tlb(struct gru_state *gru) /* * MMUOPS notifier callout functions */ -static void gru_invalidate_range_start(struct mmu_notifier *mn, +static int gru_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool blockable) { struct gru_mm_struct *gms = container_of(mn, struct gru_mm_struct, ms_notifier); @@ -231,6 +232,8 @@ static void gru_invalidate_range_start(struct mmu_notifier *mn, gru_dbg(grudev, "gms %p, start 0x%lx, end 0x%lx, act %d\n", gms, start, end, atomic_read(&gms->ms_range_active)); gru_flush_tlb_range(gms, start, end - start); + + return 0; } static void gru_invalidate_range_end(struct mmu_notifier *mn, diff --git a/drivers/xen/gntdev.c b/drivers/xen/gntdev.c index c866a62f766d..57390c7666e5 100644 --- a/drivers/xen/gntdev.c +++ b/drivers/xen/gntdev.c @@ -479,18 +479,25 @@ static const struct vm_operations_struct gntdev_vmops = { /* ------------------------------------------------------------------ */ +static bool in_range(struct gntdev_grant_map *map, + unsigned long start, unsigned long end) +{ + if (!map->vma) + return false; + if (map->vma->vm_start >= end) + return false; + if (map->vma->vm_end <= start) + return false; + + return true; +} + static void unmap_if_in_range(struct gntdev_grant_map *map, unsigned long start, unsigned long end) { unsigned long mstart, mend; int err; - if (!map->vma) - return; - if (map->vma->vm_start >= end) - return; - if (map->vma->vm_end <= start) - return; mstart = max(start, map->vma->vm_start); mend = min(end, map->vma->vm_end); pr_debug("map %d+%d (%lx %lx), range %lx %lx, mrange %lx %lx\n", @@ -503,21 +510,40 @@ static void unmap_if_in_range(struct gntdev_grant_map *map, WARN_ON(err); } -static void mn_invl_range_start(struct mmu_notifier *mn, +static int mn_invl_range_start(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool blockable) { struct gntdev_priv *priv = container_of(mn, struct gntdev_priv, mn); struct gntdev_grant_map *map; + int ret = 0; + + /* TODO do we really need a mutex here? */ + if (blockable) + mutex_lock(&priv->lock); + else if (!mutex_trylock(&priv->lock)) + return -EAGAIN; - mutex_lock(&priv->lock); list_for_each_entry(map, &priv->maps, next) { + if (in_range(map, start, end)) { + ret = -EAGAIN; + goto out_unlock; + } unmap_if_in_range(map, start, end); } list_for_each_entry(map, &priv->freeable_maps, next) { + if (in_range(map, start, end)) { + ret = -EAGAIN; + goto out_unlock; + } unmap_if_in_range(map, start, end); } + +out_unlock: mutex_unlock(&priv->lock); + + return ret; } static void mn_release(struct mmu_notifier *mn, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 7c7362dd2faa..0205aee44ded 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -1289,8 +1289,8 @@ static inline long kvm_arch_vcpu_async_ioctl(struct file *filp, } #endif /* CONFIG_HAVE_KVM_VCPU_ASYNC_IOCTL */ -void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end); +int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable); #ifdef CONFIG_HAVE_KVM_VCPU_RUN_PID_CHANGE int kvm_arch_vcpu_run_pid_change(struct kvm_vcpu *vcpu); diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index 392e6af82701..133ba78820ee 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -151,13 +151,15 @@ struct mmu_notifier_ops { * address space but may still be referenced by sptes until * the last refcount is dropped. * - * If both of these callbacks cannot block, and invalidate_range - * cannot block, mmu_notifier_ops.flags should have - * MMU_INVALIDATE_DOES_NOT_BLOCK set. + * If blockable argument is set to false then the callback cannot + * sleep and has to return with -EAGAIN. 0 should be returned + * otherwise. + * */ - void (*invalidate_range_start)(struct mmu_notifier *mn, + int (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, - unsigned long start, unsigned long end); + unsigned long start, unsigned long end, + bool blockable); void (*invalidate_range_end)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -229,8 +231,9 @@ extern int __mmu_notifier_test_young(struct mm_struct *mm, unsigned long address); extern void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, pte_t pte); -extern void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end); +extern int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable); extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end, bool only_end); @@ -281,7 +284,15 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, unsigned long start, unsigned long end) { if (mm_has_notifiers(mm)) - __mmu_notifier_invalidate_range_start(mm, start, end); + __mmu_notifier_invalidate_range_start(mm, start, end, true); +} + +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + if (mm_has_notifiers(mm)) + return __mmu_notifier_invalidate_range_start(mm, start, end, false); + return 0; } static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, @@ -461,6 +472,12 @@ static inline void mmu_notifier_invalidate_range_start(struct mm_struct *mm, { } +static inline int mmu_notifier_invalidate_range_start_nonblock(struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + return 0; +} + static inline void mmu_notifier_invalidate_range_end(struct mm_struct *mm, unsigned long start, unsigned long end) { diff --git a/include/linux/oom.h b/include/linux/oom.h index 6adac113e96d..92f70e4c6252 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -95,7 +95,7 @@ static inline int check_stable_address_space(struct mm_struct *mm) return 0; } -void __oom_reap_task_mm(struct mm_struct *mm); +bool __oom_reap_task_mm(struct mm_struct *mm); extern unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, const nodemask_t *nodemask, diff --git a/include/rdma/ib_umem_odp.h b/include/rdma/ib_umem_odp.h index 6a17f856f841..381cdf5a9bd1 100644 --- a/include/rdma/ib_umem_odp.h +++ b/include/rdma/ib_umem_odp.h @@ -119,7 +119,8 @@ typedef int (*umem_call_back)(struct ib_umem *item, u64 start, u64 end, */ int rbt_ib_umem_for_each_in_range(struct rb_root_cached *root, u64 start, u64 end, - umem_call_back cb, void *cookie); + umem_call_back cb, + bool blockable, void *cookie); /* * Find first region intersecting with address range. diff --git a/mm/hmm.c b/mm/hmm.c index 76e7a058b32f..0b0554591610 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -177,16 +177,19 @@ static void hmm_release(struct mmu_notifier *mn, struct mm_struct *mm) up_write(&hmm->mirrors_sem); } -static void hmm_invalidate_range_start(struct mmu_notifier *mn, +static int hmm_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct hmm *hmm = mm->hmm; VM_BUG_ON(!hmm); atomic_inc(&hmm->sequence); + + return 0; } static void hmm_invalidate_range_end(struct mmu_notifier *mn, diff --git a/mm/mmap.c b/mm/mmap.c index 8d6449e74431..bb2a7e097c7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3064,7 +3064,7 @@ void exit_mmap(struct mm_struct *mm) * reliably test it. */ mutex_lock(&oom_lock); - __oom_reap_task_mm(mm); + (void)__oom_reap_task_mm(mm); mutex_unlock(&oom_lock); set_bit(MMF_OOM_SKIP, &mm->flags); diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index eff6b88a993f..82bb1a939c0e 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -174,18 +174,29 @@ void __mmu_notifier_change_pte(struct mm_struct *mm, unsigned long address, srcu_read_unlock(&srcu, id); } -void __mmu_notifier_invalidate_range_start(struct mm_struct *mm, - unsigned long start, unsigned long end) +int __mmu_notifier_invalidate_range_start(struct mm_struct *mm, + unsigned long start, unsigned long end, + bool blockable) { struct mmu_notifier *mn; + int ret = 0; int id; id = srcu_read_lock(&srcu); hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { - if (mn->ops->invalidate_range_start) - mn->ops->invalidate_range_start(mn, mm, start, end); + if (mn->ops->invalidate_range_start) { + int _ret = mn->ops->invalidate_range_start(mn, mm, start, end, blockable); + if (_ret) { + pr_info("%pS callback failed with %d in %sblockable context.\n", + mn->ops->invalidate_range_start, _ret, + !blockable ? "non-" : ""); + ret = _ret; + } + } } srcu_read_unlock(&srcu, id); + + return ret; } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range_start); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 412f43453a68..be31a1e0fe78 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -487,9 +487,10 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -void __oom_reap_task_mm(struct mm_struct *mm) +bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + bool ret = true; /* * Tell all users of get_user/copy_from_user etc... that the content @@ -519,12 +520,17 @@ void __oom_reap_task_mm(struct mm_struct *mm) struct mmu_gather tlb; tlb_gather_mmu(&tlb, mm, start, end); - mmu_notifier_invalidate_range_start(mm, start, end); + if (mmu_notifier_invalidate_range_start_nonblock(mm, start, end)) { + ret = false; + continue; + } unmap_page_range(&tlb, vma, start, end, NULL); mmu_notifier_invalidate_range_end(mm, start, end); tlb_finish_mmu(&tlb, start, end); } } + + return ret; } static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) @@ -553,18 +559,6 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) goto unlock_oom; } - /* - * If the mm has invalidate_{start,end}() notifiers that could block, - * sleep to give the oom victim some more time. - * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time - */ - if (mm_has_blockable_invalidate_notifiers(mm)) { - up_read(&mm->mmap_sem); - schedule_timeout_idle(HZ); - goto unlock_oom; - } - /* * MMF_OOM_SKIP is set by exit_mmap when the OOM reaper can't * work on the mm anymore. The check for MMF_OOM_SKIP must run @@ -579,7 +573,12 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) trace_start_task_reaping(tsk->pid); - __oom_reap_task_mm(mm); + /* failed to reap part of the address space. Try again later */ + if (!__oom_reap_task_mm(mm)) { + up_read(&mm->mmap_sem); + ret = false; + goto unlock_oom; + } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 9263ead9fd32..0116b449b993 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -140,9 +140,10 @@ static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm); static unsigned long long kvm_createvm_count; static unsigned long long kvm_active_vms; -__weak void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, - unsigned long start, unsigned long end) +__weak int kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end, bool blockable) { + return 0; } bool kvm_is_reserved_pfn(kvm_pfn_t pfn) @@ -360,13 +361,15 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, srcu_read_unlock(&kvm->srcu, idx); } -static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, +static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, - unsigned long end) + unsigned long end, + bool blockable) { struct kvm *kvm = mmu_notifier_to_kvm(mn); int need_tlb_flush = 0, idx; + int ret; idx = srcu_read_lock(&kvm->srcu); spin_lock(&kvm->mmu_lock); @@ -384,9 +387,11 @@ static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, spin_unlock(&kvm->mmu_lock); - kvm_arch_mmu_notifier_invalidate_range(kvm, start, end); + ret = kvm_arch_mmu_notifier_invalidate_range(kvm, start, end, blockable); srcu_read_unlock(&kvm->srcu, idx); + + return ret; } static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, -- cgit v1.2.3 From af5679fbc669f31f7ebd0d473bca76c24c07de30 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2018 21:52:37 -0700 Subject: mm, oom: remove oom_lock from oom_reaper oom_reaper used to rely on the oom_lock since e2fe14564d33 ("oom_reaper: close race with exiting task"). We do not really need the lock anymore though. 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") has removed serialization with the exit path based on the mm reference count and so we do not really rely on the oom_lock anymore. Tetsuo was arguing that at least MMF_OOM_SKIP should be set under the lock to prevent from races when the page allocator didn't manage to get the freed (reaped) memory in __alloc_pages_may_oom but it sees the flag later on and move on to another victim. Although this is possible in principle let's wait for it to actually happen in real life before we make the locking more complex again. Therefore remove the oom_lock for oom_reaper paths (both exit_mmap and oom_reap_task_mm). The reaper serializes with exit_mmap by mmap_sem + MMF_OOM_SKIP flag. There is no synchronization with out_of_memory path now. [mhocko@kernel.org: oom_reap_task_mm should return false when __oom_reap_task_mm did] Link: http://lkml.kernel.org/r/20180724141747.GP28386@dhcp22.suse.cz Link: http://lkml.kernel.org/r/20180719075922.13784-1-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: David Rientjes Acked-by: David Rientjes Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 2 -- mm/oom_kill.c | 30 ++++-------------------------- 2 files changed, 4 insertions(+), 28 deletions(-) (limited to 'mm') diff --git a/mm/mmap.c b/mm/mmap.c index bb2a7e097c7d..5f2b2b184c60 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3063,9 +3063,7 @@ void exit_mmap(struct mm_struct *mm) * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ - mutex_lock(&oom_lock); (void)__oom_reap_task_mm(mm); - mutex_unlock(&oom_lock); set_bit(MMF_OOM_SKIP, &mm->flags); down_write(&mm->mmap_sem); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index be31a1e0fe78..66a86dd049a0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -535,28 +535,9 @@ bool __oom_reap_task_mm(struct mm_struct *mm) static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { - bool ret = true; - - /* - * We have to make sure to not race with the victim exit path - * and cause premature new oom victim selection: - * oom_reap_task_mm exit_mm - * mmget_not_zero - * mmput - * atomic_dec_and_test - * exit_oom_victim - * [...] - * out_of_memory - * select_bad_process - * # no TIF_MEMDIE task selects new victim - * unmap_page_range # frees some memory - */ - mutex_lock(&oom_lock); - if (!down_read_trylock(&mm->mmap_sem)) { - ret = false; trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + return false; } /* @@ -568,7 +549,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) if (test_bit(MMF_OOM_SKIP, &mm->flags)) { up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - goto unlock_oom; + return true; } trace_start_task_reaping(tsk->pid); @@ -576,8 +557,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) /* failed to reap part of the address space. Try again later */ if (!__oom_reap_task_mm(mm)) { up_read(&mm->mmap_sem); - ret = false; - goto unlock_oom; + return false; } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", @@ -588,9 +568,7 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) up_read(&mm->mmap_sem); trace_finish_task_reaping(tsk->pid); -unlock_oom: - mutex_unlock(&oom_lock); - return ret; + return true; } #define MAX_OOM_REAP_RETRIES 10 -- cgit v1.2.3 From c3b78b11efbb2865433abf9d22c004ffe4a73f5c Mon Sep 17 00:00:00 2001 From: Rodrigo Freire Date: Tue, 21 Aug 2018 21:52:41 -0700 Subject: mm, oom: describe task memory unit, larger PID pad The default page memory unit of OOM task dump events might not be intuitive and potentially misleading for the non-initiated when debugging OOM events: These are pages and not kBs. Add a small printk prior to the task dump informing that the memory units are actually memory _pages_. Also extends PID field to align on up to 7 characters. Reference https://lkml.org/lkml/2018/7/3/1201 Link: http://lkml.kernel.org/r/c795eb5129149ed8a6345c273aba167ff1bbd388.1530715938.git.rfreire@redhat.com Signed-off-by: Rodrigo Freire Acked-by: David Rientjes Acked-by: Rafael Aquini Cc: Michal Hocko Cc: Tetsuo Handa Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 66a86dd049a0..ed5e6a221a8e 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -400,7 +400,8 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); + pr_info("Tasks state (memory values in pages):\n"); + pr_info("[ pid ] uid tgid total_vm rss pgtables_bytes swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -416,7 +417,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", + pr_info("[%7d] %5d %5d %8lu %8lu %8ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), mm_pgtables_bytes(task->mm), -- cgit v1.2.3 From 431f42fdfdb36f06f43c711fc59be9b814d8fb22 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 21 Aug 2018 21:52:45 -0700 Subject: mm/oom_kill.c: clean up oom_reap_task_mm() Andrew has noticed some inconsistencies in oom_reap_task_mm. Notably - Undocumented return value. - comment "failed to reap part..." is misleading - sounds like it's referring to something which happened in the past, is in fact referring to something which might happen in the future. - fails to call trace_finish_task_reaping() in one case - code duplication. - Increases mmap_sem hold time a little by moving trace_finish_task_reaping() inside the locked region. So sue me ;) - Sharing the finish: path means that the trace event won't distinguish between the two sources of finishing. Add a short explanation for the return value and fix the rest by reorganizing the function a bit to have unified function exit paths. Link: http://lkml.kernel.org/r/20180724141747.GP28386@dhcp22.suse.cz Suggested-by: Andrew Morton Signed-off-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index ed5e6a221a8e..20600779f5db 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -534,8 +534,16 @@ bool __oom_reap_task_mm(struct mm_struct *mm) return ret; } +/* + * Reaps the address space of the give task. + * + * Returns true on success and false if none or part of the address space + * has been reclaimed and the caller should retry later. + */ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) { + bool ret = true; + if (!down_read_trylock(&mm->mmap_sem)) { trace_skip_task_reaping(tsk->pid); return false; @@ -548,28 +556,28 @@ static bool oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * down_write();up_write() cycle in exit_mmap(). */ if (test_bit(MMF_OOM_SKIP, &mm->flags)) { - up_read(&mm->mmap_sem); trace_skip_task_reaping(tsk->pid); - return true; + goto out_unlock; } trace_start_task_reaping(tsk->pid); /* failed to reap part of the address space. Try again later */ - if (!__oom_reap_task_mm(mm)) { - up_read(&mm->mmap_sem); - return false; - } + ret = __oom_reap_task_mm(mm); + if (!ret) + goto out_finish; pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", task_pid_nr(tsk), tsk->comm, K(get_mm_counter(mm, MM_ANONPAGES)), K(get_mm_counter(mm, MM_FILEPAGES)), K(get_mm_counter(mm, MM_SHMEMPAGES))); +out_finish: + trace_finish_task_reaping(tsk->pid); +out_unlock: up_read(&mm->mmap_sem); - trace_finish_task_reaping(tsk->pid); - return true; + return ret; } #define MAX_OOM_REAP_RETRIES 10 -- cgit v1.2.3 From a3bf6ce366496016990d8578af74673ea04178ff Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 21 Aug 2018 21:53:03 -0700 Subject: mm/mempool.c: add missing parameter description The kernel-doc for mempool_init function is missing the description of the pool parameter. Add it. Link: http://lkml.kernel.org/r/1532336274-26228-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm') diff --git a/mm/mempool.c b/mm/mempool.c index 44f5fa98c1e7..0ef8cc8d1602 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -213,6 +213,7 @@ EXPORT_SYMBOL(mempool_init_node); /** * mempool_init - initialize a memory pool + * @pool: pointer to the memory pool that should be initialized * @min_nr: the minimum number of elements guaranteed to be * allocated for this pool. * @alloc_fn: user-defined element-allocation function. -- cgit v1.2.3 From a670468f5e0b5fad4db6e4d195f15915dc2a35c1 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 21 Aug 2018 21:53:06 -0700 Subject: mm: zero out the vma in vma_init() Rather than in vm_area_alloc(). To ensure that the various oddball stack-based vmas are in a good state. Some of the callers were zeroing them out, others were not. Acked-by: Kirill A. Shutemov Cc: Russell King Cc: Dmitry Vyukov Cc: Oleg Nesterov Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/kernel/process.c | 9 ++++----- fs/hugetlbfs/inode.c | 2 -- include/linux/mm.h | 1 + kernel/fork.c | 3 ++- mm/mempolicy.c | 1 - mm/shmem.c | 1 - 6 files changed, 7 insertions(+), 10 deletions(-) (limited to 'mm') diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index d9c299133111..82ab015bf42b 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -330,16 +330,15 @@ unsigned long arch_randomize_brk(struct mm_struct *mm) * atomic helpers. Insert it into the gate_vma so that it is visible * through ptrace and /proc//mem. */ -static struct vm_area_struct gate_vma = { - .vm_start = 0xffff0000, - .vm_end = 0xffff0000 + PAGE_SIZE, - .vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC, -}; +static struct vm_area_struct gate_vma; static int __init gate_vma_init(void) { vma_init(&gate_vma, NULL); gate_vma.vm_page_prot = PAGE_READONLY_EXEC; + gate_vma.vm_start = 0xffff0000; + gate_vma.vm_end = 0xffff0000 + PAGE_SIZE; + gate_vma.vm_flags = VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYEXEC; return 0; } arch_initcall(gate_vma_init); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 346a146c7617..32920a10100e 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -410,7 +410,6 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, current->mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); @@ -595,7 +594,6 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, * allocation routines. If NUMA is configured, use page index * as input to create an allocation policy. */ - memset(&pseudo_vma, 0, sizeof(struct vm_area_struct)); vma_init(&pseudo_vma, mm); pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pseudo_vma.vm_file = file; diff --git a/include/linux/mm.h b/include/linux/mm.h index a3cae495f9ce..3a4b87d1a59a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -456,6 +456,7 @@ static inline void vma_init(struct vm_area_struct *vma, struct mm_struct *mm) { static const struct vm_operations_struct dummy_vm_ops = {}; + memset(vma, 0, sizeof(*vma)); vma->vm_mm = mm; vma->vm_ops = &dummy_vm_ops; INIT_LIST_HEAD(&vma->anon_vma_chain); diff --git a/kernel/fork.c b/kernel/fork.c index 5ee74c113381..8c760effa42e 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -310,8 +310,9 @@ static struct kmem_cache *mm_cachep; struct vm_area_struct *vm_area_alloc(struct mm_struct *mm) { - struct vm_area_struct *vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL); + struct vm_area_struct *vma; + vma = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); if (vma) vma_init(vma, mm); return vma; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 01f1a14facc4..4861ba738d6f 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2504,7 +2504,6 @@ void mpol_shared_policy_init(struct shared_policy *sp, struct mempolicy *mpol) goto put_new; /* Create pseudo-vma that contains just the policy */ - memset(&pvma, 0, sizeof(struct vm_area_struct)); vma_init(&pvma, NULL); pvma.vm_end = TASK_SIZE; /* policy covers entire file */ mpol_set_shared_policy(sp, &pvma, new); /* adds ref */ diff --git a/mm/shmem.c b/mm/shmem.c index c48c79018a7c..fb04baacc9fa 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1421,7 +1421,6 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - memset(vma, 0, sizeof(*vma)); vma_init(vma, NULL); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; -- cgit v1.2.3 From 8c9a134cae6f8d66f35321b07baa202f75ef2199 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 21 Aug 2018 21:53:10 -0700 Subject: mm: clarify CONFIG_PAGE_POISONING and usage The Kconfig text for CONFIG_PAGE_POISONING doesn't mention that it has to be enabled explicitly. This updates the documentation for that and adds a note about CONFIG_PAGE_POISONING to the "page_poison" command line docs. While here, change description of CONFIG_PAGE_POISONING_ZERO too, as it's not "random" data, but rather the fixed debugging value that would be used when not zeroing. Additionally removes a stray "bool" in the Kconfig. Link: http://lkml.kernel.org/r/20180725223832.GA43733@beast Signed-off-by: Kees Cook Reviewed-by: Andrew Morton Cc: Jonathan Corbet Cc: Laura Abbott Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/kernel-parameters.txt | 5 +++-- mm/Kconfig.debug | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) (limited to 'mm') diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bffb0caa3693..970d837bd57f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -3041,8 +3041,9 @@ on: enable the feature page_poison= [KNL] Boot-time parameter changing the state of - poisoning on the buddy allocator. - off: turn off poisoning + poisoning on the buddy allocator, available with + CONFIG_PAGE_POISONING=y. + off: turn off poisoning (default) on: turn on poisoning panic= [KNL] Kernel behaviour on panic: delay diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index e5e606ee5f71..9a7b8b049d04 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -46,7 +46,8 @@ config PAGE_POISONING Fill the pages with poison patterns after free_pages() and verify the patterns before alloc_pages. The filling of the memory helps reduce the risk of information leaks from freed data. This does - have a potential performance impact. + have a potential performance impact if enabled with the + "page_poison=1" kernel boot option. Note that "poison" here is not the same thing as the "HWPoison" for CONFIG_MEMORY_FAILURE. This is software poisoning only. @@ -65,7 +66,7 @@ config PAGE_POISONING_NO_SANITY say N. config PAGE_POISONING_ZERO - bool "Use zero for poisoning instead of random data" + bool "Use zero for poisoning instead of debugging value" depends on PAGE_POISONING ---help--- Instead of using the existing poison value, fill the pages with @@ -75,7 +76,6 @@ config PAGE_POISONING_ZERO allocation. If unsure, say N - bool config DEBUG_PAGE_REF bool "Enable tracepoint to track down page reference manipulation" -- cgit v1.2.3 From 1c4c3b99c03d3e72ac643b01edcb83c0c0aafd46 Mon Sep 17 00:00:00 2001 From: Jiang Biao Date: Tue, 21 Aug 2018 21:53:13 -0700 Subject: mm: fix page_freeze_refs and page_unfreeze_refs in comments page_freeze_refs/page_unfreeze_refs have already been relplaced by page_ref_freeze/page_ref_unfreeze , but they are not modified in the comments. Link: http://lkml.kernel.org/r/1532590226-106038-1-git-send-email-jiang.biao2@zte.com.cn Signed-off-by: Jiang Biao Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/ksm.c | 4 ++-- mm/memory-failure.c | 2 +- mm/vmscan.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/ksm.c b/mm/ksm.c index 2621be57bd95..1bd514c9e5d0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -703,7 +703,7 @@ again: * We cannot do anything with the page while its refcount is 0. * Usually 0 means free, or tail of a higher-order page: in which * case this node is no longer referenced, and should be freed; - * however, it might mean that the page is under page_freeze_refs(). + * however, it might mean that the page is under page_ref_freeze(). * The __remove_mapping() case is easy, again the node is now stale; * but if page is swapcache in migrate_page_move_mapping(), it might * still be our page, in which case it's essential to keep the node. @@ -714,7 +714,7 @@ again: * work here too. We have chosen the !PageSwapCache test to * optimize the common case, when the page is or is about to * be freed: PageSwapCache is cleared (under spin_lock_irq) - * in the freeze_refs section of __remove_mapping(); but Anon + * in the ref_freeze section of __remove_mapping(); but Anon * page->mapping reset to NULL later, in free_pages_prepare(). */ if (!PageSwapCache(page)) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 9d142b9b86dc..c83a1746812f 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1167,7 +1167,7 @@ int memory_failure(unsigned long pfn, int flags) * R/W the page; let's pray that the page has been * used and will be freed some time later. * In fact it's dangerous to directly bump up page count from 0, - * that may make page_freeze_refs()/page_unfreeze_refs() mismatch. + * that may make page_ref_freeze()/page_ref_unfreeze() mismatch. */ if (!(flags & MF_COUNT_INCREASED) && !get_hwpoison_page(p)) { if (is_free_buddy_page(p)) { diff --git a/mm/vmscan.c b/mm/vmscan.c index 3c6e2bfee427..7e7d25504651 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -903,7 +903,7 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, refcount = 2; if (!page_ref_freeze(page, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_freeze_refs provides the smp_rmb */ + /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ if (unlikely(PageDirty(page))) { page_ref_unfreeze(page, refcount); goto cannot_free; -- cgit v1.2.3 From 8de7ecc6483bf94cef9f48c4c3511ef3e9ee6d40 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Tue, 21 Aug 2018 21:53:17 -0700 Subject: memcg: reduce memcg tree traversals for stats collection Currently cgroup-v1's memcg_stat_show traverses the memcg tree ~17 times to collect the stats while cgroup-v2's memory_stat_show traverses the memcg tree thrice. On a large machine, a couple thousand memcgs is very normal and if the churn is high and memcgs stick around during to several reasons, tens of thousands of nodes in memcg tree can exist. This patch has refactored and shared the stat collection code between cgroup-v1 and cgroup-v2 and has reduced the tree traversal to just one. I ran a simple benchmark which reads the root_mem_cgroup's stat file 1000 times in the presense of 2500 memcgs on cgroup-v1. The results are: Without the patch: $ time ./read-root-stat-1000-times real 0m1.663s user 0m0.000s sys 0m1.660s With the patch: $ time ./read-root-stat-1000-times real 0m0.468s user 0m0.000s sys 0m0.467s Link: http://lkml.kernel.org/r/20180724224635.143944-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Cc: Greg Thelen Cc: Bruce Merry Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 150 +++++++++++++++++++++++++++----------------------------- 1 file changed, 73 insertions(+), 77 deletions(-) (limited to 'mm') diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6a921890739f..59c14c988143 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2899,29 +2899,34 @@ static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, return retval; } -static void tree_stat(struct mem_cgroup *memcg, unsigned long *stat) -{ - struct mem_cgroup *iter; - int i; - - memset(stat, 0, sizeof(*stat) * MEMCG_NR_STAT); - - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < MEMCG_NR_STAT; i++) - stat[i] += memcg_page_state(iter, i); - } -} +struct accumulated_stats { + unsigned long stat[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long lru_pages[NR_LRU_LISTS]; + const unsigned int *stats_array; + const unsigned int *events_array; + int stats_size; + int events_size; +}; -static void tree_events(struct mem_cgroup *memcg, unsigned long *events) +static void accumulate_memcg_tree(struct mem_cgroup *memcg, + struct accumulated_stats *acc) { - struct mem_cgroup *iter; + struct mem_cgroup *mi; int i; - memset(events, 0, sizeof(*events) * NR_VM_EVENT_ITEMS); + for_each_mem_cgroup_tree(mi, memcg) { + for (i = 0; i < acc->stats_size; i++) + acc->stat[i] += memcg_page_state(mi, + acc->stats_array ? acc->stats_array[i] : i); - for_each_mem_cgroup_tree(iter, memcg) { - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) - events[i] += memcg_sum_events(iter, i); + for (i = 0; i < acc->events_size; i++) + acc->events[i] += memcg_sum_events(mi, + acc->events_array ? acc->events_array[i] : i); + + for (i = 0; i < NR_LRU_LISTS; i++) + acc->lru_pages[i] += + mem_cgroup_nr_lru_pages(mi, BIT(i)); } } @@ -3332,6 +3337,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) unsigned long memory, memsw; struct mem_cgroup *mi; unsigned int i; + struct accumulated_stats acc; BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats)); BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); @@ -3364,32 +3370,27 @@ static int memcg_stat_show(struct seq_file *m, void *v) seq_printf(m, "hierarchical_memsw_limit %llu\n", (u64)memsw * PAGE_SIZE); - for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { - unsigned long long val = 0; + memset(&acc, 0, sizeof(acc)); + acc.stats_size = ARRAY_SIZE(memcg1_stats); + acc.stats_array = memcg1_stats; + acc.events_size = ARRAY_SIZE(memcg1_events); + acc.events_array = memcg1_events; + accumulate_memcg_tree(memcg, &acc); + for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) { if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_page_state(mi, memcg1_stats[i]) * - PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], val); + seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], + (u64)acc.stat[i] * PAGE_SIZE); } - for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) { - unsigned long long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += memcg_sum_events(mi, memcg1_events[i]); - seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], val); - } - - for (i = 0; i < NR_LRU_LISTS; i++) { - unsigned long long val = 0; + for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) + seq_printf(m, "total_%s %llu\n", memcg1_event_names[i], + (u64)acc.events[i]); - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; - seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); #ifdef CONFIG_DEBUG_VM { @@ -5486,8 +5487,7 @@ static int memory_events_show(struct seq_file *m, void *v) static int memory_stat_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long stat[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + struct accumulated_stats acc; int i; /* @@ -5501,66 +5501,62 @@ static int memory_stat_show(struct seq_file *m, void *v) * Current memory state: */ - tree_stat(memcg, stat); - tree_events(memcg, events); + memset(&acc, 0, sizeof(acc)); + acc.stats_size = MEMCG_NR_STAT; + acc.events_size = NR_VM_EVENT_ITEMS; + accumulate_memcg_tree(memcg, &acc); seq_printf(m, "anon %llu\n", - (u64)stat[MEMCG_RSS] * PAGE_SIZE); + (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE); seq_printf(m, "file %llu\n", - (u64)stat[MEMCG_CACHE] * PAGE_SIZE); + (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE); seq_printf(m, "kernel_stack %llu\n", - (u64)stat[MEMCG_KERNEL_STACK_KB] * 1024); + (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024); seq_printf(m, "slab %llu\n", - (u64)(stat[NR_SLAB_RECLAIMABLE] + - stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); + (u64)(acc.stat[NR_SLAB_RECLAIMABLE] + + acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE); seq_printf(m, "sock %llu\n", - (u64)stat[MEMCG_SOCK] * PAGE_SIZE); + (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE); seq_printf(m, "shmem %llu\n", - (u64)stat[NR_SHMEM] * PAGE_SIZE); + (u64)acc.stat[NR_SHMEM] * PAGE_SIZE); seq_printf(m, "file_mapped %llu\n", - (u64)stat[NR_FILE_MAPPED] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE); seq_printf(m, "file_dirty %llu\n", - (u64)stat[NR_FILE_DIRTY] * PAGE_SIZE); + (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE); seq_printf(m, "file_writeback %llu\n", - (u64)stat[NR_WRITEBACK] * PAGE_SIZE); + (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE); - for (i = 0; i < NR_LRU_LISTS; i++) { - struct mem_cgroup *mi; - unsigned long val = 0; - - for_each_mem_cgroup_tree(mi, memcg) - val += mem_cgroup_nr_lru_pages(mi, BIT(i)); - seq_printf(m, "%s %llu\n", - mem_cgroup_lru_names[i], (u64)val * PAGE_SIZE); - } + for (i = 0; i < NR_LRU_LISTS; i++) + seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i], + (u64)acc.lru_pages[i] * PAGE_SIZE); seq_printf(m, "slab_reclaimable %llu\n", - (u64)stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE); seq_printf(m, "slab_unreclaimable %llu\n", - (u64)stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); + (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE); /* Accumulated memory events */ - seq_printf(m, "pgfault %lu\n", events[PGFAULT]); - seq_printf(m, "pgmajfault %lu\n", events[PGMAJFAULT]); + seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]); + seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]); - seq_printf(m, "pgrefill %lu\n", events[PGREFILL]); - seq_printf(m, "pgscan %lu\n", events[PGSCAN_KSWAPD] + - events[PGSCAN_DIRECT]); - seq_printf(m, "pgsteal %lu\n", events[PGSTEAL_KSWAPD] + - events[PGSTEAL_DIRECT]); - seq_printf(m, "pgactivate %lu\n", events[PGACTIVATE]); - seq_printf(m, "pgdeactivate %lu\n", events[PGDEACTIVATE]); - seq_printf(m, "pglazyfree %lu\n", events[PGLAZYFREE]); - seq_printf(m, "pglazyfreed %lu\n", events[PGLAZYFREED]); + seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]); + seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] + + acc.events[PGSCAN_DIRECT]); + seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] + + acc.events[PGSTEAL_DIRECT]); + seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]); + seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]); + seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]); + seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]); seq_printf(m, "workingset_refault %lu\n", - stat[WORKINGSET_REFAULT]); + acc.stat[WORKINGSET_REFAULT]); seq_printf(m, "workingset_activate %lu\n", - stat[WORKINGSET_ACTIVATE]); + acc.stat[WORKINGSET_ACTIVATE]); seq_printf(m, "workingset_nodereclaim %lu\n", - stat[WORKINGSET_NODERECLAIM]); + acc.stat[WORKINGSET_NODERECLAIM]); return 0; } -- cgit v1.2.3 From ace1db39768cdc58f6b157d99ae5958ad34ffff8 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:29 -0700 Subject: mm/page_alloc.c: move ifdefery out of free_area_init_core Patch series "Refactor free_area_init_core and add free_area_init_core_hotplug", v6. This patchset does three things: 1) Clean up/refactor free_area_init_core/free_area_init_node by moving the ifdefery out of the functions. 2) Move the pgdat/zone initialization in free_area_init_core to its own function. 3) Introduce free_area_init_core_hotplug, a small subset of free_area_init_core, which is only called from memhotlug code path. In this way, we have: free_area_init_core: called during early initialization free_area_init_core_hotplug: called whenever a new node is allocated/re-used (memhotplug path) This patch (of 5): Moving the #ifdefs out of the function makes it easier to follow. Link: http://lkml.kernel.org/r/20180730101757.28058-2-osalvador@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Pavel Tatashin Acked-by: Vlastimil Babka Cc: Pasha Tatashin Cc: Mel Gorman Cc: Aaron Lu Cc: Joonsoo Kim Cc: Dan Williams Cc: David Hildenbrand Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 50 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 37 insertions(+), 13 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 15ea511fb41c..455cc3bfae99 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6194,6 +6194,37 @@ static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, return PAGE_ALIGN(pages * sizeof(struct page)) >> PAGE_SHIFT; } +#ifdef CONFIG_NUMA_BALANCING +static void pgdat_init_numabalancing(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->numabalancing_migrate_lock); + pgdat->numabalancing_migrate_nr_pages = 0; + pgdat->numabalancing_migrate_next_window = jiffies; +} +#else +static void pgdat_init_numabalancing(struct pglist_data *pgdat) {} +#endif + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static void pgdat_init_split_queue(struct pglist_data *pgdat) +{ + spin_lock_init(&pgdat->split_queue_lock); + INIT_LIST_HEAD(&pgdat->split_queue); + pgdat->split_queue_len = 0; +} +#else +static void pgdat_init_split_queue(struct pglist_data *pgdat) {} +#endif + +#ifdef CONFIG_COMPACTION +static void pgdat_init_kcompactd(struct pglist_data *pgdat) +{ + init_waitqueue_head(&pgdat->kcompactd_wait); +} +#else +static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} +#endif + /* * Set up the zone data structures: * - mark all pages reserved @@ -6208,21 +6239,14 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) int nid = pgdat->node_id; pgdat_resize_init(pgdat); -#ifdef CONFIG_NUMA_BALANCING - spin_lock_init(&pgdat->numabalancing_migrate_lock); - pgdat->numabalancing_migrate_nr_pages = 0; - pgdat->numabalancing_migrate_next_window = jiffies; -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - spin_lock_init(&pgdat->split_queue_lock); - INIT_LIST_HEAD(&pgdat->split_queue); - pgdat->split_queue_len = 0; -#endif + + pgdat_init_numabalancing(pgdat); + pgdat_init_split_queue(pgdat); + pgdat_init_kcompactd(pgdat); + init_waitqueue_head(&pgdat->kswapd_wait); init_waitqueue_head(&pgdat->pfmemalloc_wait); -#ifdef CONFIG_COMPACTION - init_waitqueue_head(&pgdat->kcompactd_wait); -#endif + pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); -- cgit v1.2.3 From c1093b746c0576ed81c4d568d1e39cab651d37e6 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 21 Aug 2018 21:53:32 -0700 Subject: mm: access zone->node via zone_to_nid() and zone_set_nid() zone->node is configured only when CONFIG_NUMA=y, so it is a good idea to have inline functions to access this field in order to avoid ifdef's in c files. Link: http://lkml.kernel.org/r/20180730101757.28058-3-osalvador@techadventures.net Signed-off-by: Pavel Tatashin Signed-off-by: Oscar Salvador Reviewed-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 9 --------- include/linux/mmzone.h | 26 ++++++++++++++++++++------ mm/mempolicy.c | 4 ++-- mm/mm_init.c | 9 ++------- mm/page_alloc.c | 10 ++++------ 5 files changed, 28 insertions(+), 30 deletions(-) (limited to 'mm') diff --git a/include/linux/mm.h b/include/linux/mm.h index 3a4b87d1a59a..a55f5389c491 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -960,15 +960,6 @@ static inline int page_zone_id(struct page *page) return (page->flags >> ZONEID_PGSHIFT) & ZONEID_MASK; } -static inline int zone_to_nid(struct zone *zone) -{ -#ifdef CONFIG_NUMA - return zone->node; -#else - return 0; -#endif -} - #ifdef NODE_NOT_IN_PAGE_FLAGS extern int page_to_nid(const struct page *page); #else diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3d4577a3ae7e..1e22d96734e0 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -834,6 +834,25 @@ static inline bool populated_zone(struct zone *zone) return zone->present_pages; } +#ifdef CONFIG_NUMA +static inline int zone_to_nid(struct zone *zone) +{ + return zone->node; +} + +static inline void zone_set_nid(struct zone *zone, int nid) +{ + zone->node = nid; +} +#else +static inline int zone_to_nid(struct zone *zone) +{ + return 0; +} + +static inline void zone_set_nid(struct zone *zone, int nid) {} +#endif + extern int movable_zone; #ifdef CONFIG_HIGHMEM @@ -949,12 +968,7 @@ static inline int zonelist_zone_idx(struct zoneref *zoneref) static inline int zonelist_node_idx(struct zoneref *zoneref) { -#ifdef CONFIG_NUMA - /* zone_to_nid not available in this context */ - return zoneref->zone->node; -#else - return 0; -#endif /* CONFIG_NUMA */ + return zone_to_nid(zoneref->zone); } struct zoneref *__next_zones_zonelist(struct zoneref *z, diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4861ba738d6f..da858f794eb6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1784,7 +1784,7 @@ unsigned int mempolicy_slab_node(void) zonelist = &NODE_DATA(node)->node_zonelists[ZONELIST_FALLBACK]; z = first_zones_zonelist(zonelist, highest_zoneidx, &policy->v.nodes); - return z->zone ? z->zone->node : node; + return z->zone ? zone_to_nid(z->zone) : node; } default: @@ -2326,7 +2326,7 @@ int mpol_misplaced(struct page *page, struct vm_area_struct *vma, unsigned long node_zonelist(numa_node_id(), GFP_HIGHUSER), gfp_zone(GFP_HIGHUSER), &pol->v.nodes); - polnid = z->zone->node; + polnid = zone_to_nid(z->zone); break; default: diff --git a/mm/mm_init.c b/mm/mm_init.c index 5b72266b4b03..6838a530789b 100644 --- a/mm/mm_init.c +++ b/mm/mm_init.c @@ -53,13 +53,8 @@ void __init mminit_verify_zonelist(void) zone->name); /* Iterate the zonelist */ - for_each_zone_zonelist(zone, z, zonelist, zoneid) { -#ifdef CONFIG_NUMA - pr_cont("%d:%s ", zone->node, zone->name); -#else - pr_cont("0:%s ", zone->name); -#endif /* CONFIG_NUMA */ - } + for_each_zone_zonelist(zone, z, zonelist, zoneid) + pr_cont("%d:%s ", zone_to_nid(zone), zone->name); pr_cont("\n"); } } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 455cc3bfae99..6b2324bc61db 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2909,10 +2909,10 @@ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z) if (!static_branch_likely(&vm_numa_stat_key)) return; - if (z->node != numa_node_id()) + if (zone_to_nid(z) != numa_node_id()) local_stat = NUMA_OTHER; - if (z->node == preferred_zone->node) + if (zone_to_nid(z) == zone_to_nid(preferred_zone)) __inc_numa_state(z, NUMA_HIT); else { __inc_numa_state(z, NUMA_MISS); @@ -5278,7 +5278,7 @@ int local_memory_node(int node) z = first_zones_zonelist(node_zonelist(node, GFP_KERNEL), gfp_zone(GFP_KERNEL), NULL); - return z->zone->node; + return zone_to_nid(z->zone); } #endif @@ -6299,9 +6299,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * And all highmem pages will be managed by the buddy system. */ zone->managed_pages = freesize; -#ifdef CONFIG_NUMA - zone->node = nid; -#endif + zone_set_nid(zone, nid); zone->name = zone_names[j]; zone->zone_pgdat = pgdat; spin_lock_init(&zone->lock); -- cgit v1.2.3 From 7cc2a9596d77e598a476ec7819046a453378d4a6 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Tue, 21 Aug 2018 21:53:36 -0700 Subject: mm: remove __paginginit __paginginit is the same thing as __meminit except for platforms without sparsemem, there it is defined as __init. Remove __paginginit and use __meminit. Use __ref in one single function that merges __meminit and __init sections: setup_usemap(). Link: http://lkml.kernel.org/r/20180801122348.21588-4-osalvador@techadventures.net Signed-off-by: Pavel Tatashin Signed-off-by: Oscar Salvador Reviewed-by: Oscar Salvador Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/internal.h | 12 ------------ mm/page_alloc.c | 19 ++++++++++--------- 2 files changed, 10 insertions(+), 21 deletions(-) (limited to 'mm') diff --git a/mm/internal.h b/mm/internal.h index 9e3654d70289..dab088cb6937 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -389,18 +389,6 @@ static inline struct page *mem_map_next(struct page *iter, return iter + 1; } -/* - * FLATMEM and DISCONTIGMEM configurations use alloc_bootmem_node, - * so all functions starting at paging_init should be marked __init - * in those cases. SPARSEMEM, however, allows for memory hotplug, - * and alloc_bootmem_node is not used. - */ -#ifdef CONFIG_SPARSEMEM -#define __paginginit __meminit -#else -#define __paginginit __init -#endif - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 6b2324bc61db..98fe3b68f209 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6120,7 +6120,7 @@ static unsigned long __init usemap_size(unsigned long zone_start_pfn, unsigned l return usemapsize / 8; } -static void __init setup_usemap(struct pglist_data *pgdat, +static void __ref setup_usemap(struct pglist_data *pgdat, struct zone *zone, unsigned long zone_start_pfn, unsigned long zonesize) @@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __paginginit set_pageblock_order(void) +void __meminit set_pageblock_order(void) { unsigned int order; @@ -6168,14 +6168,14 @@ void __paginginit set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __paginginit set_pageblock_order(void) +void __meminit set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -static unsigned long __paginginit calc_memmap_size(unsigned long spanned_pages, - unsigned long present_pages) +static unsigned long __meminit calc_memmap_size(unsigned long spanned_pages, + unsigned long present_pages) { unsigned long pages = spanned_pages; @@ -6233,7 +6233,7 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} * * NOTE: pgdat should get zeroed by caller. */ -static void __paginginit free_area_init_core(struct pglist_data *pgdat) +static void __meminit free_area_init_core(struct pglist_data *pgdat) { enum zone_type j; int nid = pgdat->node_id; @@ -6364,8 +6364,9 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLAT_NODE_MEM_MAP */ -void __paginginit free_area_init_node(int nid, unsigned long *zones_size, - unsigned long node_start_pfn, unsigned long *zholes_size) +void __meminit free_area_init_node(int nid, unsigned long *zones_size, + unsigned long node_start_pfn, + unsigned long *zholes_size) { pg_data_t *pgdat = NODE_DATA(nid); unsigned long start_pfn = 0; @@ -6410,7 +6411,7 @@ void __paginginit free_area_init_node(int nid, unsigned long *zones_size, * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. */ -void __paginginit zero_resv_unavail(void) +void __meminit zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; -- cgit v1.2.3 From 0188dc98ad5c7c361d46175623471d4be0fb8610 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:39 -0700 Subject: mm/page_alloc: inline function to handle CONFIG_DEFERRED_STRUCT_PAGE_INIT Let us move the code between CONFIG_DEFERRED_STRUCT_PAGE_INIT to an inline function. Not having an ifdef in the function makes the code more readable. Link: http://lkml.kernel.org/r/20180730101757.28058-4-osalvador@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Reviewed-by: Pavel Tatashin Acked-by: Vlastimil Babka Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Cc: Pasha Tatashin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 98fe3b68f209..5b939bd1bff9 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6364,6 +6364,21 @@ static void __ref alloc_node_mem_map(struct pglist_data *pgdat) static void __ref alloc_node_mem_map(struct pglist_data *pgdat) { } #endif /* CONFIG_FLAT_NODE_MEM_MAP */ +#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) +{ + /* + * We start only with one section of pages, more pages are added as + * needed until the rest of deferred pages are initialized. + */ + pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, + pgdat->node_spanned_pages); + pgdat->first_deferred_pfn = ULONG_MAX; +} +#else +static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} +#endif + void __meminit free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) @@ -6390,16 +6405,8 @@ void __meminit free_area_init_node(int nid, unsigned long *zones_size, zones_size, zholes_size); alloc_node_mem_map(pgdat); + pgdat_set_deferred_range(pgdat); -#ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT - /* - * We start only with one section of pages, more pages are added as - * needed until the rest of deferred pages are initialized. - */ - pgdat->static_init_pgcnt = min_t(unsigned long, PAGES_PER_SECTION, - pgdat->node_spanned_pages); - pgdat->first_deferred_pfn = ULONG_MAX; -#endif free_area_init_core(pgdat); } -- cgit v1.2.3 From 03e85f9d5f1f8c74f127c5f7a87575d74a78d248 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Tue, 21 Aug 2018 21:53:43 -0700 Subject: mm/page_alloc: Introduce free_area_init_core_hotplug Currently, whenever a new node is created/re-used from the memhotplug path, we call free_area_init_node()->free_area_init_core(). But there is some code that we do not really need to run when we are coming from such path. free_area_init_core() performs the following actions: 1) Initializes pgdat internals, such as spinlock, waitqueues and more. 2) Account # nr_all_pages and # nr_kernel_pages. These values are used later on when creating hash tables. 3) Account number of managed_pages per zone, substracting dma_reserved and memmap pages. 4) Initializes some fields of the zone structure data 5) Calls init_currently_empty_zone to initialize all the freelists 6) Calls memmap_init to initialize all pages belonging to certain zone When called from memhotplug path, free_area_init_core() only performs actions #1 and #4. Action #2 is pointless as the zones do not have any pages since either the node was freed, or we are re-using it, eitherway all zones belonging to this node should have 0 pages. For the same reason, action #3 results always in manages_pages being 0. Action #5 and #6 are performed later on when onlining the pages: online_pages()->move_pfn_range_to_zone()->init_currently_empty_zone() online_pages()->move_pfn_range_to_zone()->memmap_init_zone() This patch does two things: First, moves the node/zone initializtion to their own function, so it allows us to create a small version of free_area_init_core, where we only perform: 1) Initialization of pgdat internals, such as spinlock, waitqueues and more 4) Initialization of some fields of the zone structure data These two functions are: pgdat_init_internals() and zone_init_internals(). The second thing this patch does, is to introduce free_area_init_core_hotplug(), the memhotplug version of free_area_init_core(): Currently, we call free_area_init_node() from the memhotplug path. In there, we set some pgdat's fields, and call calculate_node_totalpages(). calculate_node_totalpages() calculates the # of pages the node has. Since the node is either new, or we are re-using it, the zones belonging to this node should not have any pages, so there is no point to calculate this now. Actually, we re-set these values to 0 later on with the calls to: reset_node_managed_pages() reset_node_present_pages() The # of pages per node and the # of pages per zone will be calculated when onlining the pages: online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_zone_range() online_pages()->move_pfn_range()->move_pfn_range_to_zone()->resize_pgdat_range() Also, since free_area_init_core/free_area_init_node will now only get called during early init, let us replace __paginginit with __init, so their code gets freed up. [osalvador@techadventures.net: fix section usage] Link: http://lkml.kernel.org/r/20180731101752.GA473@techadventures.net [osalvador@suse.de: v6] Link: http://lkml.kernel.org/r/20180801122348.21588-6-osalvador@techadventures.net Link: http://lkml.kernel.org/r/20180730101757.28058-5-osalvador@techadventures.net Signed-off-by: Oscar Salvador Reviewed-by: Pavel Tatashin Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Pasha Tatashin Cc: Aaron Lu Cc: Dan Williams Cc: David Hildenbrand Cc: Joonsoo Kim Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 + include/linux/mm.h | 2 +- mm/memory_hotplug.c | 16 +++------ mm/page_alloc.c | 78 +++++++++++++++++++++++++++++------------- 4 files changed, 61 insertions(+), 36 deletions(-) (limited to 'mm') diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 4e9828cda7a2..34a28227068d 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -319,6 +319,7 @@ static inline int offline_pages(unsigned long start_pfn, unsigned long nr_pages) static inline void remove_memory(int nid, u64 start, u64 size) {} #endif /* CONFIG_MEMORY_HOTREMOVE */ +extern void __ref free_area_init_core_hotplug(int nid); extern int walk_memory_range(unsigned long start_pfn, unsigned long end_pfn, void *arg, int (*func)(struct memory_block *, void *)); extern int add_memory(int nid, u64 start, u64 size); diff --git a/include/linux/mm.h b/include/linux/mm.h index a55f5389c491..a9e733b5fb76 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2015,7 +2015,7 @@ static inline spinlock_t *pud_lock(struct mm_struct *mm, pud_t *pud) extern void __init pagecache_init(void); extern void free_area_init(unsigned long * zones_size); -extern void free_area_init_node(int nid, unsigned long * zones_size, +extern void __init free_area_init_node(int nid, unsigned long * zones_size, unsigned long zone_start_pfn, unsigned long *zholes_size); extern void free_initmem(void); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 4eb6e824a80c..9eea6e809a4e 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -982,8 +982,6 @@ static void reset_node_present_pages(pg_data_t *pgdat) static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) { struct pglist_data *pgdat; - unsigned long zones_size[MAX_NR_ZONES] = {0}; - unsigned long zholes_size[MAX_NR_ZONES] = {0}; unsigned long start_pfn = PFN_DOWN(start); pgdat = NODE_DATA(nid); @@ -1006,8 +1004,11 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) /* we can use NODE_DATA(nid) from here */ + pgdat->node_id = nid; + pgdat->node_start_pfn = start_pfn; + /* init node's zones as empty zones, we don't have any present pages.*/ - free_area_init_node(nid, zones_size, start_pfn, zholes_size); + free_area_init_core_hotplug(nid); pgdat->per_cpu_nodestats = alloc_percpu(struct per_cpu_nodestat); /* @@ -1016,19 +1017,12 @@ static pg_data_t __ref *hotadd_new_pgdat(int nid, u64 start) */ build_all_zonelists(pgdat); - /* - * zone->managed_pages is set to an approximate value in - * free_area_init_core(), which will cause - * /sys/device/system/node/nodeX/meminfo has wrong data. - * So reset it to 0 before any memory is onlined. - */ - reset_node_managed_pages(pgdat); - /* * When memory is hot-added, all the memory is in offline state. So * clear all zones' present_pages because they will be updated in * online_pages() and offline_pages(). */ + reset_node_managed_pages(pgdat); reset_node_present_pages(pgdat); return pgdat; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5b939bd1bff9..c677c1506d73 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6140,7 +6140,7 @@ static inline void setup_usemap(struct pglist_data *pgdat, struct zone *zone, #ifdef CONFIG_HUGETLB_PAGE_SIZE_VARIABLE /* Initialise the number of pages represented by NR_PAGEBLOCK_BITS */ -void __meminit set_pageblock_order(void) +void __init set_pageblock_order(void) { unsigned int order; @@ -6168,13 +6168,13 @@ void __meminit set_pageblock_order(void) * include/linux/pageblock-flags.h for the values of pageblock_order based on * the kernel config */ -void __meminit set_pageblock_order(void) +void __init set_pageblock_order(void) { } #endif /* CONFIG_HUGETLB_PAGE_SIZE_VARIABLE */ -static unsigned long __meminit calc_memmap_size(unsigned long spanned_pages, +static unsigned long __init calc_memmap_size(unsigned long spanned_pages, unsigned long present_pages) { unsigned long pages = spanned_pages; @@ -6225,19 +6225,8 @@ static void pgdat_init_kcompactd(struct pglist_data *pgdat) static void pgdat_init_kcompactd(struct pglist_data *pgdat) {} #endif -/* - * Set up the zone data structures: - * - mark all pages reserved - * - mark all memory queues empty - * - clear the memory bitmaps - * - * NOTE: pgdat should get zeroed by caller. - */ -static void __meminit free_area_init_core(struct pglist_data *pgdat) +static void __meminit pgdat_init_internals(struct pglist_data *pgdat) { - enum zone_type j; - int nid = pgdat->node_id; - pgdat_resize_init(pgdat); pgdat_init_numabalancing(pgdat); @@ -6250,7 +6239,54 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat) pgdat_page_ext_init(pgdat); spin_lock_init(&pgdat->lru_lock); lruvec_init(node_lruvec(pgdat)); +} + +static void __meminit zone_init_internals(struct zone *zone, enum zone_type idx, int nid, + unsigned long remaining_pages) +{ + zone->managed_pages = remaining_pages; + zone_set_nid(zone, nid); + zone->name = zone_names[idx]; + zone->zone_pgdat = NODE_DATA(nid); + spin_lock_init(&zone->lock); + zone_seqlock_init(zone); + zone_pcp_init(zone); +} + +/* + * Set up the zone data structures + * - init pgdat internals + * - init all zones belonging to this node + * + * NOTE: this function is only called during memory hotplug + */ +#ifdef CONFIG_MEMORY_HOTPLUG +void __ref free_area_init_core_hotplug(int nid) +{ + enum zone_type z; + pg_data_t *pgdat = NODE_DATA(nid); + + pgdat_init_internals(pgdat); + for (z = 0; z < MAX_NR_ZONES; z++) + zone_init_internals(&pgdat->node_zones[z], z, nid, 0); +} +#endif + +/* + * Set up the zone data structures: + * - mark all pages reserved + * - mark all memory queues empty + * - clear the memory bitmaps + * + * NOTE: pgdat should get zeroed by caller. + * NOTE: this function is only called during early init. + */ +static void __init free_area_init_core(struct pglist_data *pgdat) +{ + enum zone_type j; + int nid = pgdat->node_id; + pgdat_init_internals(pgdat); pgdat->per_cpu_nodestats = &boot_nodestats; for (j = 0; j < MAX_NR_ZONES; j++) { @@ -6298,13 +6334,7 @@ static void __meminit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = freesize; - zone_set_nid(zone, nid); - zone->name = zone_names[j]; - zone->zone_pgdat = pgdat; - spin_lock_init(&zone->lock); - zone_seqlock_init(zone); - zone_pcp_init(zone); + zone_init_internals(zone, j, nid, freesize); if (!size) continue; @@ -6379,7 +6409,7 @@ static inline void pgdat_set_deferred_range(pg_data_t *pgdat) static inline void pgdat_set_deferred_range(pg_data_t *pgdat) {} #endif -void __meminit free_area_init_node(int nid, unsigned long *zones_size, +void __init free_area_init_node(int nid, unsigned long *zones_size, unsigned long node_start_pfn, unsigned long *zholes_size) { @@ -6418,7 +6448,7 @@ void __meminit free_area_init_node(int nid, unsigned long *zones_size, * may be accessed (for example page_to_pfn() on some configuration accesses * flags). We must explicitly zero those struct pages. */ -void __meminit zero_resv_unavail(void) +void __init zero_resv_unavail(void) { phys_addr_t start, end; unsigned long pfn; -- cgit v1.2.3 From 5989ad7b5ede38d605c588981f634c08252abfc3 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 21 Aug 2018 21:53:50 -0700 Subject: mm, oom: refactor oom_kill_process() Patch series "introduce memory.oom.group", v2. This is a tiny implementation of cgroup-aware OOM killer, which adds an ability to kill a cgroup as a single unit and so guarantee the integrity of the workload. Although it has only a limited functionality in comparison to what now resides in the mm tree (it doesn't change the victim task selection algorithm, doesn't look at memory stas on cgroup level, etc), it's also much simpler and more straightforward. So, hopefully, we can avoid having long debates here, as we had with the full implementation. As it doesn't prevent any futher development, and implements an useful and complete feature, it looks as a sane way forward. This patch (of 2): oom_kill_process() consists of two logical parts: the first one is responsible for considering task's children as a potential victim and printing the debug information. The second half is responsible for sending SIGKILL to all tasks sharing the mm struct with the given victim. This commit splits oom_kill_process() with an intention to re-use the the second half: __oom_kill_process(). The cgroup-aware OOM killer will kill multiple tasks belonging to the victim cgroup. We don't need to print the debug information for the each task, as well as play with task selection (considering task's children), so we can't use the existing oom_kill_process(). Link: http://lkml.kernel.org/r/20171130152824.1591-2-guro@fb.com Link: http://lkml.kernel.org/r/20180802003201.817-3-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Vladimir Davydov Cc: Tetsuo Handa Cc: David Rientjes Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 123 +++++++++++++++++++++++++++++++--------------------------- 1 file changed, 65 insertions(+), 58 deletions(-) (limited to 'mm') diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 20600779f5db..330416c67ce5 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -829,68 +829,12 @@ static bool task_will_free_mem(struct task_struct *task) return ret; } -static void oom_kill_process(struct oom_control *oc, const char *message) +static void __oom_kill_process(struct task_struct *victim) { - struct task_struct *p = oc->chosen; - unsigned int points = oc->chosen_points; - struct task_struct *victim = p; - struct task_struct *child; - struct task_struct *t; + struct task_struct *p; struct mm_struct *mm; - unsigned int victim_points = 0; - static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); bool can_oom_reap = true; - /* - * If the task is already exiting, don't alarm the sysadmin or kill - * its children or threads, just give it access to memory reserves - * so it can die quickly - */ - task_lock(p); - if (task_will_free_mem(p)) { - mark_oom_victim(p); - wake_oom_reaper(p); - task_unlock(p); - put_task_struct(p); - return; - } - task_unlock(p); - - if (__ratelimit(&oom_rs)) - dump_header(oc, p); - - pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", - message, task_pid_nr(p), p->comm, points); - - /* - * If any of p's children has a different mm and is eligible for kill, - * the one with the highest oom_badness() score is sacrificed for its - * parent. This attempts to lose the minimal amount of work done while - * still freeing memory. - */ - read_lock(&tasklist_lock); - for_each_thread(p, t) { - list_for_each_entry(child, &t->children, sibling) { - unsigned int child_points; - - if (process_shares_mm(child, p->mm)) - continue; - /* - * oom_badness() returns 0 if the thread is unkillable - */ - child_points = oom_badness(child, - oc->memcg, oc->nodemask, oc->totalpages); - if (child_points > victim_points) { - put_task_struct(victim); - victim = child; - victim_points = child_points; - get_task_struct(victim); - } - } - } - read_unlock(&tasklist_lock); - p = find_lock_task_mm(victim); if (!p) { put_task_struct(victim); @@ -964,6 +908,69 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } #undef K +static void oom_kill_process(struct oom_control *oc, const char *message) +{ + struct task_struct *p = oc->chosen; + unsigned int points = oc->chosen_points; + struct task_struct *victim = p; + struct task_struct *child; + struct task_struct *t; + unsigned int victim_points = 0; + static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + /* + * If the task is already exiting, don't alarm the sysadmin or kill + * its children or threads, just give it access to memory reserves + * so it can die quickly + */ + task_lock(p); + if (task_will_free_mem(p)) { + mark_oom_victim(p); + wake_oom_reaper(p); + task_unlock(p); + put_task_struct(p); + return; + } + task_unlock(p); + + if (__ratelimit(&oom_rs)) + dump_header(oc, p); + + pr_err("%s: Kill process %d (%s) score %u or sacrifice child\n", + message, task_pid_nr(p), p->comm, points); + + /* + * If any of p's children has a different mm and is eligible for kill, + * the one with the highest oom_badness() score is sacrificed for its + * parent. This attempts to lose the minimal amount of work done while + * still freeing memory. + */ + read_lock(&tasklist_lock); + for_each_thread(p, t) { + list_for_each_entry(child, &t->children, sibling) { + unsigned int child_points; + + if (process_shares_mm(child, p->mm)) + continue; + /* + * oom_badness() returns 0 if the thread is unkillable + */ + child_points = oom_badness(child, + oc->memcg, oc->nodemask, oc->totalpages); + if (child_points > victim_points) { + put_task_struct(victim); + victim = child; + victim_points = child_points; + get_task_struct(victim); + } + } + } + read_unlock(&tasklist_lock); + + __oom_kill_process(victim); +} + /* * Determines whether the kernel must panic because of the panic_on_oom sysctl. */ -- cgit v1.2.3 From 3d8b38eb81cac81395f6a823f6bf401b327268e6 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Tue, 21 Aug 2018 21:53:54 -0700 Subject: mm, oom: introduce memory.oom.group For some workloads an intervention from the OOM killer can be painful. Killing a random task can bring the workload into an inconsistent state. Historically, there are two common solutions for this problem: 1) enabling panic_on_oom, 2) using a userspace daemon to monitor OOMs and kill all outstanding processes. Both approaches have their downsides: rebooting on each OOM is an obvious waste of capacity, and handling all in userspace is tricky and requires a userspace agent, which will monitor all cgroups for OOMs. In most cases an in-kernel after-OOM cleaning-up mechanism can eliminate the necessity of enabling panic_on_oom. Also, it can simplify the cgroup management for userspace applications. This commit introduces a new knob for cgroup v2 memory controller: memory.oom.group. The knob determines whether the cgroup should be treated as an indivisible workload by the OOM killer. If set, all tasks belonging to the cgroup or to its descendants (if the memory cgroup is not a leaf cgroup) are killed together or not at all. To determine which cgroup has to be killed, we do traverse the cgroup hierarchy from the victim task's cgroup up to the OOMing cgroup (or root) and looking for the highest-level cgroup with memory.oom.group set. Tasks with the OOM protection (oom_score_adj set to -1000) are treated as an exception and are never killed. This patch doesn't change the OOM victim selection algorithm. Link: http://lkml.kernel.org/r/20180802003201.817-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Cc: David Rientjes Cc: Tetsuo Handa Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 18 +++++++ include/linux/memcontrol.h | 18 +++++++ mm/memcontrol.c | 93 +++++++++++++++++++++++++++++++++ mm/oom_kill.c | 30 +++++++++++ 4 files changed, 159 insertions(+) (limited to 'mm') diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 1746131bc9cb..184193bcb262 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1072,6 +1072,24 @@ PAGE_SIZE multiple when read back. high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. + memory.oom.group + A read-write single value file which exists on non-root + cgroups. The default value is "0". + + Determines whether the cgroup should be treated as + an indivisible workload by the OOM killer. If set, + all tasks belonging to the cgroup or to its descendants + (if the memory cgroup is not a leaf cgroup) are killed + together or not at all. This can be used to avoid + partial kills to guarantee workload integrity. + + Tasks with the OOM protection (oom_score_adj set to -1000) + are treated as an exception and are never killed. + + If the OOM killer is invoked in a cgroup, it's not going + to kill any tasks outside of this cgroup, regardless + memory.oom.group values of ancestor cgroups. + memory.events A read-only flat-keyed file which exists on non-root cgroups. The following entries are defined. Unless specified diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 0e6c515fb698..652f602167df 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -225,6 +225,11 @@ struct mem_cgroup { */ bool use_hierarchy; + /* + * Should the OOM killer kill all belonging tasks, had it kill one? + */ + bool oom_group; + /* protected by memcg_oom_lock */ bool oom_lock; int under_oom; @@ -542,6 +547,9 @@ static inline bool task_in_memcg_oom(struct task_struct *p) } bool mem_cgroup_oom_synchronize(bool wait); +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain); +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg); #ifdef CONFIG_MEMCG_SWAP extern int do_swap_account; @@ -1001,6 +1009,16 @@ static inline bool mem_cgroup_oom_synchronize(bool wait) return false; } +static inline struct mem_cgroup *mem_cgroup_get_oom_group( + struct task_struct *victim, struct mem_cgroup *oom_domain) +{ + return NULL; +} + +static inline void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ +} + static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 59c14c988143..4ead5a4817de 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1776,6 +1776,62 @@ cleanup: return true; } +/** + * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM + * @victim: task to be killed by the OOM killer + * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM + * + * Returns a pointer to a memory cgroup, which has to be cleaned up + * by killing all belonging OOM-killable tasks. + * + * Caller has to call mem_cgroup_put() on the returned non-NULL memcg. + */ +struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim, + struct mem_cgroup *oom_domain) +{ + struct mem_cgroup *oom_group = NULL; + struct mem_cgroup *memcg; + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + return NULL; + + if (!oom_domain) + oom_domain = root_mem_cgroup; + + rcu_read_lock(); + + memcg = mem_cgroup_from_task(victim); + if (memcg == root_mem_cgroup) + goto out; + + /* + * Traverse the memory cgroup hierarchy from the victim task's + * cgroup up to the OOMing cgroup (or root) to find the + * highest-level memory cgroup with oom.group set. + */ + for (; memcg; memcg = parent_mem_cgroup(memcg)) { + if (memcg->oom_group) + oom_group = memcg; + + if (memcg == oom_domain) + break; + } + + if (oom_group) + css_get(&oom_group->css); +out: + rcu_read_unlock(); + + return oom_group; +} + +void mem_cgroup_print_oom_group(struct mem_cgroup *memcg) +{ + pr_info("Tasks in "); + pr_cont_cgroup_path(memcg->css.cgroup); + pr_cont(" are going to be killed due to memory.oom.group set\n"); +} + /** * lock_page_memcg - lock a page->mem_cgroup binding * @page: the page @@ -5561,6 +5617,37 @@ static int memory_stat_show(struct seq_file *m, void *v) return 0; } +static int memory_oom_group_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "%d\n", memcg->oom_group); + + return 0; +} + +static ssize_t memory_oom_group_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + int ret, oom_group; + + buf = strstrip(buf); + if (!buf) + return -EINVAL; + + ret = kstrtoint(buf, 0, &oom_group); + if (ret) + return ret; + + if (oom_group != 0 && oom_group != 1) + return -EINVAL; + + memcg->oom_group = oom_group; + + return nbytes; +} + static struct cftype memory_files[] = { { .name = "current", @@ -5602,6 +5689,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .seq_show = memory_stat_show, }, + { + .name = "oom.group", + .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE, + .seq_show = memory_oom_group_show, + .write = memory_oom_group_write, + }, { } /* terminate */ }; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 330416c67ce5..0e10b864e074 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -908,6 +908,19 @@ static void __oom_kill_process(struct task_struct *victim) } #undef K +/* + * Kill provided task unless it's secured by setting + * oom_score_adj to OOM_SCORE_ADJ_MIN. + */ +static int oom_kill_memcg_member(struct task_struct *task, void *unused) +{ + if (task->signal->oom_score_adj != OOM_SCORE_ADJ_MIN) { + get_task_struct(task); + __oom_kill_process(task); + } + return 0; +} + static void oom_kill_process(struct oom_control *oc, const char *message) { struct task_struct *p = oc->chosen; @@ -915,6 +928,7 @@ static void oom_kill_process(struct oom_control *oc, const char *message) struct task_struct *victim = p; struct task_struct *child; struct task_struct *t; + struct mem_cgroup *oom_group; unsigned int victim_points = 0; static DEFINE_RATELIMIT_STATE(oom_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -968,7 +982,23 @@ static void oom_kill_process(struct oom_control *oc, const char *message) } read_unlock(&tasklist_lock); + /* + * Do we need to kill the entire memory cgroup? + * Or even one of the ancestor memory cgroups? + * Check this out before killing the victim task. + */ + oom_group = mem_cgroup_get_oom_group(victim, oc->memcg); + __oom_kill_process(victim); + + /* + * If necessary, kill all tasks in the selected memory cgroup. + */ + if (oom_group) { + mem_cgroup_print_oom_group(oom_group); + mem_cgroup_scan_tasks(oom_group, oom_kill_memcg_member, NULL); + mem_cgroup_put(oom_group); + } } /* -- cgit v1.2.3 From 7e8a6304d5419cbf056a59de92939e5eef039c57 Mon Sep 17 00:00:00 2001 From: "Dennis Zhou (Facebook)" Date: Tue, 21 Aug 2018 21:53:58 -0700 Subject: /proc/meminfo: add percpu populated pages count Currently, percpu memory only exposes allocation and utilization information via debugfs. This more or less is only really useful for understanding the fragmentation and allocation information at a per-chunk level with a few global counters. This is also gated behind a config. BPF and cgroup, for example, have seen an increase in use causing increased use of percpu memory. Let's make it easier for someone to identify how much memory is being used. This patch adds the "Percpu" stat to meminfo to more easily look up how much percpu memory is in use. This number includes the cost for all allocated backing pages and not just insight at the per a unit, per chunk level. Metadata is excluded. I think excluding metadata is fair because the backing memory scales with the numbere of cpus and can quickly outweigh the metadata. It also makes this calculation light. Link: http://lkml.kernel.org/r/20180807184723.74919-1-dennisszhou@gmail.com Signed-off-by: Dennis Zhou Acked-by: Tejun Heo Acked-by: Roman Gushchin Reviewed-by: Andrew Morton Acked-by: David Rientjes Acked-by: Vlastimil Babka Cc: Johannes Weiner Cc: Christoph Lameter Cc: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +++ fs/proc/meminfo.c | 2 ++ include/linux/percpu.h | 2 ++ mm/percpu.c | 29 +++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+) (limited to 'mm') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index 1605acbb23b6..22b4b00dee31 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -870,6 +870,7 @@ Committed_AS: 100056 kB VmallocTotal: 112216 kB VmallocUsed: 428 kB VmallocChunk: 111088 kB +Percpu: 62080 kB HardwareCorrupted: 0 kB AnonHugePages: 49152 kB ShmemHugePages: 0 kB @@ -962,6 +963,8 @@ Committed_AS: The amount of memory presently allocated on the system. VmallocTotal: total size of vmalloc memory area VmallocUsed: amount of vmalloc area which is used VmallocChunk: largest contiguous block of vmalloc area which is free + Percpu: Memory allocated to the percpu allocator used to back percpu + allocations. This stat excludes the cost of metadata. .............................................................................. diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 2fb04846ed11..edda898714eb 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -121,6 +122,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) (unsigned long)VMALLOC_TOTAL >> 10); show_val_kb(m, "VmallocUsed: ", 0ul); show_val_kb(m, "VmallocChunk: ", 0ul); + show_val_kb(m, "Percpu: ", pcpu_nr_pages()); #ifdef CONFIG_MEMORY_FAILURE seq_printf(m, "HardwareCorrupted: %5lu kB\n", diff --git a/include/linux/percpu.h b/include/linux/percpu.h index 296bbe49d5d1..70b7123f38c7 100644 --- a/include/linux/percpu.h +++ b/include/linux/percpu.h @@ -149,4 +149,6 @@ extern phys_addr_t per_cpu_ptr_to_phys(void *addr); (typeof(type) __percpu *)__alloc_percpu(sizeof(type), \ __alignof__(type)) +extern unsigned long pcpu_nr_pages(void); + #endif /* __LINUX_PERCPU_H */ diff --git a/mm/percpu.c b/mm/percpu.c index 0b6480979ac7..a749d4d96e3e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -169,6 +169,14 @@ static LIST_HEAD(pcpu_map_extend_chunks); */ int pcpu_nr_empty_pop_pages; +/* + * The number of populated pages in use by the allocator, protected by + * pcpu_lock. This number is kept per a unit per chunk (i.e. when a page gets + * allocated/deallocated, it is allocated/deallocated in all units of a chunk + * and increments/decrements this count by 1). + */ +static unsigned long pcpu_nr_populated; + /* * Balance work is used to populate or destroy chunks asynchronously. We * try to keep the number of populated free pages between @@ -1232,6 +1240,7 @@ static void pcpu_chunk_populated(struct pcpu_chunk *chunk, int page_start, bitmap_set(chunk->populated, page_start, nr); chunk->nr_populated += nr; + pcpu_nr_populated += nr; if (!for_alloc) { chunk->nr_empty_pop_pages += nr; @@ -1260,6 +1269,7 @@ static void pcpu_chunk_depopulated(struct pcpu_chunk *chunk, chunk->nr_populated -= nr; chunk->nr_empty_pop_pages -= nr; pcpu_nr_empty_pop_pages -= nr; + pcpu_nr_populated -= nr; } /* @@ -2176,6 +2186,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai, pcpu_nr_empty_pop_pages = pcpu_first_chunk->nr_empty_pop_pages; pcpu_chunk_relocate(pcpu_first_chunk, -1); + /* include all regions of the first chunk */ + pcpu_nr_populated += PFN_DOWN(size_sum); + pcpu_stats_chunk_alloc(); trace_percpu_create_chunk(base_addr); @@ -2745,6 +2758,22 @@ void __init setup_per_cpu_areas(void) #endif /* CONFIG_SMP */ +/* + * pcpu_nr_pages - calculate total number of populated backing pages + * + * This reflects the number of pages populated to back chunks. Metadata is + * excluded in the number exposed in meminfo as the number of backing pages + * scales with the number of cpus and can quickly outweigh the memory used for + * metadata. It also keeps this calculation nice and simple. + * + * RETURNS: + * Total number of populated backing pages in use by the allocator. + */ +unsigned long pcpu_nr_pages(void) +{ + return pcpu_nr_populated * pcpu_nr_units; +} + /* * Percpu allocator is initialized early during boot when neither slab or * workqueue is available. Plug async management until everything is up -- cgit v1.2.3 From e58dd0de5eadf145895b13451a1fef8ef03946eb Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Tue, 21 Aug 2018 21:55:31 -0700 Subject: bdi: use refcount_t for reference counting instead atomic_t refcount_t type and corresponding API should be used instead of atomic_t when the variable is used as a reference counter. This permits avoiding accidental refcounter overflows that might lead to use-after-free situations. Link: http://lkml.kernel.org/r/20180703200141.28415-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Acked-by: Peter Zijlstra (Intel) Suggested-by: Peter Zijlstra Cc: Jens Axboe Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/backing-dev-defs.h | 3 ++- include/linux/backing-dev.h | 4 ++-- mm/backing-dev.c | 12 ++++++------ 3 files changed, 10 insertions(+), 9 deletions(-) (limited to 'mm') diff --git a/include/linux/backing-dev-defs.h b/include/linux/backing-dev-defs.h index 24251762c20c..9a6bc0951cfa 100644 --- a/include/linux/backing-dev-defs.h +++ b/include/linux/backing-dev-defs.h @@ -12,6 +12,7 @@ #include #include #include +#include struct page; struct device; @@ -75,7 +76,7 @@ enum wb_reason { */ struct bdi_writeback_congested { unsigned long state; /* WB_[a]sync_congested flags */ - atomic_t refcnt; /* nr of attached wb's and blkg */ + refcount_t refcnt; /* nr of attached wb's and blkg */ #ifdef CONFIG_CGROUP_WRITEBACK struct backing_dev_info *__bdi; /* the associated bdi, set to NULL diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index 72ca0f3d39f3..c28a47cbe355 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -404,13 +404,13 @@ static inline bool inode_cgwb_enabled(struct inode *inode) static inline struct bdi_writeback_congested * wb_congested_get_create(struct backing_dev_info *bdi, int blkcg_id, gfp_t gfp) { - atomic_inc(&bdi->wb_congested->refcnt); + refcount_inc(&bdi->wb_congested->refcnt); return bdi->wb_congested; } static inline void wb_congested_put(struct bdi_writeback_congested *congested) { - if (atomic_dec_and_test(&congested->refcnt)) + if (refcount_dec_and_test(&congested->refcnt)) kfree(congested); } diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 2e5d3df0853d..55a233d75f39 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -438,10 +438,10 @@ retry: if (new_congested) { /* !found and storage for new one already allocated, insert */ congested = new_congested; - new_congested = NULL; rb_link_node(&congested->rb_node, parent, node); rb_insert_color(&congested->rb_node, &bdi->cgwb_congested_tree); - goto found; + spin_unlock_irqrestore(&cgwb_lock, flags); + return congested; } spin_unlock_irqrestore(&cgwb_lock, flags); @@ -451,13 +451,13 @@ retry: if (!new_congested) return NULL; - atomic_set(&new_congested->refcnt, 0); + refcount_set(&new_congested->refcnt, 1); new_congested->__bdi = bdi; new_congested->blkcg_id = blkcg_id; goto retry; found: - atomic_inc(&congested->refcnt); + refcount_inc(&congested->refcnt); spin_unlock_irqrestore(&cgwb_lock, flags); kfree(new_congested); return congested; @@ -474,7 +474,7 @@ void wb_congested_put(struct bdi_writeback_congested *congested) unsigned long flags; local_irq_save(flags); - if (!atomic_dec_and_lock(&congested->refcnt, &cgwb_lock)) { + if (!refcount_dec_and_lock(&congested->refcnt, &cgwb_lock)) { local_irq_restore(flags); return; } @@ -804,7 +804,7 @@ static int cgwb_bdi_init(struct backing_dev_info *bdi) if (!bdi->wb_congested) return -ENOMEM; - atomic_set(&bdi->wb_congested->refcnt, 1); + refcount_set(&bdi->wb_congested->refcnt, 1); err = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL); if (err) { -- cgit v1.2.3 From 060288a7320b2837a8ff2af1b3643bdbd5e568f6 Mon Sep 17 00:00:00 2001 From: Anna-Maria Gleixner Date: Tue, 21 Aug 2018 21:55:35 -0700 Subject: bdi: use irqsave variant of refcount_dec_and_lock() The irqsave variant of refcount_dec_and_lock handles irqsave/restore when taking/releasing the spin lock. With this variant the call of local_irq_save/restore is no longer required. [bigeasy@linutronix.de: s@atomic_dec_and_lock@refcount_dec_and_lock@g] Link: http://lkml.kernel.org/r/20180703200141.28415-5-bigeasy@linutronix.de Signed-off-by: Anna-Maria Gleixner Signed-off-by: Sebastian Andrzej Siewior Acked-by: Peter Zijlstra (Intel) Cc: Jens Axboe Cc: Ingo Molnar Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/backing-dev.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) (limited to 'mm') diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 55a233d75f39..f5981e9d6ae2 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -473,11 +473,8 @@ void wb_congested_put(struct bdi_writeback_congested *congested) { unsigned long flags; - local_irq_save(flags); - if (!refcount_dec_and_lock(&congested->refcnt, &cgwb_lock)) { - local_irq_restore(flags); + if (!refcount_dec_and_lock_irqsave(&congested->refcnt, &cgwb_lock, &flags)) return; - } /* bdi might already have been destroyed leaving @congested unlinked */ if (congested->__bdi) { -- cgit v1.2.3