From a3aea839e42ef8d76bb58091ab7f5a45a85ea299 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:12 -0700 Subject: mm, THP, swap: support to clear swap cache flag for THP swapped out Patch series "mm, THP, swap: Delay splitting THP after swapped out", v3. This is the second step of THP (Transparent Huge Page) swap optimization. In the first step, the splitting huge page is delayed from almost the first step of swapping out to after allocating the swap space for the THP and adding the THP into the swap cache. In the second step, the splitting is delayed further to after the swapping out finished. The plan is to delay splitting THP step by step, finally avoid splitting THP for the THP swapping out and swap out/in the THP as a whole. In the patchset, more operations for the anonymous THP reclaiming, such as TLB flushing, writing the THP to the swap device, removing the THP from the swap cache are batched. So that the performance of anonymous THP swapping out are improved. During the development, the following scenarios/code paths have been checked, - swap out/in - swap off - write protect page fault - madvise_free - process exit - split huge page With the patchset, the swap out throughput improves 42% (from about 5.81GB/s to about 8.25GB/s) in the vm-scalability swap-w-seq test case with 16 processes. At the same time, the IPI (reflect TLB flushing) reduced about 78.9%. The test is done on a Xeon E5 v3 system. The swap device used is a RAM simulated PMEM (persistent memory) device. To test the sequential swapping out, the test case creates 8 processes, which sequentially allocate and write to the anonymous pages until the RAM and part of the swap device is used up. Below is the part of the cover letter for the first step patchset of THP swap optimization which applies to all steps. ========================= Recently, the performance of the storage devices improved so fast that we cannot saturate the disk bandwidth with single logical CPU when do page swap out even on a high-end server machine. Because the performance of the storage device improved faster than that of single logical CPU. And it seems that the trend will not change in the near future. On the other hand, the THP becomes more and more popular because of increased memory size. So it becomes necessary to optimize THP swap performance. The advantages of the THP swap support include: - Batch the swap operations for the THP to reduce TLB flushing and lock acquiring/releasing, including allocating/freeing the swap space, adding/deleting to/from the swap cache, and writing/reading the swap space, etc. This will help improve the performance of the THP swap. - The THP swap space read/write will be 2M sequential IO. It is particularly helpful for the swap read, which are usually 4k random IO. This will improve the performance of the THP swap too. - It will help the memory fragmentation, especially when the THP is heavily used by the applications. The 2M continuous pages will be free up after THP swapping out. - It will improve the THP utilization on the system with the swap turned on. Because the speed for khugepaged to collapse the normal pages into the THP is quite slow. After the THP is split during the swapping out, it will take quite long time for the normal pages to collapse back into the THP after being swapped in. The high THP utilization helps the efficiency of the page based memory management too. There are some concerns regarding THP swap in, mainly because possible enlarged read/write IO size (for swap in/out) may put more overhead on the storage device. To deal with that, the THP swap in should be turned on only when necessary. For example, it can be selected via "always/never/madvise" logic, to be turned on globally, turned off globally, or turned on only for VMA with MADV_HUGEPAGE, etc. This patch (of 12): Previously, swapcache_free_cluster() is used only in the error path of shrink_page_list() to free the swap cluster just allocated if the THP (Transparent Huge Page) is failed to be split. In this patch, it is enhanced to clear the swap cache flag (SWAP_HAS_CACHE) for the swap cluster that holds the contents of THP swapped out. This will be used in delaying splitting THP after swapping out support. Because there is no THP swapping in as a whole support yet, after clearing the swap cache flag, the swap cluster backing the THP swapped out will be split. So that the swap slots in the swap cluster can be swapped in as normal pages later. Link: http://lkml.kernel.org/r/20170724051840.2309-2-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 6ba4aab2db0b..c32e9b23d642 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1168,22 +1168,40 @@ static void swapcache_free_cluster(swp_entry_t entry) struct swap_cluster_info *ci; struct swap_info_struct *si; unsigned char *map; - unsigned int i; + unsigned int i, free_entries = 0; + unsigned char val; - si = swap_info_get(entry); + si = _swap_info_get(entry); if (!si) return; ci = lock_cluster(si, offset); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { - VM_BUG_ON(map[i] != SWAP_HAS_CACHE); - map[i] = 0; + val = map[i]; + VM_BUG_ON(!(val & SWAP_HAS_CACHE)); + if (val == SWAP_HAS_CACHE) + free_entries++; + } + if (!free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++) + map[i] &= ~SWAP_HAS_CACHE; } unlock_cluster(ci); - mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); - swap_free_cluster(si, idx); - spin_unlock(&si->lock); + if (free_entries == SWAPFILE_CLUSTER) { + spin_lock(&si->lock); + ci = lock_cluster(si, offset); + memset(map, 0, SWAPFILE_CLUSTER); + unlock_cluster(ci); + mem_cgroup_uncharge_swap(entry, SWAPFILE_CLUSTER); + swap_free_cluster(si, idx); + spin_unlock(&si->lock); + } else if (free_entries) { + for (i = 0; i < SWAPFILE_CLUSTER; i++, entry.val++) { + if (!__swap_entry_free(si, entry, SWAP_HAS_CACHE)) + free_swap_slot(entry); + } + } } #else static inline void swapcache_free_cluster(swp_entry_t entry) -- cgit v1.2.3 From e07098294adfd03d582af7626752255e3d170393 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:16 -0700 Subject: mm, THP, swap: support to reclaim swap space for THP swapped out The normal swap slot reclaiming can be done when the swap count reaches SWAP_HAS_CACHE. But for the swap slot which is backing a THP, all swap slots backing one THP must be reclaimed together, because the swap slot may be used again when the THP is swapped out again later. So the swap slots backing one THP can be reclaimed together when the swap count for all swap slots for the THP reached SWAP_HAS_CACHE. In the patch, the functions to check whether the swap count for all swap slots backing one THP reached SWAP_HAS_CACHE are implemented and used when checking whether a swap slot can be reclaimed. To make it easier to determine whether a swap slot is backing a THP, a new swap cluster flag named CLUSTER_FLAG_HUGE is added to mark a swap cluster which is backing a THP (Transparent Huge Page). Because THP swap in as a whole isn't supported now. After deleting the THP from the swap cache (for example, swapping out finished), the CLUSTER_FLAG_HUGE flag will be cleared. So that, the normal pages inside THP can be swapped in individually. [ying.huang@intel.com: fix swap_page_trans_huge_swapped on HDD] Link: http://lkml.kernel.org/r/874ltsm0bi.fsf@yhuang-dev.intel.com Link: http://lkml.kernel.org/r/20170724051840.2309-3-ying.huang@intel.com Signed-off-by: "Huang, Ying" Acked-by: Rik van Riel Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 + mm/swapfile.c | 78 +++++++++++++++++++++++++++++++++++++++++++++++----- 2 files changed, 72 insertions(+), 7 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index d83d28e53e62..964b4f1fba4a 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -188,6 +188,7 @@ struct swap_cluster_info { }; #define CLUSTER_FLAG_FREE 1 /* This cluster is free */ #define CLUSTER_FLAG_NEXT_NULL 2 /* This cluster has no next cluster */ +#define CLUSTER_FLAG_HUGE 4 /* This cluster is backing a transparent huge page */ /* * We assign a cluster to each CPU, so each CPU can allocate swap entry from diff --git a/mm/swapfile.c b/mm/swapfile.c index c32e9b23d642..164d9624d7d2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -265,6 +265,16 @@ static inline void cluster_set_null(struct swap_cluster_info *info) info->data = 0; } +static inline bool cluster_is_huge(struct swap_cluster_info *info) +{ + return info->flags & CLUSTER_FLAG_HUGE; +} + +static inline void cluster_clear_huge(struct swap_cluster_info *info) +{ + info->flags &= ~CLUSTER_FLAG_HUGE; +} + static inline struct swap_cluster_info *lock_cluster(struct swap_info_struct *si, unsigned long offset) { @@ -846,7 +856,7 @@ static int swap_alloc_cluster(struct swap_info_struct *si, swp_entry_t *slot) offset = idx * SWAPFILE_CLUSTER; ci = lock_cluster(si, offset); alloc_cluster(si, idx); - cluster_set_count_flag(ci, SWAPFILE_CLUSTER, 0); + cluster_set_count_flag(ci, SWAPFILE_CLUSTER, CLUSTER_FLAG_HUGE); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) @@ -1176,6 +1186,7 @@ static void swapcache_free_cluster(swp_entry_t entry) return; ci = lock_cluster(si, offset); + VM_BUG_ON(!cluster_is_huge(ci)); map = si->swap_map + offset; for (i = 0; i < SWAPFILE_CLUSTER; i++) { val = map[i]; @@ -1187,6 +1198,7 @@ static void swapcache_free_cluster(swp_entry_t entry) for (i = 0; i < SWAPFILE_CLUSTER; i++) map[i] &= ~SWAP_HAS_CACHE; } + cluster_clear_huge(ci); unlock_cluster(ci); if (free_entries == SWAPFILE_CLUSTER) { spin_lock(&si->lock); @@ -1350,6 +1362,54 @@ out: return count; } +#ifdef CONFIG_THP_SWAP +static bool swap_page_trans_huge_swapped(struct swap_info_struct *si, + swp_entry_t entry) +{ + struct swap_cluster_info *ci; + unsigned char *map = si->swap_map; + unsigned long roffset = swp_offset(entry); + unsigned long offset = round_down(roffset, SWAPFILE_CLUSTER); + int i; + bool ret = false; + + ci = lock_cluster_or_swap_info(si, offset); + if (!ci || !cluster_is_huge(ci)) { + if (map[roffset] != SWAP_HAS_CACHE) + ret = true; + goto unlock_out; + } + for (i = 0; i < SWAPFILE_CLUSTER; i++) { + if (map[offset + i] != SWAP_HAS_CACHE) { + ret = true; + break; + } + } +unlock_out: + unlock_cluster_or_swap_info(si, ci); + return ret; +} + +static bool page_swapped(struct page *page) +{ + swp_entry_t entry; + struct swap_info_struct *si; + + if (likely(!PageTransCompound(page))) + return page_swapcount(page) != 0; + + page = compound_head(page); + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) + return swap_page_trans_huge_swapped(si, entry); + return false; +} +#else +#define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) +#define page_swapped(page) (page_swapcount(page) != 0) +#endif + /* * We can write to an anon page without COW if there are no other references * to it. And as a side-effect, free up its swap: because the old content @@ -1404,7 +1464,7 @@ int try_to_free_swap(struct page *page) return 0; if (PageWriteback(page)) return 0; - if (page_swapcount(page)) + if (page_swapped(page)) return 0; /* @@ -1425,6 +1485,7 @@ int try_to_free_swap(struct page *page) if (pm_suspended_storage()) return 0; + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); return 1; @@ -1446,7 +1507,8 @@ int free_swap_and_cache(swp_entry_t entry) p = _swap_info_get(entry); if (p) { count = __swap_entry_free(p, entry, 1); - if (count == SWAP_HAS_CACHE) { + if (count == SWAP_HAS_CACHE && + !swap_page_trans_huge_swapped(p, entry)) { page = find_get_page(swap_address_space(entry), swp_offset(entry)); if (page && !trylock_page(page)) { @@ -1463,7 +1525,8 @@ int free_swap_and_cache(swp_entry_t entry) */ if (PageSwapCache(page) && !PageWriteback(page) && (!page_mapped(page) || mem_cgroup_swap_full(page)) && - !swap_swapcount(p, entry)) { + !swap_page_trans_huge_swapped(p, entry)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } @@ -2017,7 +2080,7 @@ int try_to_unuse(unsigned int type, bool frontswap, .sync_mode = WB_SYNC_NONE, }; - swap_writepage(page, &wbc); + swap_writepage(compound_head(page), &wbc); lock_page(page); wait_on_page_writeback(page); } @@ -2030,8 +2093,9 @@ int try_to_unuse(unsigned int type, bool frontswap, * delete, since it may not have been written out to swap yet. */ if (PageSwapCache(page) && - likely(page_private(page) == entry.val)) - delete_from_swap_cache(page); + likely(page_private(page) == entry.val) && + !page_swapped(page)) + delete_from_swap_cache(compound_head(page)); /* * So we could skip searching mms once swap count went -- cgit v1.2.3 From ba3c4ce6def4915093be80585ff69f780630f32f Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:19 -0700 Subject: mm, THP, swap: make reuse_swap_page() works for THP swapped out After supporting to delay THP (Transparent Huge Page) splitting after swapped out, it is possible that some page table mappings of the THP are turned into swap entries. So reuse_swap_page() need to check the swap count in addition to the map count as before. This patch done that. In the huge PMD write protect fault handler, in addition to the page map count, the swap count need to be checked too, so the page lock need to be acquired too when calling reuse_swap_page() in addition to the page table lock. [ying.huang@intel.com: silence a compiler warning] Link: http://lkml.kernel.org/r/87bmnzizjy.fsf@yhuang-dev.intel.com Link: http://lkml.kernel.org/r/20170724051840.2309-4-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 4 +- mm/huge_memory.c | 16 +++++++- mm/memory.c | 6 +-- mm/swapfile.c | 102 ++++++++++++++++++++++++++++++++++++++++++++++----- 4 files changed, 113 insertions(+), 15 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 964b4f1fba4a..7176ba780e83 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,8 +510,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_mapcount) \ - (page_trans_huge_mapcount(page, total_mapcount) == 1) +#define reuse_swap_page(page, total_map_swapcount) \ + (page_trans_huge_mapcount(page, total_map_swapcount) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 7f0b3f6db923..02dfe635c9fe 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1240,15 +1240,29 @@ int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (page_trans_huge_mapcount(page, NULL) == 1) { + if (!trylock_page(page)) { + get_page(page); + spin_unlock(vmf->ptl); + lock_page(page); + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + unlock_page(page); + put_page(page); + goto out_unlock; + } + put_page(page); + } + if (reuse_swap_page(page, NULL)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; + unlock_page(page); goto out_unlock; } + unlock_page(page); get_page(page); spin_unlock(vmf->ptl); alloc: diff --git a/mm/memory.c b/mm/memory.c index 1416485e278c..3dd8bb46391b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2619,7 +2619,7 @@ static int do_wp_page(struct vm_fault *vmf) * not dirty accountable. */ if (PageAnon(vmf->page) && !PageKsm(vmf->page)) { - int total_mapcount; + int total_map_swapcount; if (!trylock_page(vmf->page)) { get_page(vmf->page); pte_unmap_unlock(vmf->pte, vmf->ptl); @@ -2634,8 +2634,8 @@ static int do_wp_page(struct vm_fault *vmf) } put_page(vmf->page); } - if (reuse_swap_page(vmf->page, &total_mapcount)) { - if (total_mapcount == 1) { + if (reuse_swap_page(vmf->page, &total_map_swapcount)) { + if (total_map_swapcount == 1) { /* * The page is all ours. Move it to * our anon_vma so the rmap code will diff --git a/mm/swapfile.c b/mm/swapfile.c index 164d9624d7d2..2bfbfb87123a 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1405,9 +1405,89 @@ static bool page_swapped(struct page *page) return swap_page_trans_huge_swapped(si, entry); return false; } + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int i, map_swapcount, _total_mapcount, _total_swapcount; + unsigned long offset = 0; + struct swap_info_struct *si; + struct swap_cluster_info *ci = NULL; + unsigned char *map = NULL; + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + if (likely(!PageTransCompound(page))) { + mapcount = atomic_read(&page->_mapcount) + 1; + if (total_mapcount) + *total_mapcount = mapcount; + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; + } + + page = compound_head(page); + + _total_mapcount = _total_swapcount = map_swapcount = 0; + if (PageSwapCache(page)) { + swp_entry_t entry; + + entry.val = page_private(page); + si = _swap_info_get(entry); + if (si) { + map = si->swap_map; + offset = swp_offset(entry); + } + } + if (map) + ci = lock_cluster(si, offset); + for (i = 0; i < HPAGE_PMD_NR; i++) { + mapcount = atomic_read(&page[i]._mapcount) + 1; + _total_mapcount += mapcount; + if (map) { + swapcount = swap_count(map[offset + i]); + _total_swapcount += swapcount; + } + map_swapcount = max(map_swapcount, mapcount + swapcount); + } + unlock_cluster(ci); + if (PageDoubleMap(page)) { + map_swapcount -= 1; + _total_mapcount -= HPAGE_PMD_NR; + } + mapcount = compound_mapcount(page); + map_swapcount += mapcount; + _total_mapcount += mapcount; + if (total_mapcount) + *total_mapcount = _total_mapcount; + if (total_swapcount) + *total_swapcount = _total_swapcount; + + return map_swapcount; +} #else #define swap_page_trans_huge_swapped(si, entry) swap_swapcount(si, entry) #define page_swapped(page) (page_swapcount(page) != 0) + +static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, + int *total_swapcount) +{ + int mapcount, swapcount = 0; + + /* hugetlbfs shouldn't call it */ + VM_BUG_ON_PAGE(PageHuge(page), page); + + mapcount = page_trans_huge_mapcount(page, total_mapcount); + if (PageSwapCache(page)) + swapcount = page_swapcount(page); + if (total_swapcount) + *total_swapcount = swapcount; + return mapcount + swapcount; +} #endif /* @@ -1416,23 +1496,27 @@ static bool page_swapped(struct page *page) * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. * - * NOTE: total_mapcount should not be relied upon by the caller if + * NOTE: total_map_swapcount should not be relied upon by the caller if * reuse_swap_page() returns false, but it may be always overwritten * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_mapcount) +bool reuse_swap_page(struct page *page, int *total_map_swapcount) { - int count; + int count, total_mapcount, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_mapcount(page, total_mapcount); - if (count <= 1 && PageSwapCache(page)) { - count += page_swapcount(page); - if (count != 1) - goto out; + count = page_trans_huge_map_swapcount(page, &total_mapcount, + &total_swapcount); + if (total_map_swapcount) + *total_map_swapcount = total_mapcount + total_swapcount; + if (count == 1 && PageSwapCache(page) && + (likely(!PageTransCompound(page)) || + /* The remaining swap count will be freed soon */ + total_swapcount == page_swapcount(page))) { if (!PageWriteback(page)) { + page = compound_head(page); delete_from_swap_cache(page); SetPageDirty(page); } else { @@ -1448,7 +1532,7 @@ bool reuse_swap_page(struct page *page, int *total_mapcount) spin_unlock(&p->lock); } } -out: + return count <= 1; } -- cgit v1.2.3 From f0eea189e8e969b66e03bac8a7d92888ba267854 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:23 -0700 Subject: mm, THP, swap: don't allocate huge cluster for file backed swap device It's hard to write a whole transparent huge page (THP) to a file backed swap device during swapping out and the file backed swap device isn't very popular. So the huge cluster allocation for the file backed swap device is disabled. Link: http://lkml.kernel.org/r/20170724051840.2309-5-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swapfile.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'mm/swapfile.c') diff --git a/mm/swapfile.c b/mm/swapfile.c index 2bfbfb87123a..267b1fe41844 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -948,9 +948,10 @@ start_over: spin_unlock(&si->lock); goto nextsi; } - if (cluster) - n_ret = swap_alloc_cluster(si, swp_entries); - else + if (cluster) { + if (!(si->flags & SWP_FILE)) + n_ret = swap_alloc_cluster(si, swp_entries); + } else n_ret = scan_swap_map_slots(si, SWAP_HAS_CACHE, n_goal, swp_entries); spin_unlock(&si->lock); -- cgit v1.2.3 From 59807685a7e77e8c8fe5925613968841538d53d7 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:22:34 -0700 Subject: mm, THP, swap: support splitting THP for THP swap out After adding swapping out support for THP (Transparent Huge Page), it is possible that a THP in swap cache (partly swapped out) need to be split. To split such a THP, the swap cluster backing the THP need to be split too, that is, the CLUSTER_FLAG_HUGE flag need to be cleared for the swap cluster. The patch implemented this. And because the THP swap writing needs the THP keeps as huge page during writing. The PageWriteback flag is checked before splitting. Link: http://lkml.kernel.org/r/20170724051840.2309-8-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Hugh Dickins Cc: Shaohua Li Cc: Rik van Riel Cc: Andrea Arcangeli Cc: "Kirill A . Shutemov" Cc: Dan Williams Cc: Jens Axboe Cc: Michal Hocko Cc: Ross Zwisler [for brd.c, zram_drv.c, pmem.c] Cc: Vishal L Verma Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 9 +++++++++ mm/huge_memory.c | 10 +++++++++- mm/swapfile.c | 15 +++++++++++++++ 3 files changed, 33 insertions(+), 1 deletion(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 7176ba780e83..461cf107ad52 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -527,6 +527,15 @@ static inline swp_entry_t get_swap_page(struct page *page) #endif /* CONFIG_SWAP */ +#ifdef CONFIG_THP_SWAP +extern int split_swap_cluster(swp_entry_t entry); +#else +static inline int split_swap_cluster(swp_entry_t entry) +{ + return 0; +} +#endif + #ifdef CONFIG_MEMCG static inline int mem_cgroup_swappiness(struct mem_cgroup *memcg) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 02dfe635c9fe..772048c233d1 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2481,6 +2481,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(!PageCompound(page), page); + if (PageWriteback(page)) + return -EBUSY; + if (PageAnon(head)) { /* * The caller does not necessarily hold an mmap_sem that would @@ -2558,7 +2561,12 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __dec_node_page_state(page, NR_SHMEM_THPS); spin_unlock(&pgdata->split_queue_lock); __split_huge_page(page, list, flags); - ret = 0; + if (PageSwapCache(head)) { + swp_entry_t entry = { .val = page_private(head) }; + + ret = split_swap_cluster(entry); + } else + ret = 0; } else { if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { pr_alert("total_mapcount: %u, page_count(): %u\n", diff --git a/mm/swapfile.c b/mm/swapfile.c index 267b1fe41844..42eff9e4e972 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1216,6 +1216,21 @@ static void swapcache_free_cluster(swp_entry_t entry) } } } + +int split_swap_cluster(swp_entry_t entry) +{ + struct swap_info_struct *si; + struct swap_cluster_info *ci; + unsigned long offset = swp_offset(entry); + + si = _swap_info_get(entry); + if (!si) + return -EBUSY; + ci = lock_cluster(si, offset); + cluster_clear_huge(ci); + unlock_cluster(ci); + return 0; +} #else static inline void swapcache_free_cluster(swp_entry_t entry) { -- cgit v1.2.3 From 81a0298bdfab0203d360df7c9bf690d1d457f999 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 6 Sep 2017 16:24:43 -0700 Subject: mm, swap: don't use VMA based swap readahead if HDD is used as swap VMA based swap readahead will readahead the virtual pages that is continuous in the virtual address space. While the original swap readahead will readahead the swap slots that is continuous in the swap device. Although VMA based swap readahead is more correct for the swap slots to be readahead, it will trigger more small random readings, which may cause the performance of HDD (hard disk) to degrade heavily, and may finally exceed the benefit. To avoid the issue, in this patch, if the HDD is used as swap, the VMA based swap readahead will be disabled, and the original swap readahead will be used instead. Link: http://lkml.kernel.org/r/20170807054038.1843-6-ying.huang@intel.com Signed-off-by: "Huang, Ying" Cc: Johannes Weiner Cc: Minchan Kim Cc: Rik van Riel Cc: Shaohua Li Cc: Hugh Dickins Cc: Fengguang Wu Cc: Tim Chen Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 11 ++++++----- mm/swapfile.c | 8 +++++++- 2 files changed, 13 insertions(+), 6 deletions(-) (limited to 'mm/swapfile.c') diff --git a/include/linux/swap.h b/include/linux/swap.h index 61d63379e956..9c4ae6f14eea 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -400,16 +400,17 @@ extern struct page *do_swap_page_readahead(swp_entry_t fentry, gfp_t gfp_mask, struct vm_fault *vmf, struct vma_swap_readahead *swap_ra); -static inline bool swap_use_vma_readahead(void) -{ - return READ_ONCE(swap_vma_readahead); -} - /* linux/mm/swapfile.c */ extern atomic_long_t nr_swap_pages; extern long total_swap_pages; +extern atomic_t nr_rotate_swap; extern bool has_usable_swap(void); +static inline bool swap_use_vma_readahead(void) +{ + return READ_ONCE(swap_vma_readahead) && !atomic_read(&nr_rotate_swap); +} + /* Swap 50% full? Release swapcache more aggressively.. */ static inline bool vm_swap_full(void) { diff --git a/mm/swapfile.c b/mm/swapfile.c index 42eff9e4e972..4f8b3e08a547 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -96,6 +96,8 @@ static DECLARE_WAIT_QUEUE_HEAD(proc_poll_wait); /* Activity counter to indicate that a swapon or swapoff has occurred */ static atomic_t proc_poll_event = ATOMIC_INIT(0); +atomic_t nr_rotate_swap = ATOMIC_INIT(0); + static inline unsigned char swap_count(unsigned char ent) { return ent & ~SWAP_HAS_CACHE; /* may include SWAP_HAS_CONT flag */ @@ -2569,6 +2571,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); + if (!p->bdev || !blk_queue_nonrot(bdev_get_queue(p->bdev))) + atomic_dec(&nr_rotate_swap); + mutex_lock(&swapon_mutex); spin_lock(&swap_lock); spin_lock(&p->lock); @@ -3145,7 +3150,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } - } + } else + atomic_inc(&nr_rotate_swap); error = swap_cgroup_swapon(p->type, maxpages); if (error) -- cgit v1.2.3 From a2468cc9bfdff6139f59ca896671e5819ff5f94a Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Wed, 6 Sep 2017 16:24:57 -0700 Subject: swap: choose swap device according to numa node If the system has more than one swap device and swap device has the node information, we can make use of this information to decide which swap device to use in get_swap_pages() to get better performance. The current code uses a priority based list, swap_avail_list, to decide which swap device to use and if multiple swap devices share the same priority, they are used round robin. This patch changes the previous single global swap_avail_list into a per-numa-node list, i.e. for each numa node, it sees its own priority based list of available swap devices. Swap device's priority can be promoted on its matching node's swap_avail_list. The current swap device's priority is set as: user can set a >=0 value, or the system will pick one starting from -1 then downwards. The priority value in the swap_avail_list is the negated value of the swap device's due to plist being sorted from low to high. The new policy doesn't change the semantics for priority >=0 cases, the previous starting from -1 then downwards now becomes starting from -2 then downwards and -1 is reserved as the promoted value. Take 4-node EX machine as an example, suppose 4 swap devices are available, each sit on a different node: swapA on node 0 swapB on node 1 swapC on node 2 swapD on node 3 After they are all swapped on in the sequence of ABCD. Current behaviour: their priorities will be: swapA: -1 swapB: -2 swapC: -3 swapD: -4 And their position in the global swap_avail_list will be: swapA -> swapB -> swapC -> swapD prio:1 prio:2 prio:3 prio:4 New behaviour: their priorities will be(note that -1 is skipped): swapA: -2 swapB: -3 swapC: -4 swapD: -5 And their positions in the 4 swap_avail_lists[nid] will be: swap_avail_lists[0]: /* node 0's available swap device list */ swapA -> swapB -> swapC -> swapD prio:1 prio:3 prio:4 prio:5 swap_avali_lists[1]: /* node 1's available swap device list */ swapB -> swapA -> swapC -> swapD prio:1 prio:2 prio:4 prio:5 swap_avail_lists[2]: /* node 2's available swap device list */ swapC -> swapA -> swapB -> swapD prio:1 prio:2 prio:3 prio:5 swap_avail_lists[3]: /* node 3's available swap device list */ swapD -> swapA -> swapB -> swapC prio:1 prio:2 prio:3 prio:4 To see the effect of the patch, a test that starts N process, each mmap a region of anonymous memory and then continually write to it at random position to trigger both swap in and out is used. On a 2 node Skylake EP machine with 64GiB memory, two 170GB SSD drives are used as swap devices with each attached to a different node, the result is: runtime=30m/processes=32/total test size=128G/each process mmap region=4G kernel throughput vanilla 13306 auto-binding 15169 +14% runtime=30m/processes=64/total test size=128G/each process mmap region=2G kernel throughput vanilla 11885 auto-binding 14879 +25% [aaron.lu@intel.com: v2] Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com [akpm@linux-foundation.org: use kmalloc_array()] Link: http://lkml.kernel.org/r/20170814053130.GD2369@aaronlu.sh.intel.com Link: http://lkml.kernel.org/r/20170816024439.GA10925@aaronlu.sh.intel.com Signed-off-by: Aaron Lu Cc: "Chen, Tim C" Cc: Huang Ying Cc: Andi Kleen Cc: Michal Hocko Cc: Minchan Kim Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/swap_numa.txt | 69 ++++++++++++++++++++++++ include/linux/swap.h | 2 +- mm/swapfile.c | 120 ++++++++++++++++++++++++++++++++--------- 3 files changed, 164 insertions(+), 27 deletions(-) create mode 100644 Documentation/vm/swap_numa.txt (limited to 'mm/swapfile.c') diff --git a/Documentation/vm/swap_numa.txt b/Documentation/vm/swap_numa.txt new file mode 100644 index 000000000000..d5960c9124f5 --- /dev/null +++ b/Documentation/vm/swap_numa.txt @@ -0,0 +1,69 @@ +Automatically bind swap device to numa node +------------------------------------------- + +If the system has more than one swap device and swap device has the node +information, we can make use of this information to decide which swap +device to use in get_swap_pages() to get better performance. + + +How to use this feature +----------------------- + +Swap device has priority and that decides the order of it to be used. To make +use of automatically binding, there is no need to manipulate priority settings +for swap devices. e.g. on a 2 node machine, assume 2 swap devices swapA and +swapB, with swapA attached to node 0 and swapB attached to node 1, are going +to be swapped on. Simply swapping them on by doing: +# swapon /dev/swapA +# swapon /dev/swapB + +Then node 0 will use the two swap devices in the order of swapA then swapB and +node 1 will use the two swap devices in the order of swapB then swapA. Note +that the order of them being swapped on doesn't matter. + +A more complex example on a 4 node machine. Assume 6 swap devices are going to +be swapped on: swapA and swapB are attached to node 0, swapC is attached to +node 1, swapD and swapE are attached to node 2 and swapF is attached to node3. +The way to swap them on is the same as above: +# swapon /dev/swapA +# swapon /dev/swapB +# swapon /dev/swapC +# swapon /dev/swapD +# swapon /dev/swapE +# swapon /dev/swapF + +Then node 0 will use them in the order of: +swapA/swapB -> swapC -> swapD -> swapE -> swapF +swapA and swapB will be used in a round robin mode before any other swap device. + +node 1 will use them in the order of: +swapC -> swapA -> swapB -> swapD -> swapE -> swapF + +node 2 will use them in the order of: +swapD/swapE -> swapA -> swapB -> swapC -> swapF +Similaly, swapD and swapE will be used in a round robin mode before any +other swap devices. + +node 3 will use them in the order of: +swapF -> swapA -> swapB -> swapC -> swapD -> swapE + + +Implementation details +---------------------- + +The current code uses a priority based list, swap_avail_list, to decide +which swap device to use and if multiple swap devices share the same +priority, they are used round robin. This change here replaces the single +global swap_avail_list with a per-numa-node list, i.e. for each numa node, +it sees its own priority based list of available swap devices. Swap +device's priority can be promoted on its matching node's swap_avail_list. + +The current swap device's priority is set as: user can set a >=0 value, +or the system will pick one starting from -1 then downwards. The priority +value in the swap_avail_list is the negated value of the swap device's +due to plist being sorted from low to high. The new policy doesn't change +the semantics for priority >=0 cases, the previous starting from -1 then +downwards now becomes starting from -2 then downwards and -1 is reserved +as the promoted value. So if multiple swap devices are attached to the same +node, they will all be promoted to priority -1 on that node's plist and will +be used round robin before any other swap devices. diff --git a/include/linux/swap.h b/include/linux/swap.h index 9c4ae6f14eea..8bf3487fb204 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -212,7 +212,7 @@ struct swap_info_struct { unsigned long flags; /* SWP_USED etc: see above */ signed short prio; /* swap priority of this type */ struct plist_node list; /* entry in swap_active_head */ - struct plist_node avail_list; /* entry in swap_avail_head */ + struct plist_node avail_lists[MAX_NUMNODES];/* entry in swap_avail_heads */ signed char type; /* strange name for an index */ unsigned int max; /* extent of the swap_map */ unsigned char *swap_map; /* vmalloc'ed array of usage counts */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 4f8b3e08a547..d483278ee35b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -60,7 +60,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority; +static int least_priority = -1; static const char Bad_file[] = "Bad swap file entry "; static const char Unused_file[] = "Unused swap file entry "; @@ -85,7 +85,7 @@ PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static PLIST_HEAD(swap_avail_head); +struct plist_head *swap_avail_heads; static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -592,6 +592,21 @@ new_cluster: return found_free; } +static void __del_from_avail_list(struct swap_info_struct *p) +{ + int nid; + + for_each_node(nid) + plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]); +} + +static void del_from_avail_list(struct swap_info_struct *p) +{ + spin_lock(&swap_avail_lock); + __del_from_avail_list(p); + spin_unlock(&swap_avail_lock); +} + static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -605,12 +620,22 @@ static void swap_range_alloc(struct swap_info_struct *si, unsigned long offset, if (si->inuse_pages == si->pages) { si->lowest_bit = si->max; si->highest_bit = 0; - spin_lock(&swap_avail_lock); - plist_del(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(si); } } +static void add_to_avail_list(struct swap_info_struct *p) +{ + int nid; + + spin_lock(&swap_avail_lock); + for_each_node(nid) { + WARN_ON(!plist_node_empty(&p->avail_lists[nid])); + plist_add(&p->avail_lists[nid], &swap_avail_heads[nid]); + } + spin_unlock(&swap_avail_lock); +} + static void swap_range_free(struct swap_info_struct *si, unsigned long offset, unsigned int nr_entries) { @@ -623,13 +648,8 @@ static void swap_range_free(struct swap_info_struct *si, unsigned long offset, bool was_full = !si->highest_bit; si->highest_bit = end; - if (was_full && (si->flags & SWP_WRITEOK)) { - spin_lock(&swap_avail_lock); - WARN_ON(!plist_node_empty(&si->avail_list)); - if (plist_node_empty(&si->avail_list)) - plist_add(&si->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); - } + if (was_full && (si->flags & SWP_WRITEOK)) + add_to_avail_list(si); } atomic_long_add(nr_entries, &nr_swap_pages); si->inuse_pages -= nr_entries; @@ -910,6 +930,7 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) struct swap_info_struct *si, *next; long avail_pgs; int n_ret = 0; + int node; /* Only single cluster request supported */ WARN_ON_ONCE(n_goal > 1 && cluster); @@ -929,14 +950,15 @@ int get_swap_pages(int n_goal, bool cluster, swp_entry_t swp_entries[]) spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { + node = numa_node_id(); + plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { /* requeue si to after same-priority siblings */ - plist_requeue(&si->avail_list, &swap_avail_head); + plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); spin_unlock(&swap_avail_lock); spin_lock(&si->lock); if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) { spin_lock(&swap_avail_lock); - if (plist_node_empty(&si->avail_list)) { + if (plist_node_empty(&si->avail_lists[node])) { spin_unlock(&si->lock); goto nextsi; } @@ -946,7 +968,7 @@ start_over: WARN(!(si->flags & SWP_WRITEOK), "swap_info %d in list but !SWP_WRITEOK\n", si->type); - plist_del(&si->avail_list, &swap_avail_head); + __del_from_avail_list(si); spin_unlock(&si->lock); goto nextsi; } @@ -975,7 +997,7 @@ nextsi: * swap_avail_head list then try it, otherwise start over * if we have not gotten any slots. */ - if (plist_node_empty(&next->avail_list)) + if (plist_node_empty(&next->avail_lists[node])) goto start_over; } @@ -2410,10 +2432,24 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } +static int swap_node(struct swap_info_struct *p) +{ + struct block_device *bdev; + + if (p->bdev) + bdev = p->bdev; + else + bdev = p->swap_file->f_inode->i_sb->s_bdev; + + return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; +} + static void _enable_swap_info(struct swap_info_struct *p, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info) { + int i; + if (prio >= 0) p->prio = prio; else @@ -2423,7 +2459,16 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * low-to-high, while swap ordering is high-to-low */ p->list.prio = -p->prio; - p->avail_list.prio = -p->prio; + for_each_node(i) { + if (p->prio >= 0) + p->avail_lists[i].prio = -p->prio; + else { + if (swap_node(p) == i) + p->avail_lists[i].prio = 1; + else + p->avail_lists[i].prio = -p->prio; + } + } p->swap_map = swap_map; p->cluster_info = cluster_info; p->flags |= SWP_WRITEOK; @@ -2442,9 +2487,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio, * swap_info_struct. */ plist_add(&p->list, &swap_active_head); - spin_lock(&swap_avail_lock); - plist_add(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + add_to_avail_list(p); } static void enable_swap_info(struct swap_info_struct *p, int prio, @@ -2529,17 +2572,19 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) spin_unlock(&swap_lock); goto out_dput; } - spin_lock(&swap_avail_lock); - plist_del(&p->avail_list, &swap_avail_head); - spin_unlock(&swap_avail_lock); + del_from_avail_list(p); spin_lock(&p->lock); if (p->prio < 0) { struct swap_info_struct *si = p; + int nid; plist_for_each_entry_continue(si, &swap_active_head, list) { si->prio++; si->list.prio--; - si->avail_list.prio--; + for_each_node(nid) { + if (si->avail_lists[nid].prio != 1) + si->avail_lists[nid].prio--; + } } least_priority++; } @@ -2783,6 +2828,7 @@ static struct swap_info_struct *alloc_swap_info(void) { struct swap_info_struct *p; unsigned int type; + int i; p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) @@ -2818,7 +2864,8 @@ static struct swap_info_struct *alloc_swap_info(void) } INIT_LIST_HEAD(&p->first_swap_extent.list); plist_node_init(&p->list, 0); - plist_node_init(&p->avail_list, 0); + for_each_node(i) + plist_node_init(&p->avail_lists[i], 0); p->flags = SWP_USED; spin_unlock(&swap_lock); spin_lock_init(&p->lock); @@ -3060,6 +3107,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; + if (!swap_avail_heads) + return -ENOMEM; + p = alloc_swap_info(); if (IS_ERR(p)) return PTR_ERR(p); @@ -3645,3 +3695,21 @@ static void free_swap_count_continuations(struct swap_info_struct *si) } } } + +static int __init swapfile_init(void) +{ + int nid; + + swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), + GFP_KERNEL); + if (!swap_avail_heads) { + pr_emerg("Not enough memory for swap heads, swap is disabled\n"); + return -ENOMEM; + } + + for_each_node(nid) + plist_head_init(&swap_avail_heads[nid]); + + return 0; +} +subsys_initcall(swapfile_init); -- cgit v1.2.3