From 2a681cfa5bb41e78e7bfafbb748b581374ce9b1d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Thu, 6 Aug 2020 23:22:55 -0700 Subject: mm: move p?d_alloc_track to separate header file The functions are only used in two source files, so there is no need for them to be in the global header. Move them to the new header and include it only where needed. Signed-off-by: Joerg Roedel Signed-off-by: Andrew Morton Reviewed-by: Pekka Enberg Cc: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Abdul Haleem Cc: Satheesh Rajendran Cc: Stephen Rothwell Cc: Steven Rostedt (VMware) Cc: Mike Rapoport Cc: Christophe Leroy Cc: Arnd Bergmann Cc: Max Filippov Cc: Stafford Horne Cc: Geert Uytterhoeven Cc: Matthew Wilcox Link: http://lkml.kernel.org/r/20200609120533.25867-1-joro@8bytes.org Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5a2b55c8dd9a..5be3cf3b59de 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -41,6 +41,7 @@ #include #include "internal.h" +#include "pgalloc-track.h" bool is_vmalloc_addr(const void *x) { -- cgit v1.2.3 From 0f14599c607d32512a1d37e6d2a2d1a867f16177 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Thu, 6 Aug 2020 23:24:05 -0700 Subject: vmalloc: convert to XArray The radix tree of vmap blocks is simpler to express as an XArray. Reduces both the text and data sizes of the object file and eliminates a user of the radix tree preload API. Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Reviewed-by: William Kucharski Link: http://lkml.kernel.org/r/20200603171448.5894-1-willy@infradead.org Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 40 +++++++++++----------------------------- 1 file changed, 11 insertions(+), 29 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 5be3cf3b59de..d496856bd408 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include #include @@ -1514,12 +1514,11 @@ struct vmap_block { static DEFINE_PER_CPU(struct vmap_block_queue, vmap_block_queue); /* - * Radix tree of vmap blocks, indexed by address, to quickly find a vmap block + * XArray of vmap blocks, indexed by address, to quickly find a vmap block * in the free path. Could get rid of this if we change the API to return a * "cookie" from alloc, to be passed to free. But no big deal yet. */ -static DEFINE_SPINLOCK(vmap_block_tree_lock); -static RADIX_TREE(vmap_block_tree, GFP_ATOMIC); +static DEFINE_XARRAY(vmap_blocks); /* * We should probably have a fallback mechanism to allocate virtual memory @@ -1576,13 +1575,6 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) return ERR_CAST(va); } - err = radix_tree_preload(gfp_mask); - if (unlikely(err)) { - kfree(vb); - free_vmap_area(va); - return ERR_PTR(err); - } - vaddr = vmap_block_vaddr(va->va_start, 0); spin_lock_init(&vb->lock); vb->va = va; @@ -1595,11 +1587,12 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) INIT_LIST_HEAD(&vb->free_list); vb_idx = addr_to_vb_idx(va->va_start); - spin_lock(&vmap_block_tree_lock); - err = radix_tree_insert(&vmap_block_tree, vb_idx, vb); - spin_unlock(&vmap_block_tree_lock); - BUG_ON(err); - radix_tree_preload_end(); + err = xa_insert(&vmap_blocks, vb_idx, vb, gfp_mask); + if (err) { + kfree(vb); + free_vmap_area(va); + return ERR_PTR(err); + } vbq = &get_cpu_var(vmap_block_queue); spin_lock(&vbq->lock); @@ -1613,12 +1606,8 @@ static void *new_vmap_block(unsigned int order, gfp_t gfp_mask) static void free_vmap_block(struct vmap_block *vb) { struct vmap_block *tmp; - unsigned long vb_idx; - vb_idx = addr_to_vb_idx(vb->va->va_start); - spin_lock(&vmap_block_tree_lock); - tmp = radix_tree_delete(&vmap_block_tree, vb_idx); - spin_unlock(&vmap_block_tree_lock); + tmp = xa_erase(&vmap_blocks, addr_to_vb_idx(vb->va->va_start)); BUG_ON(tmp != vb); free_vmap_area_noflush(vb->va); @@ -1724,7 +1713,6 @@ static void *vb_alloc(unsigned long size, gfp_t gfp_mask) static void vb_free(unsigned long addr, unsigned long size) { unsigned long offset; - unsigned long vb_idx; unsigned int order; struct vmap_block *vb; @@ -1734,14 +1722,8 @@ static void vb_free(unsigned long addr, unsigned long size) flush_cache_vunmap(addr, addr + size); order = get_order(size); - offset = (addr & (VMAP_BLOCK_SIZE - 1)) >> PAGE_SHIFT; - - vb_idx = addr_to_vb_idx(addr); - rcu_read_lock(); - vb = radix_tree_lookup(&vmap_block_tree, vb_idx); - rcu_read_unlock(); - BUG_ON(!vb); + vb = xa_load(&vmap_blocks, addr_to_vb_idx(addr)); unmap_kernel_range_noflush(addr, size); -- cgit v1.2.3 From 5dd7864094033a281aeffccaf9703468cbcfccfc Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 6 Aug 2020 23:24:09 -0700 Subject: mm/vmalloc: simplify merge_or_add_vmap_area() Currently when a VA is deallocated and is about to be placed back to the tree, it can be either: merged with next/prev neighbors or inserted if not coalesced. On those steps the tree can be populated several times. For example when both neighbors are merged. It can be avoided and simplified in fact. Therefore do it only once when VA points to final merged area, after all manipulations: merging/removing/inserting. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200527205054.1696-1-urezki@gmail.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++++++++++----------- 1 file changed, 14 insertions(+), 11 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d496856bd408..d0ec02308d7c 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -797,9 +797,6 @@ merge_or_add_vmap_area(struct vmap_area *va, if (sibling->va_start == va->va_end) { sibling->va_start = va->va_start; - /* Check and update the tree if needed. */ - augment_tree_propagate_from(sibling); - /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); @@ -819,14 +816,18 @@ merge_or_add_vmap_area(struct vmap_area *va, if (next->prev != head) { sibling = list_entry(next->prev, struct vmap_area, list); if (sibling->va_end == va->va_start) { - sibling->va_end = va->va_end; - - /* Check and update the tree if needed. */ - augment_tree_propagate_from(sibling); - + /* + * If both neighbors are coalesced, it is important + * to unlink the "next" node first, followed by merging + * with "previous" one. Otherwise the tree might not be + * fully populated if a sibling's augmented value is + * "normalized" because of rotation operations. + */ if (merged) unlink_va(va, root); + sibling->va_end = va->va_end; + /* Free vmap_area object. */ kmem_cache_free(vmap_area_cachep, va); @@ -837,11 +838,13 @@ merge_or_add_vmap_area(struct vmap_area *va, } insert: - if (!merged) { + if (!merged) link_va(va, root, parent, link, head); - augment_tree_propagate_from(va); - } + /* + * Last step is to check and update the tree. + */ + augment_tree_propagate_from(va); return va; } -- cgit v1.2.3 From da27c9ed17794b4741fe2858ce727ffac671877e Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 6 Aug 2020 23:24:12 -0700 Subject: mm/vmalloc: simplify augment_tree_propagate_check() This function is for debug purpose only. Currently it uses recursion for tree traversal, checking an augmented value of each node to find out if it is valid or not. The recursion can corrupt the stack because the tree can be huge if synthetic tests are applied. To prevent it, navigate the tree from bottom to upper levels using a regular list instead, because nodes are linked among each other also. It is faster and without recursion. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200527205054.1696-2-urezki@gmail.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 42 ++++++++---------------------------------- 1 file changed, 8 insertions(+), 34 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index d0ec02308d7c..740b7e4cac03 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -633,43 +633,17 @@ unlink_va(struct vmap_area *va, struct rb_root *root) #if DEBUG_AUGMENT_PROPAGATE_CHECK static void -augment_tree_propagate_check(struct rb_node *n) +augment_tree_propagate_check(void) { struct vmap_area *va; - struct rb_node *node; - unsigned long size; - bool found = false; - - if (n == NULL) - return; - - va = rb_entry(n, struct vmap_area, rb_node); - size = va->subtree_max_size; - node = n; - - while (node) { - va = rb_entry(node, struct vmap_area, rb_node); - - if (get_subtree_max_size(node->rb_left) == size) { - node = node->rb_left; - } else { - if (va_size(va) == size) { - found = true; - break; - } + unsigned long computed_size; - node = node->rb_right; - } - } - - if (!found) { - va = rb_entry(n, struct vmap_area, rb_node); - pr_emerg("tree is corrupted: %lu, %lu\n", - va_size(va), va->subtree_max_size); + list_for_each_entry(va, &free_vmap_area_list, list) { + computed_size = compute_subtree_max_size(va); + if (computed_size != va->subtree_max_size) + pr_emerg("tree is corrupted: %lu, %lu\n", + va_size(va), va->subtree_max_size); } - - augment_tree_propagate_check(n->rb_left); - augment_tree_propagate_check(n->rb_right); } #endif @@ -724,7 +698,7 @@ augment_tree_propagate_from(struct vmap_area *va) } #if DEBUG_AUGMENT_PROPAGATE_CHECK - augment_tree_propagate_check(free_vmap_area_root.rb_node); + augment_tree_propagate_check(); #endif } -- cgit v1.2.3 From 15ae144f77027db35fad168f24a1b9469dcf70a7 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 6 Aug 2020 23:24:15 -0700 Subject: mm/vmalloc: switch to "propagate()" callback An augment_tree_propagate_from() function uses its own implementation that populates a tree from the specified node toward a root node. On the other hand the RB_DECLARE_CALLBACKS_MAX macro provides the "propagate()" callback that does exactly the same. Having two similar functions does not make sense and is redundant. Reuse "built in" functionality to the macros. So the code size gets reduced. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Link: http://lkml.kernel.org/r/20200527205054.1696-3-urezki@gmail.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 25 ++++++------------------- 1 file changed, 6 insertions(+), 19 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 740b7e4cac03..ce17dbbd4bec 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -677,25 +677,12 @@ augment_tree_propagate_check(void) static __always_inline void augment_tree_propagate_from(struct vmap_area *va) { - struct rb_node *node = &va->rb_node; - unsigned long new_va_sub_max_size; - - while (node) { - va = rb_entry(node, struct vmap_area, rb_node); - new_va_sub_max_size = compute_subtree_max_size(va); - - /* - * If the newly calculated maximum available size of the - * subtree is equal to the current one, then it means that - * the tree is propagated correctly. So we have to stop at - * this point to save cycles. - */ - if (va->subtree_max_size == new_va_sub_max_size) - break; - - va->subtree_max_size = new_va_sub_max_size; - node = rb_parent(&va->rb_node); - } + /* + * Populate the tree from bottom towards the root until + * the calculated maximum available size of checked node + * is equal to its current one. + */ + free_vmap_area_rb_augment_cb_propagate(&va->rb_node, NULL); #if DEBUG_AUGMENT_PROPAGATE_CHECK augment_tree_propagate_check(); -- cgit v1.2.3 From d758ffe6b9073ac7a21405cfdf2f7685aef8bacd Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 6 Aug 2020 23:24:18 -0700 Subject: mm/vmalloc: update the header about KVA rework Reflect information about the author, date and year when the KVA rework was done. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Reviewed-by: Andrew Morton Link: http://lkml.kernel.org/r/20200622195821.4796-1-urezki@gmail.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 + 1 file changed, 1 insertion(+) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index ce17dbbd4bec..970cbc453bcd 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -7,6 +7,7 @@ * SMP-safe vmalloc/vfree/ioremap, Tigran Aivazian , May 2000 * Major rework to support vmap/vunmap, Christoph Hellwig, SGI, August 2002 * Numa awareness, Christoph Lameter, SGI, June 2005 + * Improving global KVA allocator, Uladzislau Rezki, Sony, May 2019 */ #include -- cgit v1.2.3 From 1a69a623d98397e3968a3bd6e9f650e8a5e8fdce Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 6 Aug 2020 23:24:21 -0700 Subject: mm: vmalloc: remove redundant assignment in unmap_kernel_range_noflush() 'addr' is set to 'start' and then a few lines afterwards 'start' is set to 'addr'. Remove the second asignment. Fixes: 2ba3e6947aed ("mm/vmalloc: track which page-table levels were modified") Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Reviewed-by: David Hildenbrand Cc: Joerg Roedel Link: http://lkml.kernel.org/r/20200707163226.374685-1-rppt@kernel.org Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 1 - 1 file changed, 1 deletion(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 970cbc453bcd..3d7b94eb0ac0 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -175,7 +175,6 @@ void unmap_kernel_range_noflush(unsigned long start, unsigned long size) pgtbl_mod_mask mask = 0; BUG_ON(addr >= end); - start = addr; pgd = pgd_offset_k(addr); do { next = pgd_addr_end(addr, end); -- cgit v1.2.3 From 9c801f61d0e9e19b685ceb6e5f7107ad29b7837e Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Thu, 6 Aug 2020 23:24:24 -0700 Subject: mm/vmalloc.c: remove BUG() from the find_va_links() Get rid of BUG() macro, that should be used only when a critical situation happens and a system is not able to function anymore. Replace it with WARN() macro instead, dump some extra information about start/end addresses of both VAs which overlap. Such overlap data can help to figure out what happened making further analysis easier. For example if both areas are identical it could mean a double free. A recovery process consists of declining all further steps regarding inserting of conflicting overlap range. In that sense find_va_links() now can return NULL, so its return value has to be checked by callers. Side effect of such process is it can leak memory, but it is better than just killing a machine for no good reason. Apart of that a debugging process can be done on alive system. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Andrew Morton Cc: Hillf Danton Cc: Michal Hocko Cc: Matthew Wilcox Cc: Oleksiy Avramchenko Cc: Steven Rostedt Link: http://lkml.kernel.org/r/20200711104531.12242-1-urezki@gmail.com Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 41 ++++++++++++++++++++++++++++++++--------- 1 file changed, 32 insertions(+), 9 deletions(-) (limited to 'mm/vmalloc.c') diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 3d7b94eb0ac0..b482d240f9a2 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -512,6 +512,10 @@ static struct vmap_area *__find_vmap_area(unsigned long addr) /* * This function returns back addresses of parent node * and its left or right link for further processing. + * + * Otherwise NULL is returned. In that case all further + * steps regarding inserting of conflicting overlap range + * have to be declined and actually considered as a bug. */ static __always_inline struct rb_node ** find_va_links(struct vmap_area *va, @@ -550,8 +554,12 @@ find_va_links(struct vmap_area *va, else if (va->va_end > tmp_va->va_start && va->va_start >= tmp_va->va_end) link = &(*link)->rb_right; - else - BUG(); + else { + WARN(1, "vmalloc bug: 0x%lx-0x%lx overlaps with 0x%lx-0x%lx\n", + va->va_start, va->va_end, tmp_va->va_start, tmp_va->va_end); + + return NULL; + } } while (*link); *parent = &tmp_va->rb_node; @@ -697,7 +705,8 @@ insert_vmap_area(struct vmap_area *va, struct rb_node *parent; link = find_va_links(va, root, NULL, &parent); - link_va(va, root, parent, link, head); + if (link) + link_va(va, root, parent, link, head); } static void @@ -713,8 +722,10 @@ insert_vmap_area_augment(struct vmap_area *va, else link = find_va_links(va, root, NULL, &parent); - link_va(va, root, parent, link, head); - augment_tree_propagate_from(va); + if (link) { + link_va(va, root, parent, link, head); + augment_tree_propagate_from(va); + } } /* @@ -722,6 +733,11 @@ insert_vmap_area_augment(struct vmap_area *va, * and next free blocks. If coalesce is not done a new * free area is inserted. If VA has been merged, it is * freed. + * + * Please note, it can return NULL in case of overlap + * ranges, followed by WARN() report. Despite it is a + * buggy behaviour, a system can be alive and keep + * ongoing. */ static __always_inline struct vmap_area * merge_or_add_vmap_area(struct vmap_area *va, @@ -738,6 +754,8 @@ merge_or_add_vmap_area(struct vmap_area *va, * inserted, unless it is merged with its sibling/siblings. */ link = find_va_links(va, root, NULL, &parent); + if (!link) + return NULL; /* * Get next node of VA to check if merging can be done. @@ -1346,6 +1364,9 @@ static bool __purge_vmap_area_lazy(unsigned long start, unsigned long end) va = merge_or_add_vmap_area(va, &free_vmap_area_root, &free_vmap_area_list); + if (!va) + continue; + if (is_vmalloc_or_module_addr((void *)orig_start)) kasan_release_vmalloc(orig_start, orig_end, va->va_start, va->va_end); @@ -3330,8 +3351,9 @@ recovery: orig_end = vas[area]->va_end; va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, &free_vmap_area_list); - kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); + if (va) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); vas[area] = NULL; } @@ -3379,8 +3401,9 @@ err_free_shadow: orig_end = vas[area]->va_end; va = merge_or_add_vmap_area(vas[area], &free_vmap_area_root, &free_vmap_area_list); - kasan_release_vmalloc(orig_start, orig_end, - va->va_start, va->va_end); + if (va) + kasan_release_vmalloc(orig_start, orig_end, + va->va_start, va->va_end); vas[area] = NULL; kfree(vms[area]); } -- cgit v1.2.3