From 1e3054b98c5415d5cb5f8824fc33b548ae5644c3 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Fri, 18 May 2018 16:08:44 -0700 Subject: lib/test_bitmap.c: fix bitmap optimisation tests to report errors correctly I had neglected to increment the error counter when the tests failed, which made the tests noisy when they fail, but not actually return an error code. Link: http://lkml.kernel.org/r/20180509114328.9887-1-mpe@ellerman.id.au Fixes: 3cc78125a081 ("lib/test_bitmap.c: add optimisation tests") Signed-off-by: Matthew Wilcox Signed-off-by: Michael Ellerman Reported-by: Michael Ellerman Tested-by: Michael Ellerman Reviewed-by: Kees Cook Cc: Yury Norov Cc: Andy Shevchenko Cc: Geert Uytterhoeven Cc: [4.13+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/test_bitmap.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/lib/test_bitmap.c b/lib/test_bitmap.c index de16f7869fb1..6cd7d0740005 100644 --- a/lib/test_bitmap.c +++ b/lib/test_bitmap.c @@ -331,23 +331,32 @@ static void noinline __init test_mem_optimisations(void) unsigned int start, nbits; for (start = 0; start < 1024; start += 8) { - memset(bmap1, 0x5a, sizeof(bmap1)); - memset(bmap2, 0x5a, sizeof(bmap2)); for (nbits = 0; nbits < 1024 - start; nbits += 8) { + memset(bmap1, 0x5a, sizeof(bmap1)); + memset(bmap2, 0x5a, sizeof(bmap2)); + bitmap_set(bmap1, start, nbits); __bitmap_set(bmap2, start, nbits); - if (!bitmap_equal(bmap1, bmap2, 1024)) + if (!bitmap_equal(bmap1, bmap2, 1024)) { printk("set not equal %d %d\n", start, nbits); - if (!__bitmap_equal(bmap1, bmap2, 1024)) + failed_tests++; + } + if (!__bitmap_equal(bmap1, bmap2, 1024)) { printk("set not __equal %d %d\n", start, nbits); + failed_tests++; + } bitmap_clear(bmap1, start, nbits); __bitmap_clear(bmap2, start, nbits); - if (!bitmap_equal(bmap1, bmap2, 1024)) + if (!bitmap_equal(bmap1, bmap2, 1024)) { printk("clear not equal %d %d\n", start, nbits); - if (!__bitmap_equal(bmap1, bmap2, 1024)) + failed_tests++; + } + if (!__bitmap_equal(bmap1, bmap2, 1024)) { printk("clear not __equal %d %d\n", start, nbits); + failed_tests++; + } } } } -- cgit v1.2.3 From d97baf9470b0668904aa96865abe7db4000dc3ba Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Fri, 18 May 2018 16:08:47 -0700 Subject: include/linux/mm.h: add new inline function vmf_error() Many places in drivers/ file systems, error was handled in a common way like below: ret = (ret == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS; vmf_error() will replace this and return vm_fault_t type err. A lot of drivers and filesystems currently have a rather complex mapping of errno-to-VM_FAULT code. We have been able to eliminate a lot of it by just returning VM_FAULT codes directly from functions which are called exclusively from the fault handling path. Some functions can be called both from the fault handler and other context which are expecting an errno, so they have to continue to return an errno. Some users still need to choose different behaviour for different errnos, but vmf_error() captures the essential error translation that's common to all users, and those that need to handle additional errors can handle them first. Link: http://lkml.kernel.org/r/20180510174826.GA14268@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/include/linux/mm.h b/include/linux/mm.h index c080af584ddd..c6fa9a255dbf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2466,6 +2466,13 @@ static inline vm_fault_t vmf_insert_pfn(struct vm_area_struct *vma, return VM_FAULT_NOPAGE; } +static inline vm_fault_t vmf_error(int err) +{ + if (err == -ENOMEM) + return VM_FAULT_OOM; + return VM_FAULT_SIGBUS; +} + struct page *follow_page_mask(struct vm_area_struct *vma, unsigned long address, unsigned int foll_flags, unsigned int *page_mask); -- cgit v1.2.3 From 8d9fa88edd5e360b71765feeadb915d4066c9684 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 18 May 2018 16:08:51 -0700 Subject: radix tree test suite: fix mapshift build target Commit c6ce3e2fe3da ("radix tree test suite: Add config option for map shift") introduced a phony makefile target called 'mapshift' that ends up generating the file generated/map-shift.h. This phony target was then added as a dependency of the top level 'targets' build target, which is what is run when you go to tools/testing/radix-tree and just type 'make'. Unfortunately, this phony target doesn't actually work as a dependency, so you end up getting: $ make make: *** No rule to make target 'generated/map-shift.h', needed by 'main.o'. Stop. make: *** Waiting for unfinished jobs.... Fix this by making the file generated/map-shift.h our real makefile target, and add this a dependency of the top level build target. Link: http://lkml.kernel.org/r/20180503192430.7582-2-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Christoph Hellwig Cc: CR, Sapthagirish Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/radix-tree/Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/tools/testing/radix-tree/Makefile b/tools/testing/radix-tree/Makefile index fa7ee369b3c9..db66f8a0d4be 100644 --- a/tools/testing/radix-tree/Makefile +++ b/tools/testing/radix-tree/Makefile @@ -17,7 +17,7 @@ ifeq ($(BUILD), 32) LDFLAGS += -m32 endif -targets: mapshift $(TARGETS) +targets: generated/map-shift.h $(TARGETS) main: $(OFILES) @@ -42,9 +42,7 @@ radix-tree.c: ../../../lib/radix-tree.c idr.c: ../../../lib/idr.c sed -e 's/^static //' -e 's/__always_inline //' -e 's/inline //' < $< > $@ -.PHONY: mapshift - -mapshift: +generated/map-shift.h: @if ! grep -qws $(SHIFT) generated/map-shift.h; then \ echo "#define RADIX_TREE_MAP_SHIFT $(SHIFT)" > \ generated/map-shift.h; \ -- cgit v1.2.3 From dcbbf25adb31410c95ce844f80d372ed38b68b24 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 18 May 2018 16:08:54 -0700 Subject: radix tree test suite: fix compilation issue Pulled from a patch from Matthew Wilcox entitled "xarray: Add definition of struct xarray": > From: Matthew Wilcox > Signed-off-by: Matthew Wilcox https://patchwork.kernel.org/patch/10341249/ These defines fix this compilation error: In file included from ./linux/radix-tree.h:6:0, from ./linux/../../../../include/linux/idr.h:15, from ./linux/idr.h:1, from idr.c:4: ./linux/../../../../include/linux/idr.h: In function `idr_init_base': ./linux/../../../../include/linux/radix-tree.h:129:2: warning: implicit declaration of function `spin_lock_init'; did you mean `spinlock_t'? [-Wimplicit-function-declaration] spin_lock_init(&(root)->xa_lock); \ ^ ./linux/../../../../include/linux/idr.h:126:2: note: in expansion of macro `INIT_RADIX_TREE' INIT_RADIX_TREE(&idr->idr_rt, IDR_RT_MARKER); ^~~~~~~~~~~~~~~ by providing a spin_lock_init() wrapper for the v4.17-rc* version of the radix tree test suite. Link: http://lkml.kernel.org/r/20180503192430.7582-3-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Christoph Hellwig Cc: CR, Sapthagirish Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/include/linux/spinlock.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/include/linux/spinlock.h b/tools/include/linux/spinlock.h index b21b586b9854..1738c0391da4 100644 --- a/tools/include/linux/spinlock.h +++ b/tools/include/linux/spinlock.h @@ -6,8 +6,9 @@ #include #define spinlock_t pthread_mutex_t -#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER; +#define DEFINE_SPINLOCK(x) pthread_mutex_t x = PTHREAD_MUTEX_INITIALIZER #define __SPIN_LOCK_UNLOCKED(x) (pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER +#define spin_lock_init(x) pthread_mutex_init(x, NULL) #define spin_lock_irqsave(x, f) (void)f, pthread_mutex_lock(x) #define spin_unlock_irqrestore(x, f) (void)f, pthread_mutex_unlock(x) -- cgit v1.2.3 From 3e252fa7d4f711798e7a3f5ff2d7b62f0e2987ce Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 18 May 2018 16:08:58 -0700 Subject: radix tree test suite: add item_delete_rcu() Currently the lifetime of "struct item" entries in the radix tree are not controlled by RCU, but are instead deleted inline as they are removed from the tree. In the following patches we add a test which has threads iterating over items pulled from the tree and verifying them in an rcu_read_lock()/rcu_read_unlock() section. This means that though an item has been removed from the tree it could still be being worked on by other threads until the RCU grace period expires. So, we need to actually free the "struct item" structures at the end of the grace period, just as we do with "struct radix_tree_node" items. Link: http://lkml.kernel.org/r/20180503192430.7582-4-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Christoph Hellwig Cc: CR, Sapthagirish Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/radix-tree/test.c | 19 +++++++++++++++++++ tools/testing/radix-tree/test.h | 2 ++ 2 files changed, 21 insertions(+) diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 5978ab1f403d..def6015570b2 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -75,6 +75,25 @@ int item_delete(struct radix_tree_root *root, unsigned long index) return 0; } +static void item_free_rcu(struct rcu_head *head) +{ + struct item *item = container_of(head, struct item, rcu_head); + + free(item); +} + +int item_delete_rcu(struct radix_tree_root *root, unsigned long index) +{ + struct item *item = radix_tree_delete(root, index); + + if (item) { + item_sanity(item, index); + call_rcu(&item->rcu_head, item_free_rcu); + return 1; + } + return 0; +} + void item_check_present(struct radix_tree_root *root, unsigned long index) { struct item *item; diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index d9c031dbeb1a..8cf4a7a7f94c 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -5,6 +5,7 @@ #include struct item { + struct rcu_head rcu_head; unsigned long index; unsigned int order; }; @@ -15,6 +16,7 @@ int item_insert(struct radix_tree_root *root, unsigned long index); int item_insert_order(struct radix_tree_root *root, unsigned long index, unsigned order); int item_delete(struct radix_tree_root *root, unsigned long index); +int item_delete_rcu(struct radix_tree_root *root, unsigned long index); struct item *item_lookup(struct radix_tree_root *root, unsigned long index); void item_check_present(struct radix_tree_root *root, unsigned long index); -- cgit v1.2.3 From fd8f58c40b703e47697c9f12bc16c31f14c161f1 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 18 May 2018 16:09:01 -0700 Subject: radix tree test suite: multi-order iteration race Add a test which shows a race in the multi-order iteration code. This test reliably hits the race in under a second on my machine, and is the result of a real bug report against kernel a production v4.15 based kernel (4.15.6-300.fc27.x86_64). With a real kernel this issue is hit when using order 9 PMD DAX radix tree entries. The race has to do with how we tear down multi-order sibling entries when we are removing an item from the tree. Remember that an order 2 entry looks like this: struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling] where 'entry' is in some slot in the struct radix_tree_node, and the three slots following 'entry' contain sibling pointers which point back to 'entry.' When we delete 'entry' from the tree, we call : radix_tree_delete() radix_tree_delete_item() __radix_tree_delete() replace_slot() replace_slot() first removes the siblings in order from the first to the last, then at then replaces 'entry' with NULL. This means that for a brief period of time we end up with one or more of the siblings removed, so: struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling] This causes an issue if you have a reader iterating over the slots in the tree via radix_tree_for_each_slot() while only under rcu_read_lock()/rcu_read_unlock() protection. This is a common case in mm/filemap.c. The issue is that when __radix_tree_next_slot() => skip_siblings() tries to skip over the sibling entries in the slots, it currently does so with an exact match on the slot directly preceding our current slot. Normally this works: V preceding slot struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling] ^ current slot This lets you find the first sibling, and you skip them all in order. But in the case where one of the siblings is NULL, that slot is skipped and then our sibling detection is interrupted: V preceding slot struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling] ^ current slot This means that the sibling pointers aren't recognized since they point all the way back to 'entry', so we think that they are normal internal radix tree pointers. This causes us to think we need to walk down to a struct radix_tree_node starting at the address of 'entry'. In a real running kernel this will crash the thread with a GP fault when you try and dereference the slots in your broken node starting at 'entry'. In the radix tree test suite this will be caught by the address sanitizer: ==27063==ERROR: AddressSanitizer: heap-buffer-overflow on address 0x60c0008ae400 at pc 0x00000040ce4f bp 0x7fa89b8fcad0 sp 0x7fa89b8fcac0 READ of size 8 at 0x60c0008ae400 thread T3 #0 0x40ce4e in __radix_tree_next_slot /home/rzwisler/project/linux/tools/testing/radix-tree/radix-tree.c:1660 #1 0x4022cc in radix_tree_next_slot linux/../../../../include/linux/radix-tree.h:567 #2 0x4022cc in iterator_func /home/rzwisler/project/linux/tools/testing/radix-tree/multiorder.c:655 #3 0x7fa8a088d50a in start_thread (/lib64/libpthread.so.0+0x750a) #4 0x7fa8a03bd16e in clone (/lib64/libc.so.6+0xf516e) Link: http://lkml.kernel.org/r/20180503192430.7582-5-ross.zwisler@linux.intel.com Signed-off-by: Ross Zwisler Cc: Christoph Hellwig Cc: CR, Sapthagirish Cc: Dan Williams Cc: Dave Chinner Cc: Jan Kara Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/radix-tree/multiorder.c | 63 +++++++++++++++++++++++++++++++++++ tools/testing/radix-tree/test.h | 1 + 2 files changed, 64 insertions(+) diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 59245b3d587c..7bf405638b0b 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "test.h" @@ -624,6 +625,67 @@ static void multiorder_account(void) item_kill_tree(&tree); } +bool stop_iteration = false; + +static void *creator_func(void *ptr) +{ + /* 'order' is set up to ensure we have sibling entries */ + unsigned int order = RADIX_TREE_MAP_SHIFT - 1; + struct radix_tree_root *tree = ptr; + int i; + + for (i = 0; i < 10000; i++) { + item_insert_order(tree, 0, order); + item_delete_rcu(tree, 0); + } + + stop_iteration = true; + return NULL; +} + +static void *iterator_func(void *ptr) +{ + struct radix_tree_root *tree = ptr; + struct radix_tree_iter iter; + struct item *item; + void **slot; + + while (!stop_iteration) { + rcu_read_lock(); + radix_tree_for_each_slot(slot, tree, &iter, 0) { + item = radix_tree_deref_slot(slot); + + if (!item) + continue; + if (radix_tree_deref_retry(item)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + item_sanity(item, iter.index); + } + rcu_read_unlock(); + } + return NULL; +} + +static void multiorder_iteration_race(void) +{ + const int num_threads = sysconf(_SC_NPROCESSORS_ONLN); + pthread_t worker_thread[num_threads]; + RADIX_TREE(tree, GFP_KERNEL); + int i; + + pthread_create(&worker_thread[0], NULL, &creator_func, &tree); + for (i = 1; i < num_threads; i++) + pthread_create(&worker_thread[i], NULL, &iterator_func, &tree); + + for (i = 0; i < num_threads; i++) + pthread_join(worker_thread[i], NULL); + + item_kill_tree(&tree); +} + void multiorder_checks(void) { int i; @@ -644,6 +706,7 @@ void multiorder_checks(void) multiorder_join(); multiorder_split(); multiorder_account(); + multiorder_iteration_race(); radix_tree_cpu_dead(0); } diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 8cf4a7a7f94c..31f1d9b6f506 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -13,6 +13,7 @@ struct item { struct item *item_create(unsigned long index, unsigned int order); int __item_insert(struct radix_tree_root *root, struct item *item); int item_insert(struct radix_tree_root *root, unsigned long index); +void item_sanity(struct item *item, unsigned long index); int item_insert_order(struct radix_tree_root *root, unsigned long index, unsigned order); int item_delete(struct radix_tree_root *root, unsigned long index); -- cgit v1.2.3 From 9f418224e8114156d995b98fa4e0f4fd21f685fe Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 18 May 2018 16:09:06 -0700 Subject: radix tree: fix multi-order iteration race Fix a race in the multi-order iteration code which causes the kernel to hit a GP fault. This was first seen with a production v4.15 based kernel (4.15.6-300.fc27.x86_64) utilizing a DAX workload which used order 9 PMD DAX entries. The race has to do with how we tear down multi-order sibling entries when we are removing an item from the tree. Remember for example that an order 2 entry looks like this: struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling] where 'entry' is in some slot in the struct radix_tree_node, and the three slots following 'entry' contain sibling pointers which point back to 'entry.' When we delete 'entry' from the tree, we call : radix_tree_delete() radix_tree_delete_item() __radix_tree_delete() replace_slot() replace_slot() first removes the siblings in order from the first to the last, then at then replaces 'entry' with NULL. This means that for a brief period of time we end up with one or more of the siblings removed, so: struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling] This causes an issue if you have a reader iterating over the slots in the tree via radix_tree_for_each_slot() while only under rcu_read_lock()/rcu_read_unlock() protection. This is a common case in mm/filemap.c. The issue is that when __radix_tree_next_slot() => skip_siblings() tries to skip over the sibling entries in the slots, it currently does so with an exact match on the slot directly preceding our current slot. Normally this works: V preceding slot struct radix_tree_node.slots[] = [entry][sibling][sibling][sibling] ^ current slot This lets you find the first sibling, and you skip them all in order. But in the case where one of the siblings is NULL, that slot is skipped and then our sibling detection is interrupted: V preceding slot struct radix_tree_node.slots[] = [entry][NULL][sibling][sibling] ^ current slot This means that the sibling pointers aren't recognized since they point all the way back to 'entry', so we think that they are normal internal radix tree pointers. This causes us to think we need to walk down to a struct radix_tree_node starting at the address of 'entry'. In a real running kernel this will crash the thread with a GP fault when you try and dereference the slots in your broken node starting at 'entry'. We fix this race by fixing the way that skip_siblings() detects sibling nodes. Instead of testing against the preceding slot we instead look for siblings via is_sibling_entry() which compares against the position of the struct radix_tree_node.slots[] array. This ensures that sibling entries are properly identified, even if they are no longer contiguous with the 'entry' they point to. Link: http://lkml.kernel.org/r/20180503192430.7582-6-ross.zwisler@linux.intel.com Fixes: 148deab223b2 ("radix-tree: improve multiorder iterators") Signed-off-by: Ross Zwisler Reported-by: CR, Sapthagirish Reviewed-by: Jan Kara Cc: Matthew Wilcox Cc: Christoph Hellwig Cc: Dan Williams Cc: Dave Chinner Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/radix-tree.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/lib/radix-tree.c b/lib/radix-tree.c index da9e10c827df..43e0cbedc3a0 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1612,11 +1612,9 @@ static void set_iter_tags(struct radix_tree_iter *iter, static void __rcu **skip_siblings(struct radix_tree_node **nodep, void __rcu **slot, struct radix_tree_iter *iter) { - void *sib = node_to_entry(slot - 1); - while (iter->index < iter->next_index) { *nodep = rcu_dereference_raw(*slot); - if (*nodep && *nodep != sib) + if (*nodep && !is_sibling_entry(iter->node, *nodep)) return slot; slot++; iter->index = __radix_tree_iter_add(iter, 1); @@ -1631,7 +1629,7 @@ void __rcu **__radix_tree_next_slot(void __rcu **slot, struct radix_tree_iter *iter, unsigned flags) { unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; - struct radix_tree_node *node = rcu_dereference_raw(*slot); + struct radix_tree_node *node; slot = skip_siblings(&node, slot, iter); -- cgit v1.2.3 From f3d8d3cfc1c00d1ebdb53ee96aa53aa822209003 Mon Sep 17 00:00:00 2001 From: "Shuah Khan (Samsung OSG)" Date: Fri, 18 May 2018 16:09:09 -0700 Subject: MAINTAINERS: add Q: entry to kselftest for patchwork project A new patchwork project is created to track kselftest patches. Update the kselftest entry in the MAINTAINERS file adding 'Q:' entry: https://patchwork.kernel.org/project/linux-kselftest/list/ Link: http://lkml.kernel.org/r/20180515164427.12201-1-shuah@kernel.org Signed-off-by: Shuah Khan (Samsung OSG) Cc: David S. Miller Cc: Mauro Carvalho Chehab Cc: Greg Kroah-Hartman Cc: Linus Walleij Cc: Randy Dunlap Cc: Joe Perches Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 92e47b5b0480..9d1bc95dfe3c 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7698,6 +7698,7 @@ KERNEL SELFTEST FRAMEWORK M: Shuah Khan L: linux-kselftest@vger.kernel.org T: git git://git.kernel.org/pub/scm/linux/kernel/git/shuah/linux-kselftest.git +Q: https://patchwork.kernel.org/project/linux-kselftest/list/ S: Maintained F: tools/testing/selftests/ F: Documentation/dev-tools/kselftest* -- cgit v1.2.3 From ab1e8d8960b68f54af42b6484b5950bd13a4054b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Fri, 18 May 2018 16:09:13 -0700 Subject: mm: don't allow deferred pages with NEED_PER_CPU_KM It is unsafe to do virtual to physical translations before mm_init() is called if struct page is needed in order to determine the memory section number (see SECTION_IN_PAGE_FLAGS). This is because only in mm_init() we initialize struct pages for all the allocated memory when deferred struct pages are used. My recent fix in commit c9e97a1997 ("mm: initialize pages on demand during boot") exposed this problem, because it greatly reduced number of pages that are initialized before mm_init(), but the problem existed even before my fix, as Fengguang Wu found. Below is a more detailed explanation of the problem. We initialize struct pages in four places: 1. Early in boot a small set of struct pages is initialized to fill the first section, and lower zones. 2. During mm_init() we initialize "struct pages" for all the memory that is allocated, i.e reserved in memblock. 3. Using on-demand logic when pages are allocated after mm_init call (when memblock is finished) 4. After smp_init() when the rest free deferred pages are initialized. The problem occurs if we try to do va to phys translation of a memory between steps 1 and 2. Because we have not yet initialized struct pages for all the reserved pages, it is inherently unsafe to do va to phys if the translation itself requires access of "struct page" as in case of this combination: CONFIG_SPARSE && !CONFIG_SPARSE_VMEMMAP The following path exposes the problem: start_kernel() trap_init() setup_cpu_entry_areas() setup_cpu_entry_area(cpu) get_cpu_gdt_paddr(cpu) per_cpu_ptr_to_phys(addr) pcpu_addr_to_page(addr) virt_to_page(addr) pfn_to_page(__pa(addr) >> PAGE_SHIFT) We disable this path by not allowing NEED_PER_CPU_KM with deferred struct pages feature. The problems are discussed in these threads: http://lkml.kernel.org/r/20180418135300.inazvpxjxowogyge@wfg-t540p.sh.intel.com http://lkml.kernel.org/r/20180419013128.iurzouiqxvcnpbvz@wfg-t540p.sh.intel.com http://lkml.kernel.org/r/20180426202619.2768-1-pasha.tatashin@oracle.com Link: http://lkml.kernel.org/r/20180515175124.1770-1-pasha.tatashin@oracle.com Fixes: 3a80a7fa7989 ("mm: meminit: initialise a subset of struct pages if CONFIG_DEFERRED_STRUCT_PAGE_INIT is set") Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Steven Sistare Cc: Daniel Jordan Cc: Mel Gorman Cc: Fengguang Wu Cc: Dennis Zhou Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/Kconfig b/mm/Kconfig index d5004d82a1d6..e14c01513bfd 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -636,6 +636,7 @@ config DEFERRED_STRUCT_PAGE_INIT default n depends on NO_BOOTMEM depends on !FLATMEM + depends on !NEED_PER_CPU_KM help Ordinarily all struct pages are initialised during early boot in a single thread. On very large machines this can take a considerable -- cgit v1.2.3 From 66072c29328717072fd84aaff3e070e3f008ba77 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 18 May 2018 16:09:16 -0700 Subject: hfsplus: stop workqueue when fill_super() failed syzbot is reporting ODEBUG messages at hfsplus_fill_super() [1]. This is because hfsplus_fill_super() forgot to call cancel_delayed_work_sync(). As far as I can see, it is hfsplus_mark_mdb_dirty() from hfsplus_new_inode() in hfsplus_fill_super() that calls queue_delayed_work(). Therefore, I assume that hfsplus_new_inode() does not fail if queue_delayed_work() was called, and the out_put_hidden_dir label is the appropriate location to call cancel_delayed_work_sync(). [1] https://syzkaller.appspot.com/bug?id=a66f45e96fdbeb76b796bf46eb25ea878c42a6c9 Link: http://lkml.kernel.org/r/964a8b27-cd69-357c-fe78-76b066056201@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Reported-by: syzbot Cc: Al Viro Cc: David Howells Cc: Ernesto A. Fernandez Cc: Vyacheslav Dubeyko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hfsplus/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/hfsplus/super.c b/fs/hfsplus/super.c index 513c357c734b..a6c0f54c48c3 100644 --- a/fs/hfsplus/super.c +++ b/fs/hfsplus/super.c @@ -588,6 +588,7 @@ static int hfsplus_fill_super(struct super_block *sb, void *data, int silent) return 0; out_put_hidden_dir: + cancel_delayed_work_sync(&sbi->sync_work); iput(sbi->hidden_dir); out_put_root: dput(sb->s_root); -- cgit v1.2.3