From c7be96af89d4b53211862d8599b2430e8900ed92 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Wed, 14 Dec 2016 15:04:10 -0800 Subject: signals: avoid unnecessary taking of sighand->siglock When running certain database workload on a high-end system with many CPUs, it was found that spinlock contention in the sigprocmask syscalls became a significant portion of the overall CPU cycles as shown below. 9.30% 9.30% 905387 dataserver /proc/kcore 0x7fff8163f4d2 [k] _raw_spin_lock_irq | ---_raw_spin_lock_irq | |--99.34%-- __set_current_blocked | sigprocmask | sys_rt_sigprocmask | system_call_fastpath | | | |--50.63%-- __swapcontext | | | | | |--99.91%-- upsleepgeneric | | | |--49.36%-- __setcontext | | ktskRun Looking further into the swapcontext function in glibc, it was found that the function always call sigprocmask() without checking if there are changes in the signal mask. A check was added to the __set_current_blocked() function to avoid taking the sighand->siglock spinlock if there is no change in the signal mask. This will prevent unneeded spinlock contention when many threads are trying to call sigprocmask(). With this patch applied, the spinlock contention in sigprocmask() was gone. Link: http://lkml.kernel.org/r/1474979209-11867-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Waiman Long Acked-by: Oleg Nesterov Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Stas Sergeev Cc: Scott J Norton Cc: Douglas Hatch Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/signal.h | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'include') diff --git a/include/linux/signal.h b/include/linux/signal.h index b63f63eaa39c..5308304993be 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -97,6 +97,23 @@ static inline int sigisemptyset(sigset_t *set) } } +static inline int sigequalsets(const sigset_t *set1, const sigset_t *set2) +{ + switch (_NSIG_WORDS) { + case 4: + return (set1->sig[3] == set2->sig[3]) && + (set1->sig[2] == set2->sig[2]) && + (set1->sig[1] == set2->sig[1]) && + (set1->sig[0] == set2->sig[0]); + case 2: + return (set1->sig[1] == set2->sig[1]) && + (set1->sig[0] == set2->sig[0]); + case 1: + return set1->sig[0] == set2->sig[0]; + } + return 0; +} + #define sigmask(sig) (1UL << ((sig) - 1)) #ifndef __HAVE_ARCH_SIG_SETOPS -- cgit v1.2.3 From 69f58384791ac6da4165ce8e6defd6f408f4afdf Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 14 Dec 2016 15:04:16 -0800 Subject: Revert "kdump, vmcoreinfo: report memory sections virtual addresses" This reverts commit 0549a3c02efb ("kdump, vmcoreinfo: report memory sections virtual addresses"). Commit 0549a3c02efb tells the userspace utility makedumpfile the randomized base address of these memmory sections when mm kaslr is enabled. However the following patch "kexec: export the value of phys_base instead of symbol address" makes makedumpfile not need these addresses any more. Besides we should use VMCOREINFO_NUMBER to export the value of the variable so that we can use the existing number_table mechanism of Makedumpfile to fetch it. So revert it now. If needed we can add it later. http://lists.infradead.org/pipermail/kexec/2016-October/017540.html Link: http://lkml.kernel.org/r/1478568596-30060-1-git-send-email-bhe@redhat.com Signed-off-by: Baoquan He Cc: Thomas Garnier Cc: Baoquan He Cc: Thomas Gleixner Cc: Ingo Molnar Cc: "H . Peter Anvin" Cc: Eric Biederman Cc: Xunlei Pang Cc: HATAYAMA Daisuke Cc: Kees Cook Cc: Eugene Surovegin Cc: Dave Young Cc: AKASHI Takahiro Cc: Atsushi Kumagai Cc: Dave Anderson Cc: Pratyush Anand Cc: Vivek Goyal Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/machine_kexec_64.c | 3 --- include/linux/kexec.h | 6 ------ 2 files changed, 9 deletions(-) (limited to 'include') diff --git a/arch/x86/kernel/machine_kexec_64.c b/arch/x86/kernel/machine_kexec_64.c index 8c1f218926d7..5a294e48b185 100644 --- a/arch/x86/kernel/machine_kexec_64.c +++ b/arch/x86/kernel/machine_kexec_64.c @@ -337,9 +337,6 @@ void arch_crash_save_vmcoreinfo(void) #endif vmcoreinfo_append_str("KERNELOFFSET=%lx\n", kaslr_offset()); - VMCOREINFO_PAGE_OFFSET(PAGE_OFFSET); - VMCOREINFO_VMALLOC_START(VMALLOC_START); - VMCOREINFO_VMEMMAP_START(VMEMMAP_START); } /* arch-dependent functionality related to kexec file-based syscall */ diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 406c33dcae13..d7437777baaa 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -259,12 +259,6 @@ phys_addr_t paddr_vmcoreinfo_note(void); vmcoreinfo_append_str("NUMBER(%s)=%ld\n", #name, (long)name) #define VMCOREINFO_CONFIG(name) \ vmcoreinfo_append_str("CONFIG_%s=y\n", #name) -#define VMCOREINFO_PAGE_OFFSET(value) \ - vmcoreinfo_append_str("PAGE_OFFSET=%lx\n", (unsigned long)value) -#define VMCOREINFO_VMALLOC_START(value) \ - vmcoreinfo_append_str("VMALLOC_START=%lx\n", (unsigned long)value) -#define VMCOREINFO_VMEMMAP_START(value) \ - vmcoreinfo_append_str("VMEMMAP_START=%lx\n", (unsigned long)value) extern struct kimage *kexec_image; extern struct kimage *kexec_crash_image; -- cgit v1.2.3 From 0495c3d367944e4af053983ff3cdf256b567b053 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 14 Dec 2016 15:05:23 -0800 Subject: dma: add calls for dma_map_page_attrs and dma_unmap_page_attrs Add support for mapping and unmapping a page with attributes. The primary use for this is currently to allow for us to pass the DMA_ATTR_SKIP_CPU_SYNC attribute when mapping and unmapping a page. On some architectures such as ARM the synchronization has significant overhead and if we are already taking care of the sync_for_cpu and sync_for_device from the driver there isn't much need to handle this in the map/unmap calls as well. Link: http://lkml.kernel.org/r/20161110113601.76501.46095.stgit@ahduyck-blue-test.jf.intel.com Signed-off-by: Alexander Duyck Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/dma-mapping.h | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index 08528afdf58b..10c5a17b1f51 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -243,29 +243,33 @@ static inline void dma_unmap_sg_attrs(struct device *dev, struct scatterlist *sg ops->unmap_sg(dev, sg, nents, dir, attrs); } -static inline dma_addr_t dma_map_page(struct device *dev, struct page *page, - size_t offset, size_t size, - enum dma_data_direction dir) +static inline dma_addr_t dma_map_page_attrs(struct device *dev, + struct page *page, + size_t offset, size_t size, + enum dma_data_direction dir, + unsigned long attrs) { struct dma_map_ops *ops = get_dma_ops(dev); dma_addr_t addr; kmemcheck_mark_initialized(page_address(page) + offset, size); BUG_ON(!valid_dma_direction(dir)); - addr = ops->map_page(dev, page, offset, size, dir, 0); + addr = ops->map_page(dev, page, offset, size, dir, attrs); debug_dma_map_page(dev, page, offset, size, dir, addr, false); return addr; } -static inline void dma_unmap_page(struct device *dev, dma_addr_t addr, - size_t size, enum dma_data_direction dir) +static inline void dma_unmap_page_attrs(struct device *dev, + dma_addr_t addr, size_t size, + enum dma_data_direction dir, + unsigned long attrs) { struct dma_map_ops *ops = get_dma_ops(dev); BUG_ON(!valid_dma_direction(dir)); if (ops->unmap_page) - ops->unmap_page(dev, addr, size, dir, 0); + ops->unmap_page(dev, addr, size, dir, attrs); debug_dma_unmap_page(dev, addr, size, dir, false); } @@ -385,6 +389,8 @@ dma_sync_sg_for_device(struct device *dev, struct scatterlist *sg, #define dma_unmap_single(d, a, s, r) dma_unmap_single_attrs(d, a, s, r, 0) #define dma_map_sg(d, s, n, r) dma_map_sg_attrs(d, s, n, r, 0) #define dma_unmap_sg(d, s, n, r) dma_unmap_sg_attrs(d, s, n, r, 0) +#define dma_map_page(d, p, o, s, r) dma_map_page_attrs(d, p, o, s, r, 0) +#define dma_unmap_page(d, a, s, r) dma_unmap_page_attrs(d, a, s, r, 0) extern int dma_common_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size); -- cgit v1.2.3 From 44fdffd70504c15b617686753dfdf9eb0ddf3729 Mon Sep 17 00:00:00 2001 From: Alexander Duyck Date: Wed, 14 Dec 2016 15:05:26 -0800 Subject: mm: add support for releasing multiple instances of a page Add a function that allows us to batch free a page that has multiple references outstanding. Specifically this function can be used to drop a page being used in the page frag alloc cache. With this drivers can make use of functionality similar to the page frag alloc cache without having to do any workarounds for the fact that there is no function that frees multiple references. Link: http://lkml.kernel.org/r/20161110113606.76501.70752.stgit@ahduyck-blue-test.jf.intel.com Signed-off-by: Alexander Duyck Cc: "David S. Miller" Cc: "James E.J. Bottomley" Cc: Chris Metcalf Cc: David Howells Cc: Geert Uytterhoeven Cc: Hans-Christian Noren Egtvedt Cc: Helge Deller Cc: James Hogan Cc: Jeff Kirsher Cc: Jonas Bonn Cc: Keguang Zhang Cc: Ley Foon Tan Cc: Mark Salter Cc: Max Filippov Cc: Michael Ellerman Cc: Michal Simek Cc: Ralf Baechle Cc: Rich Felker Cc: Richard Kuo Cc: Russell King Cc: Steven Miao Cc: Tobias Klauser Cc: Vineet Gupta Cc: Yoshinori Sato Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 2 ++ mm/page_alloc.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) (limited to 'include') diff --git a/include/linux/gfp.h b/include/linux/gfp.h index f8041f9de31e..4175dca4ac39 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -506,6 +506,8 @@ extern void free_hot_cold_page(struct page *page, bool cold); extern void free_hot_cold_page_list(struct list_head *list, bool cold); struct page_frag_cache; +extern void __page_frag_drain(struct page *page, unsigned int order, + unsigned int count); extern void *__alloc_page_frag(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask); extern void __free_page_frag(void *addr); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f64e7bcb43b7..2c6d5f64feca 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3925,6 +3925,20 @@ static struct page *__page_frag_refill(struct page_frag_cache *nc, return page; } +void __page_frag_drain(struct page *page, unsigned int order, + unsigned int count) +{ + VM_BUG_ON_PAGE(page_ref_count(page) == 0, page); + + if (page_ref_sub_and_test(page, count)) { + if (order == 0) + free_hot_cold_page(page, false); + else + __free_pages_ok(page, order); + } +} +EXPORT_SYMBOL(__page_frag_drain); + void *__alloc_page_frag(struct page_frag_cache *nc, unsigned int fragsz, gfp_t gfp_mask) { -- cgit v1.2.3 From d1bd8ead126668a2d6c42d97cc3664e95b3fa1dc Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Dec 2016 15:05:52 -0800 Subject: kdb: remove unused kdb_event handling kdb_event state variable is only set but never checked in the kernel code. http://www.spinics.net/lists/kdb/msg01733.html suggests that this variable affected WARN_CONSOLE_UNLOCKED() in the original implementation. But this check never went upstream. The semantic is unclear and racy. The value is updated after the kdb_printf_lock is acquired and after it is released. It should be symmetric at minimum. The value should be manipulated either inside or outside the locked area. Fortunately, it seems that the original function is gone and we could simply remove the state variable. Link: http://lkml.kernel.org/r/1480412276-16690-2-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Suggested-by: Daniel Thompson Cc: Jason Wessel Cc: Peter Zijlstra Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdb.h | 1 - kernel/debug/kdb/kdb_io.c | 2 -- kernel/debug/kdb/kdb_main.c | 1 - 3 files changed, 4 deletions(-) (limited to 'include') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index 410decacff8f..eb706188dc23 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -77,7 +77,6 @@ extern int kdb_poll_idx; * number whenever the kernel debugger is entered. */ extern int kdb_initial_cpu; -extern atomic_t kdb_event; /* Types and messages used for dynamically added kdb shell commands */ diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index 98c9011eac78..46f477bebe0c 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -576,7 +576,6 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) KDB_STATE_SET(PRINTF_LOCK); spin_lock_irqsave(&kdb_printf_lock, flags); got_printf_lock = 1; - atomic_inc(&kdb_event); } else { __acquire(kdb_printf_lock); } @@ -851,7 +850,6 @@ kdb_print_out: got_printf_lock = 0; spin_unlock_irqrestore(&kdb_printf_lock, flags); KDB_STATE_CLEAR(PRINTF_LOCK); - atomic_dec(&kdb_event); } else { __release(kdb_printf_lock); } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 2a20c0dfdafc..ca183919d302 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -60,7 +60,6 @@ int kdb_grep_trailing; * Kernel debugger state flags */ int kdb_flags; -atomic_t kdb_event; /* * kdb_lock protects updates to kdb_initial_cpu. Used to -- cgit v1.2.3 From 34aaff40b42148b23dcde40152480e25c7d2d759 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 14 Dec 2016 15:05:58 -0800 Subject: kdb: call vkdb_printf() from vprintk_default() only when wanted kdb_trap_printk allows to pass normal printk() messages to kdb via vkdb_printk(). For example, it is used to get backtrace using the classic show_stack(), see kdb_show_stack(). vkdb_printf() tries to avoid a potential infinite loop by disabling the trap. But this approach is racy, for example: CPU1 CPU2 vkdb_printf() // assume that kdb_trap_printk == 0 saved_trap_printk = kdb_trap_printk; kdb_trap_printk = 0; kdb_show_stack() kdb_trap_printk++; Problem1: Now, a nested printk() on CPU0 calls vkdb_printf() even when it should have been disabled. It will not cause a deadlock but... // using the outdated saved value: 0 kdb_trap_printk = saved_trap_printk; kdb_trap_printk--; Problem2: Now, kdb_trap_printk == -1 and will stay like this. It means that all messages will get passed to kdb from now on. This patch removes the racy saved_trap_printk handling. Instead, the recursion is prevented by a check for the locked CPU. The solution is still kind of racy. A non-related printk(), from another process, might get trapped by vkdb_printf(). And the wanted printk() might not get trapped because kdb_printf_cpu is assigned. But this problem existed even with the original code. A proper solution would be to get_cpu() before setting kdb_trap_printk and trap messages only from this CPU. I am not sure if it is worth the effort, though. In fact, the race is very theoretical. When kdb is running any of the commands that use kdb_trap_printk there is a single active CPU and the other CPUs should be in a holding pen inside kgdb_cpu_enter(). The only time this is violated is when there is a timeout waiting for the other CPUs to report to the holding pen. Finally, note that the situation is a bit schizophrenic. vkdb_printf() explicitly allows recursion but only from KDB code that calls kdb_printf() directly. On the other hand, the generic printk() recursion is not allowed because it might cause an infinite loop. This is why we could not hide the decision inside vkdb_printf() easily. Link: http://lkml.kernel.org/r/1480412276-16690-4-git-send-email-pmladek@suse.com Signed-off-by: Petr Mladek Cc: Daniel Thompson Cc: Jason Wessel Cc: Peter Zijlstra Cc: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kdb.h | 1 + kernel/debug/kdb/kdb_io.c | 9 ++------- kernel/printk/printk.c | 3 ++- 3 files changed, 5 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/kdb.h b/include/linux/kdb.h index eb706188dc23..68bd88223417 100644 --- a/include/linux/kdb.h +++ b/include/linux/kdb.h @@ -161,6 +161,7 @@ enum kdb_msgsrc { }; extern int kdb_trap_printk; +extern int kdb_printf_cpu; extern __printf(2, 0) int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list args); extern __printf(1, 2) int kdb_printf(const char *, ...); diff --git a/kernel/debug/kdb/kdb_io.c b/kernel/debug/kdb/kdb_io.c index daa76154fb1b..e74be38245ad 100644 --- a/kernel/debug/kdb/kdb_io.c +++ b/kernel/debug/kdb/kdb_io.c @@ -30,6 +30,7 @@ char kdb_prompt_str[CMD_BUFLEN]; int kdb_trap_printk; +int kdb_printf_cpu = -1; static int kgdb_transition_check(char *buffer) { @@ -554,24 +555,19 @@ int vkdb_printf(enum kdb_msgsrc src, const char *fmt, va_list ap) int linecount; int colcount; int logging, saved_loglevel = 0; - int saved_trap_printk; int retlen = 0; int fnd, len; int this_cpu, old_cpu; - static int kdb_printf_cpu = -1; char *cp, *cp2, *cphold = NULL, replaced_byte = ' '; char *moreprompt = "more> "; struct console *c = console_drivers; unsigned long uninitialized_var(flags); - local_irq_save(flags); - saved_trap_printk = kdb_trap_printk; - kdb_trap_printk = 0; - /* Serialize kdb_printf if multiple cpus try to write at once. * But if any cpu goes recursive in kdb, just print the output, * even if it is interleaved with any other text. */ + local_irq_save(flags); this_cpu = smp_processor_id(); for (;;) { old_cpu = cmpxchg(&kdb_printf_cpu, -1, this_cpu); @@ -849,7 +845,6 @@ kdb_print_out: console_loglevel = saved_loglevel; /* kdb_printf_cpu locked the code above. */ smp_store_release(&kdb_printf_cpu, old_cpu); - kdb_trap_printk = saved_trap_printk; local_irq_restore(flags); return retlen; } diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 577f2288d19f..a3ce35e0fa1e 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1926,7 +1926,8 @@ int vprintk_default(const char *fmt, va_list args) int r; #ifdef CONFIG_KGDB_KDB - if (unlikely(kdb_trap_printk)) { + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) { r = vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); return r; } -- cgit v1.2.3 From 249e52e35580fcfe5dad53a7dcd7c1252788749c Mon Sep 17 00:00:00 2001 From: Babu Moger Date: Wed, 14 Dec 2016 15:06:21 -0800 Subject: kernel/watchdog.c: move shared definitions to nmi.h Patch series "Clean up watchdog handlers", v2. This is an attempt to cleanup watchdog handlers. Right now, kernel/watchdog.c implements both softlockup and hardlockup detectors. Softlockup code is generic. Hardlockup code is arch specific. Some architectures don't use hardlockup detectors. They use their own watchdog detectors. To make both these combination work, we have numerous #ifdefs in kernel/watchdog.c. We are trying here to make these handlers independent of each other. Also provide an interface for architectures to implement their own handlers. watchdog_nmi_enable and watchdog_nmi_disable will be defined as weak such that architectures can override its definitions. Thanks to Don Zickus for his suggestions. Here are our previous discussions http://www.spinics.net/lists/sparclinux/msg16543.html http://www.spinics.net/lists/sparclinux/msg16441.html This patch (of 3): Move shared macros and definitions to nmi.h so that watchdog.c, new file watchdog_hld.c or any other architecture specific handler can use those definitions. Link: http://lkml.kernel.org/r/1478034826-43888-2-git-send-email-babu.moger@oracle.com Signed-off-by: Babu Moger Acked-by: Don Zickus Cc: Ingo Molnar Cc: Jiri Kosina Cc: Andi Kleen Cc: Yaowei Bai Cc: Aaron Tomlin Cc: Ulrich Obergfell Cc: Tejun Heo Cc: Hidehiro Kawai Cc: Josh Hunt Cc: "David S. Miller" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/nmi.h | 24 ++++++++++++++++++++++++ kernel/watchdog.c | 28 ++++------------------------ 2 files changed, 28 insertions(+), 24 deletions(-) (limited to 'include') diff --git a/include/linux/nmi.h b/include/linux/nmi.h index a78c35cff1ae..aacca824a6ae 100644 --- a/include/linux/nmi.h +++ b/include/linux/nmi.h @@ -7,6 +7,23 @@ #include #include +/* + * The run state of the lockup detectors is controlled by the content of the + * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - + * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. + * + * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' + * are variables that are only used as an 'interface' between the parameters + * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The + * 'watchdog_thresh' variable is handled differently because its value is not + * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' + * is equal zero. + */ +#define NMI_WATCHDOG_ENABLED_BIT 0 +#define SOFT_WATCHDOG_ENABLED_BIT 1 +#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) +#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) + /** * touch_nmi_watchdog - restart NMI watchdog timeout. * @@ -91,9 +108,16 @@ extern int nmi_watchdog_enabled; extern int soft_watchdog_enabled; extern int watchdog_user_enabled; extern int watchdog_thresh; +extern unsigned long watchdog_enabled; extern unsigned long *watchdog_cpumask_bits; +#ifdef CONFIG_SMP extern int sysctl_softlockup_all_cpu_backtrace; extern int sysctl_hardlockup_all_cpu_backtrace; +#else +#define sysctl_softlockup_all_cpu_backtrace 0 +#define sysctl_hardlockup_all_cpu_backtrace 0 +#endif +extern bool is_hardlockup(void); struct ctl_table; extern int proc_watchdog(struct ctl_table *, int , void __user *, size_t *, loff_t *); diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 6d1020c03d41..d21049496e26 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -27,29 +27,12 @@ #include #include -/* - * The run state of the lockup detectors is controlled by the content of the - * 'watchdog_enabled' variable. Each lockup detector has its dedicated bit - - * bit 0 for the hard lockup detector and bit 1 for the soft lockup detector. - * - * 'watchdog_user_enabled', 'nmi_watchdog_enabled' and 'soft_watchdog_enabled' - * are variables that are only used as an 'interface' between the parameters - * in /proc/sys/kernel and the internal state bits in 'watchdog_enabled'. The - * 'watchdog_thresh' variable is handled differently because its value is not - * boolean, and the lockup detectors are 'suspended' while 'watchdog_thresh' - * is equal zero. - */ -#define NMI_WATCHDOG_ENABLED_BIT 0 -#define SOFT_WATCHDOG_ENABLED_BIT 1 -#define NMI_WATCHDOG_ENABLED (1 << NMI_WATCHDOG_ENABLED_BIT) -#define SOFT_WATCHDOG_ENABLED (1 << SOFT_WATCHDOG_ENABLED_BIT) - static DEFINE_MUTEX(watchdog_proc_mutex); -#ifdef CONFIG_HARDLOCKUP_DETECTOR -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; +#if defined(CONFIG_HAVE_NMI_WATCHDOG) || defined(CONFIG_HARDLOCKUP_DETECTOR) +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED|NMI_WATCHDOG_ENABLED; #else -static unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; +unsigned long __read_mostly watchdog_enabled = SOFT_WATCHDOG_ENABLED; #endif int __read_mostly nmi_watchdog_enabled; int __read_mostly soft_watchdog_enabled; @@ -59,9 +42,6 @@ int __read_mostly watchdog_thresh = 10; #ifdef CONFIG_SMP int __read_mostly sysctl_softlockup_all_cpu_backtrace; int __read_mostly sysctl_hardlockup_all_cpu_backtrace; -#else -#define sysctl_softlockup_all_cpu_backtrace 0 -#define sysctl_hardlockup_all_cpu_backtrace 0 #endif static struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); @@ -289,7 +269,7 @@ void touch_softlockup_watchdog_sync(void) #ifdef CONFIG_HARDLOCKUP_DETECTOR /* watchdog detector functions */ -static bool is_hardlockup(void) +bool is_hardlockup(void) { unsigned long hrint = __this_cpu_read(hrtimer_interrupts); -- cgit v1.2.3 From 5b56d49fc31dbb0487e14ead790fc81ca9fb2c99 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Dec 2016 15:06:52 -0800 Subject: mm: add locked parameter to get_user_pages_remote() Patch series "mm: unexport __get_user_pages_unlocked()". This patch series continues the cleanup of get_user_pages*() functions taking advantage of the fact we can now pass gup_flags as we please. It firstly adds an additional 'locked' parameter to get_user_pages_remote() to allow for its callers to utilise VM_FAULT_RETRY functionality. This is necessary as the invocation of __get_user_pages_unlocked() in process_vm_rw_single_vec() makes use of this and no other existing higher level function would allow it to do so. Secondly existing callers of __get_user_pages_unlocked() are replaced with the appropriate higher-level replacement - get_user_pages_unlocked() if the current task and memory descriptor are referenced, or get_user_pages_remote() if other task/memory descriptors are referenced (having acquiring mmap_sem.) This patch (of 2): Add a int *locked parameter to get_user_pages_remote() to allow VM_FAULT_RETRY faulting behaviour similar to get_user_pages_[un]locked(). Taking into account the previous adjustments to get_user_pages*() functions allowing for the passing of gup_flags, we are now in a position where __get_user_pages_unlocked() need only be exported for his ability to allow VM_FAULT_RETRY behaviour, this adjustment allows us to subsequently unexport __get_user_pages_unlocked() as well as allowing for future flexibility in the use of get_user_pages_remote(). [sfr@canb.auug.org.au: merge fix for get_user_pages_remote API change] Link: http://lkml.kernel.org/r/20161122210511.024ec341@canb.auug.org.au Link: http://lkml.kernel.org/r/20161027095141.2569-2-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Jan Kara Cc: Hugh Dickins Cc: Dave Hansen Cc: Rik van Riel Cc: Mel Gorman Cc: Paolo Bonzini Cc: Radim Krcmar Signed-off-by: Stephen Rothwell Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 2 +- drivers/gpu/drm/i915/i915_gem_userptr.c | 2 +- drivers/infiniband/core/umem_odp.c | 2 +- drivers/vfio/vfio_iommu_type1.c | 2 +- fs/exec.c | 2 +- include/linux/mm.h | 2 +- kernel/events/uprobes.c | 4 ++-- mm/gup.c | 12 ++++++++---- mm/memory.c | 2 +- security/tomoyo/domain.c | 2 +- 10 files changed, 18 insertions(+), 14 deletions(-) (limited to 'include') diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 7d066a91d778..6f5dfabb3096 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -759,7 +759,7 @@ static struct page **etnaviv_gem_userptr_do_get_pages( down_read(&mm->mmap_sem); while (pinned < npages) { ret = get_user_pages_remote(task, mm, ptr, npages - pinned, - flags, pvec + pinned, NULL); + flags, pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/gpu/drm/i915/i915_gem_userptr.c b/drivers/gpu/drm/i915/i915_gem_userptr.c index 107ddf51065e..d068af2ec3a3 100644 --- a/drivers/gpu/drm/i915/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/i915_gem_userptr.c @@ -515,7 +515,7 @@ __i915_gem_userptr_get_pages_worker(struct work_struct *_work) obj->userptr.ptr + pinned * PAGE_SIZE, npages - pinned, flags, - pvec + pinned, NULL); + pvec + pinned, NULL, NULL); if (ret < 0) break; diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 1f0fe3217f23..6b079a31dced 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -578,7 +578,7 @@ int ib_umem_odp_map_dma_pages(struct ib_umem *umem, u64 user_virt, u64 bcnt, */ npages = get_user_pages_remote(owning_process, owning_mm, user_virt, gup_num_pages, - flags, local_page_list, NULL); + flags, local_page_list, NULL, NULL); up_read(&owning_mm->mmap_sem); if (npages < 0) diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c index 9815e45c23c4..f3726ba12aa6 100644 --- a/drivers/vfio/vfio_iommu_type1.c +++ b/drivers/vfio/vfio_iommu_type1.c @@ -362,7 +362,7 @@ static int vaddr_get_pfn(struct mm_struct *mm, unsigned long vaddr, down_read(&mm->mmap_sem); ret = get_user_pages_remote(NULL, mm, vaddr, 1, flags, page, - NULL); + NULL, NULL); up_read(&mm->mmap_sem); } diff --git a/fs/exec.c b/fs/exec.c index 923c57d96899..eac60886b0ab 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -209,7 +209,7 @@ static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos, * doing the exec and bprm->mm is the new process's mm. */ ret = get_user_pages_remote(current, bprm->mm, pos, 1, gup_flags, - &page, NULL); + &page, NULL, NULL); if (ret <= 0) return NULL; diff --git a/include/linux/mm.h b/include/linux/mm.h index a92c8d73aeaf..cc154454675a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1274,7 +1274,7 @@ extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas); + struct vm_area_struct **vmas, int *locked); long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, struct vm_area_struct **vmas); diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index f9ec9add2164..215871bda3a2 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -301,7 +301,7 @@ int uprobe_write_opcode(struct mm_struct *mm, unsigned long vaddr, retry: /* Read the page with vaddr into memory */ ret = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &old_page, - &vma); + &vma, NULL); if (ret <= 0) return ret; @@ -1712,7 +1712,7 @@ static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr) * essentially a kernel access to the memory. */ result = get_user_pages_remote(NULL, mm, vaddr, 1, FOLL_FORCE, &page, - NULL); + NULL, NULL); if (result < 0) return result; diff --git a/mm/gup.c b/mm/gup.c index e50178c58b97..b64c907aa4f0 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -917,6 +917,9 @@ EXPORT_SYMBOL(get_user_pages_unlocked); * only intends to ensure the pages are faulted in. * @vmas: array of pointers to vmas corresponding to each page. * Or NULL if the caller does not require them. + * @locked: pointer to lock flag indicating whether lock is held and + * subsequently whether VM_FAULT_RETRY functionality can be + * utilised. Lock must initially be held. * * Returns number of pages pinned. This may be fewer than the number * requested. If nr_pages is 0 or negative, returns 0. If no pages @@ -960,10 +963,10 @@ EXPORT_SYMBOL(get_user_pages_unlocked); long get_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, - struct vm_area_struct **vmas) + struct vm_area_struct **vmas, int *locked) { return __get_user_pages_locked(tsk, mm, start, nr_pages, pages, vmas, - NULL, false, + locked, true, gup_flags | FOLL_TOUCH | FOLL_REMOTE); } EXPORT_SYMBOL(get_user_pages_remote); @@ -971,8 +974,9 @@ EXPORT_SYMBOL(get_user_pages_remote); /* * This is the same as get_user_pages_remote(), just with a * less-flexible calling convention where we assume that the task - * and mm being operated on are the current task's. We also - * obviously don't pass FOLL_REMOTE in here. + * and mm being operated on are the current task's and don't allow + * passing of a locked parameter. We also obviously don't pass + * FOLL_REMOTE in here. */ long get_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, diff --git a/mm/memory.c b/mm/memory.c index c264f7cd3e47..3a6a1239c42b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3919,7 +3919,7 @@ static int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, struct page *page = NULL; ret = get_user_pages_remote(tsk, mm, addr, 1, - gup_flags, &page, &vma); + gup_flags, &page, &vma, NULL); if (ret <= 0) { #ifndef CONFIG_HAVE_IOREMAP_PROT break; diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 682b73af7766..838ffa78cfda 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -881,7 +881,7 @@ bool tomoyo_dump_page(struct linux_binprm *bprm, unsigned long pos, * the execve(). */ if (get_user_pages_remote(current, bprm->mm, pos, 1, - FOLL_FORCE, &page, NULL) <= 0) + FOLL_FORCE, &page, NULL, NULL) <= 0) return false; #else page = bprm->page[pos / PAGE_SIZE]; -- cgit v1.2.3 From 8b7457ef9a9eb46cd1675d40d8e1fd3c47a38395 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Wed, 14 Dec 2016 15:06:55 -0800 Subject: mm: unexport __get_user_pages_unlocked() Unexport the low-level __get_user_pages_unlocked() function and replaces invocations with calls to more appropriate higher-level functions. In hva_to_pfn_slow() we are able to replace __get_user_pages_unlocked() with get_user_pages_unlocked() since we can now pass gup_flags. In async_pf_execute() and process_vm_rw_single_vec() we need to pass different tsk, mm arguments so get_user_pages_remote() is the sane replacement in these cases (having added manual acquisition and release of mmap_sem.) Additionally get_user_pages_remote() reintroduces use of the FOLL_TOUCH flag. However, this flag was originally silently dropped by commit 1e9877902dc7 ("mm/gup: Introduce get_user_pages_remote()"), so this appears to have been unintentional and reintroducing it is therefore not an issue. [akpm@linux-foundation.org: coding-style fixes] Link: http://lkml.kernel.org/r/20161027095141.2569-3-lstoakes@gmail.com Signed-off-by: Lorenzo Stoakes Acked-by: Michal Hocko Cc: Jan Kara Cc: Hugh Dickins Cc: Dave Hansen Cc: Rik van Riel Cc: Mel Gorman Cc: Paolo Bonzini Cc: Radim Krcmar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 3 --- mm/gup.c | 8 ++++---- mm/nommu.c | 8 ++++---- mm/process_vm_access.c | 12 ++++++++---- virt/kvm/async_pf.c | 10 +++++++--- virt/kvm/kvm_main.c | 5 ++--- 6 files changed, 25 insertions(+), 21 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cc154454675a..7b2d14ed3815 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1280,9 +1280,6 @@ long get_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); int get_user_pages_fast(unsigned long start, int nr_pages, int write, diff --git a/mm/gup.c b/mm/gup.c index b64c907aa4f0..55315555489d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -865,9 +865,10 @@ EXPORT_SYMBOL(get_user_pages_locked); * caller if required (just like with __get_user_pages). "FOLL_GET" * is set implicitly if "pages" is non-NULL. */ -__always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; int locked = 1; @@ -879,7 +880,6 @@ __always_inline long __get_user_pages_unlocked(struct task_struct *tsk, struct m up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); /* * get_user_pages_unlocked() is suitable to replace the form: diff --git a/mm/nommu.c b/mm/nommu.c index 9720e0bab029..c299a7fbca70 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -176,9 +176,10 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, } EXPORT_SYMBOL(get_user_pages_locked); -long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, - unsigned long start, unsigned long nr_pages, - struct page **pages, unsigned int gup_flags) +static long __get_user_pages_unlocked(struct task_struct *tsk, + struct mm_struct *mm, unsigned long start, + unsigned long nr_pages, struct page **pages, + unsigned int gup_flags) { long ret; down_read(&mm->mmap_sem); @@ -187,7 +188,6 @@ long __get_user_pages_unlocked(struct task_struct *tsk, struct mm_struct *mm, up_read(&mm->mmap_sem); return ret; } -EXPORT_SYMBOL(__get_user_pages_unlocked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags) diff --git a/mm/process_vm_access.c b/mm/process_vm_access.c index be8dc8d1edb9..84d0c7eada2b 100644 --- a/mm/process_vm_access.c +++ b/mm/process_vm_access.c @@ -88,7 +88,7 @@ static int process_vm_rw_single_vec(unsigned long addr, ssize_t rc = 0; unsigned long max_pages_per_loop = PVM_MAX_KMALLOC_PAGES / sizeof(struct pages *); - unsigned int flags = FOLL_REMOTE; + unsigned int flags = 0; /* Work out address and page range required */ if (len == 0) @@ -100,15 +100,19 @@ static int process_vm_rw_single_vec(unsigned long addr, while (!rc && nr_pages && iov_iter_count(iter)) { int pages = min(nr_pages, max_pages_per_loop); + int locked = 1; size_t bytes; /* * Get the pages we're interested in. We must - * add FOLL_REMOTE because task/mm might not + * access remotely because task/mm might not * current/current->mm */ - pages = __get_user_pages_unlocked(task, mm, pa, pages, - process_pages, flags); + down_read(&mm->mmap_sem); + pages = get_user_pages_remote(task, mm, pa, pages, flags, + process_pages, NULL, &locked); + if (locked) + up_read(&mm->mmap_sem); if (pages <= 0) return -EFAULT; diff --git a/virt/kvm/async_pf.c b/virt/kvm/async_pf.c index efeceb0a222d..3815e940fbea 100644 --- a/virt/kvm/async_pf.c +++ b/virt/kvm/async_pf.c @@ -76,16 +76,20 @@ static void async_pf_execute(struct work_struct *work) struct kvm_vcpu *vcpu = apf->vcpu; unsigned long addr = apf->addr; gva_t gva = apf->gva; + int locked = 1; might_sleep(); /* * This work is run asynchromously to the task which owns * mm and might be done in another context, so we must - * use FOLL_REMOTE. + * access remotely. */ - __get_user_pages_unlocked(NULL, mm, addr, 1, NULL, - FOLL_WRITE | FOLL_REMOTE); + down_read(&mm->mmap_sem); + get_user_pages_remote(NULL, mm, addr, 1, FOLL_WRITE, NULL, NULL, + &locked); + if (locked) + up_read(&mm->mmap_sem); kvm_async_page_present_sync(vcpu, apf); diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 823544c166be..de102cae7125 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1418,13 +1418,12 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, npages = get_user_page_nowait(addr, write_fault, page); up_read(¤t->mm->mmap_sem); } else { - unsigned int flags = FOLL_TOUCH | FOLL_HWPOISON; + unsigned int flags = FOLL_HWPOISON; if (write_fault) flags |= FOLL_WRITE; - npages = __get_user_pages_unlocked(current, current->mm, addr, 1, - page, flags); + npages = get_user_pages_unlocked(addr, 1, page, flags); } if (npages != 1) return npages; -- cgit v1.2.3 From 82b0f8c39a3869b6fd2a10e180a862248736ec6f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:06:58 -0800 Subject: mm: join struct fault_env and vm_fault Currently we have two different structures for passing fault information around - struct vm_fault and struct fault_env. DAX will need more information in struct vm_fault to handle its faults so the content of that structure would become event closer to fault_env. Furthermore it would need to generate struct fault_env to be able to call some of the generic functions. So at this point I don't think there's much use in keeping these two structures separate. Just embed into struct vm_fault all that is needed to use it for both purposes. Link: http://lkml.kernel.org/r/1479460644-25076-2-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Cc: Ross Zwisler Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 2 +- fs/userfaultfd.c | 22 +- include/linux/huge_mm.h | 10 +- include/linux/mm.h | 28 +- include/linux/userfaultfd_k.h | 4 +- mm/filemap.c | 14 +- mm/huge_memory.c | 173 ++++++------ mm/internal.h | 2 +- mm/khugepaged.c | 20 +- mm/memory.c | 568 +++++++++++++++++++------------------- mm/nommu.c | 2 +- 11 files changed, 423 insertions(+), 422 deletions(-) (limited to 'include') diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 1b5f15653b1b..69e2387ca278 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -556,7 +556,7 @@ till "end_pgoff". ->map_pages() is called with page table locked and must not block. If it's not possible to reach a page without blocking, filesystem should skip it. Filesystem should use do_set_pte() to setup page table entry. Pointer to entry associated with the page is passed in -"pte" field in fault_env structure. Pointers to entries for other offsets +"pte" field in vm_fault structure. Pointers to entries for other offsets should be calculated relative to "pte". ->page_mkwrite() is called when a previously read-only pte is diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 85959d8324df..d96e2f30084b 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -257,9 +257,9 @@ out: * fatal_signal_pending()s, and the mmap_sem must be released before * returning it. */ -int handle_userfault(struct fault_env *fe, unsigned long reason) +int handle_userfault(struct vm_fault *vmf, unsigned long reason) { - struct mm_struct *mm = fe->vma->vm_mm; + struct mm_struct *mm = vmf->vma->vm_mm; struct userfaultfd_ctx *ctx; struct userfaultfd_wait_queue uwq; int ret; @@ -268,7 +268,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) BUG_ON(!rwsem_is_locked(&mm->mmap_sem)); ret = VM_FAULT_SIGBUS; - ctx = fe->vma->vm_userfaultfd_ctx.ctx; + ctx = vmf->vma->vm_userfaultfd_ctx.ctx; if (!ctx) goto out; @@ -301,17 +301,18 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * without first stopping userland access to the memory. For * VM_UFFD_MISSING userfaults this is enough for now. */ - if (unlikely(!(fe->flags & FAULT_FLAG_ALLOW_RETRY))) { + if (unlikely(!(vmf->flags & FAULT_FLAG_ALLOW_RETRY))) { /* * Validate the invariant that nowait must allow retry * to be sure not to return SIGBUS erroneously on * nowait invocations. */ - BUG_ON(fe->flags & FAULT_FLAG_RETRY_NOWAIT); + BUG_ON(vmf->flags & FAULT_FLAG_RETRY_NOWAIT); #ifdef CONFIG_DEBUG_VM if (printk_ratelimit()) { printk(KERN_WARNING - "FAULT_FLAG_ALLOW_RETRY missing %x\n", fe->flags); + "FAULT_FLAG_ALLOW_RETRY missing %x\n", + vmf->flags); dump_stack(); } #endif @@ -323,7 +324,7 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) * and wait. */ ret = VM_FAULT_RETRY; - if (fe->flags & FAULT_FLAG_RETRY_NOWAIT) + if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; /* take the reference before dropping the mmap_sem */ @@ -331,11 +332,11 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) init_waitqueue_func_entry(&uwq.wq, userfaultfd_wake_function); uwq.wq.private = current; - uwq.msg = userfault_msg(fe->address, fe->flags, reason); + uwq.msg = userfault_msg(vmf->address, vmf->flags, reason); uwq.ctx = ctx; return_to_userland = - (fe->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == + (vmf->flags & (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE)) == (FAULT_FLAG_USER|FAULT_FLAG_KILLABLE); spin_lock(&ctx->fault_pending_wqh.lock); @@ -353,7 +354,8 @@ int handle_userfault(struct fault_env *fe, unsigned long reason) TASK_KILLABLE); spin_unlock(&ctx->fault_pending_wqh.lock); - must_wait = userfaultfd_must_wait(ctx, fe->address, fe->flags, reason); + must_wait = userfaultfd_must_wait(ctx, vmf->address, vmf->flags, + reason); up_read(&mm->mmap_sem); if (likely(must_wait && !ACCESS_ONCE(ctx->released) && diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 1f782aa1d8e6..97e478d6b690 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -1,12 +1,12 @@ #ifndef _LINUX_HUGE_MM_H #define _LINUX_HUGE_MM_H -extern int do_huge_pmd_anonymous_page(struct fault_env *fe); +extern int do_huge_pmd_anonymous_page(struct vm_fault *vmf); extern int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm, pmd_t *dst_pmd, pmd_t *src_pmd, unsigned long addr, struct vm_area_struct *vma); -extern void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd); -extern int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd); +extern void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd); +extern int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd, @@ -142,7 +142,7 @@ static inline int hpage_nr_pages(struct page *page) return 1; } -extern int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd); +extern int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd); extern struct page *huge_zero_page; @@ -212,7 +212,7 @@ static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, return NULL; } -static inline int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t orig_pmd) +static inline int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t orig_pmd) { return 0; } diff --git a/include/linux/mm.h b/include/linux/mm.h index 7b2d14ed3815..de5bcead2511 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -292,10 +292,16 @@ extern pgprot_t protection_map[16]; * pgoff should be used in favour of virtual_address, if possible. */ struct vm_fault { + struct vm_area_struct *vma; /* Target VMA */ unsigned int flags; /* FAULT_FLAG_xxx flags */ gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ - void __user *virtual_address; /* Faulting virtual address */ + unsigned long address; /* Faulting virtual address */ + void __user *virtual_address; /* Faulting virtual address masked by + * PAGE_MASK */ + pmd_t *pmd; /* Pointer to pmd entry matching + * the 'address' + */ struct page *cow_page; /* Handler may choose to COW */ struct page *page; /* ->fault handlers should return a @@ -309,19 +315,7 @@ struct vm_fault { * VM_FAULT_DAX_LOCKED and fill in * entry here. */ -}; - -/* - * Page fault context: passes though page fault handler instead of endless list - * of function arguments. - */ -struct fault_env { - struct vm_area_struct *vma; /* Target VMA */ - unsigned long address; /* Faulting virtual address */ - unsigned int flags; /* FAULT_FLAG_xxx flags */ - pmd_t *pmd; /* Pointer to pmd entry matching - * the 'address' - */ + /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page * table hasn't been allocated. @@ -351,7 +345,7 @@ struct vm_operations_struct { int (*fault)(struct vm_area_struct *vma, struct vm_fault *vmf); int (*pmd_fault)(struct vm_area_struct *, unsigned long address, pmd_t *, unsigned int flags); - void (*map_pages)(struct fault_env *fe, + void (*map_pages)(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); /* notification that a previously read-only page is about to become @@ -625,7 +619,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) return pte; } -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); #endif @@ -2094,7 +2088,7 @@ extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ extern int filemap_fault(struct vm_area_struct *, struct vm_fault *); -extern void filemap_map_pages(struct fault_env *fe, +extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); extern int filemap_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index dd66a952e8cd..11b92b047a1e 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -27,7 +27,7 @@ #define UFFD_SHARED_FCNTL_FLAGS (O_CLOEXEC | O_NONBLOCK) #define UFFD_FLAGS_SET (EFD_SHARED_FCNTL_FLAGS) -extern int handle_userfault(struct fault_env *fe, unsigned long reason); +extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len); @@ -55,7 +55,7 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) #else /* CONFIG_USERFAULTFD */ /* mm helpers */ -static inline int handle_userfault(struct fault_env *fe, unsigned long reason) +static inline int handle_userfault(struct vm_fault *vmf, unsigned long reason) { return VM_FAULT_SIGBUS; } diff --git a/mm/filemap.c b/mm/filemap.c index 69568388c699..235021e361eb 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2164,12 +2164,12 @@ page_not_uptodate: } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { struct radix_tree_iter iter; void **slot; - struct file *file = fe->vma->vm_file; + struct file *file = vmf->vma->vm_file; struct address_space *mapping = file->f_mapping; pgoff_t last_pgoff = start_pgoff; loff_t size; @@ -2225,11 +2225,11 @@ repeat: if (file->f_ra.mmap_miss > 0) file->f_ra.mmap_miss--; - fe->address += (iter.index - last_pgoff) << PAGE_SHIFT; - if (fe->pte) - fe->pte += iter.index - last_pgoff; + vmf->address += (iter.index - last_pgoff) << PAGE_SHIFT; + if (vmf->pte) + vmf->pte += iter.index - last_pgoff; last_pgoff = iter.index; - if (alloc_set_pte(fe, NULL, page)) + if (alloc_set_pte(vmf, NULL, page)) goto unlock; unlock_page(page); goto next; @@ -2239,7 +2239,7 @@ skip: put_page(page); next: /* Huge page is mapped? No need to proceed. */ - if (pmd_trans_huge(*fe->pmd)) + if (pmd_trans_huge(*vmf->pmd)) break; if (iter.index == end_pgoff) break; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index cee42cf05477..10eedbf14421 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -542,13 +542,13 @@ unsigned long thp_get_unmapped_area(struct file *filp, unsigned long addr, } EXPORT_SYMBOL_GPL(thp_get_unmapped_area); -static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, +static int __do_huge_pmd_anonymous_page(struct vm_fault *vmf, struct page *page, gfp_t gfp) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; pgtable_t pgtable; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; VM_BUG_ON_PAGE(!PageCompound(page), page); @@ -573,9 +573,9 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, */ __SetPageUptodate(page); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); @@ -586,11 +586,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, if (userfaultfd_missing(vma)) { int ret; - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(page, memcg, true); put_page(page); pte_free(vma->vm_mm, pgtable); - ret = handle_userfault(fe, VM_UFFD_MISSING); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); return ret; } @@ -600,11 +600,11 @@ static int __do_huge_pmd_anonymous_page(struct fault_env *fe, struct page *page, page_add_new_anon_rmap(page, vma, haddr, true); mem_cgroup_commit_charge(page, memcg, false, true); lru_cache_add_active_or_unevictable(page, vma); - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, pgtable); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); atomic_long_inc(&vma->vm_mm->nr_ptes); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); count_vm_event(THP_FAULT_ALLOC); } @@ -651,12 +651,12 @@ static bool set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return true; } -int do_huge_pmd_anonymous_page(struct fault_env *fe) +int do_huge_pmd_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; gfp_t gfp; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; if (haddr < vma->vm_start || haddr + HPAGE_PMD_SIZE > vma->vm_end) return VM_FAULT_FALLBACK; @@ -664,7 +664,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_OOM; if (unlikely(khugepaged_enter(vma, vma->vm_flags))) return VM_FAULT_OOM; - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm) && transparent_hugepage_use_zero_page()) { pgtable_t pgtable; @@ -680,22 +680,22 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); ret = 0; set = false; - if (pmd_none(*fe->pmd)) { + if (pmd_none(*vmf->pmd)) { if (userfaultfd_missing(vma)) { - spin_unlock(fe->ptl); - ret = handle_userfault(fe, VM_UFFD_MISSING); + spin_unlock(vmf->ptl); + ret = handle_userfault(vmf, VM_UFFD_MISSING); VM_BUG_ON(ret & VM_FAULT_FALLBACK); } else { set_huge_zero_page(pgtable, vma->vm_mm, vma, - haddr, fe->pmd, zero_page); - spin_unlock(fe->ptl); + haddr, vmf->pmd, zero_page); + spin_unlock(vmf->ptl); set = true; } } else - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); if (!set) pte_free(vma->vm_mm, pgtable); return ret; @@ -707,7 +707,7 @@ int do_huge_pmd_anonymous_page(struct fault_env *fe) return VM_FAULT_FALLBACK; } prep_transhuge_page(page); - return __do_huge_pmd_anonymous_page(fe, page, gfp); + return __do_huge_pmd_anonymous_page(vmf, page, gfp); } static void insert_pfn_pmd(struct vm_area_struct *vma, unsigned long addr, @@ -879,30 +879,30 @@ out: return ret; } -void huge_pmd_set_accessed(struct fault_env *fe, pmd_t orig_pmd) +void huge_pmd_set_accessed(struct vm_fault *vmf, pmd_t orig_pmd) { pmd_t entry; unsigned long haddr; - fe->ptl = pmd_lock(fe->vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto unlock; entry = pmd_mkyoung(orig_pmd); - haddr = fe->address & HPAGE_PMD_MASK; - if (pmdp_set_access_flags(fe->vma, haddr, fe->pmd, entry, - fe->flags & FAULT_FLAG_WRITE)) - update_mmu_cache_pmd(fe->vma, fe->address, fe->pmd); + haddr = vmf->address & HPAGE_PMD_MASK; + if (pmdp_set_access_flags(vmf->vma, haddr, vmf->pmd, entry, + vmf->flags & FAULT_FLAG_WRITE)) + update_mmu_cache_pmd(vmf->vma, vmf->address, vmf->pmd); unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); } -static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, +static int do_huge_pmd_wp_page_fallback(struct vm_fault *vmf, pmd_t orig_pmd, struct page *page) { - struct vm_area_struct *vma = fe->vma; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; struct mem_cgroup *memcg; pgtable_t pgtable; pmd_t _pmd; @@ -921,7 +921,7 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, for (i = 0; i < HPAGE_PMD_NR; i++) { pages[i] = alloc_page_vma_node(GFP_HIGHUSER_MOVABLE | __GFP_OTHER_NODE, vma, - fe->address, page_to_nid(page)); + vmf->address, page_to_nid(page)); if (unlikely(!pages[i] || mem_cgroup_try_charge(pages[i], vma->vm_mm, GFP_KERNEL, &memcg, false))) { @@ -952,15 +952,15 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_free_pages; VM_BUG_ON_PAGE(!PageHead(page), page); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); /* leave pmd empty until pte is filled */ - pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, fe->pmd); + pgtable = pgtable_trans_huge_withdraw(vma->vm_mm, vmf->pmd); pmd_populate(vma->vm_mm, &_pmd, pgtable); for (i = 0; i < HPAGE_PMD_NR; i++, haddr += PAGE_SIZE) { @@ -969,20 +969,20 @@ static int do_huge_pmd_wp_page_fallback(struct fault_env *fe, pmd_t orig_pmd, entry = maybe_mkwrite(pte_mkdirty(entry), vma); memcg = (void *)page_private(pages[i]); set_page_private(pages[i], 0); - page_add_new_anon_rmap(pages[i], fe->vma, haddr, false); + page_add_new_anon_rmap(pages[i], vmf->vma, haddr, false); mem_cgroup_commit_charge(pages[i], memcg, false, false); lru_cache_add_active_or_unevictable(pages[i], vma); - fe->pte = pte_offset_map(&_pmd, haddr); - VM_BUG_ON(!pte_none(*fe->pte)); - set_pte_at(vma->vm_mm, haddr, fe->pte, entry); - pte_unmap(fe->pte); + vmf->pte = pte_offset_map(&_pmd, haddr); + VM_BUG_ON(!pte_none(*vmf->pte)); + set_pte_at(vma->vm_mm, haddr, vmf->pte, entry); + pte_unmap(vmf->pte); } kfree(pages); smp_wmb(); /* make pte visible before pmd */ - pmd_populate(vma->vm_mm, fe->pmd, pgtable); + pmd_populate(vma->vm_mm, vmf->pmd, pgtable); page_remove_rmap(page, true); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); @@ -993,7 +993,7 @@ out: return ret; out_free_pages: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); for (i = 0; i < HPAGE_PMD_NR; i++) { memcg = (void *)page_private(pages[i]); @@ -1005,23 +1005,23 @@ out_free_pages: goto out; } -int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) +int do_huge_pmd_wp_page(struct vm_fault *vmf, pmd_t orig_pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL, *new_page; struct mem_cgroup *memcg; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; unsigned long mmun_start; /* For mmu_notifiers */ unsigned long mmun_end; /* For mmu_notifiers */ gfp_t huge_gfp; /* for allocation and charge */ int ret = 0; - fe->ptl = pmd_lockptr(vma->vm_mm, fe->pmd); + vmf->ptl = pmd_lockptr(vma->vm_mm, vmf->pmd); VM_BUG_ON_VMA(!vma->anon_vma, vma); if (is_huge_zero_pmd(orig_pmd)) goto alloc; - spin_lock(fe->ptl); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) goto out_unlock; page = pmd_page(orig_pmd); @@ -1034,13 +1034,13 @@ int do_huge_pmd_wp_page(struct fault_env *fe, pmd_t orig_pmd) pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - if (pmdp_set_access_flags(vma, haddr, fe->pmd, entry, 1)) - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + if (pmdp_set_access_flags(vma, haddr, vmf->pmd, entry, 1)) + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); ret |= VM_FAULT_WRITE; goto out_unlock; } get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); alloc: if (transparent_hugepage_enabled(vma) && !transparent_hugepage_debug_cow()) { @@ -1053,12 +1053,12 @@ alloc: prep_transhuge_page(new_page); } else { if (!page) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } else { - ret = do_huge_pmd_wp_page_fallback(fe, orig_pmd, page); + ret = do_huge_pmd_wp_page_fallback(vmf, orig_pmd, page); if (ret & VM_FAULT_OOM) { - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); ret |= VM_FAULT_FALLBACK; } put_page(page); @@ -1070,7 +1070,7 @@ alloc: if (unlikely(mem_cgroup_try_charge(new_page, vma->vm_mm, huge_gfp, &memcg, true))) { put_page(new_page); - split_huge_pmd(vma, fe->pmd, fe->address); + split_huge_pmd(vma, vmf->pmd, vmf->address); if (page) put_page(page); ret |= VM_FAULT_FALLBACK; @@ -1090,11 +1090,11 @@ alloc: mmun_end = haddr + HPAGE_PMD_SIZE; mmu_notifier_invalidate_range_start(vma->vm_mm, mmun_start, mmun_end); - spin_lock(fe->ptl); + spin_lock(vmf->ptl); if (page) put_page(page); - if (unlikely(!pmd_same(*fe->pmd, orig_pmd))) { - spin_unlock(fe->ptl); + if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { + spin_unlock(vmf->ptl); mem_cgroup_cancel_charge(new_page, memcg, true); put_page(new_page); goto out_mn; @@ -1102,12 +1102,12 @@ alloc: pmd_t entry; entry = mk_huge_pmd(new_page, vma->vm_page_prot); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); - pmdp_huge_clear_flush_notify(vma, haddr, fe->pmd); + pmdp_huge_clear_flush_notify(vma, haddr, vmf->pmd); page_add_new_anon_rmap(new_page, vma, haddr, true); mem_cgroup_commit_charge(new_page, memcg, false, true); lru_cache_add_active_or_unevictable(new_page, vma); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); if (!page) { add_mm_counter(vma->vm_mm, MM_ANONPAGES, HPAGE_PMD_NR); } else { @@ -1117,13 +1117,13 @@ alloc: } ret |= VM_FAULT_WRITE; } - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out_mn: mmu_notifier_invalidate_range_end(vma->vm_mm, mmun_start, mmun_end); out: return ret; out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); return ret; } @@ -1196,12 +1196,12 @@ out: } /* NUMA hinting page fault entry point for trans huge pmds */ -int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) +int do_huge_pmd_numa_page(struct vm_fault *vmf, pmd_t pmd) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct anon_vma *anon_vma = NULL; struct page *page; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; int page_nid = -1, this_nid = numa_node_id(); int target_nid, last_cpupid = -1; bool page_locked; @@ -1209,8 +1209,8 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) bool was_writable; int flags = 0; - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_same(pmd, *fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) goto out_unlock; /* @@ -1218,9 +1218,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * without disrupting NUMA hinting information. Do not relock and * check_same as the page may no longer be mapped. */ - if (unlikely(pmd_trans_migrating(*fe->pmd))) { - page = pmd_page(*fe->pmd); - spin_unlock(fe->ptl); + if (unlikely(pmd_trans_migrating(*vmf->pmd))) { + page = pmd_page(*vmf->pmd); + spin_unlock(vmf->ptl); wait_on_page_locked(page); goto out; } @@ -1253,7 +1253,7 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) /* Migration could have started since the pmd_trans_migrating check */ if (!page_locked) { - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); wait_on_page_locked(page); page_nid = -1; goto out; @@ -1264,12 +1264,12 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * to serialises splits */ get_page(page); - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); anon_vma = page_lock_anon_vma_read(page); /* Confirm the PMD did not change while page_table_lock was released */ - spin_lock(fe->ptl); - if (unlikely(!pmd_same(pmd, *fe->pmd))) { + spin_lock(vmf->ptl); + if (unlikely(!pmd_same(pmd, *vmf->pmd))) { unlock_page(page); put_page(page); page_nid = -1; @@ -1287,9 +1287,9 @@ int do_huge_pmd_numa_page(struct fault_env *fe, pmd_t pmd) * Migrate the THP to the requested node, returns with page unlocked * and access rights restored. */ - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); migrated = migrate_misplaced_transhuge_page(vma->vm_mm, vma, - fe->pmd, pmd, fe->address, page, target_nid); + vmf->pmd, pmd, vmf->address, page, target_nid); if (migrated) { flags |= TNF_MIGRATED; page_nid = target_nid; @@ -1304,18 +1304,19 @@ clear_pmdnuma: pmd = pmd_mkyoung(pmd); if (was_writable) pmd = pmd_mkwrite(pmd); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, pmd); - update_mmu_cache_pmd(vma, fe->address, fe->pmd); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, pmd); + update_mmu_cache_pmd(vma, vmf->address, vmf->pmd); unlock_page(page); out_unlock: - spin_unlock(fe->ptl); + spin_unlock(vmf->ptl); out: if (anon_vma) page_unlock_anon_vma_read(anon_vma); if (page_nid != -1) - task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, fe->flags); + task_numa_fault(last_cpupid, page_nid, HPAGE_PMD_NR, + vmf->flags); return 0; } diff --git a/mm/internal.h b/mm/internal.h index 537ac9951f5f..093b1eacc91b 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct fault_env *fe, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf, pte_t orig_pte); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 09460955e818..d950c2509161 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -877,7 +877,7 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, { pte_t pteval; int swapped_in = 0, ret = 0; - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = FAULT_FLAG_ALLOW_RETRY, @@ -889,19 +889,19 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; } - fe.pte = pte_offset_map(pmd, address); - for (; fe.address < address + HPAGE_PMD_NR*PAGE_SIZE; - fe.pte++, fe.address += PAGE_SIZE) { - pteval = *fe.pte; + vmf.pte = pte_offset_map(pmd, address); + for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; + vmf.pte++, vmf.address += PAGE_SIZE) { + pteval = *vmf.pte; if (!is_swap_pte(pteval)) continue; swapped_in++; - ret = do_swap_page(&fe, pteval); + ret = do_swap_page(&vmf, pteval); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { down_read(&mm->mmap_sem); - if (hugepage_vma_revalidate(mm, address, &fe.vma)) { + if (hugepage_vma_revalidate(mm, address, &vmf.vma)) { /* vma is no longer available, don't continue to swapin */ trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); return false; @@ -915,10 +915,10 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, return false; } /* pte is unmapped now, we need to map it */ - fe.pte = pte_offset_map(pmd, fe.address); + vmf.pte = pte_offset_map(pmd, vmf.address); } - fe.pte--; - pte_unmap(fe.pte); + vmf.pte--; + pte_unmap(vmf.pte); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); return true; } diff --git a/mm/memory.c b/mm/memory.c index 3a6a1239c42b..512e1c359193 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2070,11 +2070,11 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, +static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, struct page *page, int page_mkwrite, int dirty_shared) - __releases(fe->ptl) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; pte_t entry; /* * Clear the pages cpupid information as the existing @@ -2084,12 +2084,12 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); entry = pte_mkyoung(orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); - if (ptep_set_access_flags(vma, fe->address, fe->pte, entry, 1)) - update_mmu_cache(vma, fe->address, fe->pte); - pte_unmap_unlock(fe->pte, fe->ptl); + if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) + update_mmu_cache(vma, vmf->address, vmf->pte); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (dirty_shared) { struct address_space *mapping; @@ -2135,15 +2135,15 @@ static inline int wp_page_reuse(struct fault_env *fe, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, +static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, struct page *old_page) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; struct page *new_page = NULL; pte_t entry; int page_copied = 0; - const unsigned long mmun_start = fe->address & PAGE_MASK; + const unsigned long mmun_start = vmf->address & PAGE_MASK; const unsigned long mmun_end = mmun_start + PAGE_SIZE; struct mem_cgroup *memcg; @@ -2151,15 +2151,16 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, goto oom; if (is_zero_pfn(pte_pfn(orig_pte))) { - new_page = alloc_zeroed_user_highpage_movable(vma, fe->address); + new_page = alloc_zeroed_user_highpage_movable(vma, + vmf->address); if (!new_page) goto oom; } else { new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - fe->address); + vmf->address); if (!new_page) goto oom; - cow_user_page(new_page, old_page, fe->address, vma); + cow_user_page(new_page, old_page, vmf->address, vma); } if (mem_cgroup_try_charge(new_page, mm, GFP_KERNEL, &memcg, false)) @@ -2172,8 +2173,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, /* * Re-check the pte - we dropped the lock */ - fe->pte = pte_offset_map_lock(mm, fe->pmd, fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) { + vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2183,7 +2184,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, fe->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2192,8 +2193,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * seen in the presence of one thread doing SMC and another * thread doing COW. */ - ptep_clear_flush_notify(vma, fe->address, fe->pte); - page_add_new_anon_rmap(new_page, vma, fe->address, false); + ptep_clear_flush_notify(vma, vmf->address, vmf->pte); + page_add_new_anon_rmap(new_page, vma, vmf->address, false); mem_cgroup_commit_charge(new_page, memcg, false, false); lru_cache_add_active_or_unevictable(new_page, vma); /* @@ -2201,8 +2202,8 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, * mmu page tables (such as kvm shadow page tables), we want the * new page to be mapped directly into the secondary page table. */ - set_pte_at_notify(mm, fe->address, fe->pte, entry); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at_notify(mm, vmf->address, vmf->pte, entry); + update_mmu_cache(vma, vmf->address, vmf->pte); if (old_page) { /* * Only after switching the pte to the new page may @@ -2239,7 +2240,7 @@ static int wp_page_copy(struct fault_env *fe, pte_t orig_pte, if (new_page) put_page(new_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); if (old_page) { /* @@ -2267,43 +2268,43 @@ oom: * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct fault_env *fe, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma->vm_ops && vma->vm_ops->pfn_mkwrite) { - struct vm_fault vmf = { + struct vm_fault vmf2 = { .page = NULL, - .pgoff = linear_page_index(vma, fe->address), + .pgoff = linear_page_index(vma, vmf->address), .virtual_address = - (void __user *)(fe->address & PAGE_MASK), + (void __user *)(vmf->address & PAGE_MASK), .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; int ret; - pte_unmap_unlock(fe->pte, fe->ptl); - ret = vma->vm_ops->pfn_mkwrite(vma, &vmf); + pte_unmap_unlock(vmf->pte, vmf->ptl); + ret = vma->vm_ops->pfn_mkwrite(vma, &vmf2); if (ret & VM_FAULT_ERROR) return ret; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); /* * We might have raced with another page fault while we * released the pte_offset_map_lock. */ - if (!pte_same(*fe->pte, orig_pte)) { - pte_unmap_unlock(fe->pte, fe->ptl); + if (!pte_same(*vmf->pte, orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } } - return wp_page_reuse(fe, orig_pte, NULL, 0, 0); + return wp_page_reuse(vmf, orig_pte, NULL, 0, 0); } -static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, +static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, struct page *old_page) - __releases(fe->ptl) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; int page_mkwrite = 0; get_page(old_page); @@ -2311,8 +2312,8 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, if (vma->vm_ops && vma->vm_ops->page_mkwrite) { int tmp; - pte_unmap_unlock(fe->pte, fe->ptl); - tmp = do_page_mkwrite(vma, old_page, fe->address); + pte_unmap_unlock(vmf->pte, vmf->ptl); + tmp = do_page_mkwrite(vma, old_page, vmf->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(old_page); @@ -2324,18 +2325,18 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * they did, we just return, as we can count on the * MMU to tell us if they didn't also make it writable. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); return 0; } page_mkwrite = 1; } - return wp_page_reuse(fe, orig_pte, old_page, page_mkwrite, 1); + return wp_page_reuse(vmf, orig_pte, old_page, page_mkwrite, 1); } /* @@ -2356,13 +2357,13 @@ static int wp_page_shared(struct fault_env *fe, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct fault_env *fe, pte_t orig_pte) - __releases(fe->ptl) +static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) + __releases(vmf->ptl) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *old_page; - old_page = vm_normal_page(vma, fe->address, orig_pte); + old_page = vm_normal_page(vma, vmf->address, orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a @@ -2373,10 +2374,10 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(fe, orig_pte); + return wp_pfn_shared(vmf, orig_pte); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf, orig_pte, old_page); } /* @@ -2387,13 +2388,13 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) int total_mapcount; if (!trylock_page(old_page)) { get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); lock_page(old_page); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (!pte_same(*fe->pte, orig_pte)) { + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_same(*vmf->pte, orig_pte)) { unlock_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); return 0; } @@ -2411,12 +2412,12 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) page_move_anon_rmap(old_page, vma); } unlock_page(old_page); - return wp_page_reuse(fe, orig_pte, old_page, 0, 0); + return wp_page_reuse(vmf, orig_pte, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(fe, orig_pte, old_page); + return wp_page_shared(vmf, orig_pte, old_page); } /* @@ -2424,8 +2425,8 @@ static int do_wp_page(struct fault_env *fe, pte_t orig_pte) */ get_page(old_page); - pte_unmap_unlock(fe->pte, fe->ptl); - return wp_page_copy(fe, orig_pte, old_page); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return wp_page_copy(vmf, orig_pte, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2513,9 +2514,9 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct fault_env *fe, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; struct mem_cgroup *memcg; swp_entry_t entry; @@ -2524,17 +2525,18 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, fe->pmd, fe->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, orig_pte)) goto out; entry = pte_to_swp_entry(orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { - migration_entry_wait(vma->vm_mm, fe->pmd, fe->address); + migration_entry_wait(vma->vm_mm, vmf->pmd, + vmf->address); } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, fe->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2542,16 +2544,16 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) delayacct_set_flag(DELAYACCT_PF_SWAPIN); page = lookup_swap_cache(entry); if (!page) { - page = swapin_readahead(entry, - GFP_HIGHUSER_MOVABLE, vma, fe->address); + page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vma, + vmf->address); if (!page) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, - fe->address, &fe->ptl); - if (likely(pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (likely(pte_same(*vmf->pte, orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2573,7 +2575,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) } swapcache = page; - locked = lock_page_or_retry(page, vma->vm_mm, fe->flags); + locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); delayacct_clear_flag(DELAYACCT_PF_SWAPIN); if (!locked) { @@ -2590,7 +2592,7 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) if (unlikely(!PageSwapCache(page) || page_private(page) != entry.val)) goto out_page; - page = ksm_might_need_to_copy(page, vma, fe->address); + page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; page = swapcache; @@ -2606,9 +2608,9 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) /* * Back out if somebody else already faulted in this pte. */ - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (unlikely(!pte_same(*fe->pte, orig_pte))) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2629,22 +2631,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((fe->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); - fe->flags &= ~FAULT_FLAG_WRITE; + vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); if (pte_swp_soft_dirty(orig_pte)) pte = pte_mksoft_dirty(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); if (page == swapcache) { - do_page_add_anon_rmap(page, vma, fe->address, exclusive); + do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); activate_page(page); } else { /* ksm created a completely new copy */ - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } @@ -2667,22 +2669,22 @@ int do_swap_page(struct fault_env *fe, pte_t orig_pte) put_page(swapcache); } - if (fe->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(fe, pte); + if (vmf->flags & FAULT_FLAG_WRITE) { + ret |= do_wp_page(vmf, pte); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; } /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: return ret; out_nomap: mem_cgroup_cancel_charge(page, memcg, false); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: unlock_page(page); out_release: @@ -2733,9 +2735,9 @@ static inline int check_stack_guard_page(struct vm_area_struct *vma, unsigned lo * but allow concurrent faults), and pte mapped but not yet locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_anonymous_page(struct fault_env *fe) +static int do_anonymous_page(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct mem_cgroup *memcg; struct page *page; pte_t entry; @@ -2745,7 +2747,7 @@ static int do_anonymous_page(struct fault_env *fe) return VM_FAULT_SIGBUS; /* Check if we need to add a guard page to the stack */ - if (check_stack_guard_page(vma, fe->address) < 0) + if (check_stack_guard_page(vma, vmf->address) < 0) return VM_FAULT_SIGSEGV; /* @@ -2758,26 +2760,26 @@ static int do_anonymous_page(struct fault_env *fe) * * Here we only have down_read(mmap_sem). */ - if (pte_alloc(vma->vm_mm, fe->pmd, fe->address)) + if (pte_alloc(vma->vm_mm, vmf->pmd, vmf->address)) return VM_FAULT_OOM; /* See the comment in pte_alloc_one_map() */ - if (unlikely(pmd_trans_unstable(fe->pmd))) + if (unlikely(pmd_trans_unstable(vmf->pmd))) return 0; /* Use the zero-page for reads */ - if (!(fe->flags & FAULT_FLAG_WRITE) && + if (!(vmf->flags & FAULT_FLAG_WRITE) && !mm_forbids_zeropage(vma->vm_mm)) { - entry = pte_mkspecial(pfn_pte(my_zero_pfn(fe->address), + entry = pte_mkspecial(pfn_pte(my_zero_pfn(vmf->address), vma->vm_page_prot)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, + vmf->address, &vmf->ptl); + if (!pte_none(*vmf->pte)) goto unlock; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); - return handle_userfault(fe, VM_UFFD_MISSING); + pte_unmap_unlock(vmf->pte, vmf->ptl); + return handle_userfault(vmf, VM_UFFD_MISSING); } goto setpte; } @@ -2785,7 +2787,7 @@ static int do_anonymous_page(struct fault_env *fe) /* Allocate our own private page. */ if (unlikely(anon_vma_prepare(vma))) goto oom; - page = alloc_zeroed_user_highpage_movable(vma, fe->address); + page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!page) goto oom; @@ -2803,30 +2805,30 @@ static int do_anonymous_page(struct fault_env *fe) if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); - if (!pte_none(*fe->pte)) + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + if (!pte_none(*vmf->pte)) goto release; /* Deliver the page fault to userland, check inside PT lock */ if (userfaultfd_missing(vma)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); mem_cgroup_cancel_charge(page, memcg, false); put_page(page); - return handle_userfault(fe, VM_UFFD_MISSING); + return handle_userfault(vmf, VM_UFFD_MISSING); } inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); setpte: - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* No need to invalidate - it was non-present before */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; release: mem_cgroup_cancel_charge(page, memcg, false); @@ -2843,62 +2845,62 @@ oom: * released depending on flags and vma->vm_ops->fault() return value. * See filemap_fault() and __lock_page_retry(). */ -static int __do_fault(struct fault_env *fe, pgoff_t pgoff, +static int __do_fault(struct vm_fault *vmf, pgoff_t pgoff, struct page *cow_page, struct page **page, void **entry) { - struct vm_area_struct *vma = fe->vma; - struct vm_fault vmf; + struct vm_area_struct *vma = vmf->vma; + struct vm_fault vmf2; int ret; - vmf.virtual_address = (void __user *)(fe->address & PAGE_MASK); - vmf.pgoff = pgoff; - vmf.flags = fe->flags; - vmf.page = NULL; - vmf.gfp_mask = __get_fault_gfp_mask(vma); - vmf.cow_page = cow_page; + vmf2.virtual_address = (void __user *)(vmf->address & PAGE_MASK); + vmf2.pgoff = pgoff; + vmf2.flags = vmf->flags; + vmf2.page = NULL; + vmf2.gfp_mask = __get_fault_gfp_mask(vma); + vmf2.cow_page = cow_page; - ret = vma->vm_ops->fault(vma, &vmf); + ret = vma->vm_ops->fault(vma, &vmf2); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; if (ret & VM_FAULT_DAX_LOCKED) { - *entry = vmf.entry; + *entry = vmf2.entry; return ret; } - if (unlikely(PageHWPoison(vmf.page))) { + if (unlikely(PageHWPoison(vmf2.page))) { if (ret & VM_FAULT_LOCKED) - unlock_page(vmf.page); - put_page(vmf.page); + unlock_page(vmf2.page); + put_page(vmf2.page); return VM_FAULT_HWPOISON; } if (unlikely(!(ret & VM_FAULT_LOCKED))) - lock_page(vmf.page); + lock_page(vmf2.page); else - VM_BUG_ON_PAGE(!PageLocked(vmf.page), vmf.page); + VM_BUG_ON_PAGE(!PageLocked(vmf2.page), vmf2.page); - *page = vmf.page; + *page = vmf2.page; return ret; } -static int pte_alloc_one_map(struct fault_env *fe) +static int pte_alloc_one_map(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - if (!pmd_none(*fe->pmd)) + if (!pmd_none(*vmf->pmd)) goto map_pte; - if (fe->prealloc_pte) { - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) { - spin_unlock(fe->ptl); + if (vmf->prealloc_pte) { + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) { + spin_unlock(vmf->ptl); goto map_pte; } atomic_long_inc(&vma->vm_mm->nr_ptes); - pmd_populate(vma->vm_mm, fe->pmd, fe->prealloc_pte); - spin_unlock(fe->ptl); - fe->prealloc_pte = 0; - } else if (unlikely(pte_alloc(vma->vm_mm, fe->pmd, fe->address))) { + pmd_populate(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); + spin_unlock(vmf->ptl); + vmf->prealloc_pte = 0; + } else if (unlikely(pte_alloc(vma->vm_mm, vmf->pmd, vmf->address))) { return VM_FAULT_OOM; } map_pte: @@ -2913,11 +2915,11 @@ map_pte: * through an atomic read in C, which is what pmd_trans_unstable() * provides. */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return VM_FAULT_NOPAGE; - fe->pte = pte_offset_map_lock(vma->vm_mm, fe->pmd, fe->address, - &fe->ptl); + vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); return 0; } @@ -2935,24 +2937,24 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, return true; } -static void deposit_prealloc_pte(struct fault_env *fe) +static void deposit_prealloc_pte(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; - pgtable_trans_huge_deposit(vma->vm_mm, fe->pmd, fe->prealloc_pte); + pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, vmf->prealloc_pte); /* * We are going to consume the prealloc table, * count that as nr_ptes. */ atomic_long_inc(&vma->vm_mm->nr_ptes); - fe->prealloc_pte = 0; + vmf->prealloc_pte = 0; } -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; - unsigned long haddr = fe->address & HPAGE_PMD_MASK; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; + unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t entry; int i, ret; @@ -2966,15 +2968,15 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * Archs like ppc64 need additonal space to store information * related to pte entry. Use the preallocated table for that. */ - if (arch_needs_pgtable_deposit() && !fe->prealloc_pte) { - fe->prealloc_pte = pte_alloc_one(vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (arch_needs_pgtable_deposit() && !vmf->prealloc_pte) { + vmf->prealloc_pte = pte_alloc_one(vma->vm_mm, vmf->address); + if (!vmf->prealloc_pte) return VM_FAULT_OOM; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->ptl = pmd_lock(vma->vm_mm, fe->pmd); - if (unlikely(!pmd_none(*fe->pmd))) + vmf->ptl = pmd_lock(vma->vm_mm, vmf->pmd); + if (unlikely(!pmd_none(*vmf->pmd))) goto out; for (i = 0; i < HPAGE_PMD_NR; i++) @@ -2990,11 +2992,11 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * deposit and withdraw with pmd lock held */ if (arch_needs_pgtable_deposit()) - deposit_prealloc_pte(fe); + deposit_prealloc_pte(vmf); - set_pmd_at(vma->vm_mm, haddr, fe->pmd, entry); + set_pmd_at(vma->vm_mm, haddr, vmf->pmd, entry); - update_mmu_cache_pmd(vma, haddr, fe->pmd); + update_mmu_cache_pmd(vma, haddr, vmf->pmd); /* fault is handled */ ret = 0; @@ -3005,13 +3007,13 @@ out: * withdraw with pmd lock held. */ if (arch_needs_pgtable_deposit() && ret == VM_FAULT_FALLBACK) - fe->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, - fe->pmd); - spin_unlock(fe->ptl); + vmf->prealloc_pte = pgtable_trans_huge_withdraw(vma->vm_mm, + vmf->pmd); + spin_unlock(vmf->ptl); return ret; } #else -static int do_set_pmd(struct fault_env *fe, struct page *page) +static int do_set_pmd(struct vm_fault *vmf, struct page *page) { BUILD_BUG(); return 0; @@ -3022,41 +3024,42 @@ static int do_set_pmd(struct fault_env *fe, struct page *page) * alloc_set_pte - setup new PTE entry for given page and add reverse page * mapping. If needed, the fucntion allocates page table or use pre-allocated. * - * @fe: fault environment + * @vmf: fault environment * @memcg: memcg to charge page (only for private mappings) * @page: page to map * - * Caller must take care of unlocking fe->ptl, if fe->pte is non-NULL on return. + * Caller must take care of unlocking vmf->ptl, if vmf->pte is non-NULL on + * return. * * Target users are page handler itself and implementations of * vm_ops->map_pages. */ -int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, +int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page) { - struct vm_area_struct *vma = fe->vma; - bool write = fe->flags & FAULT_FLAG_WRITE; + struct vm_area_struct *vma = vmf->vma; + bool write = vmf->flags & FAULT_FLAG_WRITE; pte_t entry; int ret; - if (pmd_none(*fe->pmd) && PageTransCompound(page) && + if (pmd_none(*vmf->pmd) && PageTransCompound(page) && IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE)) { /* THP on COW? */ VM_BUG_ON_PAGE(memcg, page); - ret = do_set_pmd(fe, page); + ret = do_set_pmd(vmf, page); if (ret != VM_FAULT_FALLBACK) goto fault_handled; } - if (!fe->pte) { - ret = pte_alloc_one_map(fe); + if (!vmf->pte) { + ret = pte_alloc_one_map(vmf); if (ret) goto fault_handled; } /* Re-check under ptl */ - if (unlikely(!pte_none(*fe->pte))) { + if (unlikely(!pte_none(*vmf->pte))) { ret = VM_FAULT_NOPAGE; goto fault_handled; } @@ -3068,24 +3071,24 @@ int alloc_set_pte(struct fault_env *fe, struct mem_cgroup *memcg, /* copy-on-write page */ if (write && !(vma->vm_flags & VM_SHARED)) { inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); - page_add_new_anon_rmap(page, vma, fe->address, false); + page_add_new_anon_rmap(page, vma, vmf->address, false); mem_cgroup_commit_charge(page, memcg, false, false); lru_cache_add_active_or_unevictable(page, vma); } else { inc_mm_counter_fast(vma->vm_mm, mm_counter_file(page)); page_add_file_rmap(page, false); } - set_pte_at(vma->vm_mm, fe->address, fe->pte, entry); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, entry); /* no need to invalidate: a not-present page won't be cached */ - update_mmu_cache(vma, fe->address, fe->pte); + update_mmu_cache(vma, vmf->address, vmf->pte); ret = 0; fault_handled: /* preallocated pagetable is unused: free it */ - if (fe->prealloc_pte) { - pte_free(fe->vma->vm_mm, fe->prealloc_pte); - fe->prealloc_pte = 0; + if (vmf->prealloc_pte) { + pte_free(vmf->vma->vm_mm, vmf->prealloc_pte); + vmf->prealloc_pte = 0; } return ret; } @@ -3154,17 +3157,17 @@ late_initcall(fault_around_debugfs); * fault_around_pages() value (and therefore to page order). This way it's * easier to guarantee that we don't cross page table boundaries. */ -static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) +static int do_fault_around(struct vm_fault *vmf, pgoff_t start_pgoff) { - unsigned long address = fe->address, nr_pages, mask; + unsigned long address = vmf->address, nr_pages, mask; pgoff_t end_pgoff; int off, ret = 0; nr_pages = READ_ONCE(fault_around_bytes) >> PAGE_SHIFT; mask = ~(nr_pages * PAGE_SIZE - 1) & PAGE_MASK; - fe->address = max(address & mask, fe->vma->vm_start); - off = ((address - fe->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); + vmf->address = max(address & mask, vmf->vma->vm_start); + off = ((address - vmf->address) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1); start_pgoff -= off; /* @@ -3172,44 +3175,45 @@ static int do_fault_around(struct fault_env *fe, pgoff_t start_pgoff) * or fault_around_pages() from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - - ((fe->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + + ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + PTRS_PER_PTE - 1; - end_pgoff = min3(end_pgoff, vma_pages(fe->vma) + fe->vma->vm_pgoff - 1, + end_pgoff = min3(end_pgoff, vma_pages(vmf->vma) + vmf->vma->vm_pgoff - 1, start_pgoff + nr_pages - 1); - if (pmd_none(*fe->pmd)) { - fe->prealloc_pte = pte_alloc_one(fe->vma->vm_mm, fe->address); - if (!fe->prealloc_pte) + if (pmd_none(*vmf->pmd)) { + vmf->prealloc_pte = pte_alloc_one(vmf->vma->vm_mm, + vmf->address); + if (!vmf->prealloc_pte) goto out; smp_wmb(); /* See comment in __pte_alloc() */ } - fe->vma->vm_ops->map_pages(fe, start_pgoff, end_pgoff); + vmf->vma->vm_ops->map_pages(vmf, start_pgoff, end_pgoff); /* Huge page is mapped? Page fault is solved */ - if (pmd_trans_huge(*fe->pmd)) { + if (pmd_trans_huge(*vmf->pmd)) { ret = VM_FAULT_NOPAGE; goto out; } /* ->map_pages() haven't done anything useful. Cold page cache? */ - if (!fe->pte) + if (!vmf->pte) goto out; /* check if the page fault is solved */ - fe->pte -= (fe->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); - if (!pte_none(*fe->pte)) + vmf->pte -= (vmf->address >> PAGE_SHIFT) - (address >> PAGE_SHIFT); + if (!pte_none(*vmf->pte)) ret = VM_FAULT_NOPAGE; - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); out: - fe->address = address; - fe->pte = NULL; + vmf->address = address; + vmf->pte = NULL; return ret; } -static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_read_fault(struct vm_fault *vmf, pgoff_t pgoff) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *fault_page; int ret = 0; @@ -3219,27 +3223,27 @@ static int do_read_fault(struct fault_env *fe, pgoff_t pgoff) * something). */ if (vma->vm_ops->map_pages && fault_around_bytes >> PAGE_SHIFT > 1) { - ret = do_fault_around(fe, pgoff); + ret = do_fault_around(vmf, pgoff); if (ret) return ret; } - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= alloc_set_pte(vmf, NULL, fault_page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); unlock_page(fault_page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) put_page(fault_page); return ret; } -static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_cow_fault(struct vm_fault *vmf, pgoff_t pgoff) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *fault_page, *new_page; void *fault_entry; struct mem_cgroup *memcg; @@ -3248,7 +3252,7 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) if (unlikely(anon_vma_prepare(vma))) return VM_FAULT_OOM; - new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, fe->address); + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, vmf->address); if (!new_page) return VM_FAULT_OOM; @@ -3258,17 +3262,17 @@ static int do_cow_fault(struct fault_env *fe, pgoff_t pgoff) return VM_FAULT_OOM; } - ret = __do_fault(fe, pgoff, new_page, &fault_page, &fault_entry); + ret = __do_fault(vmf, pgoff, new_page, &fault_page, &fault_entry); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(new_page, fault_page, fe->address, vma); + copy_user_highpage(new_page, fault_page, vmf->address, vma); __SetPageUptodate(new_page); - ret |= alloc_set_pte(fe, memcg, new_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= alloc_set_pte(vmf, memcg, new_page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(fault_page); put_page(fault_page); @@ -3284,15 +3288,15 @@ uncharge_out: return ret; } -static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) +static int do_shared_fault(struct vm_fault *vmf, pgoff_t pgoff) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *fault_page; struct address_space *mapping; int dirtied = 0; int ret, tmp; - ret = __do_fault(fe, pgoff, NULL, &fault_page, NULL); + ret = __do_fault(vmf, pgoff, NULL, &fault_page, NULL); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; @@ -3302,7 +3306,7 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) */ if (vma->vm_ops->page_mkwrite) { unlock_page(fault_page); - tmp = do_page_mkwrite(vma, fault_page, fe->address); + tmp = do_page_mkwrite(vma, fault_page, vmf->address); if (unlikely(!tmp || (tmp & (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { put_page(fault_page); @@ -3310,9 +3314,9 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) } } - ret |= alloc_set_pte(fe, NULL, fault_page); - if (fe->pte) - pte_unmap_unlock(fe->pte, fe->ptl); + ret |= alloc_set_pte(vmf, NULL, fault_page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { unlock_page(fault_page); @@ -3350,19 +3354,19 @@ static int do_shared_fault(struct fault_env *fe, pgoff_t pgoff) * The mmap_sem may have been released depending on flags and our * return value. See filemap_fault() and __lock_page_or_retry(). */ -static int do_fault(struct fault_env *fe) +static int do_fault(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; - pgoff_t pgoff = linear_page_index(vma, fe->address); + struct vm_area_struct *vma = vmf->vma; + pgoff_t pgoff = linear_page_index(vma, vmf->address); /* The VMA was not fully populated on mmap() or missing VM_DONTEXPAND */ if (!vma->vm_ops->fault) return VM_FAULT_SIGBUS; - if (!(fe->flags & FAULT_FLAG_WRITE)) - return do_read_fault(fe, pgoff); + if (!(vmf->flags & FAULT_FLAG_WRITE)) + return do_read_fault(vmf, pgoff); if (!(vma->vm_flags & VM_SHARED)) - return do_cow_fault(fe, pgoff); - return do_shared_fault(fe, pgoff); + return do_cow_fault(vmf, pgoff); + return do_shared_fault(vmf, pgoff); } static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, @@ -3380,9 +3384,9 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct fault_env *fe, pte_t pte) +static int do_numa_page(struct vm_fault *vmf, pte_t pte) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; int page_nid = -1; int last_cpupid; @@ -3400,10 +3404,10 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) * page table entry is not accessible, so there would be no * concurrent hardware modifications to the PTE. */ - fe->ptl = pte_lockptr(vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, pte))) { - pte_unmap_unlock(fe->pte, fe->ptl); + vmf->ptl = pte_lockptr(vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, pte))) { + pte_unmap_unlock(vmf->pte, vmf->ptl); goto out; } @@ -3412,18 +3416,18 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) pte = pte_mkyoung(pte); if (was_writable) pte = pte_mkwrite(pte); - set_pte_at(vma->vm_mm, fe->address, fe->pte, pte); - update_mmu_cache(vma, fe->address, fe->pte); + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + update_mmu_cache(vma, vmf->address, vmf->pte); - page = vm_normal_page(vma, fe->address, pte); + page = vm_normal_page(vma, vmf->address, pte); if (!page) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } /* TODO: handle PTE-mapped THP */ if (PageCompound(page)) { - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3447,9 +3451,9 @@ static int do_numa_page(struct fault_env *fe, pte_t pte) last_cpupid = page_cpupid_last(page); page_nid = page_to_nid(page); - target_nid = numa_migrate_prep(page, vma, fe->address, page_nid, + target_nid = numa_migrate_prep(page, vma, vmf->address, page_nid, &flags); - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); if (target_nid == -1) { put_page(page); goto out; @@ -3469,28 +3473,28 @@ out: return 0; } -static int create_huge_pmd(struct fault_env *fe) +static int create_huge_pmd(struct vm_fault *vmf) { - struct vm_area_struct *vma = fe->vma; + struct vm_area_struct *vma = vmf->vma; if (vma_is_anonymous(vma)) - return do_huge_pmd_anonymous_page(fe); + return do_huge_pmd_anonymous_page(vmf); if (vma->vm_ops->pmd_fault) - return vma->vm_ops->pmd_fault(vma, fe->address, fe->pmd, - fe->flags); + return vma->vm_ops->pmd_fault(vma, vmf->address, vmf->pmd, + vmf->flags); return VM_FAULT_FALLBACK; } -static int wp_huge_pmd(struct fault_env *fe, pmd_t orig_pmd) +static int wp_huge_pmd(struct vm_fault *vmf, pmd_t orig_pmd) { - if (vma_is_anonymous(fe->vma)) - return do_huge_pmd_wp_page(fe, orig_pmd); - if (fe->vma->vm_ops->pmd_fault) - return fe->vma->vm_ops->pmd_fault(fe->vma, fe->address, fe->pmd, - fe->flags); + if (vma_is_anonymous(vmf->vma)) + return do_huge_pmd_wp_page(vmf, orig_pmd); + if (vmf->vma->vm_ops->pmd_fault) + return vmf->vma->vm_ops->pmd_fault(vmf->vma, vmf->address, + vmf->pmd, vmf->flags); /* COW handled on pte level: split pmd */ - VM_BUG_ON_VMA(fe->vma->vm_flags & VM_SHARED, fe->vma); - __split_huge_pmd(fe->vma, fe->pmd, fe->address, false, NULL); + VM_BUG_ON_VMA(vmf->vma->vm_flags & VM_SHARED, vmf->vma); + __split_huge_pmd(vmf->vma, vmf->pmd, vmf->address, false, NULL); return VM_FAULT_FALLBACK; } @@ -3515,21 +3519,21 @@ static inline bool vma_is_accessible(struct vm_area_struct *vma) * The mmap_sem may have been released depending on flags and our return value. * See filemap_fault() and __lock_page_or_retry(). */ -static int handle_pte_fault(struct fault_env *fe) +static int handle_pte_fault(struct vm_fault *vmf) { pte_t entry; - if (unlikely(pmd_none(*fe->pmd))) { + if (unlikely(pmd_none(*vmf->pmd))) { /* * Leave __pte_alloc() until later: because vm_ops->fault may * want to allocate huge page, and if we expose page table * for an instant, it will be difficult to retract from * concurrent faults and from rmap lookups. */ - fe->pte = NULL; + vmf->pte = NULL; } else { /* See comment in pte_alloc_one_map() */ - if (pmd_trans_unstable(fe->pmd) || pmd_devmap(*fe->pmd)) + if (pmd_trans_unstable(vmf->pmd) || pmd_devmap(*vmf->pmd)) return 0; /* * A regular pmd is established and it can't morph into a huge @@ -3537,9 +3541,9 @@ static int handle_pte_fault(struct fault_env *fe) * mmap_sem read mode and khugepaged takes it in write mode. * So now it's safe to run pte_offset_map(). */ - fe->pte = pte_offset_map(fe->pmd, fe->address); + vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - entry = *fe->pte; + entry = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3551,37 +3555,37 @@ static int handle_pte_fault(struct fault_env *fe) */ barrier(); if (pte_none(entry)) { - pte_unmap(fe->pte); - fe->pte = NULL; + pte_unmap(vmf->pte); + vmf->pte = NULL; } } - if (!fe->pte) { - if (vma_is_anonymous(fe->vma)) - return do_anonymous_page(fe); + if (!vmf->pte) { + if (vma_is_anonymous(vmf->vma)) + return do_anonymous_page(vmf); else - return do_fault(fe); + return do_fault(vmf); } if (!pte_present(entry)) - return do_swap_page(fe, entry); + return do_swap_page(vmf, entry); - if (pte_protnone(entry) && vma_is_accessible(fe->vma)) - return do_numa_page(fe, entry); + if (pte_protnone(entry) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf, entry); - fe->ptl = pte_lockptr(fe->vma->vm_mm, fe->pmd); - spin_lock(fe->ptl); - if (unlikely(!pte_same(*fe->pte, entry))) + vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); + spin_lock(vmf->ptl); + if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; - if (fe->flags & FAULT_FLAG_WRITE) { + if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(fe, entry); + return do_wp_page(vmf, entry); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (ptep_set_access_flags(fe->vma, fe->address, fe->pte, entry, - fe->flags & FAULT_FLAG_WRITE)) { - update_mmu_cache(fe->vma, fe->address, fe->pte); + if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry, + vmf->flags & FAULT_FLAG_WRITE)) { + update_mmu_cache(vmf->vma, vmf->address, vmf->pte); } else { /* * This is needed only for protection faults but the arch code @@ -3589,11 +3593,11 @@ static int handle_pte_fault(struct fault_env *fe) * This still avoids useless tlb flushes for .text page faults * with threads. */ - if (fe->flags & FAULT_FLAG_WRITE) - flush_tlb_fix_spurious_fault(fe->vma, fe->address); + if (vmf->flags & FAULT_FLAG_WRITE) + flush_tlb_fix_spurious_fault(vmf->vma, vmf->address); } unlock: - pte_unmap_unlock(fe->pte, fe->ptl); + pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } @@ -3606,7 +3610,7 @@ unlock: static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) { - struct fault_env fe = { + struct vm_fault vmf = { .vma = vma, .address = address, .flags = flags, @@ -3619,35 +3623,35 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, pud = pud_alloc(mm, pgd, address); if (!pud) return VM_FAULT_OOM; - fe.pmd = pmd_alloc(mm, pud, address); - if (!fe.pmd) + vmf.pmd = pmd_alloc(mm, pud, address); + if (!vmf.pmd) return VM_FAULT_OOM; - if (pmd_none(*fe.pmd) && transparent_hugepage_enabled(vma)) { - int ret = create_huge_pmd(&fe); + if (pmd_none(*vmf.pmd) && transparent_hugepage_enabled(vma)) { + int ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - pmd_t orig_pmd = *fe.pmd; + pmd_t orig_pmd = *vmf.pmd; int ret; barrier(); if (pmd_trans_huge(orig_pmd) || pmd_devmap(orig_pmd)) { if (pmd_protnone(orig_pmd) && vma_is_accessible(vma)) - return do_huge_pmd_numa_page(&fe, orig_pmd); + return do_huge_pmd_numa_page(&vmf, orig_pmd); - if ((fe.flags & FAULT_FLAG_WRITE) && + if ((vmf.flags & FAULT_FLAG_WRITE) && !pmd_write(orig_pmd)) { - ret = wp_huge_pmd(&fe, orig_pmd); + ret = wp_huge_pmd(&vmf, orig_pmd); if (!(ret & VM_FAULT_FALLBACK)) return ret; } else { - huge_pmd_set_accessed(&fe, orig_pmd); + huge_pmd_set_accessed(&vmf, orig_pmd); return 0; } } } - return handle_pte_fault(&fe); + return handle_pte_fault(&vmf); } /* diff --git a/mm/nommu.c b/mm/nommu.c index c299a7fbca70..9902d811ff4f 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1801,7 +1801,7 @@ int filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } EXPORT_SYMBOL(filemap_fault); -void filemap_map_pages(struct fault_env *fe, +void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff) { BUG(); -- cgit v1.2.3 From 1a29d85eb0f19b7d8271923d8917d7b4f5540b3e Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:01 -0800 Subject: mm: use vmf->address instead of of vmf->virtual_address Every single user of vmf->virtual_address typed that entry to unsigned long before doing anything with it so the type of virtual_address does not really provide us any additional safety. Just use masked vmf->address which already has the appropriate type. Link: http://lkml.kernel.org/r/1479460644-25076-3-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Cc: Dan Williams Cc: Ross Zwisler Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/cell/spufs/file.c | 8 +++----- arch/x86/entry/vdso/vma.c | 4 ++-- drivers/char/agp/alpha-agp.c | 3 +-- drivers/char/mspec.c | 2 +- drivers/dax/dax.c | 3 +-- drivers/gpu/drm/armada/armada_gem.c | 5 ++--- drivers/gpu/drm/drm_vm.c | 10 +++++----- drivers/gpu/drm/etnaviv/etnaviv_gem.c | 7 +++---- drivers/gpu/drm/exynos/exynos_drm_gem.c | 6 ++---- drivers/gpu/drm/gma500/framebuffer.c | 2 +- drivers/gpu/drm/gma500/gem.c | 5 ++--- drivers/gpu/drm/i915/i915_gem.c | 3 +-- drivers/gpu/drm/msm/msm_gem.c | 8 +++----- drivers/gpu/drm/omapdrm/omap_gem.c | 20 ++++++++------------ drivers/gpu/drm/tegra/gem.c | 4 ++-- drivers/gpu/drm/ttm/ttm_bo_vm.c | 2 +- drivers/gpu/drm/udl/udl_gem.c | 5 ++--- drivers/gpu/drm/vgem/vgem_drv.c | 2 +- drivers/media/v4l2-core/videobuf-dma-sg.c | 5 ++--- drivers/misc/cxl/context.c | 5 ++--- drivers/misc/sgi-gru/grumain.c | 2 +- drivers/staging/android/ion/ion.c | 2 +- drivers/staging/lustre/lustre/llite/vvp_io.c | 6 +++--- drivers/xen/privcmd.c | 2 +- fs/dax.c | 4 ++-- include/linux/mm.h | 2 -- mm/memory.c | 9 ++++----- 27 files changed, 57 insertions(+), 79 deletions(-) (limited to 'include') diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 06254467e4dd..3a147122bc98 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -236,7 +236,6 @@ static int spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct spu_context *ctx = vma->vm_file->private_data; - unsigned long address = (unsigned long)vmf->virtual_address; unsigned long pfn, offset; offset = vmf->pgoff << PAGE_SHIFT; @@ -244,7 +243,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; pr_debug("spufs_mem_mmap_fault address=0x%lx, offset=0x%lx\n", - address, offset); + vmf->address, offset); if (spu_acquire(ctx)) return VM_FAULT_NOPAGE; @@ -256,7 +255,7 @@ spufs_mem_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) vma->vm_page_prot = pgprot_noncached_wc(vma->vm_page_prot); pfn = (ctx->spu->local_store_phys + offset) >> PAGE_SHIFT; } - vm_insert_pfn(vma, address, pfn); + vm_insert_pfn(vma, vmf->address, pfn); spu_release(ctx); @@ -355,8 +354,7 @@ static int spufs_ps_fault(struct vm_area_struct *vma, down_read(¤t->mm->mmap_sem); } else { area = ctx->spu->problem_phys + ps_offs; - vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, - (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); spu_context_trace(spufs_ps_fault__insert, ctx, ctx->spu); } diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index e739002427ed..40121d14d34d 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -109,7 +109,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, return VM_FAULT_SIGBUS; if (sym_offset == image->sym_vvar_page) { - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, + ret = vm_insert_pfn(vma, vmf->address, __pa_symbol(&__vvar_page) >> PAGE_SHIFT); } else if (sym_offset == image->sym_pvclock_page) { struct pvclock_vsyscall_time_info *pvti = @@ -117,7 +117,7 @@ static int vvar_fault(const struct vm_special_mapping *sm, if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) { ret = vm_insert_pfn( vma, - (unsigned long)vmf->virtual_address, + vmf->address, __pa(pvti) >> PAGE_SHIFT); } } diff --git a/drivers/char/agp/alpha-agp.c b/drivers/char/agp/alpha-agp.c index 199b8e99f7d7..737187865269 100644 --- a/drivers/char/agp/alpha-agp.c +++ b/drivers/char/agp/alpha-agp.c @@ -19,8 +19,7 @@ static int alpha_core_agp_vm_fault(struct vm_area_struct *vma, unsigned long pa; struct page *page; - dma_addr = (unsigned long)vmf->virtual_address - vma->vm_start - + agp->aperture.bus_base; + dma_addr = vmf->address - vma->vm_start + agp->aperture.bus_base; pa = agp->ops->translate(agp, dma_addr); if (pa == (unsigned long)-EINVAL) diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f3f92d5fcda0..a697ca0cab1e 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -227,7 +227,7 @@ mspec_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * be because another thread has installed the pte first, so it * is no problem. */ - vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + vm_insert_pfn(vma, vmf->address, pfn); return VM_FAULT_NOPAGE; } diff --git a/drivers/dax/dax.c b/drivers/dax/dax.c index 286447a83dab..26ec39ddf21f 100644 --- a/drivers/dax/dax.c +++ b/drivers/dax/dax.c @@ -328,7 +328,6 @@ static phys_addr_t pgoff_to_phys(struct dax_dev *dax_dev, pgoff_t pgoff, static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long) vmf->virtual_address; struct device *dev = &dax_dev->dev; struct dax_region *dax_region; int rc = VM_FAULT_SIGBUS; @@ -353,7 +352,7 @@ static int __dax_dev_fault(struct dax_dev *dax_dev, struct vm_area_struct *vma, pfn = phys_to_pfn_t(phys, dax_region->pfn_flags); - rc = vm_insert_mixed(vma, vaddr, pfn); + rc = vm_insert_mixed(vma, vmf->address, pfn); if (rc == -ENOMEM) return VM_FAULT_OOM; diff --git a/drivers/gpu/drm/armada/armada_gem.c b/drivers/gpu/drm/armada/armada_gem.c index 768087ddb046..a293c8be232c 100644 --- a/drivers/gpu/drm/armada/armada_gem.c +++ b/drivers/gpu/drm/armada/armada_gem.c @@ -17,12 +17,11 @@ static int armada_gem_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct armada_gem_object *obj = drm_to_armada_gem(vma->vm_private_data); - unsigned long addr = (unsigned long)vmf->virtual_address; unsigned long pfn = obj->phys_addr >> PAGE_SHIFT; int ret; - pfn += (addr - vma->vm_start) >> PAGE_SHIFT; - ret = vm_insert_pfn(vma, addr, pfn); + pfn += (vmf->address - vma->vm_start) >> PAGE_SHIFT; + ret = vm_insert_pfn(vma, vmf->address, pfn); switch (ret) { case 0: diff --git a/drivers/gpu/drm/drm_vm.c b/drivers/gpu/drm/drm_vm.c index caa4e4ca616d..bd311c77c254 100644 --- a/drivers/gpu/drm/drm_vm.c +++ b/drivers/gpu/drm/drm_vm.c @@ -124,8 +124,7 @@ static int drm_do_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) * Using vm_pgoff as a selector forces us to use this unusual * addressing scheme. */ - resource_size_t offset = (unsigned long)vmf->virtual_address - - vma->vm_start; + resource_size_t offset = vmf->address - vma->vm_start; resource_size_t baddr = map->offset + offset; struct drm_agp_mem *agpmem; struct page *page; @@ -195,7 +194,7 @@ static int drm_do_vm_shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!map) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; + offset = vmf->address - vma->vm_start; i = (unsigned long)map->handle + offset; page = vmalloc_to_page((void *)i); if (!page) @@ -301,7 +300,8 @@ static int drm_do_vm_dma_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!dma->pagelist) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; /* vm_[pg]off[set] should be 0 */ + offset = vmf->address - vma->vm_start; + /* vm_[pg]off[set] should be 0 */ page_nr = offset >> PAGE_SHIFT; /* page_nr could just be vmf->pgoff */ page = virt_to_page((void *)dma->pagelist[page_nr]); @@ -337,7 +337,7 @@ static int drm_do_vm_sg_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!entry->pagelist) return VM_FAULT_SIGBUS; /* Nothing allocated */ - offset = (unsigned long)vmf->virtual_address - vma->vm_start; + offset = vmf->address - vma->vm_start; map_offset = map->offset - (unsigned long)dev->sg->virtual; page_offset = (offset >> PAGE_SHIFT) + (map_offset >> PAGE_SHIFT); page = entry->pagelist[page_offset]; diff --git a/drivers/gpu/drm/etnaviv/etnaviv_gem.c b/drivers/gpu/drm/etnaviv/etnaviv_gem.c index 6f5dfabb3096..114dddbd297b 100644 --- a/drivers/gpu/drm/etnaviv/etnaviv_gem.c +++ b/drivers/gpu/drm/etnaviv/etnaviv_gem.c @@ -202,15 +202,14 @@ int etnaviv_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; page = pages[pgoff]; - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, page_to_pfn(page), page_to_pfn(page) << PAGE_SHIFT); - ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + ret = vm_insert_page(vma, vmf->address, page); out: switch (ret) { diff --git a/drivers/gpu/drm/exynos/exynos_drm_gem.c b/drivers/gpu/drm/exynos/exynos_drm_gem.c index ea7a18230888..57b81460fec8 100644 --- a/drivers/gpu/drm/exynos/exynos_drm_gem.c +++ b/drivers/gpu/drm/exynos/exynos_drm_gem.c @@ -455,8 +455,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) pgoff_t page_offset; int ret; - page_offset = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (page_offset >= (exynos_gem->size >> PAGE_SHIFT)) { DRM_ERROR("invalid page offset\n"); @@ -465,8 +464,7 @@ int exynos_drm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } pfn = page_to_pfn(exynos_gem->pages[page_offset]); - ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); out: switch (ret) { diff --git a/drivers/gpu/drm/gma500/framebuffer.c b/drivers/gpu/drm/gma500/framebuffer.c index 4071b2d1e8cf..8b44fa542562 100644 --- a/drivers/gpu/drm/gma500/framebuffer.c +++ b/drivers/gpu/drm/gma500/framebuffer.c @@ -125,7 +125,7 @@ static int psbfb_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) psbfb->gtt->offset; page_num = vma_pages(vma); - address = (unsigned long)vmf->virtual_address - (vmf->pgoff << PAGE_SHIFT); + address = vmf->address - (vmf->pgoff << PAGE_SHIFT); vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); diff --git a/drivers/gpu/drm/gma500/gem.c b/drivers/gpu/drm/gma500/gem.c index 6d1cb6b370b1..527c62917660 100644 --- a/drivers/gpu/drm/gma500/gem.c +++ b/drivers/gpu/drm/gma500/gem.c @@ -197,15 +197,14 @@ int psb_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) /* Page relative to the VMA start - we must calculate this ourselves because vmf->pgoff is the fake GEM offset */ - page_offset = ((unsigned long) vmf->virtual_address - vma->vm_start) - >> PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; /* CPU view of the page, don't go via the GART for CPU writes */ if (r->stolen) pfn = (dev_priv->stolen_base + r->offset) >> PAGE_SHIFT; else pfn = page_to_pfn(r->pages[page_offset]); - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + ret = vm_insert_pfn(vma, vmf->address, pfn); fail: mutex_unlock(&dev_priv->mmap_mutex); diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index d0dcaf35b429..412f3513f269 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -1796,8 +1796,7 @@ int i915_gem_fault(struct vm_area_struct *area, struct vm_fault *vmf) int ret; /* We don't use vmf->pgoff since that has the fake offset */ - page_offset = ((unsigned long)vmf->virtual_address - area->vm_start) >> - PAGE_SHIFT; + page_offset = (vmf->address - area->vm_start) >> PAGE_SHIFT; trace_i915_gem_object_fault(obj, page_offset, true, write); diff --git a/drivers/gpu/drm/msm/msm_gem.c b/drivers/gpu/drm/msm/msm_gem.c index cd06cfd94687..d8bc59c7e261 100644 --- a/drivers/gpu/drm/msm/msm_gem.c +++ b/drivers/gpu/drm/msm/msm_gem.c @@ -225,16 +225,14 @@ int msm_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) } /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; pfn = page_to_pfn(pages[pgoff]); - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); - ret = vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + ret = vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); out_unlock: mutex_unlock(&dev->struct_mutex); diff --git a/drivers/gpu/drm/omapdrm/omap_gem.c b/drivers/gpu/drm/omapdrm/omap_gem.c index d4e1e11466f8..4a90c690f09e 100644 --- a/drivers/gpu/drm/omapdrm/omap_gem.c +++ b/drivers/gpu/drm/omapdrm/omap_gem.c @@ -398,8 +398,7 @@ static int fault_1d(struct drm_gem_object *obj, pgoff_t pgoff; /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (omap_obj->pages) { omap_gem_cpu_sync(obj, pgoff); @@ -409,11 +408,10 @@ static int fault_1d(struct drm_gem_object *obj, pfn = (omap_obj->paddr >> PAGE_SHIFT) + pgoff; } - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); - return vm_insert_mixed(vma, (unsigned long)vmf->virtual_address, - __pfn_to_pfn_t(pfn, PFN_DEV)); + return vm_insert_mixed(vma, vmf->address, __pfn_to_pfn_t(pfn, PFN_DEV)); } /* Special handling for the case of faulting in 2d tiled buffers */ @@ -427,7 +425,7 @@ static int fault_2d(struct drm_gem_object *obj, struct page *pages[64]; /* XXX is this too much to have on stack? */ unsigned long pfn; pgoff_t pgoff, base_pgoff; - void __user *vaddr; + unsigned long vaddr; int i, ret, slots; /* @@ -447,8 +445,7 @@ static int fault_2d(struct drm_gem_object *obj, const int m = 1 + ((omap_obj->width << fmt) / PAGE_SIZE); /* We don't use vmf->pgoff since that has the fake offset: */ - pgoff = ((unsigned long)vmf->virtual_address - - vma->vm_start) >> PAGE_SHIFT; + pgoff = (vmf->address - vma->vm_start) >> PAGE_SHIFT; /* * Actual address we start mapping at is rounded down to previous slot @@ -459,7 +456,7 @@ static int fault_2d(struct drm_gem_object *obj, /* figure out buffer width in slots */ slots = omap_obj->width >> priv->usergart[fmt].slot_shift; - vaddr = vmf->virtual_address - ((pgoff - base_pgoff) << PAGE_SHIFT); + vaddr = vmf->address - ((pgoff - base_pgoff) << PAGE_SHIFT); entry = &priv->usergart[fmt].entry[priv->usergart[fmt].last]; @@ -503,12 +500,11 @@ static int fault_2d(struct drm_gem_object *obj, pfn = entry->paddr >> PAGE_SHIFT; - VERB("Inserting %p pfn %lx, pa %lx", vmf->virtual_address, + VERB("Inserting %p pfn %lx, pa %lx", (void *)vmf->address, pfn, pfn << PAGE_SHIFT); for (i = n; i > 0; i--) { - vm_insert_mixed(vma, (unsigned long)vaddr, - __pfn_to_pfn_t(pfn, PFN_DEV)); + vm_insert_mixed(vma, vaddr, __pfn_to_pfn_t(pfn, PFN_DEV)); pfn += priv->usergart[fmt].stride_pfn; vaddr += PAGE_SIZE * m; } diff --git a/drivers/gpu/drm/tegra/gem.c b/drivers/gpu/drm/tegra/gem.c index c08e5279eeac..7d853e6b5ff0 100644 --- a/drivers/gpu/drm/tegra/gem.c +++ b/drivers/gpu/drm/tegra/gem.c @@ -452,10 +452,10 @@ static int tegra_bo_fault(struct vm_area_struct *vma, struct vm_fault *vmf) if (!bo->pages) return VM_FAULT_SIGBUS; - offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> PAGE_SHIFT; + offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; page = bo->pages[offset]; - err = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + err = vm_insert_page(vma, vmf->address, page); switch (err) { case -EAGAIN: case 0: diff --git a/drivers/gpu/drm/ttm/ttm_bo_vm.c b/drivers/gpu/drm/ttm/ttm_bo_vm.c index 4748aedc933a..68ef993ab431 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_vm.c +++ b/drivers/gpu/drm/ttm/ttm_bo_vm.c @@ -101,7 +101,7 @@ static int ttm_bo_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page; int ret; int i; - unsigned long address = (unsigned long)vmf->virtual_address; + unsigned long address = vmf->address; int retval = VM_FAULT_NOPAGE; struct ttm_mem_type_manager *man = &bdev->man[bo->mem.mem_type]; diff --git a/drivers/gpu/drm/udl/udl_gem.c b/drivers/gpu/drm/udl/udl_gem.c index 818e70712b18..3c0c4bd3f750 100644 --- a/drivers/gpu/drm/udl/udl_gem.c +++ b/drivers/gpu/drm/udl/udl_gem.c @@ -107,14 +107,13 @@ int udl_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned int page_offset; int ret = 0; - page_offset = ((unsigned long)vmf->virtual_address - vma->vm_start) >> - PAGE_SHIFT; + page_offset = (vmf->address - vma->vm_start) >> PAGE_SHIFT; if (!obj->pages) return VM_FAULT_SIGBUS; page = obj->pages[page_offset]; - ret = vm_insert_page(vma, (unsigned long)vmf->virtual_address, page); + ret = vm_insert_page(vma, vmf->address, page); switch (ret) { case -EAGAIN: case 0: diff --git a/drivers/gpu/drm/vgem/vgem_drv.c b/drivers/gpu/drm/vgem/vgem_drv.c index f36c14729b55..477e07f0ecb6 100644 --- a/drivers/gpu/drm/vgem/vgem_drv.c +++ b/drivers/gpu/drm/vgem/vgem_drv.c @@ -54,7 +54,7 @@ static int vgem_gem_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct drm_vgem_gem_object *obj = vma->vm_private_data; /* We don't use vmf->pgoff since that has the fake offset */ - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct page *page; page = shmem_read_mapping_page(file_inode(obj->base.filp)->i_mapping, diff --git a/drivers/media/v4l2-core/videobuf-dma-sg.c b/drivers/media/v4l2-core/videobuf-dma-sg.c index 1db0af6c7f94..ba63ca57ed7e 100644 --- a/drivers/media/v4l2-core/videobuf-dma-sg.c +++ b/drivers/media/v4l2-core/videobuf-dma-sg.c @@ -439,13 +439,12 @@ static int videobuf_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) struct page *page; dprintk(3, "fault: fault @ %08lx [vma %08lx-%08lx]\n", - (unsigned long)vmf->virtual_address, - vma->vm_start, vma->vm_end); + vmf->address, vma->vm_start, vma->vm_end); page = alloc_page(GFP_USER | __GFP_DMA32); if (!page) return VM_FAULT_OOM; - clear_user_highpage(page, (unsigned long)vmf->virtual_address); + clear_user_highpage(page, vmf->address); vmf->page = page; return 0; diff --git a/drivers/misc/cxl/context.c b/drivers/misc/cxl/context.c index 5e506c19108a..5d36dcc7f47e 100644 --- a/drivers/misc/cxl/context.c +++ b/drivers/misc/cxl/context.c @@ -117,13 +117,12 @@ int cxl_context_init(struct cxl_context *ctx, struct cxl_afu *afu, bool master, static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct cxl_context *ctx = vma->vm_file->private_data; - unsigned long address = (unsigned long)vmf->virtual_address; u64 area, offset; offset = vmf->pgoff << PAGE_SHIFT; pr_devel("%s: pe: %i address: 0x%lx offset: 0x%llx\n", - __func__, ctx->pe, address, offset); + __func__, ctx->pe, vmf->address, offset); if (ctx->afu->current_mode == CXL_MODE_DEDICATED) { area = ctx->afu->psn_phys; @@ -155,7 +154,7 @@ static int cxl_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) return VM_FAULT_SIGBUS; } - vm_insert_pfn(vma, address, (area + offset) >> PAGE_SHIFT); + vm_insert_pfn(vma, vmf->address, (area + offset) >> PAGE_SHIFT); mutex_unlock(&ctx->status_mutex); diff --git a/drivers/misc/sgi-gru/grumain.c b/drivers/misc/sgi-gru/grumain.c index 33741ad4a74a..af2e077da4b8 100644 --- a/drivers/misc/sgi-gru/grumain.c +++ b/drivers/misc/sgi-gru/grumain.c @@ -932,7 +932,7 @@ int gru_fault(struct vm_area_struct *vma, struct vm_fault *vmf) unsigned long paddr, vaddr; unsigned long expires; - vaddr = (unsigned long)vmf->virtual_address; + vaddr = vmf->address; gru_dbg(grudev, "vma %p, vaddr 0x%lx (0x%lx)\n", vma, vaddr, GSEG_BASE(vaddr)); STAT(nopfn); diff --git a/drivers/staging/android/ion/ion.c b/drivers/staging/android/ion/ion.c index d5cc3070e83f..b653451843c8 100644 --- a/drivers/staging/android/ion/ion.c +++ b/drivers/staging/android/ion/ion.c @@ -882,7 +882,7 @@ static int ion_vm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) BUG_ON(!buffer->pages || !buffer->pages[vmf->pgoff]); pfn = page_to_pfn(ion_buffer_page(buffer->pages[vmf->pgoff])); - ret = vm_insert_pfn(vma, (unsigned long)vmf->virtual_address, pfn); + ret = vm_insert_pfn(vma, vmf->address, pfn); mutex_unlock(&buffer->lock); if (ret) return VM_FAULT_ERROR; diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index 0b6d388d8aa4..697cbfbe9374 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -1014,7 +1014,7 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) "page %p map %p index %lu flags %lx count %u priv %0lx: got addr %p type NOPAGE\n", vmf->page, vmf->page->mapping, vmf->page->index, (long)vmf->page->flags, page_count(vmf->page), - page_private(vmf->page), vmf->virtual_address); + page_private(vmf->page), (void *)vmf->address); if (unlikely(!(cfio->ft_flags & VM_FAULT_LOCKED))) { lock_page(vmf->page); cfio->ft_flags |= VM_FAULT_LOCKED; @@ -1025,12 +1025,12 @@ static int vvp_io_kernel_fault(struct vvp_fault_io *cfio) } if (cfio->ft_flags & (VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV)) { - CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", vmf->virtual_address); + CDEBUG(D_PAGE, "got addr %p - SIGBUS\n", (void *)vmf->address); return -EFAULT; } if (cfio->ft_flags & VM_FAULT_OOM) { - CDEBUG(D_PAGE, "got addr %p - OOM\n", vmf->virtual_address); + CDEBUG(D_PAGE, "got addr %p - OOM\n", (void *)vmf->address); return -ENOMEM; } diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index 702040fe2001..6e3306f4a525 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -602,7 +602,7 @@ static int privcmd_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { printk(KERN_DEBUG "privcmd_fault: vma=%p %lx-%lx, pgoff=%lx, uv=%p\n", vma, vma->vm_start, vma->vm_end, - vmf->pgoff, vmf->virtual_address); + vmf->pgoff, (void *)vmf->address); return VM_FAULT_SIGBUS; } diff --git a/fs/dax.c b/fs/dax.c index 5ae8e11ad786..f1dfae64b11a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -738,7 +738,7 @@ static int dax_insert_mapping(struct address_space *mapping, struct block_device *bdev, sector_t sector, size_t size, void **entryp, struct vm_area_struct *vma, struct vm_fault *vmf) { - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; struct blk_dax_ctl dax = { .sector = sector, .size = size, @@ -948,7 +948,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, { struct address_space *mapping = vma->vm_file->f_mapping; struct inode *inode = mapping->host; - unsigned long vaddr = (unsigned long)vmf->virtual_address; + unsigned long vaddr = vmf->address; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; sector_t sector; struct iomap iomap = { 0 }; diff --git a/include/linux/mm.h b/include/linux/mm.h index de5bcead2511..75fda0de64bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -297,8 +297,6 @@ struct vm_fault { gfp_t gfp_mask; /* gfp mask to be used for allocations */ pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ - void __user *virtual_address; /* Faulting virtual address masked by - * PAGE_MASK */ pmd_t *pmd; /* Pointer to pmd entry matching * the 'address' */ diff --git a/mm/memory.c b/mm/memory.c index 512e1c359193..379836261d4a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2040,7 +2040,7 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, struct vm_fault vmf; int ret; - vmf.virtual_address = (void __user *)(address & PAGE_MASK); + vmf.address = address & PAGE_MASK; vmf.pgoff = page->index; vmf.flags = FAULT_FLAG_WRITE|FAULT_FLAG_MKWRITE; vmf.gfp_mask = __get_fault_gfp_mask(vma); @@ -2276,8 +2276,7 @@ static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) struct vm_fault vmf2 = { .page = NULL, .pgoff = linear_page_index(vma, vmf->address), - .virtual_address = - (void __user *)(vmf->address & PAGE_MASK), + .address = vmf->address, .flags = FAULT_FLAG_WRITE | FAULT_FLAG_MKWRITE, }; int ret; @@ -2852,7 +2851,7 @@ static int __do_fault(struct vm_fault *vmf, pgoff_t pgoff, struct vm_fault vmf2; int ret; - vmf2.virtual_address = (void __user *)(vmf->address & PAGE_MASK); + vmf2.address = vmf->address; vmf2.pgoff = pgoff; vmf2.flags = vmf->flags; vmf2.page = NULL; @@ -3612,7 +3611,7 @@ static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address, { struct vm_fault vmf = { .vma = vma, - .address = address, + .address = address & PAGE_MASK, .flags = flags, }; struct mm_struct *mm = vma->vm_mm; -- cgit v1.2.3 From 2994302bc8a17180788fac66a47102d338d5d0ec Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:16 -0800 Subject: mm: add orig_pte field into vm_fault Add orig_pte field to vm_fault structure to allow ->page_mkwrite handlers to fully handle the fault. This also allows us to save some passing of extra arguments around. Link: http://lkml.kernel.org/r/1479460644-25076-8-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +-- mm/internal.h | 2 +- mm/khugepaged.c | 7 ++--- mm/memory.c | 82 +++++++++++++++++++++++++++--------------------------- 4 files changed, 47 insertions(+), 48 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 75fda0de64bf..39c17a2efcea 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -298,8 +298,8 @@ struct vm_fault { pgoff_t pgoff; /* Logical page offset based on vma */ unsigned long address; /* Faulting virtual address */ pmd_t *pmd; /* Pointer to pmd entry matching - * the 'address' - */ + * the 'address' */ + pte_t orig_pte; /* Value of PTE at the time of fault */ struct page *cow_page; /* Handler may choose to COW */ struct page *page; /* ->fault handlers should return a diff --git a/mm/internal.h b/mm/internal.h index 093b1eacc91b..44d68895a9b9 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -36,7 +36,7 @@ /* Do not use these with a slab allocator */ #define GFP_SLAB_BUG_MASK (__GFP_DMA32|__GFP_HIGHMEM|~__GFP_BITS_MASK) -int do_swap_page(struct vm_fault *vmf, pte_t orig_pte); +int do_swap_page(struct vm_fault *vmf); void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, unsigned long floor, unsigned long ceiling); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index f062d54c9683..7434a63cac94 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -875,7 +875,6 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, unsigned long address, pmd_t *pmd, int referenced) { - pte_t pteval; int swapped_in = 0, ret = 0; struct vm_fault vmf = { .vma = vma, @@ -893,11 +892,11 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, vmf.pte = pte_offset_map(pmd, address); for (; vmf.address < address + HPAGE_PMD_NR*PAGE_SIZE; vmf.pte++, vmf.address += PAGE_SIZE) { - pteval = *vmf.pte; - if (!is_swap_pte(pteval)) + vmf.orig_pte = *vmf.pte; + if (!is_swap_pte(vmf.orig_pte)) continue; swapped_in++; - ret = do_swap_page(&vmf, pteval); + ret = do_swap_page(&vmf); /* do_swap_page returns VM_FAULT_RETRY with released mmap_sem */ if (ret & VM_FAULT_RETRY) { diff --git a/mm/memory.c b/mm/memory.c index 7ba9cc58dddd..cf74f7ca911b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2070,8 +2070,8 @@ static int do_page_mkwrite(struct vm_area_struct *vma, struct page *page, * case, all we need to do here is to mark the page as writable and update * any related book-keeping. */ -static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, - struct page *page, int page_mkwrite, int dirty_shared) +static inline int wp_page_reuse(struct vm_fault *vmf, struct page *page, + int page_mkwrite, int dirty_shared) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2084,8 +2084,8 @@ static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, if (page) page_cpupid_xchg_last(page, (1 << LAST_CPUPID_SHIFT) - 1); - flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); - entry = pte_mkyoung(orig_pte); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); + entry = pte_mkyoung(vmf->orig_pte); entry = maybe_mkwrite(pte_mkdirty(entry), vma); if (ptep_set_access_flags(vma, vmf->address, vmf->pte, entry, 1)) update_mmu_cache(vma, vmf->address, vmf->pte); @@ -2135,8 +2135,7 @@ static inline int wp_page_reuse(struct vm_fault *vmf, pte_t orig_pte, * held to the old page, as well as updating the rmap. * - In any case, unlock the PTL and drop the reference we took to the old page. */ -static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, - struct page *old_page) +static int wp_page_copy(struct vm_fault *vmf, struct page *old_page) { struct vm_area_struct *vma = vmf->vma; struct mm_struct *mm = vma->vm_mm; @@ -2150,7 +2149,7 @@ static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, if (unlikely(anon_vma_prepare(vma))) goto oom; - if (is_zero_pfn(pte_pfn(orig_pte))) { + if (is_zero_pfn(pte_pfn(vmf->orig_pte))) { new_page = alloc_zeroed_user_highpage_movable(vma, vmf->address); if (!new_page) @@ -2174,7 +2173,7 @@ static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, * Re-check the pte - we dropped the lock */ vmf->pte = pte_offset_map_lock(mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, orig_pte))) { + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) { if (old_page) { if (!PageAnon(old_page)) { dec_mm_counter_fast(mm, @@ -2184,7 +2183,7 @@ static int wp_page_copy(struct vm_fault *vmf, pte_t orig_pte, } else { inc_mm_counter_fast(mm, MM_ANONPAGES); } - flush_cache_page(vma, vmf->address, pte_pfn(orig_pte)); + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -2268,7 +2267,7 @@ oom: * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping */ -static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) +static int wp_pfn_shared(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -2286,16 +2285,15 @@ static int wp_pfn_shared(struct vm_fault *vmf, pte_t orig_pte) * We might have raced with another page fault while we * released the pte_offset_map_lock. */ - if (!pte_same(*vmf->pte, orig_pte)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) { pte_unmap_unlock(vmf->pte, vmf->ptl); return 0; } } - return wp_page_reuse(vmf, orig_pte, NULL, 0, 0); + return wp_page_reuse(vmf, NULL, 0, 0); } -static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, - struct page *old_page) +static int wp_page_shared(struct vm_fault *vmf, struct page *old_page) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; @@ -2321,7 +2319,7 @@ static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, orig_pte)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) { unlock_page(old_page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); @@ -2330,7 +2328,7 @@ static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, page_mkwrite = 1; } - return wp_page_reuse(vmf, orig_pte, old_page, page_mkwrite, 1); + return wp_page_reuse(vmf, old_page, page_mkwrite, 1); } /* @@ -2351,13 +2349,13 @@ static int wp_page_shared(struct vm_fault *vmf, pte_t orig_pte, * but allow concurrent faults), with pte both mapped and locked. * We return with mmap_sem still held, but pte unmapped and unlocked. */ -static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) +static int do_wp_page(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; struct page *old_page; - old_page = vm_normal_page(vma, vmf->address, orig_pte); + old_page = vm_normal_page(vma, vmf->address, vmf->orig_pte); if (!old_page) { /* * VM_MIXEDMAP !pfn_valid() case, or VM_SOFTDIRTY clear on a @@ -2368,10 +2366,10 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) */ if ((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED)) - return wp_pfn_shared(vmf, orig_pte); + return wp_pfn_shared(vmf); pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf, orig_pte, old_page); + return wp_page_copy(vmf, old_page); } /* @@ -2386,7 +2384,7 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) lock_page(old_page); vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, orig_pte)) { + if (!pte_same(*vmf->pte, vmf->orig_pte)) { unlock_page(old_page); pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(old_page); @@ -2406,12 +2404,12 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) page_move_anon_rmap(old_page, vma); } unlock_page(old_page); - return wp_page_reuse(vmf, orig_pte, old_page, 0, 0); + return wp_page_reuse(vmf, old_page, 0, 0); } unlock_page(old_page); } else if (unlikely((vma->vm_flags & (VM_WRITE|VM_SHARED)) == (VM_WRITE|VM_SHARED))) { - return wp_page_shared(vmf, orig_pte, old_page); + return wp_page_shared(vmf, old_page); } /* @@ -2420,7 +2418,7 @@ static int do_wp_page(struct vm_fault *vmf, pte_t orig_pte) get_page(old_page); pte_unmap_unlock(vmf->pte, vmf->ptl); - return wp_page_copy(vmf, orig_pte, old_page); + return wp_page_copy(vmf, old_page); } static void unmap_mapping_range_vma(struct vm_area_struct *vma, @@ -2508,7 +2506,7 @@ EXPORT_SYMBOL(unmap_mapping_range); * We return with the mmap_sem locked or unlocked in the same cases * as does filemap_fault(). */ -int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) +int do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page, *swapcache; @@ -2519,10 +2517,10 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) int exclusive = 0; int ret = 0; - if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, orig_pte)) + if (!pte_unmap_same(vma->vm_mm, vmf->pmd, vmf->pte, vmf->orig_pte)) goto out; - entry = pte_to_swp_entry(orig_pte); + entry = pte_to_swp_entry(vmf->orig_pte); if (unlikely(non_swap_entry(entry))) { if (is_migration_entry(entry)) { migration_entry_wait(vma->vm_mm, vmf->pmd, @@ -2530,7 +2528,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) } else if (is_hwpoison_entry(entry)) { ret = VM_FAULT_HWPOISON; } else { - print_bad_pte(vma, vmf->address, orig_pte, NULL); + print_bad_pte(vma, vmf->address, vmf->orig_pte, NULL); ret = VM_FAULT_SIGBUS; } goto out; @@ -2547,7 +2545,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (likely(pte_same(*vmf->pte, orig_pte))) + if (likely(pte_same(*vmf->pte, vmf->orig_pte))) ret = VM_FAULT_OOM; delayacct_clear_flag(DELAYACCT_PF_SWAPIN); goto unlock; @@ -2604,7 +2602,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) */ vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); - if (unlikely(!pte_same(*vmf->pte, orig_pte))) + if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; if (unlikely(!PageUptodate(page))) { @@ -2632,9 +2630,10 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) exclusive = RMAP_EXCLUSIVE; } flush_icache_page(vma, page); - if (pte_swp_soft_dirty(orig_pte)) + if (pte_swp_soft_dirty(vmf->orig_pte)) pte = pte_mksoft_dirty(pte); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + vmf->orig_pte = pte; if (page == swapcache) { do_page_add_anon_rmap(page, vma, vmf->address, exclusive); mem_cgroup_commit_charge(page, memcg, true, false); @@ -2664,7 +2663,7 @@ int do_swap_page(struct vm_fault *vmf, pte_t orig_pte) } if (vmf->flags & FAULT_FLAG_WRITE) { - ret |= do_wp_page(vmf, pte); + ret |= do_wp_page(vmf); if (ret & VM_FAULT_ERROR) ret &= VM_FAULT_ERROR; goto out; @@ -3363,7 +3362,7 @@ static int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, return mpol_misplaced(page, vma, addr); } -static int do_numa_page(struct vm_fault *vmf, pte_t pte) +static int do_numa_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct page *page = NULL; @@ -3371,6 +3370,7 @@ static int do_numa_page(struct vm_fault *vmf, pte_t pte) int last_cpupid; int target_nid; bool migrated = false; + pte_t pte = vmf->orig_pte; bool was_writable = pte_write(pte); int flags = 0; @@ -3521,8 +3521,7 @@ static int handle_pte_fault(struct vm_fault *vmf) * So now it's safe to run pte_offset_map(). */ vmf->pte = pte_offset_map(vmf->pmd, vmf->address); - - entry = *vmf->pte; + vmf->orig_pte = *vmf->pte; /* * some architectures can have larger ptes than wordsize, @@ -3533,7 +3532,7 @@ static int handle_pte_fault(struct vm_fault *vmf) * ptl lock held. So here a barrier will do. */ barrier(); - if (pte_none(entry)) { + if (pte_none(vmf->orig_pte)) { pte_unmap(vmf->pte); vmf->pte = NULL; } @@ -3546,19 +3545,20 @@ static int handle_pte_fault(struct vm_fault *vmf) return do_fault(vmf); } - if (!pte_present(entry)) - return do_swap_page(vmf, entry); + if (!pte_present(vmf->orig_pte)) + return do_swap_page(vmf); - if (pte_protnone(entry) && vma_is_accessible(vmf->vma)) - return do_numa_page(vmf, entry); + if (pte_protnone(vmf->orig_pte) && vma_is_accessible(vmf->vma)) + return do_numa_page(vmf); vmf->ptl = pte_lockptr(vmf->vma->vm_mm, vmf->pmd); spin_lock(vmf->ptl); + entry = vmf->orig_pte; if (unlikely(!pte_same(*vmf->pte, entry))) goto unlock; if (vmf->flags & FAULT_FLAG_WRITE) { if (!pte_write(entry)) - return do_wp_page(vmf, entry); + return do_wp_page(vmf); entry = pte_mkdirty(entry); } entry = pte_mkyoung(entry); -- cgit v1.2.3 From 3917048d4572b9cabf6f8f5ad395eb693717367c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:18 -0800 Subject: mm: allow full handling of COW faults in ->fault handlers Patch series "dax: Clear dirty bits after flushing caches", v5. Patchset to clear dirty bits from radix tree of DAX inodes when caches for corresponding pfns have been flushed. In principle, these patches enable handlers to easily update PTEs and do other work necessary to finish the fault without duplicating the functionality present in the generic code. I'd like to thank Kirill and Ross for reviews of the series! This patch (of 20): To allow full handling of COW faults add memcg field to struct vm_fault and a return value of ->fault() handler meaning that COW fault is fully handled and memcg charge must not be canceled. This will allow us to remove knowledge about special DAX locking from the generic fault code. Link: http://lkml.kernel.org/r/1479460644-25076-9-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 +++- mm/memory.c | 14 +++++++------- 2 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 39c17a2efcea..6e25f4916d6f 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -301,7 +301,8 @@ struct vm_fault { * the 'address' */ pte_t orig_pte; /* Value of PTE at the time of fault */ - struct page *cow_page; /* Handler may choose to COW */ + struct page *cow_page; /* Page handler may use for COW fault */ + struct mem_cgroup *memcg; /* Cgroup cow_page belongs to */ struct page *page; /* ->fault handlers should return a * page here, unless VM_FAULT_NOPAGE * is set (which is also implied by @@ -1103,6 +1104,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ +#define VM_FAULT_DONE_COW 0x2000 /* ->fault has fully handled COW */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ diff --git a/mm/memory.c b/mm/memory.c index cf74f7ca911b..02504cd4ca0e 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2844,9 +2844,8 @@ static int __do_fault(struct vm_fault *vmf) int ret; ret = vma->vm_ops->fault(vma, vmf); - if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) - return ret; - if (ret & VM_FAULT_DAX_LOCKED) + if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | + VM_FAULT_DAX_LOCKED | VM_FAULT_DONE_COW))) return ret; if (unlikely(PageHWPoison(vmf->page))) { @@ -3226,7 +3225,6 @@ static int do_read_fault(struct vm_fault *vmf) static int do_cow_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct mem_cgroup *memcg; int ret; if (unlikely(anon_vma_prepare(vma))) @@ -3237,7 +3235,7 @@ static int do_cow_fault(struct vm_fault *vmf) return VM_FAULT_OOM; if (mem_cgroup_try_charge(vmf->cow_page, vma->vm_mm, GFP_KERNEL, - &memcg, false)) { + &vmf->memcg, false)) { put_page(vmf->cow_page); return VM_FAULT_OOM; } @@ -3245,12 +3243,14 @@ static int do_cow_fault(struct vm_fault *vmf) ret = __do_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; + if (ret & VM_FAULT_DONE_COW) + return ret; if (!(ret & VM_FAULT_DAX_LOCKED)) copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(vmf, memcg, vmf->cow_page); + ret |= alloc_set_pte(vmf, vmf->memcg, vmf->cow_page); if (vmf->pte) pte_unmap_unlock(vmf->pte, vmf->ptl); if (!(ret & VM_FAULT_DAX_LOCKED)) { @@ -3263,7 +3263,7 @@ static int do_cow_fault(struct vm_fault *vmf) goto uncharge_out; return ret; uncharge_out: - mem_cgroup_cancel_charge(vmf->cow_page, memcg, false); + mem_cgroup_cancel_charge(vmf->cow_page, vmf->memcg, false); put_page(vmf->cow_page); return ret; } -- cgit v1.2.3 From 9118c0cbd44262d0015568266f314e645ed6b9ce Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:21 -0800 Subject: mm: factor out functionality to finish page faults Introduce finish_fault() as a helper function for finishing page faults. It is rather thin wrapper around alloc_set_pte() but since we'd want to call this from DAX code or filesystems, it is still useful to avoid some boilerplate code. Link: http://lkml.kernel.org/r/1479460644-25076-10-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 44 +++++++++++++++++++++++++++++++++++--------- 2 files changed, 36 insertions(+), 9 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 6e25f4916d6f..60a230e6ece7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -620,6 +620,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); +int finish_fault(struct vm_fault *vmf); #endif /* diff --git a/mm/memory.c b/mm/memory.c index 02504cd4ca0e..22f7f6e38515 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3074,6 +3074,38 @@ fault_handled: return ret; } + +/** + * finish_fault - finish page fault once we have prepared the page to fault + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a page fault once the + * page to fault in is prepared. It handles locking of PTEs, inserts PTE for + * given page, adds reverse page mapping, handles memcg charges and LRU + * addition. The function returns 0 on success, VM_FAULT_ code in case of + * error. + * + * The function expects the page to be locked and on success it consumes a + * reference of a page being mapped (for the PTE which maps it). + */ +int finish_fault(struct vm_fault *vmf) +{ + struct page *page; + int ret; + + /* Did we COW the page? */ + if ((vmf->flags & FAULT_FLAG_WRITE) && + !(vmf->vma->vm_flags & VM_SHARED)) + page = vmf->cow_page; + else + page = vmf->page; + ret = alloc_set_pte(vmf, vmf->memcg, page); + if (vmf->pte) + pte_unmap_unlock(vmf->pte, vmf->ptl); + return ret; +} + static unsigned long fault_around_bytes __read_mostly = rounddown_pow_of_two(65536); @@ -3213,9 +3245,7 @@ static int do_read_fault(struct vm_fault *vmf) if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) return ret; - ret |= alloc_set_pte(vmf, NULL, vmf->page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + ret |= finish_fault(vmf); unlock_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) put_page(vmf->page); @@ -3250,9 +3280,7 @@ static int do_cow_fault(struct vm_fault *vmf) copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); - ret |= alloc_set_pte(vmf, vmf->memcg, vmf->cow_page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + ret |= finish_fault(vmf); if (!(ret & VM_FAULT_DAX_LOCKED)) { unlock_page(vmf->page); put_page(vmf->page); @@ -3293,9 +3321,7 @@ static int do_shared_fault(struct vm_fault *vmf) } } - ret |= alloc_set_pte(vmf, NULL, vmf->page); - if (vmf->pte) - pte_unmap_unlock(vmf->pte, vmf->ptl); + ret |= finish_fault(vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) { unlock_page(vmf->page); -- cgit v1.2.3 From b1aa812b21084285e9f6098639be9cd5bf9e05d7 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:24 -0800 Subject: mm: move handling of COW faults into DAX code Move final handling of COW faults from generic code into DAX fault handler. That way generic code doesn't have to be aware of peculiarities of DAX locking so remove that knowledge and make locking functions private to fs/dax.c. Link: http://lkml.kernel.org/r/1479460644-25076-11-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Kirill A. Shutemov Reviewed-by: Ross Zwisler Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 61 +++++++++++++++++++++++++---------------------------- include/linux/dax.h | 7 ------ include/linux/mm.h | 9 +------- mm/memory.c | 13 ++++-------- 4 files changed, 34 insertions(+), 56 deletions(-) (limited to 'include') diff --git a/fs/dax.c b/fs/dax.c index f1dfae64b11a..e83aa4077df4 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -240,6 +240,23 @@ static void *get_unlocked_mapping_entry(struct address_space *mapping, } } +static void dax_unlock_mapping_entry(struct address_space *mapping, + pgoff_t index) +{ + void *entry, **slot; + + spin_lock_irq(&mapping->tree_lock); + entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); + if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || + !slot_locked(mapping, slot))) { + spin_unlock_irq(&mapping->tree_lock); + return; + } + unlock_slot(mapping, slot); + spin_unlock_irq(&mapping->tree_lock); + dax_wake_mapping_entry_waiter(mapping, index, entry, false); +} + static void put_locked_mapping_entry(struct address_space *mapping, pgoff_t index, void *entry) { @@ -433,22 +450,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); } -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index) -{ - void *entry, **slot; - - spin_lock_irq(&mapping->tree_lock); - entry = __radix_tree_lookup(&mapping->page_tree, index, NULL, &slot); - if (WARN_ON_ONCE(!entry || !radix_tree_exceptional_entry(entry) || - !slot_locked(mapping, slot))) { - spin_unlock_irq(&mapping->tree_lock); - return; - } - unlock_slot(mapping, slot); - spin_unlock_irq(&mapping->tree_lock); - dax_wake_mapping_entry_waiter(mapping, index, entry, false); -} - /* * Delete exceptional DAX entry at @index from @mapping. Wait for radix tree * entry to get unlocked before deleting it. @@ -500,10 +501,8 @@ static int dax_load_hole(struct address_space *mapping, void *entry, /* This will replace locked radix tree entry with a hole page */ page = find_or_create_page(mapping, vmf->pgoff, vmf->gfp_mask | __GFP_ZERO); - if (!page) { - put_locked_mapping_entry(mapping, vmf->pgoff, entry); + if (!page) return VM_FAULT_OOM; - } vmf->page = page; return VM_FAULT_LOCKED; } @@ -954,7 +953,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, struct iomap iomap = { 0 }; unsigned flags = IOMAP_FAULT; int error, major = 0; - int locked_status = 0; + int vmf_ret = 0; void *entry; /* @@ -1007,13 +1006,11 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, if (error) goto finish_iomap; - if (!radix_tree_exceptional_entry(entry)) { - vmf->page = entry; - locked_status = VM_FAULT_LOCKED; - } else { - vmf->entry = entry; - locked_status = VM_FAULT_DAX_LOCKED; - } + + __SetPageUptodate(vmf->cow_page); + vmf_ret = finish_fault(vmf); + if (!vmf_ret) + vmf_ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1030,7 +1027,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!(vmf->flags & FAULT_FLAG_WRITE)) { - locked_status = dax_load_hole(mapping, entry, vmf); + vmf_ret = dax_load_hole(mapping, entry, vmf); break; } /*FALLTHRU*/ @@ -1042,7 +1039,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, finish_iomap: if (ops->iomap_end) { - if (error) { + if (error || (vmf_ret & VM_FAULT_ERROR)) { /* keep previous error */ ops->iomap_end(inode, pos, PAGE_SIZE, 0, flags, &iomap); @@ -1052,7 +1049,7 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, } } unlock_entry: - if (!locked_status || error) + if (vmf_ret != VM_FAULT_LOCKED || error) put_locked_mapping_entry(mapping, vmf->pgoff, entry); out: if (error == -ENOMEM) @@ -1060,9 +1057,9 @@ int dax_iomap_fault(struct vm_area_struct *vma, struct vm_fault *vmf, /* -EBUSY is fine, somebody else faulted on the same PTE */ if (error < 0 && error != -EBUSY) return VM_FAULT_SIGBUS | major; - if (locked_status) { + if (vmf_ret) { WARN_ON_ONCE(error); /* -EBUSY from ops->iomap_end? */ - return locked_status; + return vmf_ret; } return VM_FAULT_NOPAGE | major; } diff --git a/include/linux/dax.h b/include/linux/dax.h index 0afade8bd3d7..f97bcfe79472 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -46,7 +46,6 @@ void dax_wake_mapping_entry_waiter(struct address_space *mapping, #ifdef CONFIG_FS_DAX struct page *read_dax_sector(struct block_device *bdev, sector_t n); -void dax_unlock_mapping_entry(struct address_space *mapping, pgoff_t index); int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length); #else @@ -55,12 +54,6 @@ static inline struct page *read_dax_sector(struct block_device *bdev, { return ERR_PTR(-ENXIO); } -/* Shouldn't ever be called when dax is disabled. */ -static inline void dax_unlock_mapping_entry(struct address_space *mapping, - pgoff_t index) -{ - BUG(); -} static inline int __dax_zero_page_range(struct block_device *bdev, sector_t sector, unsigned int offset, unsigned int length) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 60a230e6ece7..59a4da1742e5 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -308,12 +308,6 @@ struct vm_fault { * is set (which is also implied by * VM_FAULT_ERROR). */ - void *entry; /* ->fault handler can alternatively - * return locked DAX entry. In that - * case handler should return - * VM_FAULT_DAX_LOCKED and fill in - * entry here. - */ /* These three entries are valid only while holding ptl lock */ pte_t *pte; /* Pointer to pte entry matching * the 'address'. NULL if the page @@ -1104,8 +1098,7 @@ static inline void clear_page_pfmemalloc(struct page *page) #define VM_FAULT_LOCKED 0x0200 /* ->fault locked the returned page */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ -#define VM_FAULT_DAX_LOCKED 0x1000 /* ->fault has locked DAX entry */ -#define VM_FAULT_DONE_COW 0x2000 /* ->fault has fully handled COW */ +#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ diff --git a/mm/memory.c b/mm/memory.c index 22f7f6e38515..ca3b95fa5fd1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2845,7 +2845,7 @@ static int __do_fault(struct vm_fault *vmf) ret = vma->vm_ops->fault(vma, vmf); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY | - VM_FAULT_DAX_LOCKED | VM_FAULT_DONE_COW))) + VM_FAULT_DONE_COW))) return ret; if (unlikely(PageHWPoison(vmf->page))) { @@ -3276,17 +3276,12 @@ static int do_cow_fault(struct vm_fault *vmf) if (ret & VM_FAULT_DONE_COW) return ret; - if (!(ret & VM_FAULT_DAX_LOCKED)) - copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); + copy_user_highpage(vmf->cow_page, vmf->page, vmf->address, vma); __SetPageUptodate(vmf->cow_page); ret |= finish_fault(vmf); - if (!(ret & VM_FAULT_DAX_LOCKED)) { - unlock_page(vmf->page); - put_page(vmf->page); - } else { - dax_unlock_mapping_entry(vma->vm_file->f_mapping, vmf->pgoff); - } + unlock_page(vmf->page); + put_page(vmf->page); if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE | VM_FAULT_RETRY))) goto uncharge_out; return ret; -- cgit v1.2.3 From 66a6197c118540d454913eef24d68d7491ab5d5f Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:39 -0800 Subject: mm: provide helper for finishing mkwrite faults Provide a helper function for finishing write faults due to PTE being read-only. The helper will be used by DAX to avoid the need of complicating generic MM code with DAX locking specifics. Link: http://lkml.kernel.org/r/1479460644-25076-16-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 1 + mm/memory.c | 67 ++++++++++++++++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 27 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index 59a4da1742e5..cec967e93f95 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -615,6 +615,7 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma) int alloc_set_pte(struct vm_fault *vmf, struct mem_cgroup *memcg, struct page *page); int finish_fault(struct vm_fault *vmf); +int finish_mkwrite_fault(struct vm_fault *vmf); #endif /* diff --git a/mm/memory.c b/mm/memory.c index 82e7689e3059..bbc25da48a18 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2269,6 +2269,38 @@ oom: return VM_FAULT_OOM; } +/** + * finish_mkwrite_fault - finish page fault for a shared mapping, making PTE + * writeable once the page is prepared + * + * @vmf: structure describing the fault + * + * This function handles all that is needed to finish a write page fault in a + * shared mapping due to PTE being read-only once the mapped page is prepared. + * It handles locking of PTE and modifying it. The function returns + * VM_FAULT_WRITE on success, 0 when PTE got changed before we acquired PTE + * lock. + * + * The function expects the page to be locked or other protection against + * concurrent faults / writeback (such as DAX radix tree locks). + */ +int finish_mkwrite_fault(struct vm_fault *vmf) +{ + WARN_ON_ONCE(!(vmf->vma->vm_flags & VM_SHARED)); + vmf->pte = pte_offset_map_lock(vmf->vma->vm_mm, vmf->pmd, vmf->address, + &vmf->ptl); + /* + * We might have raced with another page fault while we released the + * pte_offset_map_lock. + */ + if (!pte_same(*vmf->pte, vmf->orig_pte)) { + pte_unmap_unlock(vmf->pte, vmf->ptl); + return 0; + } + wp_page_reuse(vmf); + return VM_FAULT_WRITE; +} + /* * Handle write page faults for VM_MIXEDMAP or VM_PFNMAP for a VM_SHARED * mapping @@ -2285,16 +2317,7 @@ static int wp_pfn_shared(struct vm_fault *vmf) ret = vma->vm_ops->pfn_mkwrite(vma, vmf); if (ret & VM_FAULT_ERROR) return ret; - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - /* - * We might have raced with another page fault while we - * released the pte_offset_map_lock. - */ - if (!pte_same(*vmf->pte, vmf->orig_pte)) { - pte_unmap_unlock(vmf->pte, vmf->ptl); - return 0; - } + return finish_mkwrite_fault(vmf); } wp_page_reuse(vmf); return VM_FAULT_WRITE; @@ -2304,7 +2327,6 @@ static int wp_page_shared(struct vm_fault *vmf) __releases(vmf->ptl) { struct vm_area_struct *vma = vmf->vma; - int page_mkwrite = 0; get_page(vmf->page); @@ -2318,26 +2340,17 @@ static int wp_page_shared(struct vm_fault *vmf) put_page(vmf->page); return tmp; } - /* - * Since we dropped the lock we need to revalidate - * the PTE as someone else may have changed it. If - * they did, we just return, as we can count on the - * MMU to tell us if they didn't also make it writable. - */ - vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, - vmf->address, &vmf->ptl); - if (!pte_same(*vmf->pte, vmf->orig_pte)) { + tmp = finish_mkwrite_fault(vmf); + if (unlikely(!tmp || (tmp & + (VM_FAULT_ERROR | VM_FAULT_NOPAGE)))) { unlock_page(vmf->page); - pte_unmap_unlock(vmf->pte, vmf->ptl); put_page(vmf->page); - return 0; + return tmp; } - page_mkwrite = 1; - } - - wp_page_reuse(vmf); - if (!page_mkwrite) + } else { + wp_page_reuse(vmf); lock_page(vmf->page); + } fault_dirty_shared_page(vma, vmf->page); put_page(vmf->page); -- cgit v1.2.3 From cae1240257d9ba4b40eb240124c530de8ee349bc Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 14 Dec 2016 15:07:45 -0800 Subject: mm: export follow_pte() DAX will need to implement its own version of page_check_address(). To avoid duplicating page table walking code, export follow_pte() which does what we need. Link: http://lkml.kernel.org/r/1479460644-25076-18-git-send-email-jack@suse.cz Signed-off-by: Jan Kara Reviewed-by: Ross Zwisler Cc: Kirill A. Shutemov Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 ++ mm/memory.c | 4 ++-- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/mm.h b/include/linux/mm.h index cec967e93f95..63926492c06a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1210,6 +1210,8 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows); +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp); int follow_pfn(struct vm_area_struct *vma, unsigned long address, unsigned long *pfn); int follow_phys(struct vm_area_struct *vma, unsigned long address, diff --git a/mm/memory.c b/mm/memory.c index 8b7f0656a921..edd899d0decb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3817,8 +3817,8 @@ out: return -EINVAL; } -static inline int follow_pte(struct mm_struct *mm, unsigned long address, - pte_t **ptepp, spinlock_t **ptlp) +int follow_pte(struct mm_struct *mm, unsigned long address, pte_t **ptepp, + spinlock_t **ptlp) { int res; -- cgit v1.2.3 From 91d9c05ac6c788531136888d31ef18c6a0ec160f Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:34 -0800 Subject: radix-tree: move rcu_head into a union with private_list I want to be able to reference node->parent after freeing node. Currently node->parent is in a union with rcu_head, so it is overwritten when the node is put on the RCU list. We know that private_list is not referenced after the node is freed, so it is safe for these two members to share space. Link: http://lkml.kernel.org/r/1480369871-5271-50-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 14 ++++---------- lib/radix-tree.c | 1 + 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 744486057e9e..d04073ac3a9f 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -85,18 +85,12 @@ struct radix_tree_node { unsigned char offset; /* Slot offset in parent */ unsigned char count; /* Total entry count */ unsigned char exceptional; /* Exceptional entry count */ + struct radix_tree_node *parent; /* Used when ascending tree */ + void *private_data; /* For tree user */ union { - struct { - /* Used when ascending tree */ - struct radix_tree_node *parent; - /* For tree user */ - void *private_data; - }; - /* Used when freeing node */ - struct rcu_head rcu_head; + struct list_head private_list; /* For tree user */ + struct rcu_head rcu_head; /* Used when freeing node */ }; - /* For tree user */ - struct list_head private_list; void __rcu *slots[RADIX_TREE_MAP_SIZE]; unsigned long tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS]; }; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 60c10361cbfa..8c5911eae5e2 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -325,6 +325,7 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) tag_clear(node, i, 0); node->slots[0] = NULL; + INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); } -- cgit v1.2.3 From 148deab223b23734069abcacb5c7118b0e7deadc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:49 -0800 Subject: radix-tree: improve multiorder iterators This fixes several interlinked problems with the iterators in the presence of multiorder entries. 1. radix_tree_iter_next() would only advance by one slot, which would result in the iterators returning the same entry more than once if there were sibling entries. 2. radix_tree_next_slot() could return an internal pointer instead of a user pointer if a tagged multiorder entry was immediately followed by an entry of lower order. 3. radix_tree_next_slot() expanded to a lot more code than it used to when multiorder support was compiled in. And I wasn't comfortable with entry_to_node() being in a header file. Fixing radix_tree_iter_next() for the presence of sibling entries necessarily involves examining the contents of the radix tree, so we now need to pass 'slot' to radix_tree_iter_next(), and we need to change the calling convention so it is called *before* dropping the lock which protects the tree. Also rename it to radix_tree_iter_resume(), as some people thought it was necessary to call radix_tree_iter_next() each time around the loop. radix_tree_next_slot() becomes closer to how it looked before multiorder support was introduced. It only checks to see if the next entry in the chunk is a sibling entry or a pointer to a node; this should be rare enough that handling this case out of line is not a performance impact (and such impact is amortised by the fact that the entry we just processed was a multiorder entry). Also, radix_tree_next_slot() used to force a new chunk lookup for untagged entries, which is more expensive than the out of line sibling entry skipping. Link: http://lkml.kernel.org/r/1480369871-5271-55-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/tests/btrfs-tests.c | 2 +- include/linux/radix-tree.h | 71 +++++++-------- lib/radix-tree.c | 138 +++++++++++++++++++++++++---- mm/khugepaged.c | 7 +- mm/shmem.c | 6 +- tools/testing/radix-tree/iteration_check.c | 12 +-- tools/testing/radix-tree/multiorder.c | 28 ++++-- tools/testing/radix-tree/regression3.c | 8 +- tools/testing/radix-tree/test.h | 1 + 9 files changed, 188 insertions(+), 85 deletions(-) (limited to 'include') diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 73076a0ea6a9..00ee006a8aa2 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) slot = radix_tree_iter_retry(&iter); continue; } - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock(&fs_info->buffer_lock); free_extent_buffer_stale(eb); spin_lock(&fs_info->buffer_lock); diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index d04073ac3a9f..289d007d487b 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -403,20 +403,17 @@ __radix_tree_iter_add(struct radix_tree_iter *iter, unsigned long slots) } /** - * radix_tree_iter_next - resume iterating when the chunk may be invalid - * @iter: iterator state + * radix_tree_iter_resume - resume iterating when the chunk may be invalid + * @slot: pointer to current slot + * @iter: iterator state + * Returns: New slot pointer * * If the iterator needs to release then reacquire a lock, the chunk may * have been invalidated by an insertion or deletion. Call this function - * to continue the iteration from the next index. + * before releasing the lock to continue the iteration from the next index. */ -static inline __must_check -void **radix_tree_iter_next(struct radix_tree_iter *iter) -{ - iter->next_index = __radix_tree_iter_add(iter, 1); - iter->tags = 0; - return NULL; -} +void **__must_check radix_tree_iter_resume(void **slot, + struct radix_tree_iter *iter); /** * radix_tree_chunk_size - get current chunk size @@ -430,10 +427,17 @@ radix_tree_chunk_size(struct radix_tree_iter *iter) return (iter->next_index - iter->index) >> iter_shift(iter); } -static inline struct radix_tree_node *entry_to_node(void *ptr) +#ifdef CONFIG_RADIX_TREE_MULTIORDER +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags); +#else +/* Can't happen without sibling entries, but the compiler can't tell that */ +static inline void ** __radix_tree_next_slot(void **slot, + struct radix_tree_iter *iter, unsigned flags) { - return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); + return slot; } +#endif /** * radix_tree_next_slot - find next slot in chunk @@ -447,7 +451,7 @@ static inline struct radix_tree_node *entry_to_node(void *ptr) * For tagged lookup it also eats @iter->tags. * * There are several cases where 'slot' can be passed in as NULL to this - * function. These cases result from the use of radix_tree_iter_next() or + * function. These cases result from the use of radix_tree_iter_resume() or * radix_tree_iter_retry(). In these cases we don't end up dereferencing * 'slot' because either: * a) we are doing tagged iteration and iter->tags has been set to 0, or @@ -458,51 +462,31 @@ static __always_inline void ** radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) { if (flags & RADIX_TREE_ITER_TAGGED) { - void *canon = slot; - iter->tags >>= 1; if (unlikely(!iter->tags)) return NULL; - while (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_internal_node(slot[1])) { - if (entry_to_node(slot[1]) == canon) { - iter->tags >>= 1; - iter->index = __radix_tree_iter_add(iter, 1); - slot++; - continue; - } - iter->next_index = __radix_tree_iter_add(iter, 1); - return NULL; - } if (likely(iter->tags & 1ul)) { iter->index = __radix_tree_iter_add(iter, 1); - return slot + 1; + slot++; + goto found; } if (!(flags & RADIX_TREE_ITER_CONTIG)) { unsigned offset = __ffs(iter->tags); - iter->tags >>= offset; - iter->index = __radix_tree_iter_add(iter, offset + 1); - return slot + offset + 1; + iter->tags >>= offset++; + iter->index = __radix_tree_iter_add(iter, offset); + slot += offset; + goto found; } } else { long count = radix_tree_chunk_size(iter); - void *canon = slot; while (--count > 0) { slot++; iter->index = __radix_tree_iter_add(iter, 1); - if (IS_ENABLED(CONFIG_RADIX_TREE_MULTIORDER) && - radix_tree_is_internal_node(*slot)) { - if (entry_to_node(*slot) == canon) - continue; - iter->next_index = iter->index; - break; - } - if (likely(*slot)) - return slot; + goto found; if (flags & RADIX_TREE_ITER_CONTIG) { /* forbid switching to the next chunk */ iter->next_index = 0; @@ -511,6 +495,11 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) } } return NULL; + + found: + if (unlikely(radix_tree_is_internal_node(*slot))) + return __radix_tree_next_slot(slot, iter, flags); + return slot; } /** @@ -561,6 +550,6 @@ radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, unsigned flags) slot || (slot = radix_tree_next_chunk(root, iter, \ RADIX_TREE_ITER_TAGGED | tag)) ; \ slot = radix_tree_next_slot(slot, iter, \ - RADIX_TREE_ITER_TAGGED)) + RADIX_TREE_ITER_TAGGED | tag)) #endif /* _LINUX_RADIX_TREE_H */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index dbf4e8c8e100..0a7ac0fb4a65 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -69,6 +69,11 @@ struct radix_tree_preload { }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; +static inline struct radix_tree_node *entry_to_node(void *ptr) +{ + return (void *)((unsigned long)ptr & ~RADIX_TREE_INTERNAL_NODE); +} + static inline void *node_to_entry(void *ptr) { return (void *)((unsigned long)ptr | RADIX_TREE_INTERNAL_NODE); @@ -1104,6 +1109,120 @@ static inline void __set_iter_shift(struct radix_tree_iter *iter, #endif } +/* Construct iter->tags bit-mask from node->tags[tag] array */ +static void set_iter_tags(struct radix_tree_iter *iter, + struct radix_tree_node *node, unsigned offset, + unsigned tag) +{ + unsigned tag_long = offset / BITS_PER_LONG; + unsigned tag_bit = offset % BITS_PER_LONG; + + iter->tags = node->tags[tag][tag_long] >> tag_bit; + + /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ + if (tag_long < RADIX_TREE_TAG_LONGS - 1) { + /* Pick tags from next element */ + if (tag_bit) + iter->tags |= node->tags[tag][tag_long + 1] << + (BITS_PER_LONG - tag_bit); + /* Clip chunk size, here only BITS_PER_LONG tags */ + iter->next_index = __radix_tree_iter_add(iter, BITS_PER_LONG); + } +} + +#ifdef CONFIG_RADIX_TREE_MULTIORDER +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + void *sib = node_to_entry(slot - 1); + + while (iter->index < iter->next_index) { + *nodep = rcu_dereference_raw(*slot); + if (*nodep && *nodep != sib) + return slot; + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + iter->tags >>= 1; + } + + *nodep = NULL; + return NULL; +} + +void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, + unsigned flags) +{ + unsigned tag = flags & RADIX_TREE_ITER_TAG_MASK; + struct radix_tree_node *node = rcu_dereference_raw(*slot); + + slot = skip_siblings(&node, slot, iter); + + while (radix_tree_is_internal_node(node)) { + unsigned offset; + unsigned long next_index; + + if (node == RADIX_TREE_RETRY) + return slot; + node = entry_to_node(node); + iter->shift = node->shift; + + if (flags & RADIX_TREE_ITER_TAGGED) { + offset = radix_tree_find_next_bit(node, tag, 0); + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + slot = &node->slots[offset]; + iter->index = __radix_tree_iter_add(iter, offset); + set_iter_tags(iter, node, offset, tag); + node = rcu_dereference_raw(*slot); + } else { + offset = 0; + slot = &node->slots[0]; + for (;;) { + node = rcu_dereference_raw(*slot); + if (node) + break; + slot++; + offset++; + if (offset == RADIX_TREE_MAP_SIZE) + return NULL; + } + iter->index = __radix_tree_iter_add(iter, offset); + } + if ((flags & RADIX_TREE_ITER_CONTIG) && (offset > 0)) + goto none; + next_index = (iter->index | shift_maxindex(iter->shift)) + 1; + if (next_index < iter->next_index) + iter->next_index = next_index; + } + + return slot; + none: + iter->next_index = 0; + return NULL; +} +EXPORT_SYMBOL(__radix_tree_next_slot); +#else +static void **skip_siblings(struct radix_tree_node **nodep, + void **slot, struct radix_tree_iter *iter) +{ + return slot; +} +#endif + +void **radix_tree_iter_resume(void **slot, struct radix_tree_iter *iter) +{ + struct radix_tree_node *node; + + slot++; + iter->index = __radix_tree_iter_add(iter, 1); + node = rcu_dereference_raw(*slot); + skip_siblings(&node, slot, iter); + iter->next_index = iter->index; + iter->tags = 0; + return NULL; +} +EXPORT_SYMBOL(radix_tree_iter_resume); + /** * radix_tree_next_chunk - find next chunk of slots for iteration * @@ -1191,23 +1310,8 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, iter->next_index = (index | node_maxindex(node)) + 1; __set_iter_shift(iter, node->shift); - /* Construct iter->tags bit-mask from node->tags[tag] array */ - if (flags & RADIX_TREE_ITER_TAGGED) { - unsigned tag_long, tag_bit; - - tag_long = offset / BITS_PER_LONG; - tag_bit = offset % BITS_PER_LONG; - iter->tags = node->tags[tag][tag_long] >> tag_bit; - /* This never happens if RADIX_TREE_TAG_LONGS == 1 */ - if (tag_long < RADIX_TREE_TAG_LONGS - 1) { - /* Pick tags from next element */ - if (tag_bit) - iter->tags |= node->tags[tag][tag_long + 1] << - (BITS_PER_LONG - tag_bit); - /* Clip chunk size, here only BITS_PER_LONG tags */ - iter->next_index = index + BITS_PER_LONG; - } - } + if (flags & RADIX_TREE_ITER_TAGGED) + set_iter_tags(iter, node, offset, tag); return node->slots + offset; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7434a63cac94..e32389a97030 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1446,7 +1446,7 @@ static void collapse_shmem(struct mm_struct *mm, radix_tree_replace_slot(&mapping->page_tree, slot, new_page + (index % HPAGE_PMD_NR)); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); index++; continue; out_lru: @@ -1546,7 +1546,6 @@ tree_unlocked: /* Put holes back where they were */ radix_tree_delete(&mapping->page_tree, iter.index); - slot = radix_tree_iter_next(&iter); continue; } @@ -1557,11 +1556,11 @@ tree_unlocked: page_ref_unfreeze(page, 2); radix_tree_replace_slot(&mapping->page_tree, slot, page); + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); putback_lru_page(page); unlock_page(page); spin_lock_irq(&mapping->tree_lock); - slot = radix_tree_iter_next(&iter); } VM_BUG_ON(nr_none); spin_unlock_irq(&mapping->tree_lock); @@ -1641,8 +1640,8 @@ static void khugepaged_scan_shmem(struct mm_struct *mm, present++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/mm/shmem.c b/mm/shmem.c index abd7403aba41..be11c6dd414a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -661,8 +661,8 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, swapped++; if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } @@ -2447,8 +2447,8 @@ static void shmem_tag_pins(struct address_space *mapping) } if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); @@ -2517,8 +2517,8 @@ static int shmem_wait_for_pins(struct address_space *mapping) spin_unlock_irq(&mapping->tree_lock); continue_resched: if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); cond_resched_rcu(); - slot = radix_tree_iter_next(&iter); } } rcu_read_unlock(); diff --git a/tools/testing/radix-tree/iteration_check.c b/tools/testing/radix-tree/iteration_check.c index df71cb841385..f328a66899b4 100644 --- a/tools/testing/radix-tree/iteration_check.c +++ b/tools/testing/radix-tree/iteration_check.c @@ -48,8 +48,8 @@ static void *add_entries_fn(void *arg) /* * Iterate over the tagged entries, doing a radix_tree_iter_retry() as we find * things that have been removed and randomly resetting our iteration to the - * next chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and - * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a + * next chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and + * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a * NULL 'slot' variable. */ static void *tagged_iteration_fn(void *arg) @@ -79,7 +79,7 @@ static void *tagged_iteration_fn(void *arg) } if (rand_r(&seeds[0]) % 50 == 0) { - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); rcu_barrier(); rcu_read_lock(); @@ -96,8 +96,8 @@ static void *tagged_iteration_fn(void *arg) /* * Iterate over the entries, doing a radix_tree_iter_retry() as we find things * that have been removed and randomly resetting our iteration to the next - * chunk with radix_tree_iter_next(). Both radix_tree_iter_retry() and - * radix_tree_iter_next() cause radix_tree_next_slot() to be called with a + * chunk with radix_tree_iter_resume(). Both radix_tree_iter_retry() and + * radix_tree_iter_resume() cause radix_tree_next_slot() to be called with a * NULL 'slot' variable. */ static void *untagged_iteration_fn(void *arg) @@ -127,7 +127,7 @@ static void *untagged_iteration_fn(void *arg) } if (rand_r(&seeds[1]) % 50 == 0) { - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); rcu_barrier(); rcu_read_lock(); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 8d5865c95664..b9be8856d652 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -231,11 +231,14 @@ void multiorder_iteration(void) radix_tree_for_each_slot(slot, &tree, &iter, j) { int height = order[i] / RADIX_TREE_MAP_SHIFT; int shift = height * RADIX_TREE_MAP_SHIFT; - int mask = (1 << order[i]) - 1; + unsigned long mask = (1UL << order[i]) - 1; + struct item *item = *slot; - assert(iter.index >= (index[i] &~ mask)); - assert(iter.index <= (index[i] | mask)); + assert((iter.index | mask) == (index[i] | mask)); assert(iter.shift == shift); + assert(!radix_tree_is_internal_node(item)); + assert((item->index | mask) == (index[i] | mask)); + assert(item->order == order[i]); i++; } } @@ -269,7 +272,7 @@ void multiorder_tagged_iteration(void) assert(radix_tree_tag_set(&tree, tag_index[i], 1)); for (j = 0; j < 256; j++) { - int mask, k; + int k; for (i = 0; i < TAG_ENTRIES; i++) { for (k = i; index[k] < tag_index[i]; k++) @@ -279,12 +282,16 @@ void multiorder_tagged_iteration(void) } radix_tree_for_each_tagged(slot, &tree, &iter, j, 1) { + unsigned long mask; + struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; - mask = (1 << order[k]) - 1; + mask = (1UL << order[k]) - 1; - assert(iter.index >= (tag_index[i] &~ mask)); - assert(iter.index <= (tag_index[i] | mask)); + assert((iter.index | mask) == (tag_index[i] | mask)); + assert(!radix_tree_is_internal_node(item)); + assert((item->index | mask) == (tag_index[i] | mask)); + assert(item->order == order[k]); i++; } } @@ -303,12 +310,15 @@ void multiorder_tagged_iteration(void) } radix_tree_for_each_tagged(slot, &tree, &iter, j, 2) { + struct item *item = *slot; for (k = i; index[k] < tag_index[i]; k++) ; mask = (1 << order[k]) - 1; - assert(iter.index >= (tag_index[i] &~ mask)); - assert(iter.index <= (tag_index[i] | mask)); + assert((iter.index | mask) == (tag_index[i] | mask)); + assert(!radix_tree_is_internal_node(item)); + assert((item->index | mask) == (tag_index[i] | mask)); + assert(item->order == order[k]); i++; } } diff --git a/tools/testing/radix-tree/regression3.c b/tools/testing/radix-tree/regression3.c index 1f06ed73d0a8..b594841fae85 100644 --- a/tools/testing/radix-tree/regression3.c +++ b/tools/testing/radix-tree/regression3.c @@ -5,7 +5,7 @@ * In following radix_tree_next_slot current chunk size becomes zero. * This isn't checked and it tries to dereference null pointer in slot. * - * Helper radix_tree_iter_next reset slot to NULL and next_index to index + 1, + * Helper radix_tree_iter_resume reset slot to NULL and next_index to index + 1, * for tagger iteraction it also must reset cached tags in iterator to abort * next radix_tree_next_slot and go to slow-path into radix_tree_next_chunk. * @@ -88,7 +88,7 @@ void regression3_test(void) printf("slot %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); } } @@ -96,7 +96,7 @@ void regression3_test(void) printf("contig %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); } } @@ -106,7 +106,7 @@ void regression3_test(void) printf("tagged %ld %p\n", iter.index, *slot); if (!iter.index) { printf("next at %ld\n", iter.index); - slot = radix_tree_iter_next(&iter); + slot = radix_tree_iter_resume(slot, &iter); } } diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 423c528aaee9..617416ec3c5e 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -41,6 +41,7 @@ void verify_tag_consistency(struct radix_tree_root *root, unsigned int tag); extern int nr_allocated; /* Normally private parts of lib/radix-tree.c */ +struct radix_tree_node *entry_to_node(void *ptr); void radix_tree_dump(struct radix_tree_root *root); int root_tag_get(struct radix_tree_root *root, unsigned int tag); unsigned long node_maxindex(struct radix_tree_node *); -- cgit v1.2.3 From 478922e2b0f41567e4a530771bfb3f693f857d45 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:52 -0800 Subject: radix-tree: delete radix_tree_locate_item() This rather complicated function can be better implemented as an iterator. It has only one caller, so move the functionality to the only place that needs it. Update the test suite to follow the same pattern. Link: http://lkml.kernel.org/r/1480369871-5271-56-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Acked-by: Konstantin Khlebnikov Tested-by: Kirill A. Shutemov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 - lib/radix-tree.c | 99 ----------------------------------------- mm/shmem.c | 26 ++++++++++- tools/testing/radix-tree/main.c | 8 ++-- tools/testing/radix-tree/test.c | 22 +++++++++ tools/testing/radix-tree/test.h | 2 + 6 files changed, 53 insertions(+), 105 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 289d007d487b..a13d3f7c6c65 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -296,7 +296,6 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, unsigned long nr_to_tag, unsigned int fromtag, unsigned int totag); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item); static inline void radix_tree_preload_end(void) { diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 0a7ac0fb4a65..5bdf1bdbca00 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1579,105 +1579,6 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, } EXPORT_SYMBOL(radix_tree_gang_lookup_tag_slot); -#if defined(CONFIG_SHMEM) && defined(CONFIG_SWAP) -#include /* for cond_resched() */ - -struct locate_info { - unsigned long found_index; - bool stop; -}; - -/* - * This linear search is at present only useful to shmem_unuse_inode(). - */ -static unsigned long __locate(struct radix_tree_node *slot, void *item, - unsigned long index, struct locate_info *info) -{ - unsigned long i; - - do { - unsigned int shift = slot->shift; - - for (i = (index >> shift) & RADIX_TREE_MAP_MASK; - i < RADIX_TREE_MAP_SIZE; - i++, index += (1UL << shift)) { - struct radix_tree_node *node = - rcu_dereference_raw(slot->slots[i]); - if (node == RADIX_TREE_RETRY) - goto out; - if (!radix_tree_is_internal_node(node)) { - if (node == item) { - info->found_index = index; - info->stop = true; - goto out; - } - continue; - } - node = entry_to_node(node); - if (is_sibling_entry(slot, node)) - continue; - slot = node; - break; - } - } while (i < RADIX_TREE_MAP_SIZE); - -out: - if ((index == 0) && (i == RADIX_TREE_MAP_SIZE)) - info->stop = true; - return index; -} - -/** - * radix_tree_locate_item - search through radix tree for item - * @root: radix tree root - * @item: item to be found - * - * Returns index where item was found, or -1 if not found. - * Caller must hold no lock (since this time-consuming function needs - * to be preemptible), and must check afterwards if item is still there. - */ -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - struct radix_tree_node *node; - unsigned long max_index; - unsigned long cur_index = 0; - struct locate_info info = { - .found_index = -1, - .stop = false, - }; - - do { - rcu_read_lock(); - node = rcu_dereference_raw(root->rnode); - if (!radix_tree_is_internal_node(node)) { - rcu_read_unlock(); - if (node == item) - info.found_index = 0; - break; - } - - node = entry_to_node(node); - - max_index = node_maxindex(node); - if (cur_index > max_index) { - rcu_read_unlock(); - break; - } - - cur_index = __locate(node, item, cur_index, &info); - rcu_read_unlock(); - cond_resched(); - } while (!info.stop && cur_index <= max_index); - - return info.found_index; -} -#else -unsigned long radix_tree_locate_item(struct radix_tree_root *root, void *item) -{ - return -1; -} -#endif /* CONFIG_SHMEM && CONFIG_SWAP */ - /** * __radix_tree_delete_node - try to free node after clearing a slot * @root: radix tree root diff --git a/mm/shmem.c b/mm/shmem.c index be11c6dd414a..54287d443806 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1049,6 +1049,30 @@ static void shmem_evict_inode(struct inode *inode) clear_inode(inode); } +static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned int checked = 0; + + rcu_read_lock(); + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4096) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + + rcu_read_unlock(); + return found; +} + /* * If swap found in inode, free it and move page from swapcache to filecache. */ @@ -1062,7 +1086,7 @@ static int shmem_unuse_inode(struct shmem_inode_info *info, int error = 0; radswap = swp_to_radix_entry(swap); - index = radix_tree_locate_item(&mapping->page_tree, radswap); + index = find_swap_entry(&mapping->page_tree, radswap); if (index == -1) return -EAGAIN; /* tell shmem_unuse we found nothing */ diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index 76d9c95aa487..a028dae8a043 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -239,7 +239,7 @@ static void __locate_check(struct radix_tree_root *tree, unsigned long index, item_insert_order(tree, index, order); item = item_lookup(tree, index); - index2 = radix_tree_locate_item(tree, item); + index2 = find_item(tree, item); if (index != index2) { printf("index %ld order %d inserted; found %ld\n", index, order, index2); @@ -273,17 +273,17 @@ static void locate_check(void) index += (1UL << order)) { __locate_check(&tree, index + offset, order); } - if (radix_tree_locate_item(&tree, &tree) != -1) + if (find_item(&tree, &tree) != -1) abort(); item_kill_tree(&tree); } } - if (radix_tree_locate_item(&tree, &tree) != -1) + if (find_item(&tree, &tree) != -1) abort(); __locate_check(&tree, -1, 0); - if (radix_tree_locate_item(&tree, &tree) != -1) + if (find_item(&tree, &tree) != -1) abort(); item_kill_tree(&tree); } diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 0de548939a2e..88bf57f7175e 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -151,6 +151,28 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, assert(nfound == 0); } +/* Use the same pattern as find_swap_entry() in mm/shmem.c */ +unsigned long find_item(struct radix_tree_root *root, void *item) +{ + struct radix_tree_iter iter; + void **slot; + unsigned long found = -1; + unsigned long checked = 0; + + radix_tree_for_each_slot(slot, root, &iter, 0) { + if (*slot == item) { + found = iter.index; + break; + } + checked++; + if ((checked % 4) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + } + + return found; +} + static int verify_node(struct radix_tree_node *slot, unsigned int tag, int tagged) { diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 617416ec3c5e..3d9d1d30da22 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -25,6 +25,8 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, unsigned long nr, int chunk); void item_kill_tree(struct radix_tree_root *root); +unsigned long find_item(struct radix_tree_root *, void *item); + void tag_check(void); void multiorder_checks(void); void iteration_test(void); -- cgit v1.2.3 From 268f42de718128cd0301293177e79c08c38e39a6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:55 -0800 Subject: radix-tree: delete radix_tree_range_tag_if_tagged() This is an exceptionally complicated function with just one caller (tag_pages_for_writeback). We devote a large portion of the runtime of the test suite to testing this one function which has one caller. By introducing the new function radix_tree_iter_tag_set(), we can eliminate all of the complexity while keeping the performance. The caller can now use a fairly standard radix_tree_for_each() loop, and it doesn't need to worry about tricksy things like 'start' wrapping. The test suite continues to spend a large amount of time investigating this function, but now it's testing the underlying primitives such as radix_tree_iter_resume() and the radix_tree_for_each_tagged() iterator which are also used by other parts of the kernel. Link: http://lkml.kernel.org/r/1480369871-5271-57-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 74 ++++++++++----------- lib/radix-tree.c | 117 ++++++--------------------------- mm/page-writeback.c | 28 +++++--- tools/testing/radix-tree/main.c | 12 ++-- tools/testing/radix-tree/multiorder.c | 13 ++-- tools/testing/radix-tree/regression2.c | 3 +- tools/testing/radix-tree/tag_check.c | 4 +- tools/testing/radix-tree/test.c | 34 ++++++++++ tools/testing/radix-tree/test.h | 3 + 9 files changed, 125 insertions(+), 163 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index a13d3f7c6c65..7a8d2516c73a 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -120,6 +120,41 @@ static inline bool radix_tree_empty(struct radix_tree_root *root) return root->rnode == NULL; } +/** + * struct radix_tree_iter - radix tree iterator state + * + * @index: index of current slot + * @next_index: one beyond the last index for this chunk + * @tags: bit-mask for tag-iterating + * @node: node that contains current slot + * @shift: shift for the node that holds our slots + * + * This radix tree iterator works in terms of "chunks" of slots. A chunk is a + * subinterval of slots contained within one radix tree leaf node. It is + * described by a pointer to its first slot and a struct radix_tree_iter + * which holds the chunk's position in the tree and its size. For tagged + * iteration radix_tree_iter also holds the slots' bit-mask for one chosen + * radix tree tag. + */ +struct radix_tree_iter { + unsigned long index; + unsigned long next_index; + unsigned long tags; + struct radix_tree_node *node; +#ifdef CONFIG_RADIX_TREE_MULTIORDER + unsigned int shift; +#endif +}; + +static inline unsigned int iter_shift(const struct radix_tree_iter *iter) +{ +#ifdef CONFIG_RADIX_TREE_MULTIORDER + return iter->shift; +#else + return 0; +#endif +} + /** * Radix-tree synchronization * @@ -283,6 +318,8 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, unsigned long index, unsigned int tag); int radix_tree_tag_get(struct radix_tree_root *root, unsigned long index, unsigned int tag); +void radix_tree_iter_tag_set(struct radix_tree_root *root, + const struct radix_tree_iter *iter, unsigned int tag); unsigned int radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, unsigned long first_index, unsigned int max_items, @@ -291,10 +328,6 @@ unsigned int radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, unsigned long first_index, unsigned int max_items, unsigned int tag); -unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, - unsigned long *first_indexp, unsigned long last_index, - unsigned long nr_to_tag, - unsigned int fromtag, unsigned int totag); int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag); static inline void radix_tree_preload_end(void) @@ -302,39 +335,6 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } -/** - * struct radix_tree_iter - radix tree iterator state - * - * @index: index of current slot - * @next_index: one beyond the last index for this chunk - * @tags: bit-mask for tag-iterating - * @shift: shift for the node that holds our slots - * - * This radix tree iterator works in terms of "chunks" of slots. A chunk is a - * subinterval of slots contained within one radix tree leaf node. It is - * described by a pointer to its first slot and a struct radix_tree_iter - * which holds the chunk's position in the tree and its size. For tagged - * iteration radix_tree_iter also holds the slots' bit-mask for one chosen - * radix tree tag. - */ -struct radix_tree_iter { - unsigned long index; - unsigned long next_index; - unsigned long tags; -#ifdef CONFIG_RADIX_TREE_MULTIORDER - unsigned int shift; -#endif -}; - -static inline unsigned int iter_shift(struct radix_tree_iter *iter) -{ -#ifdef CONFIG_RADIX_TREE_MULTIORDER - return iter->shift; -#else - return 0; -#endif -} - #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ #define RADIX_TREE_ITER_CONTIG 0x0200 /* stop at first hole */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 5bdf1bdbca00..d1f4ca5b7989 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -219,6 +219,11 @@ radix_tree_find_next_bit(struct radix_tree_node *node, unsigned int tag, return RADIX_TREE_MAP_SIZE; } +static unsigned int iter_offset(const struct radix_tree_iter *iter) +{ + return (iter->index >> iter_shift(iter)) & RADIX_TREE_MAP_MASK; +} + /* * The maximum index which can be stored in a radix tree */ @@ -1014,6 +1019,18 @@ static void node_tag_set(struct radix_tree_root *root, root_tag_set(root, tag); } +/** + * radix_tree_iter_tag_set - set a tag on the current iterator entry + * @root: radix tree root + * @iter: iterator state + * @tag: tag to set + */ +void radix_tree_iter_tag_set(struct radix_tree_root *root, + const struct radix_tree_iter *iter, unsigned int tag) +{ + node_tag_set(root, iter->node, tag, iter_offset(iter)); +} + /** * radix_tree_tag_clear - clear a tag on a radix tree node * @root: radix tree root @@ -1164,6 +1181,7 @@ void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter, if (node == RADIX_TREE_RETRY) return slot; node = entry_to_node(node); + iter->node = node; iter->shift = node->shift; if (flags & RADIX_TREE_ITER_TAGGED) { @@ -1266,6 +1284,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, iter->index = index; iter->next_index = maxindex + 1; iter->tags = 1; + iter->node = NULL; __set_iter_shift(iter, 0); return (void **)&root->rnode; } @@ -1308,6 +1327,7 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, /* Update the iterator state */ iter->index = (index &~ node_maxindex(node)) | (offset << node->shift); iter->next_index = (index | node_maxindex(node)) + 1; + iter->node = node; __set_iter_shift(iter, node->shift); if (flags & RADIX_TREE_ITER_TAGGED) @@ -1317,103 +1337,6 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, } EXPORT_SYMBOL(radix_tree_next_chunk); -/** - * radix_tree_range_tag_if_tagged - for each item in given range set given - * tag if item has another tag set - * @root: radix tree root - * @first_indexp: pointer to a starting index of a range to scan - * @last_index: last index of a range to scan - * @nr_to_tag: maximum number items to tag - * @iftag: tag index to test - * @settag: tag index to set if tested tag is set - * - * This function scans range of radix tree from first_index to last_index - * (inclusive). For each item in the range if iftag is set, the function sets - * also settag. The function stops either after tagging nr_to_tag items or - * after reaching last_index. - * - * The tags must be set from the leaf level only and propagated back up the - * path to the root. We must do this so that we resolve the full path before - * setting any tags on intermediate nodes. If we set tags as we descend, then - * we can get to the leaf node and find that the index that has the iftag - * set is outside the range we are scanning. This reults in dangling tags and - * can lead to problems with later tag operations (e.g. livelocks on lookups). - * - * The function returns the number of leaves where the tag was set and sets - * *first_indexp to the first unscanned index. - * WARNING! *first_indexp can wrap if last_index is ULONG_MAX. Caller must - * be prepared to handle that. - */ -unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, - unsigned long *first_indexp, unsigned long last_index, - unsigned long nr_to_tag, - unsigned int iftag, unsigned int settag) -{ - struct radix_tree_node *node, *child; - unsigned long maxindex; - unsigned long tagged = 0; - unsigned long index = *first_indexp; - - radix_tree_load_root(root, &child, &maxindex); - last_index = min(last_index, maxindex); - if (index > last_index) - return 0; - if (!nr_to_tag) - return 0; - if (!root_tag_get(root, iftag)) { - *first_indexp = last_index + 1; - return 0; - } - if (!radix_tree_is_internal_node(child)) { - *first_indexp = last_index + 1; - root_tag_set(root, settag); - return 1; - } - - node = entry_to_node(child); - - for (;;) { - unsigned offset = radix_tree_descend(node, &child, index); - if (!child) - goto next; - if (!tag_get(node, iftag, offset)) - goto next; - /* Sibling slots never have tags set on them */ - if (radix_tree_is_internal_node(child)) { - node = entry_to_node(child); - continue; - } - - tagged++; - node_tag_set(root, node, settag, offset); - next: - /* Go to next entry in node */ - index = ((index >> node->shift) + 1) << node->shift; - /* Overflow can happen when last_index is ~0UL... */ - if (index > last_index || !index) - break; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - while (offset == 0) { - /* - * We've fully scanned this node. Go up. Because - * last_index is guaranteed to be in the tree, what - * we do below cannot wander astray. - */ - node = node->parent; - offset = (index >> node->shift) & RADIX_TREE_MAP_MASK; - } - if (is_sibling_entry(node, node->slots[offset])) - goto next; - if (tagged >= nr_to_tag) - break; - } - - *first_indexp = index; - - return tagged; -} -EXPORT_SYMBOL(radix_tree_range_tag_if_tagged); - /** * radix_tree_gang_lookup - perform multiple lookup on a radix tree * @root: radix tree root diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 52e2f8e3b472..290e8b7d3181 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2106,18 +2106,26 @@ void tag_pages_for_writeback(struct address_space *mapping, pgoff_t start, pgoff_t end) { #define WRITEBACK_TAG_BATCH 4096 - unsigned long tagged; - - do { - spin_lock_irq(&mapping->tree_lock); - tagged = radix_tree_range_tag_if_tagged(&mapping->page_tree, - &start, end, WRITEBACK_TAG_BATCH, - PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + spin_lock_irq(&mapping->tree_lock); + radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter, start, + PAGECACHE_TAG_DIRTY) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(&mapping->page_tree, &iter, + PAGECACHE_TAG_TOWRITE); + tagged++; + if ((tagged % WRITEBACK_TAG_BATCH) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); spin_unlock_irq(&mapping->tree_lock); - WARN_ON_ONCE(tagged > WRITEBACK_TAG_BATCH); cond_resched(); - /* We check 'start' to handle wrapping when end == ~0UL */ - } while (tagged >= WRITEBACK_TAG_BATCH && start); + spin_lock_irq(&mapping->tree_lock); + } + spin_unlock_irq(&mapping->tree_lock); } EXPORT_SYMBOL(tag_pages_for_writeback); diff --git a/tools/testing/radix-tree/main.c b/tools/testing/radix-tree/main.c index a028dae8a043..170175ca4105 100644 --- a/tools/testing/radix-tree/main.c +++ b/tools/testing/radix-tree/main.c @@ -205,8 +205,7 @@ void copy_tag_check(void) } // printf("\ncopying tags...\n"); - cur = start; - tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, ITEMS, 0, 1); + tagged = tag_tagged_items(&tree, NULL, start, end, ITEMS, 0, 1); // printf("checking copied tags\n"); assert(tagged == count); @@ -214,16 +213,13 @@ void copy_tag_check(void) /* Copy tags in several rounds */ // printf("\ncopying tags...\n"); - cur = start; - do { - tmp = rand() % (count/10+2); - tagged = radix_tree_range_tag_if_tagged(&tree, &cur, end, tmp, 0, 2); - } while (tmp == tagged); + tmp = rand() % (count / 10 + 2); + tagged = tag_tagged_items(&tree, NULL, start, end, tmp, 0, 2); + assert(tagged == count); // printf("%lu %lu %lu\n", tagged, tmp, count); // printf("checking copied tags\n"); check_copied_tags(&tree, start, end, idx, ITEMS, 0, 2); - assert(tagged < tmp); verify_tag_consistency(&tree, 0); verify_tag_consistency(&tree, 1); verify_tag_consistency(&tree, 2); diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index b9be8856d652..86daf23b3509 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -26,7 +26,6 @@ static void __multiorder_tag_test(int index, int order) { RADIX_TREE(tree, GFP_KERNEL); int base, err, i; - unsigned long first = 0; /* our canonical entry */ base = index & ~((1 << order) - 1); @@ -60,7 +59,7 @@ static void __multiorder_tag_test(int index, int order) assert(!radix_tree_tag_get(&tree, i, 1)); } - assert(radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, 10, 0, 1) == 1); + assert(tag_tagged_items(&tree, NULL, 0, ~0UL, 10, 0, 1) == 1); assert(radix_tree_tag_clear(&tree, index, 0)); for_each_index(i, base, order) { @@ -251,7 +250,6 @@ void multiorder_tagged_iteration(void) RADIX_TREE(tree, GFP_KERNEL); struct radix_tree_iter iter; void **slot; - unsigned long first = 0; int i, j; printf("Multiorder tagged iteration test\n"); @@ -296,8 +294,8 @@ void multiorder_tagged_iteration(void) } } - radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, - MT_NUM_ENTRIES, 1, 2); + assert(tag_tagged_items(&tree, NULL, 0, ~0UL, TAG_ENTRIES, 1, 2) == + TAG_ENTRIES); for (j = 0; j < 256; j++) { int mask, k; @@ -323,9 +321,8 @@ void multiorder_tagged_iteration(void) } } - first = 1; - radix_tree_range_tag_if_tagged(&tree, &first, ~0UL, - MT_NUM_ENTRIES, 1, 0); + assert(tag_tagged_items(&tree, NULL, 1, ~0UL, MT_NUM_ENTRIES * 2, 1, 0) + == TAG_ENTRIES); i = 0; radix_tree_for_each_tagged(slot, &tree, &iter, 0, 0) { assert(iter.index == tag_index[i]); diff --git a/tools/testing/radix-tree/regression2.c b/tools/testing/radix-tree/regression2.c index 63bf347aaf33..a41325d7a170 100644 --- a/tools/testing/radix-tree/regression2.c +++ b/tools/testing/radix-tree/regression2.c @@ -50,6 +50,7 @@ #include #include "regression.h" +#include "test.h" #define PAGECACHE_TAG_DIRTY 0 #define PAGECACHE_TAG_WRITEBACK 1 @@ -90,7 +91,7 @@ void regression2_test(void) /* 1. */ start = 0; end = max_slots - 2; - radix_tree_range_tag_if_tagged(&mt_tree, &start, end, 1, + tag_tagged_items(&mt_tree, NULL, start, end, 1, PAGECACHE_TAG_DIRTY, PAGECACHE_TAG_TOWRITE); /* 2. */ diff --git a/tools/testing/radix-tree/tag_check.c b/tools/testing/radix-tree/tag_check.c index 186f6e4914e4..ed5f87d1dce2 100644 --- a/tools/testing/radix-tree/tag_check.c +++ b/tools/testing/radix-tree/tag_check.c @@ -23,7 +23,7 @@ __simple_checks(struct radix_tree_root *tree, unsigned long index, int tag) item_tag_set(tree, index, tag); ret = item_tag_get(tree, index, tag); assert(ret != 0); - ret = radix_tree_range_tag_if_tagged(tree, &first, ~0UL, 10, tag, !tag); + ret = tag_tagged_items(tree, NULL, first, ~0UL, 10, tag, !tag); assert(ret == 1); ret = item_tag_get(tree, index, !tag); assert(ret != 0); @@ -320,7 +320,7 @@ static void single_check(void) assert(ret == 0); verify_tag_consistency(&tree, 0); verify_tag_consistency(&tree, 1); - ret = radix_tree_range_tag_if_tagged(&tree, &first, 10, 10, 0, 1); + ret = tag_tagged_items(&tree, NULL, first, 10, 10, 0, 1); assert(ret == 1); ret = radix_tree_gang_lookup_tag(&tree, (void **)items, 0, BATCH, 1); assert(ret == 1); diff --git a/tools/testing/radix-tree/test.c b/tools/testing/radix-tree/test.c index 88bf57f7175e..e5726e373646 100644 --- a/tools/testing/radix-tree/test.c +++ b/tools/testing/radix-tree/test.c @@ -151,6 +151,40 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, assert(nfound == 0); } +/* Use the same pattern as tag_pages_for_writeback() in mm/page-writeback.c */ +int tag_tagged_items(struct radix_tree_root *root, pthread_mutex_t *lock, + unsigned long start, unsigned long end, unsigned batch, + unsigned iftag, unsigned thentag) +{ + unsigned long tagged = 0; + struct radix_tree_iter iter; + void **slot; + + if (batch == 0) + batch = 1; + + if (lock) + pthread_mutex_lock(lock); + radix_tree_for_each_tagged(slot, root, &iter, start, iftag) { + if (iter.index > end) + break; + radix_tree_iter_tag_set(root, &iter, thentag); + tagged++; + if ((tagged % batch) != 0) + continue; + slot = radix_tree_iter_resume(slot, &iter); + if (lock) { + pthread_mutex_unlock(lock); + rcu_barrier(); + pthread_mutex_lock(lock); + } + } + if (lock) + pthread_mutex_unlock(lock); + + return tagged; +} + /* Use the same pattern as find_swap_entry() in mm/shmem.c */ unsigned long find_item(struct radix_tree_root *root, void *item) { diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index 3d9d1d30da22..e11e4d260b4e 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -25,6 +25,9 @@ void item_full_scan(struct radix_tree_root *root, unsigned long start, unsigned long nr, int chunk); void item_kill_tree(struct radix_tree_root *root); +int tag_tagged_items(struct radix_tree_root *, pthread_mutex_t *, + unsigned long start, unsigned long end, unsigned batch, + unsigned iftag, unsigned thentag); unsigned long find_item(struct radix_tree_root *, void *item); void tag_check(void); -- cgit v1.2.3 From 175542f575723e43f897ddb09d0011c13f7cf0ec Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:08:58 -0800 Subject: radix-tree: add radix_tree_join This new function allows for the replacement of many smaller entries in the radix tree with one larger multiorder entry. From the point of view of an RCU walker, they may see a mixture of the smaller entries and the large entry during the same walk, but they will never see NULL for an index which was populated before the join. Link: http://lkml.kernel.org/r/1480369871-5271-58-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 3 + lib/radix-tree.c | 183 ++++++++++++++++++++++++++++------ tools/testing/radix-tree/multiorder.c | 58 +++++++++++ 3 files changed, 213 insertions(+), 31 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 7a8d2516c73a..935293a24f7d 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -335,6 +335,9 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } +int radix_tree_join(struct radix_tree_root *, unsigned long index, + unsigned new_order, void *); + #define RADIX_TREE_ITER_TAG_MASK 0x00FF /* tag index in lower byte */ #define RADIX_TREE_ITER_TAGGED 0x0100 /* lookup tagged slots */ #define RADIX_TREE_ITER_CONTIG 0x0200 /* stop at first hole */ diff --git a/lib/radix-tree.c b/lib/radix-tree.c index d1f4ca5b7989..962cfb3a7671 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -339,17 +339,14 @@ static void radix_tree_node_rcu_free(struct rcu_head *head) { struct radix_tree_node *node = container_of(head, struct radix_tree_node, rcu_head); - int i; /* - * must only free zeroed nodes into the slab. radix_tree_shrink - * can leave us with a non-NULL entry in the first slot, so clear - * that here to make sure. + * Must only free zeroed nodes into the slab. We can be left with + * non-NULL entries by radix_tree_free_nodes, so clear the entries + * and tags here. */ - for (i = 0; i < RADIX_TREE_MAX_TAGS; i++) - tag_clear(node, i, 0); - - node->slots[0] = NULL; + memset(node->slots, 0, sizeof(node->slots)); + memset(node->tags, 0, sizeof(node->tags)); INIT_LIST_HEAD(&node->private_list); kmem_cache_free(radix_tree_node_cachep, node); @@ -678,14 +675,14 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, shift = radix_tree_load_root(root, &child, &maxindex); /* Make sure the tree is high enough. */ + if (order > 0 && max == ((1UL << order) - 1)) + max++; if (max > maxindex) { int error = radix_tree_extend(root, max, shift); if (error < 0) return error; shift = error; child = root->rnode; - if (order == shift) - shift += RADIX_TREE_MAP_SHIFT; } while (shift > order) { @@ -697,6 +694,8 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, return -ENOMEM; child->shift = shift; child->offset = offset; + child->count = 0; + child->exceptional = 0; child->parent = node; rcu_assign_pointer(*slot, node_to_entry(child)); if (node) @@ -710,31 +709,121 @@ int __radix_tree_create(struct radix_tree_root *root, unsigned long index, slot = &node->slots[offset]; } + if (nodep) + *nodep = node; + if (slotp) + *slotp = slot; + return 0; +} + #ifdef CONFIG_RADIX_TREE_MULTIORDER - /* Insert pointers to the canonical entry */ - if (order > shift) { - unsigned i, n = 1 << (order - shift); +/* + * Free any nodes below this node. The tree is presumed to not need + * shrinking, and any user data in the tree is presumed to not need a + * destructor called on it. If we need to add a destructor, we can + * add that functionality later. Note that we may not clear tags or + * slots from the tree as an RCU walker may still have a pointer into + * this subtree. We could replace the entries with RADIX_TREE_RETRY, + * but we'll still have to clear those in rcu_free. + */ +static void radix_tree_free_nodes(struct radix_tree_node *node) +{ + unsigned offset = 0; + struct radix_tree_node *child = entry_to_node(node); + + for (;;) { + void *entry = child->slots[offset]; + if (radix_tree_is_internal_node(entry) && + !is_sibling_entry(child, entry)) { + child = entry_to_node(entry); + offset = 0; + continue; + } + offset++; + while (offset == RADIX_TREE_MAP_SIZE) { + struct radix_tree_node *old = child; + offset = child->offset + 1; + child = child->parent; + radix_tree_node_free(old); + if (old == entry_to_node(node)) + return; + } + } +} + +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + struct radix_tree_node *child; + unsigned i, n, tag, offset, tags = 0; + + if (node) { + n = 1 << (order - node->shift); + offset = get_slot_offset(node, slot); + } else { + n = 1; + offset = 0; + } + + if (n > 1) { offset = offset & ~(n - 1); slot = &node->slots[offset]; - child = node_to_entry(slot); - for (i = 0; i < n; i++) { - if (slot[i]) + } + child = node_to_entry(slot); + + for (i = 0; i < n; i++) { + if (slot[i]) { + if (replace) { + node->count--; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(node, tag, offset + i)) + tags |= 1 << tag; + } else return -EEXIST; } + } - for (i = 1; i < n; i++) { + for (i = 0; i < n; i++) { + struct radix_tree_node *old = slot[i]; + if (i) { rcu_assign_pointer(slot[i], child); - node->count++; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_clear(node, tag, offset + i); + } else { + rcu_assign_pointer(slot[i], item); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); } + if (radix_tree_is_internal_node(old) && + !is_sibling_entry(node, old)) + radix_tree_free_nodes(old); + if (radix_tree_exceptional_entry(old)) + node->exceptional--; } -#endif - - if (nodep) - *nodep = node; - if (slotp) - *slotp = slot; - return 0; + if (node) { + node->count += n; + if (radix_tree_exceptional_entry(item)) + node->exceptional += n; + } + return n; } +#else +static inline int insert_entries(struct radix_tree_node *node, void **slot, + void *item, unsigned order, bool replace) +{ + if (*slot) + return -EEXIST; + rcu_assign_pointer(*slot, item); + if (node) { + node->count++; + if (radix_tree_exceptional_entry(item)) + node->exceptional++; + } + return 1; +} +#endif /** * __radix_tree_insert - insert into a radix tree @@ -757,15 +846,13 @@ int __radix_tree_insert(struct radix_tree_root *root, unsigned long index, error = __radix_tree_create(root, index, order, &node, &slot); if (error) return error; - if (*slot != NULL) - return -EEXIST; - rcu_assign_pointer(*slot, item); + + error = insert_entries(node, slot, item, order, false); + if (error < 0) + return error; if (node) { unsigned offset = get_slot_offset(node, slot); - node->count++; - if (radix_tree_exceptional_entry(item)) - node->exceptional++; BUG_ON(tag_get(node, 0, offset)); BUG_ON(tag_get(node, 1, offset)); BUG_ON(tag_get(node, 2, offset)); @@ -942,6 +1029,40 @@ void radix_tree_replace_slot(struct radix_tree_root *root, replace_slot(root, NULL, slot, item, true); } +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/** + * radix_tree_join - replace multiple entries with one multiorder entry + * @root: radix tree root + * @index: an index inside the new entry + * @order: order of the new entry + * @item: new entry + * + * Call this function to replace several entries with one larger entry. + * The existing entries are presumed to not need freeing as a result of + * this call. + * + * The replacement entry will have all the tags set on it that were set + * on any of the entries it is replacing. + */ +int radix_tree_join(struct radix_tree_root *root, unsigned long index, + unsigned order, void *item) +{ + struct radix_tree_node *node; + void **slot; + int error; + + BUG_ON(radix_tree_is_internal_node(item)); + + error = __radix_tree_create(root, index, order, &node, &slot); + if (!error) + error = insert_entries(node, slot, item, order, true); + if (error > 0) + error = 0; + + return error; +} +#endif + /** * radix_tree_tag_set - set a tag on a radix tree node * @root: radix tree root diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index 86daf23b3509..c9f656cf5f52 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -332,6 +332,63 @@ void multiorder_tagged_iteration(void) item_kill_tree(&tree); } +static void __multiorder_join(unsigned long index, + unsigned order1, unsigned order2) +{ + unsigned long loc; + void *item, *item2 = item_create(index + 1, order1); + RADIX_TREE(tree, GFP_KERNEL); + + item_insert_order(&tree, index, order2); + item = radix_tree_lookup(&tree, index); + radix_tree_join(&tree, index + 1, order1, item2); + loc = find_item(&tree, item); + if (loc == -1) + free(item); + item = radix_tree_lookup(&tree, index + 1); + assert(item == item2); + item_kill_tree(&tree); +} + +static void __multiorder_join2(unsigned order1, unsigned order2) +{ + RADIX_TREE(tree, GFP_KERNEL); + struct radix_tree_node *node; + void *item1 = item_create(0, order1); + void *item2; + + item_insert_order(&tree, 0, order2); + radix_tree_insert(&tree, 1 << order2, (void *)0x12UL); + item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); + assert(item2 == (void *)0x12UL); + assert(node->exceptional == 1); + + radix_tree_join(&tree, 0, order1, item1); + item2 = __radix_tree_lookup(&tree, 1 << order2, &node, NULL); + assert(item2 == item1); + assert(node->exceptional == 0); + item_kill_tree(&tree); +} + +static void multiorder_join(void) +{ + int i, j, idx; + + for (idx = 0; idx < 1024; idx = idx * 2 + 3) { + for (i = 1; i < 15; i++) { + for (j = 0; j < i; j++) { + __multiorder_join(idx, i, j); + } + } + } + + for (i = 1; i < 15; i++) { + for (j = 0; j < i; j++) { + __multiorder_join2(i, j); + } + } +} + void multiorder_checks(void) { int i; @@ -349,4 +406,5 @@ void multiorder_checks(void) multiorder_tag_tests(); multiorder_iteration(); multiorder_tagged_iteration(); + multiorder_join(); } -- cgit v1.2.3 From e157b555945fb16ddc6cce605a1eb6b4135ea5f1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:09:01 -0800 Subject: radix-tree: add radix_tree_split This new function splits a larger multiorder entry into smaller entries (potentially multi-order entries). These entries are initialised to RADIX_TREE_RETRY to ensure that RCU walkers who see this state aren't confused. The caller should then call radix_tree_for_each_slot() and radix_tree_replace_slot() in order to turn these retry entries into the intended new entries. Tags are replicated from the original multiorder entry into each new entry. Link: http://lkml.kernel.org/r/1480369871-5271-59-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 12 +++ lib/radix-tree.c | 142 +++++++++++++++++++++++++++++++++- tools/testing/radix-tree/multiorder.c | 64 +++++++++++++++ 3 files changed, 214 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 935293a24f7d..1f4b56120de8 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -80,6 +80,14 @@ static inline bool radix_tree_is_internal_node(void *ptr) #define RADIX_TREE_MAX_PATH (DIV_ROUND_UP(RADIX_TREE_INDEX_BITS, \ RADIX_TREE_MAP_SHIFT)) +/* + * @count is the count of every non-NULL element in the ->slots array + * whether that is an exceptional entry, a retry entry, a user pointer, + * a sibling entry or a pointer to the next level of the tree. + * @exceptional is the count of every element in ->slots which is + * either radix_tree_exceptional_entry() or is a sibling entry for an + * exceptional entry. + */ struct radix_tree_node { unsigned char shift; /* Bits remaining in each slot */ unsigned char offset; /* Slot offset in parent */ @@ -293,6 +301,8 @@ void __radix_tree_replace(struct radix_tree_root *root, struct radix_tree_node *node, void **slot, void *item, radix_tree_update_node_t update_node, void *private); +void radix_tree_iter_replace(struct radix_tree_root *, + const struct radix_tree_iter *, void **slot, void *item); void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item); void __radix_tree_delete_node(struct radix_tree_root *root, @@ -335,6 +345,8 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } +int radix_tree_split(struct radix_tree_root *, unsigned long index, + unsigned new_order); int radix_tree_join(struct radix_tree_root *, unsigned long index, unsigned new_order, void *); diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 962cfb3a7671..ade2ed3f5190 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -22,6 +22,7 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#include #include #include #include @@ -758,7 +759,10 @@ static inline int insert_entries(struct radix_tree_node *node, void **slot, unsigned i, n, tag, offset, tags = 0; if (node) { - n = 1 << (order - node->shift); + if (order > node->shift) + n = 1 << (order - node->shift); + else + n = 1; offset = get_slot_offset(node, slot); } else { n = 1; @@ -797,7 +801,8 @@ static inline int insert_entries(struct radix_tree_node *node, void **slot, tag_set(node, tag, offset); } if (radix_tree_is_internal_node(old) && - !is_sibling_entry(node, old)) + !is_sibling_entry(node, old) && + (old != RADIX_TREE_RETRY)) radix_tree_free_nodes(old); if (radix_tree_exceptional_entry(old)) node->exceptional--; @@ -1021,7 +1026,8 @@ void __radix_tree_replace(struct radix_tree_root *root, * NOTE: This cannot be used to switch between non-entries (empty slots), * regular entries, and exceptional entries, as that requires accounting * inside the radix tree node. When switching from one type of entry or - * deleting, use __radix_tree_lookup() and __radix_tree_replace(). + * deleting, use __radix_tree_lookup() and __radix_tree_replace() or + * radix_tree_iter_replace(). */ void radix_tree_replace_slot(struct radix_tree_root *root, void **slot, void *item) @@ -1029,6 +1035,21 @@ void radix_tree_replace_slot(struct radix_tree_root *root, replace_slot(root, NULL, slot, item, true); } +/** + * radix_tree_iter_replace - replace item in a slot + * @root: radix tree root + * @slot: pointer to slot + * @item: new item to store in the slot. + * + * For use with radix_tree_split() and radix_tree_for_each_slot(). + * Caller must hold tree write locked across split and replacement. + */ +void radix_tree_iter_replace(struct radix_tree_root *root, + const struct radix_tree_iter *iter, void **slot, void *item) +{ + __radix_tree_replace(root, iter->node, slot, item, NULL, NULL); +} + #ifdef CONFIG_RADIX_TREE_MULTIORDER /** * radix_tree_join - replace multiple entries with one multiorder entry @@ -1061,6 +1082,117 @@ int radix_tree_join(struct radix_tree_root *root, unsigned long index, return error; } + +/** + * radix_tree_split - Split an entry into smaller entries + * @root: radix tree root + * @index: An index within the large entry + * @order: Order of new entries + * + * Call this function as the first step in replacing a multiorder entry + * with several entries of lower order. After this function returns, + * loop over the relevant portion of the tree using radix_tree_for_each_slot() + * and call radix_tree_iter_replace() to set up each new entry. + * + * The tags from this entry are replicated to all the new entries. + * + * The radix tree should be locked against modification during the entire + * replacement operation. Lock-free lookups will see RADIX_TREE_RETRY which + * should prompt RCU walkers to restart the lookup from the root. + */ +int radix_tree_split(struct radix_tree_root *root, unsigned long index, + unsigned order) +{ + struct radix_tree_node *parent, *node, *child; + void **slot; + unsigned int offset, end; + unsigned n, tag, tags = 0; + + if (!__radix_tree_lookup(root, index, &parent, &slot)) + return -ENOENT; + if (!parent) + return -ENOENT; + + offset = get_slot_offset(parent, slot); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tag_get(parent, tag, offset)) + tags |= 1 << tag; + + for (end = offset + 1; end < RADIX_TREE_MAP_SIZE; end++) { + if (!is_sibling_entry(parent, parent->slots[end])) + break; + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(parent, tag, end); + /* rcu_assign_pointer ensures tags are set before RETRY */ + rcu_assign_pointer(parent->slots[end], RADIX_TREE_RETRY); + } + rcu_assign_pointer(parent->slots[offset], RADIX_TREE_RETRY); + parent->exceptional -= (end - offset); + + if (order == parent->shift) + return 0; + if (order > parent->shift) { + while (offset < end) + offset += insert_entries(parent, &parent->slots[offset], + RADIX_TREE_RETRY, order, true); + return 0; + } + + node = parent; + + for (;;) { + if (node->shift > order) { + child = radix_tree_node_alloc(root); + if (!child) + goto nomem; + child->shift = node->shift - RADIX_TREE_MAP_SHIFT; + child->offset = offset; + child->count = 0; + child->parent = node; + if (node != parent) { + node->count++; + node->slots[offset] = node_to_entry(child); + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + } + + node = child; + offset = 0; + continue; + } + + n = insert_entries(node, &node->slots[offset], + RADIX_TREE_RETRY, order, false); + BUG_ON(n > RADIX_TREE_MAP_SIZE); + + for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) + if (tags & (1 << tag)) + tag_set(node, tag, offset); + offset += n; + + while (offset == RADIX_TREE_MAP_SIZE) { + if (node == parent) + break; + offset = node->offset; + child = node; + node = node->parent; + rcu_assign_pointer(node->slots[offset], + node_to_entry(child)); + offset++; + } + if ((node == parent) && (offset == end)) + return 0; + } + + nomem: + /* Shouldn't happen; did user forget to preload? */ + /* TODO: free all the allocated nodes */ + WARN_ON(1); + return -ENOMEM; +} #endif /** @@ -1441,8 +1573,10 @@ void **radix_tree_next_chunk(struct radix_tree_root *root, child = rcu_dereference_raw(node->slots[offset]); } - if ((child == NULL) || (child == RADIX_TREE_RETRY)) + if (!child) goto restart; + if (child == RADIX_TREE_RETRY) + break; } while (radix_tree_is_internal_node(child)); /* Update the iterator state */ diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index c9f656cf5f52..fa6effe997a3 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -389,6 +389,69 @@ static void multiorder_join(void) } } +static void __multiorder_split(int old_order, int new_order) +{ + RADIX_TREE(tree, GFP_KERNEL); + void **slot; + struct radix_tree_iter iter; + struct radix_tree_node *node; + void *item; + + item_insert_order(&tree, 0, old_order); + radix_tree_tag_set(&tree, 0, 2); + radix_tree_split(&tree, 0, new_order); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + radix_tree_iter_replace(&tree, &iter, slot, + item_create(iter.index, new_order)); + } + + item_kill_tree(&tree); + + __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x12); + assert(node->exceptional > 0); + + radix_tree_split(&tree, 0, new_order); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + radix_tree_iter_replace(&tree, &iter, slot, + item_create(iter.index, new_order)); + } + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item != (void *)0x12); + assert(node->exceptional == 0); + + item_kill_tree(&tree); + + __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x12); + assert(node->exceptional > 0); + + radix_tree_split(&tree, 0, new_order); + radix_tree_for_each_slot(slot, &tree, &iter, 0) { + radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16); + } + + item = __radix_tree_lookup(&tree, 0, &node, NULL); + assert(item == (void *)0x16); + assert(node->exceptional > 0); + + item_kill_tree(&tree); +} + +static void multiorder_split(void) +{ + int i, j; + + for (i = 9; i < 19; i++) + for (j = 0; j < i; j++) + __multiorder_split(i, j); +} + void multiorder_checks(void) { int i; @@ -407,4 +470,5 @@ void multiorder_checks(void) multiorder_iteration(); multiorder_tagged_iteration(); multiorder_join(); + multiorder_split(); } -- cgit v1.2.3 From 2791653a6814d170fa893344618563a7b1da95c6 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:09:04 -0800 Subject: radix-tree: add radix_tree_split_preload() Calculate how many nodes we need to allocate to split an old_order entry into multiple entries, each of size new_order. The test suite checks that we allocated exactly the right number of nodes; neither too many (checked by rtp->nr == 0), nor too few (checked by comparing nr_allocated before and after the call to radix_tree_split()). Link: http://lkml.kernel.org/r/1480369871-5271-60-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/radix-tree.h | 1 + lib/radix-tree.c | 24 +++++++++++++++++++- tools/testing/radix-tree/multiorder.c | 42 +++++++++++++++++++++++++++++++++-- tools/testing/radix-tree/test.h | 5 +++++ 4 files changed, 69 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 1f4b56120de8..5dea8f6440e4 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -345,6 +345,7 @@ static inline void radix_tree_preload_end(void) preempt_enable(); } +int radix_tree_split_preload(unsigned old_order, unsigned new_order, gfp_t); int radix_tree_split(struct radix_tree_root *, unsigned long index, unsigned new_order); int radix_tree_join(struct radix_tree_root *, unsigned long index, diff --git a/lib/radix-tree.c b/lib/radix-tree.c index ade2ed3f5190..be1183e62590 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -368,7 +368,7 @@ radix_tree_node_free(struct radix_tree_node *node) * To make use of this facility, the radix tree must be initialised without * __GFP_DIRECT_RECLAIM being passed to INIT_RADIX_TREE(). */ -static int __radix_tree_preload(gfp_t gfp_mask, int nr) +static int __radix_tree_preload(gfp_t gfp_mask, unsigned nr) { struct radix_tree_preload *rtp; struct radix_tree_node *node; @@ -434,6 +434,28 @@ int radix_tree_maybe_preload(gfp_t gfp_mask) } EXPORT_SYMBOL(radix_tree_maybe_preload); +#ifdef CONFIG_RADIX_TREE_MULTIORDER +/* + * Preload with enough objects to ensure that we can split a single entry + * of order @old_order into many entries of size @new_order + */ +int radix_tree_split_preload(unsigned int old_order, unsigned int new_order, + gfp_t gfp_mask) +{ + unsigned top = 1 << (old_order % RADIX_TREE_MAP_SHIFT); + unsigned layers = (old_order / RADIX_TREE_MAP_SHIFT) - + (new_order / RADIX_TREE_MAP_SHIFT); + unsigned nr = 0; + + WARN_ON_ONCE(!gfpflags_allow_blocking(gfp_mask)); + BUG_ON(new_order >= old_order); + + while (layers--) + nr = nr * RADIX_TREE_MAP_SIZE + 1; + return __radix_tree_preload(gfp_mask, top * nr); +} +#endif + /* * The same as function above, but preload number of nodes required to insert * (1 << order) continuous naturally-aligned elements. diff --git a/tools/testing/radix-tree/multiorder.c b/tools/testing/radix-tree/multiorder.c index fa6effe997a3..5421f015f46c 100644 --- a/tools/testing/radix-tree/multiorder.c +++ b/tools/testing/radix-tree/multiorder.c @@ -389,35 +389,67 @@ static void multiorder_join(void) } } +static void check_mem(unsigned old_order, unsigned new_order, unsigned alloc) +{ + struct radix_tree_preload *rtp = &radix_tree_preloads; + if (rtp->nr != 0) + printf("split(%u %u) remaining %u\n", old_order, new_order, + rtp->nr); + /* + * Can't check for equality here as some nodes may have been + * RCU-freed while we ran. But we should never finish with more + * nodes allocated since they should have all been preloaded. + */ + if (nr_allocated > alloc) + printf("split(%u %u) allocated %u %u\n", old_order, new_order, + alloc, nr_allocated); +} + static void __multiorder_split(int old_order, int new_order) { - RADIX_TREE(tree, GFP_KERNEL); + RADIX_TREE(tree, GFP_ATOMIC); void **slot; struct radix_tree_iter iter; struct radix_tree_node *node; void *item; + unsigned alloc; + + radix_tree_preload(GFP_KERNEL); + assert(item_insert_order(&tree, 0, old_order) == 0); + radix_tree_preload_end(); + + /* Wipe out the preloaded cache or it'll confuse check_mem() */ + radix_tree_cpu_dead(0); - item_insert_order(&tree, 0, old_order); radix_tree_tag_set(&tree, 0, 2); + + radix_tree_split_preload(old_order, new_order, GFP_KERNEL); + alloc = nr_allocated; radix_tree_split(&tree, 0, new_order); + check_mem(old_order, new_order, alloc); radix_tree_for_each_slot(slot, &tree, &iter, 0) { radix_tree_iter_replace(&tree, &iter, slot, item_create(iter.index, new_order)); } + radix_tree_preload_end(); item_kill_tree(&tree); + radix_tree_preload(GFP_KERNEL); __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + radix_tree_preload_end(); item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == (void *)0x12); assert(node->exceptional > 0); + radix_tree_split_preload(old_order, new_order, GFP_KERNEL); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { radix_tree_iter_replace(&tree, &iter, slot, item_create(iter.index, new_order)); } + radix_tree_preload_end(); item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item != (void *)0x12); @@ -425,16 +457,20 @@ static void __multiorder_split(int old_order, int new_order) item_kill_tree(&tree); + radix_tree_preload(GFP_KERNEL); __radix_tree_insert(&tree, 0, old_order, (void *)0x12); + radix_tree_preload_end(); item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == (void *)0x12); assert(node->exceptional > 0); + radix_tree_split_preload(old_order, new_order, GFP_KERNEL); radix_tree_split(&tree, 0, new_order); radix_tree_for_each_slot(slot, &tree, &iter, 0) { radix_tree_iter_replace(&tree, &iter, slot, (void *)0x16); } + radix_tree_preload_end(); item = __radix_tree_lookup(&tree, 0, &node, NULL); assert(item == (void *)0x16); @@ -471,4 +507,6 @@ void multiorder_checks(void) multiorder_tagged_iteration(); multiorder_join(); multiorder_split(); + + radix_tree_cpu_dead(0); } diff --git a/tools/testing/radix-tree/test.h b/tools/testing/radix-tree/test.h index e11e4d260b4e..7c2611caa6d2 100644 --- a/tools/testing/radix-tree/test.h +++ b/tools/testing/radix-tree/test.h @@ -52,3 +52,8 @@ int root_tag_get(struct radix_tree_root *root, unsigned int tag); unsigned long node_maxindex(struct radix_tree_node *); unsigned long shift_maxindex(unsigned int shift); int radix_tree_cpu_dead(unsigned int cpu); +struct radix_tree_preload { + unsigned nr; + struct radix_tree_node *nodes; +}; +extern struct radix_tree_preload radix_tree_preloads; -- cgit v1.2.3 From 99c494077e2d4282a17120a772eecc00ec3004cc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:09:13 -0800 Subject: idr: add ida_is_empty Two of the USB Gadgets were poking around in the internals of struct ida in order to determine if it is empty. Add the appropriate abstraction. Link: http://lkml.kernel.org/r/1480369871-5271-63-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Acked-by: Konstantin Khlebnikov Tested-by: Kirill A. Shutemov Cc: Ross Zwisler Cc: Felipe Balbi Cc: Greg Kroah-Hartman Cc: Michal Nazarewicz Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/usb/gadget/function/f_hid.c | 6 +++--- drivers/usb/gadget/function/f_printer.c | 6 +++--- include/linux/idr.h | 5 +++++ 3 files changed, 11 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/drivers/usb/gadget/function/f_hid.c b/drivers/usb/gadget/function/f_hid.c index 7abd70b2a588..3151d2a0fe59 100644 --- a/drivers/usb/gadget/function/f_hid.c +++ b/drivers/usb/gadget/function/f_hid.c @@ -905,7 +905,7 @@ static void hidg_free_inst(struct usb_function_instance *f) mutex_lock(&hidg_ida_lock); hidg_put_minor(opts->minor); - if (idr_is_empty(&hidg_ida.idr)) + if (ida_is_empty(&hidg_ida)) ghid_cleanup(); mutex_unlock(&hidg_ida_lock); @@ -931,7 +931,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) mutex_lock(&hidg_ida_lock); - if (idr_is_empty(&hidg_ida.idr)) { + if (ida_is_empty(&hidg_ida)) { status = ghid_setup(NULL, HIDG_MINORS); if (status) { ret = ERR_PTR(status); @@ -944,7 +944,7 @@ static struct usb_function_instance *hidg_alloc_inst(void) if (opts->minor < 0) { ret = ERR_PTR(opts->minor); kfree(opts); - if (idr_is_empty(&hidg_ida.idr)) + if (ida_is_empty(&hidg_ida)) ghid_cleanup(); goto unlock; } diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 0de36cda6e41..8054da9276dd 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -1265,7 +1265,7 @@ static void gprinter_free_inst(struct usb_function_instance *f) mutex_lock(&printer_ida_lock); gprinter_put_minor(opts->minor); - if (idr_is_empty(&printer_ida.idr)) + if (ida_is_empty(&printer_ida)) gprinter_cleanup(); mutex_unlock(&printer_ida_lock); @@ -1289,7 +1289,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) mutex_lock(&printer_ida_lock); - if (idr_is_empty(&printer_ida.idr)) { + if (ida_is_empty(&printer_ida)) { status = gprinter_setup(PRINTER_MINORS); if (status) { ret = ERR_PTR(status); @@ -1302,7 +1302,7 @@ static struct usb_function_instance *gprinter_alloc_inst(void) if (opts->minor < 0) { ret = ERR_PTR(opts->minor); kfree(opts); - if (idr_is_empty(&printer_ida.idr)) + if (ida_is_empty(&printer_ida)) gprinter_cleanup(); goto unlock; } diff --git a/include/linux/idr.h b/include/linux/idr.h index 083d61e92706..3639a28188c9 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -195,6 +195,11 @@ static inline int ida_get_new(struct ida *ida, int *p_id) return ida_get_new_above(ida, 0, p_id); } +static inline bool ida_is_empty(struct ida *ida) +{ + return idr_is_empty(&ida->idr); +} + void __init idr_init_cache(void); #endif /* __IDR_H__ */ -- cgit v1.2.3 From 444306129a920015a2cc876d13fcbf52382f39bd Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:09:19 -0800 Subject: rxrpc: abstract away knowledge of IDR internals Add idr_get_cursor() / idr_set_cursor() APIs, and remove the reference to IDR_SIZE. Link: http://lkml.kernel.org/r/1480369871-5271-65-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Reviewed-by: David Howells Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 26 ++++++++++++++++++++++++++ net/rxrpc/af_rxrpc.c | 11 ++++++----- net/rxrpc/conn_client.c | 4 ++-- 3 files changed, 34 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index 3639a28188c9..1eb755f77f2f 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -55,6 +55,32 @@ struct idr { } #define DEFINE_IDR(name) struct idr name = IDR_INIT(name) +/** + * idr_get_cursor - Return the current position of the cyclic allocator + * @idr: idr handle + * + * The value returned is the value that will be next returned from + * idr_alloc_cyclic() if it is free (otherwise the search will start from + * this position). + */ +static inline unsigned int idr_get_cursor(struct idr *idr) +{ + return READ_ONCE(idr->cur); +} + +/** + * idr_set_cursor - Set the current position of the cyclic allocator + * @idr: idr handle + * @val: new position + * + * The next call to idr_alloc_cyclic() will return @val if it is free + * (otherwise the search will start from this position). + */ +static inline void idr_set_cursor(struct idr *idr, unsigned int val) +{ + WRITE_ONCE(idr->cur, val); +} + /** * DOC: idr sync * idr synchronization (stolen from radix-tree.h) diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c index 2d59c9be40e1..5f63f6dcaabb 100644 --- a/net/rxrpc/af_rxrpc.c +++ b/net/rxrpc/af_rxrpc.c @@ -762,16 +762,17 @@ static const struct net_proto_family rxrpc_family_ops = { static int __init af_rxrpc_init(void) { int ret = -1; + unsigned int tmp; BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb)); get_random_bytes(&rxrpc_epoch, sizeof(rxrpc_epoch)); rxrpc_epoch |= RXRPC_RANDOM_EPOCH; - get_random_bytes(&rxrpc_client_conn_ids.cur, - sizeof(rxrpc_client_conn_ids.cur)); - rxrpc_client_conn_ids.cur &= 0x3fffffff; - if (rxrpc_client_conn_ids.cur == 0) - rxrpc_client_conn_ids.cur = 1; + get_random_bytes(&tmp, sizeof(tmp)); + tmp &= 0x3fffffff; + if (tmp == 0) + tmp = 1; + idr_set_cursor(&rxrpc_client_conn_ids, tmp); ret = -ENOMEM; rxrpc_call_jar = kmem_cache_create( diff --git a/net/rxrpc/conn_client.c b/net/rxrpc/conn_client.c index 60ef9605167e..6cbcdcc29853 100644 --- a/net/rxrpc/conn_client.c +++ b/net/rxrpc/conn_client.c @@ -263,12 +263,12 @@ static bool rxrpc_may_reuse_conn(struct rxrpc_connection *conn) * times the maximum number of client conns away from the current * allocation point to try and keep the IDs concentrated. */ - id_cursor = READ_ONCE(rxrpc_client_conn_ids.cur); + id_cursor = idr_get_cursor(&rxrpc_client_conn_ids); id = conn->proto.cid >> RXRPC_CIDSHIFT; distance = id - id_cursor; if (distance < 0) distance = -distance; - limit = round_up(rxrpc_max_client_connections, IDR_SIZE) * 4; + limit = max(rxrpc_max_client_connections * 4, 1024U); if (distance > limit) goto mark_dont_reuse; -- cgit v1.2.3 From 424251a4a929a1b6dff2056d49135e3805132e32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 14 Dec 2016 15:09:22 -0800 Subject: idr: reduce the number of bits per level from 8 to 6 In preparation for merging the IDR and radix tree, reduce the fanout at each level from 256 to 64. If this causes a performance problem then a bisect will point to this commit, and we'll have a better idea about what we might do to fix it. Link: http://lkml.kernel.org/r/1480369871-5271-66-git-send-email-mawilcox@linuxonhyperv.com Signed-off-by: Matthew Wilcox Tested-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Ross Zwisler Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/idr.h | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'include') diff --git a/include/linux/idr.h b/include/linux/idr.h index 1eb755f77f2f..3c01b89aed67 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -18,12 +18,11 @@ #include /* - * We want shallower trees and thus more bits covered at each layer. 8 - * bits gives us large enough first layer for most use cases and maximum - * tree depth of 4. Each idr_layer is slightly larger than 2k on 64bit and - * 1k on 32bit. + * Using 6 bits at each layer allows us to allocate 7 layers out of each page. + * 8 bits only gave us 3 layers out of every pair of pages, which is less + * efficient except for trees with a largest element between 192-255 inclusive. */ -#define IDR_BITS 8 +#define IDR_BITS 6 #define IDR_SIZE (1 << IDR_BITS) #define IDR_MASK ((1 << IDR_BITS)-1) -- cgit v1.2.3