diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-08 11:11:38 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2020-06-08 11:11:38 -0700 |
commit | 20b0d06722169e6e66049c8fe6f1a48adffb79c6 (patch) | |
tree | 1b88278ca547c07f58297325aea1ab3c447e844d | |
parent | 63d72b93f2262900c8de74ad0f5a58e0d452c9d3 (diff) | |
parent | db33ec371be8e45956e8cebb5b0fe641f008430b (diff) | |
download | linux-20b0d06722169e6e66049c8fe6f1a48adffb79c6.tar.bz2 |
Merge branch 'akpm' (patches from Andrew)
Merge still more updates from Andrew Morton:
"Various trees. Mainly those parts of MM whose linux-next dependents
are now merged. I'm still sitting on ~160 patches which await merges
from -next.
Subsystems affected by this patch series: mm/proc, ipc, dynamic-debug,
panic, lib, sysctl, mm/gup, mm/pagemap"
* emailed patches from Andrew Morton <akpm@linux-foundation.org>: (52 commits)
doc: cgroup: update note about conditions when oom killer is invoked
module: move the set_fs hack for flush_icache_range to m68k
nommu: use flush_icache_user_range in brk and mmap
binfmt_flat: use flush_icache_user_range
exec: use flush_icache_user_range in read_code
exec: only build read_code when needed
m68k: implement flush_icache_user_range
arm: rename flush_cache_user_range to flush_icache_user_range
xtensa: implement flush_icache_user_range
sh: implement flush_icache_user_range
asm-generic: add a flush_icache_user_range stub
mm: rename flush_icache_user_range to flush_icache_user_page
arm,sparc,unicore32: remove flush_icache_user_range
riscv: use asm-generic/cacheflush.h
powerpc: use asm-generic/cacheflush.h
openrisc: use asm-generic/cacheflush.h
m68knommu: use asm-generic/cacheflush.h
microblaze: use asm-generic/cacheflush.h
ia64: use asm-generic/cacheflush.h
hexagon: use asm-generic/cacheflush.h
...
75 files changed, 701 insertions, 493 deletions
diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index c2a4b652bd1a..ce3e05e41724 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1170,6 +1170,13 @@ PAGE_SIZE multiple when read back. Under certain circumstances, the usage may go over the limit temporarily. + In default configuration regular 0-order allocations always + succeed unless OOM killer chooses current task as a victim. + + Some kinds of allocations don't invoke the OOM killer. + Caller could retry them differently, return into userspace + as -ENOMEM or silently ignore in cases like disk readahead. + This is the ultimate protection mechanism. As long as the high limit is used and monitored properly, this limit's utility is limited to providing the final safety net. @@ -1226,17 +1233,9 @@ PAGE_SIZE multiple when read back. The number of time the cgroup's memory usage was reached the limit and allocation was about to fail. - Depending on context result could be invocation of OOM - killer and retrying allocation or failing allocation. - - Failed allocation in its turn could be returned into - userspace as -ENOMEM or silently ignored in cases like - disk readahead. For now OOM in memory cgroup kills - tasks iff shortage has happened inside page fault. - This event is not raised if the OOM killer is not considered as an option, e.g. for failed high-order - allocations. + allocations or if caller asked to not retry attempts. oom_kill The number of processes belonging to this cgroup diff --git a/Documentation/admin-guide/dynamic-debug-howto.rst b/Documentation/admin-guide/dynamic-debug-howto.rst index 0dc2eb8e44e5..1012bd9305e9 100644 --- a/Documentation/admin-guide/dynamic-debug-howto.rst +++ b/Documentation/admin-guide/dynamic-debug-howto.rst @@ -13,6 +13,11 @@ kernel code to obtain additional kernel information. Currently, if ``print_hex_dump_debug()``/``print_hex_dump_bytes()`` calls can be dynamically enabled per-callsite. +If you do not want to enable dynamic debug globally (i.e. in some embedded +system), you may set ``CONFIG_DYNAMIC_DEBUG_CORE`` as basic support of dynamic +debug and add ``ccflags := -DDYNAMIC_DEBUG_MODULE`` into the Makefile of any +modules which you'd like to dynamically debug later. + If ``CONFIG_DYNAMIC_DEBUG`` is not set, ``print_hex_dump_debug()`` is just shortcut for ``print_hex_dump(KERN_DEBUG)``. diff --git a/Documentation/admin-guide/kdump/kdump.rst b/Documentation/admin-guide/kdump/kdump.rst index ac7e131d2935..2da65fef2a1c 100644 --- a/Documentation/admin-guide/kdump/kdump.rst +++ b/Documentation/admin-guide/kdump/kdump.rst @@ -521,6 +521,14 @@ will cause a kdump to occur at the panic() call. In cases where a user wants to specify this during runtime, /proc/sys/kernel/panic_on_warn can be set to 1 to achieve the same behaviour. +Trigger Kdump on add_taint() +============================ + +The kernel parameter panic_on_taint facilitates a conditional call to panic() +from within add_taint() whenever the value set in this bitmask matches with the +bit flag being set by add_taint(). +This will cause a kdump to occur at the add_taint()->panic() call. + Contact ======= diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index f3eeecbb3f63..ed9597e37415 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -1445,7 +1445,7 @@ hardlockup_all_cpu_backtrace= [KNL] Should the hard-lockup detector generate backtraces on all cpus. - Format: <integer> + Format: 0 | 1 hashdist= [KNL,NUMA] Large hashes allocated during boot are distributed across NUMA nodes. Defaults on @@ -1513,9 +1513,9 @@ hung_task_panic= [KNL] Should the hung task detector generate panics. - Format: <integer> + Format: 0 | 1 - A nonzero value instructs the kernel to panic when a + A value of 1 instructs the kernel to panic when a hung task is detected. The default value is controlled by the CONFIG_BOOTPARAM_HUNG_TASK_PANIC build-time option. The value selected by this boot parameter can @@ -3447,6 +3447,19 @@ bit 4: print ftrace buffer bit 5: print all printk messages in buffer + panic_on_taint= Bitmask for conditionally calling panic() in add_taint() + Format: <hex>[,nousertaint] + Hexadecimal bitmask representing the set of TAINT flags + that will cause the kernel to panic when add_taint() is + called with any of the flags in this set. + The optional switch "nousertaint" can be utilized to + prevent userspace forced crashes by writing to sysctl + /proc/sys/kernel/tainted any flagset matching with the + bitmask set on panic_on_taint. + See Documentation/admin-guide/tainted-kernels.rst for + extra details on the taint flags that users can pick + to compose the bitmask to assign to panic_on_taint. + panic_on_warn panic() instead of WARN(). Useful to cause kdump on a WARN(). @@ -4652,9 +4665,9 @@ softlockup_panic= [KNL] Should the soft-lockup detector generate panics. - Format: <integer> + Format: 0 | 1 - A nonzero value instructs the soft-lockup detector + A value of 1 instructs the soft-lockup detector to panic the machine when a soft-lockup occurs. It is also controlled by the kernel.softlockup_panic sysctl and CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC, which is the @@ -4663,7 +4676,7 @@ softlockup_all_cpu_backtrace= [KNL] Should the soft-lockup detector generate backtraces on all cpus. - Format: <integer> + Format: 0 | 1 sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/admin-guide/laptops/sonypi.rst @@ -4956,6 +4969,15 @@ switches= [HW,M68k] + sysctl.*= [KNL] + Set a sysctl parameter, right before loading the init + process, as if the value was written to the respective + /proc/sys/... file. Both '.' and '/' are recognized as + separators. Unrecognized parameters and invalid values + are reported in the kernel log. Sysctls registered + later by a loaded module cannot be set this way. + Example: sysctl.vm.swappiness=40 + sysfs.deprecated=0|1 [KNL] Enable/disable old style sysfs layout for old udev on older distributions. When this option is enabled diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst index 1ebf68d01141..83acf5025488 100644 --- a/Documentation/admin-guide/sysctl/kernel.rst +++ b/Documentation/admin-guide/sysctl/kernel.rst @@ -335,6 +335,20 @@ Path for the hotplug policy agent. Default value is "``/sbin/hotplug``". +hung_task_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when a hung task is detected. This file shows up if +CONFIG_DETECT_HUNG_TASK and CONFIG_SMP are enabled. + +0: Won't show all CPUs backtraces when a hung task is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +a hung task is detected. + + hung_task_panic =============== @@ -632,6 +646,22 @@ rate for each task. scanned for a given scan. +oops_all_cpu_backtrace: +================ + +If this option is set, the kernel will send an NMI to all CPUs to dump +their backtraces when an oops event occurs. It should be used as a last +resort in case a panic cannot be triggered (to protect VMs running, for +example) or kdump can't be collected. This file shows up if CONFIG_SMP +is enabled. + +0: Won't show all CPUs backtraces when an oops is detected. +This is the default behavior. + +1: Will non-maskably interrupt all CPUs and dump their backtraces when +an oops event is detected. + + osrelease, ostype & version =========================== @@ -1239,6 +1269,13 @@ ORed together. The letters are seen in "Tainted" line of Oops reports. See :doc:`/admin-guide/tainted-kernels` for more information. +Note: + writes to this sysctl interface will fail with ``EINVAL`` if the kernel is + booted with the command line option ``panic_on_taint=<bitmask>,nousertaint`` + and any of the ORed together values being written to ``tainted`` match with + the bitmask declared on panic_on_taint. + See :doc:`/admin-guide/kernel-parameters` for more details on that particular + kernel command line option and its optional ``nousertaint`` switch. threads-max =========== diff --git a/Documentation/core-api/pin_user_pages.rst b/Documentation/core-api/pin_user_pages.rst index 2e939ff10b86..6068266dd303 100644 --- a/Documentation/core-api/pin_user_pages.rst +++ b/Documentation/core-api/pin_user_pages.rst @@ -148,23 +148,46 @@ NOTE: Some pages, such as DAX pages, cannot be pinned with longterm pins. That's because DAX pages do not have a separate page cache, and so "pinning" implies locking down file system blocks, which is not (yet) supported in that way. -CASE 3: Hardware with page faulting support -------------------------------------------- -Here, a well-written driver doesn't normally need to pin pages at all. However, -if the driver does choose to do so, it can register MMU notifiers for the range, -and will be called back upon invalidation. Either way (avoiding page pinning, or -using MMU notifiers to unpin upon request), there is proper synchronization with -both filesystem and mm (page_mkclean(), munmap(), etc). - -Therefore, neither flag needs to be set. - -In this case, ideally, neither get_user_pages() nor pin_user_pages() should be -called. Instead, the software should be written so that it does not pin pages. -This allows mm and filesystems to operate more efficiently and reliably. +CASE 3: MMU notifier registration, with or without page faulting hardware +------------------------------------------------------------------------- +Device drivers can pin pages via get_user_pages*(), and register for mmu +notifier callbacks for the memory range. Then, upon receiving a notifier +"invalidate range" callback , stop the device from using the range, and unpin +the pages. There may be other possible schemes, such as for example explicitly +synchronizing against pending IO, that accomplish approximately the same thing. + +Or, if the hardware supports replayable page faults, then the device driver can +avoid pinning entirely (this is ideal), as follows: register for mmu notifier +callbacks as above, but instead of stopping the device and unpinning in the +callback, simply remove the range from the device's page tables. + +Either way, as long as the driver unpins the pages upon mmu notifier callback, +then there is proper synchronization with both filesystem and mm +(page_mkclean(), munmap(), etc). Therefore, neither flag needs to be set. CASE 4: Pinning for struct page manipulation only ------------------------------------------------- -Here, normal GUP calls are sufficient, so neither flag needs to be set. +If only struct page data (as opposed to the actual memory contents that a page +is tracking) is affected, then normal GUP calls are sufficient, and neither flag +needs to be set. + +CASE 5: Pinning in order to write to the data within the page +------------------------------------------------------------- +Even though neither DMA nor Direct IO is involved, just a simple case of "pin, +write to a page's data, unpin" can cause a problem. Case 5 may be considered a +superset of Case 1, plus Case 2, plus anything that invokes that pattern. In +other words, if the code is neither Case 1 nor Case 2, it may still require +FOLL_PIN, for patterns like this: + +Correct (uses FOLL_PIN calls): + pin_user_pages() + write to the data within the pages + unpin_user_pages() + +INCORRECT (uses FOLL_GET calls): + get_user_pages() + write to the data within the pages + put_page() page_maybe_dma_pinned(): the whole point of pinning =================================================== diff --git a/arch/alpha/include/asm/cacheflush.h b/arch/alpha/include/asm/cacheflush.h index 89128489cb59..9945ff483eaf 100644 --- a/arch/alpha/include/asm/cacheflush.h +++ b/arch/alpha/include/asm/cacheflush.h @@ -4,19 +4,6 @@ #include <linux/mm.h> -/* Caches aren't brain-dead on the Alpha. */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* Note that the following two definitions are _highly_ dependent on the contexts in which they are used in the kernel. I personally think it is criminal how loosely defined these macros are. */ @@ -48,7 +35,7 @@ extern void smp_imb(void); extern void __load_new_mm_context(struct mm_struct *); static inline void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (vma->vm_flags & VM_EXEC) { @@ -59,20 +46,17 @@ flush_icache_user_range(struct vm_area_struct *vma, struct page *page, mm->context[smp_processor_id()] = 0; } } -#else -extern void flush_icache_user_range(struct vm_area_struct *vma, +#define flush_icache_user_page flush_icache_user_page +#else /* CONFIG_SMP */ +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#endif +#define flush_icache_user_page flush_icache_user_page +#endif /* CONFIG_SMP */ /* This is used only in __do_fault and do_swap_page. */ #define flush_icache_page(vma, page) \ - flush_icache_user_range((vma), (page), 0, 0) + flush_icache_user_page((vma), (page), 0, 0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include <asm-generic/cacheflush.h> #endif /* _ALPHA_CACHEFLUSH_H */ diff --git a/arch/alpha/kernel/smp.c b/arch/alpha/kernel/smp.c index 5f90df30be20..52995bf413fe 100644 --- a/arch/alpha/kernel/smp.c +++ b/arch/alpha/kernel/smp.c @@ -740,7 +740,7 @@ ipi_flush_icache_page(void *x) } void -flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { struct mm_struct *mm = vma->vm_mm; diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index 7114b9aa46b8..2e24e765e6d3 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -258,11 +258,11 @@ extern void flush_cache_page(struct vm_area_struct *vma, unsigned long user_addr #define flush_cache_dup_mm(mm) flush_cache_mm(mm) /* - * flush_cache_user_range is used when we want to ensure that the + * flush_icache_user_range is used when we want to ensure that the * Harvard caches are synchronised for the user space address range. * This is used for the ARM private sys_cacheflush system call. */ -#define flush_cache_user_range(s,e) __cpuc_coherent_user_range(s,e) +#define flush_icache_user_range(s,e) __cpuc_coherent_user_range(s,e) /* * Perform necessary cache operations to ensure that data previously @@ -318,9 +318,6 @@ extern void flush_kernel_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) xa_lock_irq(&mapping->i_pages) #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&mapping->i_pages) -#define flush_icache_user_range(vma,page,addr,len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). diff --git a/arch/arm/kernel/fiq.c b/arch/arm/kernel/fiq.c index cd1234c103fc..98ca3e3fa847 100644 --- a/arch/arm/kernel/fiq.c +++ b/arch/arm/kernel/fiq.c @@ -98,8 +98,8 @@ void set_fiq_handler(void *start, unsigned int length) memcpy(base + offset, start, length); if (!cache_is_vipt_nonaliasing()) - flush_icache_range((unsigned long)base + offset, offset + - length); + flush_icache_range((unsigned long)base + offset, + (unsigned long)base + offset + length); flush_icache_range(0xffff0000 + offset, 0xffff0000 + offset + length); } diff --git a/arch/arm/kernel/traps.c b/arch/arm/kernel/traps.c index 1e70e7227f0f..316a7687f813 100644 --- a/arch/arm/kernel/traps.c +++ b/arch/arm/kernel/traps.c @@ -566,7 +566,7 @@ __do_cache_op(unsigned long start, unsigned long end) if (fatal_signal_pending(current)) return 0; - ret = flush_cache_user_range(start, start + chunk); + ret = flush_icache_user_range(start, start + chunk); if (ret) return ret; diff --git a/arch/arm64/include/asm/cacheflush.h b/arch/arm64/include/asm/cacheflush.h index ce50c1f1f1ea..9384fd8fc13c 100644 --- a/arch/arm64/include/asm/cacheflush.h +++ b/arch/arm64/include/asm/cacheflush.h @@ -94,20 +94,7 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) kick_all_cpus_sync(); } - -static inline void flush_cache_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long user_addr, unsigned long pfn) -{ -} - -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} +#define flush_icache_range flush_icache_range /* * Cache maintenance functions used by the DMA API. No to be used directly. @@ -123,12 +110,7 @@ extern void __dma_flush_area(const void *, size_t); */ extern void copy_to_user_page(struct vm_area_struct *, struct page *, unsigned long, void *, const void *, unsigned long); -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - } while (0) - -#define flush_cache_dup_mm(mm) flush_cache_mm(mm) +#define copy_to_user_page copy_to_user_page /* * flush_dcache_page is used when the kernel has written to the page @@ -154,29 +136,11 @@ static __always_inline void __flush_icache_all(void) dsb(ish); } -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -/* - * We don't appear to need to do anything here. In fact, if we did, we'd - * duplicate cache flushing elsewhere performed by flush_dcache_page(). - */ -#define flush_icache_page(vma,page) do { } while (0) - -/* - * Not required on AArch64 (PIPT or VIPT non-aliasing D-cache). - */ -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - int set_memory_valid(unsigned long addr, int numpages, int enable); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); -#endif +#include <asm-generic/cacheflush.h> + +#endif /* __ASM_CACHEFLUSH_H */ diff --git a/arch/c6x/include/asm/cacheflush.h b/arch/c6x/include/asm/cacheflush.h index 4540b40475e6..10922d528de6 100644 --- a/arch/c6x/include/asm/cacheflush.h +++ b/arch/c6x/include/asm/cacheflush.h @@ -17,21 +17,6 @@ #include <asm/string.h> /* - * virtually-indexed cache management (our cache is physically indexed) - */ -#define flush_cache_all() do {} while (0) -#define flush_cache_mm(mm) do {} while (0) -#define flush_cache_dup_mm(mm) do {} while (0) -#define flush_cache_range(mm, start, end) do {} while (0) -#define flush_cache_page(vma, vmaddr, pfn) do {} while (0) -#define flush_cache_vmap(start, end) do {} while (0) -#define flush_cache_vunmap(start, end) do {} while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do {} while (0) -#define flush_dcache_mmap_lock(mapping) do {} while (0) -#define flush_dcache_mmap_unlock(mapping) do {} while (0) - -/* * physically-indexed cache management */ #define flush_icache_range(s, e) \ @@ -49,14 +34,12 @@ do { \ (unsigned long) page_address(page) + PAGE_SIZE)); \ } while (0) - #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ flush_icache_range((unsigned) (dst), (unsigned) (dst) + (len)); \ } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include <asm-generic/cacheflush.h> #endif /* _ASM_C6X_CACHEFLUSH_H */ diff --git a/arch/hexagon/include/asm/cacheflush.h b/arch/hexagon/include/asm/cacheflush.h index fb447de45d54..6eff0730e6ef 100644 --- a/arch/hexagon/include/asm/cacheflush.h +++ b/arch/hexagon/include/asm/cacheflush.h @@ -25,29 +25,17 @@ #define LINESIZE 32 #define LINEBITS 5 -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - /* * Flush Dcache range through current map. */ extern void flush_dcache_range(unsigned long start, unsigned long end); +#define flush_dcache_range flush_dcache_range /* * Flush Icache range through current map. */ extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range /* * Memory-management related flushes are there to ensure in non-physically @@ -78,6 +66,7 @@ static inline void update_mmu_cache(struct vm_area_struct *vma, void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len); +#define copy_to_user_page copy_to_user_page #define copy_from_user_page(vma, page, vaddr, dst, src, len) \ memcpy(dst, src, len) @@ -85,4 +74,6 @@ void copy_to_user_page(struct vm_area_struct *vma, struct page *page, extern void hexagon_inv_dcache_range(unsigned long start, unsigned long end); extern void hexagon_clean_dcache_range(unsigned long start, unsigned long end); +#include <asm-generic/cacheflush.h> + #endif diff --git a/arch/ia64/include/asm/cacheflush.h b/arch/ia64/include/asm/cacheflush.h index 6d3478f8abc8..708c0fa5d975 100644 --- a/arch/ia64/include/asm/cacheflush.h +++ b/arch/ia64/include/asm/cacheflush.h @@ -12,44 +12,22 @@ #include <asm/page.h> -/* - * Cache flushing routines. This is the kind of stuff that can be very expensive, so try - * to avoid them whenever possible. - */ - -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_icache_page(vma,page) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 #define flush_dcache_page(page) \ do { \ clear_bit(PG_arch_1, &(page)->flags); \ } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -extern void flush_icache_range (unsigned long start, unsigned long end); +extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_range flush_icache_range extern void clflush_cache_range(void *addr, int size); - -#define flush_icache_user_range(vma, page, user_addr, len) \ +#define flush_icache_user_page(vma, page, user_addr, len) \ do { \ unsigned long _addr = (unsigned long) page_address(page) + ((user_addr) & ~PAGE_MASK); \ flush_icache_range(_addr, _addr + (len)); \ } while (0) -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ -do { memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ -} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include <asm-generic/cacheflush.h> #endif /* _ASM_IA64_CACHEFLUSH_H */ diff --git a/arch/m68k/include/asm/cacheflush_mm.h b/arch/m68k/include/asm/cacheflush_mm.h index 1e2544ecaf88..1ac55e7b47f0 100644 --- a/arch/m68k/include/asm/cacheflush_mm.h +++ b/arch/m68k/include/asm/cacheflush_mm.h @@ -254,9 +254,11 @@ static inline void __flush_page_to_ram(void *vaddr) #define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_page(vma, page) __flush_page_to_ram(page_address(page)) -extern void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +extern void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); extern void flush_icache_range(unsigned long address, unsigned long endaddr); +extern void flush_icache_user_range(unsigned long address, + unsigned long endaddr); static inline void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, @@ -264,7 +266,7 @@ static inline void copy_to_user_page(struct vm_area_struct *vma, { flush_cache_page(vma, vaddr, page_to_pfn(page)); memcpy(dst, src, len); - flush_icache_user_range(vma, page, vaddr, len); + flush_icache_user_page(vma, page, vaddr, len); } static inline void copy_from_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, diff --git a/arch/m68k/include/asm/cacheflush_no.h b/arch/m68k/include/asm/cacheflush_no.h index 11e9a9dcbfb2..2731f07e7be8 100644 --- a/arch/m68k/include/asm/cacheflush_no.h +++ b/arch/m68k/include/asm/cacheflush_no.h @@ -9,25 +9,8 @@ #include <asm/mcfsim.h> #define flush_cache_all() __flush_cache_all() -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr) do { } while (0) #define flush_dcache_range(start, len) __flush_dcache_all() -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 -#define flush_dcache_page(page) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) #define flush_icache_range(start, len) __flush_icache_all() -#define flush_icache_page(vma,pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) void mcf_cache_push(void); @@ -98,4 +81,6 @@ static inline void cache_clear(unsigned long paddr, int len) __clear_cache_all(); } +#include <asm-generic/cacheflush.h> + #endif /* _M68KNOMMU_CACHEFLUSH_H */ diff --git a/arch/m68k/mm/cache.c b/arch/m68k/mm/cache.c index 079e64898e6a..5ecb3310e874 100644 --- a/arch/m68k/mm/cache.c +++ b/arch/m68k/mm/cache.c @@ -73,7 +73,7 @@ static unsigned long virt_to_phys_slow(unsigned long vaddr) /* Push n pages at kernel virtual address and clear the icache */ /* RZ: use cpush %bc instead of cpush %dc, cinv %ic */ -void flush_icache_range(unsigned long address, unsigned long endaddr) +void flush_icache_user_range(unsigned long address, unsigned long endaddr) { if (CPU_IS_COLDFIRE) { unsigned long start, end; @@ -104,9 +104,18 @@ void flush_icache_range(unsigned long address, unsigned long endaddr) : "di" (FLUSH_I)); } } + +void flush_icache_range(unsigned long address, unsigned long endaddr) +{ + mm_segment_t old_fs = get_fs(); + + set_fs(KERNEL_DS); + flush_icache_user_range(address, endaddr); + set_fs(old_fs); +} EXPORT_SYMBOL(flush_icache_range); -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { if (CPU_IS_COLDFIRE) { diff --git a/arch/microblaze/include/asm/cacheflush.h b/arch/microblaze/include/asm/cacheflush.h index 11f56c85056b..39f8fb6768d8 100644 --- a/arch/microblaze/include/asm/cacheflush.h +++ b/arch/microblaze/include/asm/cacheflush.h @@ -57,9 +57,6 @@ void microblaze_cache_init(void); #define invalidate_icache() mbc->iin(); #define invalidate_icache_range(start, end) mbc->iinr(start, end); -#define flush_icache_user_range(vma, pg, adr, len) flush_icache(); -#define flush_icache_page(vma, pg) do { } while (0) - #define enable_dcache() mbc->de(); #define disable_dcache() mbc->dd(); /* FIXME for LL-temac driver */ @@ -77,27 +74,9 @@ do { \ flush_dcache_range((unsigned) (addr), (unsigned) (addr) + PAGE_SIZE); \ } while (0); -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) - -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) -#define flush_cache_mm(mm) do { } while (0) - #define flush_cache_page(vma, vmaddr, pfn) \ flush_dcache_range(pfn << PAGE_SHIFT, (pfn << PAGE_SHIFT) + PAGE_SIZE); -/* MS: kgdb code use this macro, wrong len with FLASH */ -#if 0 -#define flush_cache_range(vma, start, len) { \ - flush_icache_range((unsigned) (start), (unsigned) (start) + (len)); \ - flush_dcache_range((unsigned) (start), (unsigned) (start) + (len)); \ -} -#endif - -#define flush_cache_range(vma, start, len) do { } while (0) - static inline void copy_to_user_page(struct vm_area_struct *vma, struct page *page, unsigned long vaddr, void *dst, void *src, int len) @@ -109,12 +88,8 @@ static inline void copy_to_user_page(struct vm_area_struct *vma, flush_dcache_range(addr, addr + PAGE_SIZE); } } +#define copy_to_user_page copy_to_user_page -static inline void copy_from_user_page(struct vm_area_struct *vma, - struct page *page, unsigned long vaddr, - void *dst, void *src, int len) -{ - memcpy(dst, src, len); -} +#include <asm-generic/cacheflush.h> #endif /* _ASM_MICROBLAZE_CACHEFLUSH_H */ diff --git a/arch/nds32/include/asm/cacheflush.h b/arch/nds32/include/asm/cacheflush.h index caddded56e77..7d6824f7c0e8 100644 --- a/arch/nds32/include/asm/cacheflush.h +++ b/arch/nds32/include/asm/cacheflush.h @@ -44,9 +44,9 @@ void invalidate_kernel_vmap_range(void *addr, int size); #define flush_dcache_mmap_unlock(mapping) xa_unlock_irq(&(mapping)->i_pages) #else -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len); -#define flush_icache_user_range flush_icache_user_range +#define flush_icache_user_page flush_icache_user_page #include <asm-generic/cacheflush.h> #endif diff --git a/arch/nds32/mm/cacheflush.c b/arch/nds32/mm/cacheflush.c index 254703653b6f..6eb98a7ad27d 100644 --- a/arch/nds32/mm/cacheflush.c +++ b/arch/nds32/mm/cacheflush.c @@ -35,9 +35,8 @@ void flush_icache_page(struct vm_area_struct *vma, struct page *page) kunmap_atomic((void *)kaddr); local_irq_restore(flags); } -EXPORT_SYMBOL(flush_icache_page); -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { unsigned long kaddr; diff --git a/arch/openrisc/include/asm/cacheflush.h b/arch/openrisc/include/asm/cacheflush.h index 79d5d7753fe4..eeac40d4a854 100644 --- a/arch/openrisc/include/asm/cacheflush.h +++ b/arch/openrisc/include/asm/cacheflush.h @@ -62,31 +62,12 @@ static inline void flush_dcache_page(struct page *page) clear_bit(PG_dc_clean, &page->flags); } -/* - * Other interfaces are not required since we do not have virtually - * indexed or tagged caches. So we can use the default here. - */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_range(start, end) do { } while (0) -#define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma, pg, adr, len) do { } while (0) -#define flush_cache_vmap(start, end) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - if (vma->vm_flags & VM_EXEC) \ - sync_icache_dcache(page); \ - } while (0) +#define flush_icache_user_page(vma, page, addr, len) \ +do { \ + if (vma->vm_flags & VM_EXEC) \ + sync_icache_dcache(page); \ +} while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#include <asm-generic/cacheflush.h> #endif /* __ASM_CACHEFLUSH_H */ diff --git a/arch/powerpc/include/asm/cacheflush.h b/arch/powerpc/include/asm/cacheflush.h index e92191b390f3..de600b915a3c 100644 --- a/arch/powerpc/include/asm/cacheflush.h +++ b/arch/powerpc/include/asm/cacheflush.h @@ -4,23 +4,9 @@ #ifndef _ASM_POWERPC_CACHEFLUSH_H #define _ASM_POWERPC_CACHEFLUSH_H -#ifdef __KERNEL__ - #include <linux/mm.h> #include <asm/cputable.h> -/* - * No cache flushing is required when address mappings are changed, - * because the caches on PowerPCs are physically addressed. - */ -#define flush_cache_all() do { } while (0) -#define flush_cache_mm(mm) do { } while (0) -#define flush_cache_dup_mm(mm) do { } while (0) -#define flush_cache_range(vma, start, end) do { } while (0) -#define flush_cache_page(vma, vmaddr, pfn) do { } while (0) -#define flush_icache_page(vma, page) do { } while (0) -#define flush_cache_vunmap(start, end) do { } while (0) - #ifdef CONFIG_PPC_BOOK3S_64 /* * Book3s has no ptesync after setting a pte, so without this ptesync it's @@ -33,20 +19,20 @@ static inline void flush_cache_vmap(unsigned long start, unsigned long end) { asm volatile("ptesync" ::: "memory"); } -#else -static inline void flush_cache_vmap(unsigned long start, unsigned long end) { } -#endif +#define flush_cache_vmap flush_cache_vmap +#endif /* CONFIG_PPC_BOOK3S_64 */ #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); -#define flush_dcache_mmap_lock(mapping) do { } while (0) -#define flush_dcache_mmap_unlock(mapping) do { } while (0) void flush_icache_range(unsigned long start, unsigned long stop); -extern void flush_icache_user_range(struct vm_area_struct *vma, - struct page *page, unsigned long addr, - int len); -extern void flush_dcache_icache_page(struct page *page); +#define flush_icache_range flush_icache_range + +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, + unsigned long addr, int len); +#define flush_icache_user_page flush_icache_user_page + +void flush_dcache_icache_page(struct page *page); void __flush_dcache_icache(void *page); /** @@ -111,14 +97,6 @@ static inline void invalidate_dcache_range(unsigned long start, mb(); /* sync */ } -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ - } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) - -#endif /* __KERNEL__ */ +#include <asm-generic/cacheflush.h> #endif /* _ASM_POWERPC_CACHEFLUSH_H */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_hv.c b/arch/powerpc/kvm/book3s_64_mmu_hv.c index 18aed9775a3c..ddfc4c90ebb6 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ b/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -581,7 +581,7 @@ int kvmppc_book3s_hv_page_fault(struct kvm_run *run, struct kvm_vcpu *vcpu, * We always ask for write permission since the common case * is that the page is writable. */ - if (__get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (get_user_page_fast_only(hva, FOLL_WRITE, &page)) { write_ok = true; } else { /* Call KVM generic code to do the slow-path check */ diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c b/arch/powerpc/kvm/book3s_64_mmu_radix.c index 02219e28b1e4..a47fa8b4d0f0 100644 --- a/arch/powerpc/kvm/book3s_64_mmu_radix.c +++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c @@ -795,7 +795,7 @@ int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, * is that the page is writable. */ hva = gfn_to_hva_memslot(memslot, gfn); - if (!kvm_ro && __get_user_pages_fast(hva, 1, 1, &page) == 1) { + if (!kvm_ro && get_user_page_fast_only(hva, FOLL_WRITE, &page)) { upgrade_write = true; } else { unsigned long pfn; diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 5f7fe13211e9..e2d6a6236aa7 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -577,7 +577,7 @@ void copy_user_page(void *vto, void *vfrom, unsigned long vaddr, flush_dcache_page(pg); } -void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, +void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { unsigned long maddr; @@ -586,7 +586,6 @@ void flush_icache_user_range(struct vm_area_struct *vma, struct page *page, flush_icache_range(maddr, maddr + len); kunmap(page); } -EXPORT_SYMBOL(flush_icache_user_range); /* * System memory should not be in /proc/iomem but various tools expect it diff --git a/arch/powerpc/perf/callchain_64.c b/arch/powerpc/perf/callchain_64.c index b63086b663ef..9cc1a129737e 100644 --- a/arch/powerpc/perf/callchain_64.c +++ b/arch/powerpc/perf/callchain_64.c @@ -30,11 +30,9 @@ int read_user_stack_slow(void __user *ptr, void *buf, int nb) unsigned long addr = (unsigned long) ptr; unsigned long offset; struct page *page; - int nrpages; void *kaddr; - nrpages = __get_user_pages_fast(addr, 1, 1, &page); - if (nrpages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &page)) { kaddr = page_address(page); /* align address to page boundary */ diff --git a/arch/riscv/include/asm/cacheflush.h b/arch/riscv/include/asm/cacheflush.h index c8677c75f82c..23ff70350992 100644 --- a/arch/riscv/include/asm/cacheflush.h +++ b/arch/riscv/include/asm/cacheflush.h @@ -8,65 +8,6 @@ #include <linux/mm.h> -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 - -/* - * The cache doesn't need to be flushed when TLB entries change when - * the cache is mapped to physical memory, not virtual memory - */ -static inline void flush_cache_all(void) -{ -} - -static inline void flush_cache_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_dup_mm(struct mm_struct *mm) -{ -} - -static inline void flush_cache_range(struct vm_area_struct *vma, - unsigned long start, - unsigned long end) -{ -} - -static inline void flush_cache_page(struct vm_area_struct *vma, - unsigned long vmaddr, - unsigned long pfn) -{ -} - -static inline void flush_dcache_mmap_lock(struct address_space *mapping) -{ -} - -static inline void flush_dcache_mmap_unlock(struct address_space *mapping) -{ -} - -static inline void flush_icache_page(struct vm_area_struct *vma, - struct page *page) -{ -} - -static inline void flush_cache_vmap(unsigned long start, unsigned long end) -{ -} - -static inline void flush_cache_vunmap(unsigned long start, unsigned long end) -{ -} - -#define copy_to_user_page(vma, page, vaddr, dst, src, len) \ - do { \ - memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ - } while (0) -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) - static inline void local_flush_icache_all(void) { asm volatile ("fence.i" ::: "memory"); @@ -79,13 +20,15 @@ static inline void flush_dcache_page(struct page *page) if (test_bit(PG_dcache_clean, &page->flags)) clear_bit(PG_dcache_clean, &page->flags); } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 /* * RISC-V doesn't have an instruction to flush parts of the instruction cache, * so instead we just flush the whole thing. */ #define flush_icache_range(start, end) flush_icache_all() -#define flush_icache_user_range(vma, pg, addr, len) flush_icache_mm(vma->vm_mm, 0) +#define flush_icache_user_page(vma, pg, addr, len) \ + flush_icache_mm(vma->vm_mm, 0) #ifndef CONFIG_SMP @@ -105,4 +48,6 @@ void flush_icache_mm(struct mm_struct *mm, bool local); #define SYS_RISCV_FLUSH_ICACHE_LOCAL 1UL #define SYS_RISCV_FLUSH_ICACHE_ALL (SYS_RISCV_FLUSH_ICACHE_LOCAL) +#include <asm-generic/cacheflush.h> + #endif /* _ASM_RISCV_CACHEFLUSH_H */ diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index b932e42ef028..fe7400079b97 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -46,6 +46,7 @@ extern void flush_cache_range(struct vm_area_struct *vma, #define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 1 extern void flush_dcache_page(struct page *page); extern void flush_icache_range(unsigned long start, unsigned long end); +#define flush_icache_user_range flush_icache_range extern void flush_icache_page(struct vm_area_struct *vma, struct page *page); extern void flush_cache_sigtramp(unsigned long address); diff --git a/arch/sparc/include/asm/cacheflush_32.h b/arch/sparc/include/asm/cacheflush_32.h index fb66094a2c30..41c6d734a474 100644 --- a/arch/sparc/include/asm/cacheflush_32.h +++ b/arch/sparc/include/asm/cacheflush_32.h @@ -17,8 +17,6 @@ #define flush_icache_range(start, end) do { } while (0) #define flush_icache_page(vma, pg) do { } while (0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) - #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ flush_cache_page(vma, vaddr, page_to_pfn(page));\ diff --git a/arch/sparc/include/asm/cacheflush_64.h b/arch/sparc/include/asm/cacheflush_64.h index e7517434d1fa..b9341836597e 100644 --- a/arch/sparc/include/asm/cacheflush_64.h +++ b/arch/sparc/include/asm/cacheflush_64.h @@ -49,7 +49,6 @@ void __flush_dcache_range(unsigned long start, unsigned long end); void flush_dcache_page(struct page *page); #define flush_icache_page(vma, pg) do { } while(0) -#define flush_icache_user_range(vma,pg,adr,len) do { } while (0) void flush_ptrace_access(struct vm_area_struct *, struct page *, unsigned long uaddr, void *kaddr, diff --git a/arch/um/include/asm/tlb.h b/arch/um/include/asm/tlb.h index 70ee60383900..ff9c62828962 100644 --- a/arch/um/include/asm/tlb.h +++ b/arch/um/include/asm/tlb.h @@ -2,6 +2,8 @@ #ifndef __UM_TLB_H #define __UM_TLB_H +#include <linux/mm.h> + #include <asm/tlbflush.h> #include <asm-generic/cacheflush.h> #include <asm-generic/tlb.h> diff --git a/arch/unicore32/include/asm/cacheflush.h b/arch/unicore32/include/asm/cacheflush.h index dc8c0b41538f..ff0be92ebc32 100644 --- a/arch/unicore32/include/asm/cacheflush.h +++ b/arch/unicore32/include/asm/cacheflush.h @@ -133,14 +133,6 @@ extern void flush_cache_page(struct vm_area_struct *vma, #define flush_cache_dup_mm(mm) flush_cache_mm(mm) /* - * flush_cache_user_range is used when we want to ensure that the - * Harvard caches are synchronised for the user space address range. - * This is used for the UniCore private sys_cacheflush system call. - */ -#define flush_cache_user_range(vma, start, end) \ - __cpuc_coherent_user_range((start) & PAGE_MASK, PAGE_ALIGN(end)) - -/* * Perform necessary cache operations to ensure that data previously * stored within this range of addresses can be executed by the CPU. */ @@ -170,9 +162,6 @@ extern void flush_dcache_page(struct page *); #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) -#define flush_icache_user_range(vma, page, addr, len) \ - flush_dcache_page(page) - /* * We don't appear to need to do anything here. In fact, if we did, we'd * duplicate cache flushing elsewhere performed by flush_dcache_page(). diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h index 63feaf2a5f93..b192d917a6d0 100644 --- a/arch/x86/include/asm/cacheflush.h +++ b/arch/x86/include/asm/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_CACHEFLUSH_H #define _ASM_X86_CACHEFLUSH_H +#include <linux/mm.h> + /* Caches aren't brain-dead on the intel. */ #include <asm-generic/cacheflush.h> #include <asm/special_insns.h> diff --git a/arch/xtensa/include/asm/cacheflush.h b/arch/xtensa/include/asm/cacheflush.h index a0d50be5a8cb..cf907e5bf2f2 100644 --- a/arch/xtensa/include/asm/cacheflush.h +++ b/arch/xtensa/include/asm/cacheflush.h @@ -145,6 +145,8 @@ void local_flush_cache_page(struct vm_area_struct *vma, #endif +#define flush_icache_user_range flush_icache_range + /* Ensure consistency between data and instruction cache. */ #define local_flush_icache_range(start, end) \ do { \ diff --git a/drivers/media/platform/omap3isp/ispvideo.c b/drivers/media/platform/omap3isp/ispvideo.c index 6f769c527fae..10c214bd0903 100644 --- a/drivers/media/platform/omap3isp/ispvideo.c +++ b/drivers/media/platform/omap3isp/ispvideo.c @@ -10,7 +10,6 @@ * Sakari Ailus <sakari.ailus@iki.fi> */ -#include <asm/cacheflush.h> #include <linux/clk.h> #include <linux/mm.h> #include <linux/module.h> @@ -19,6 +18,7 @@ #include <linux/sched.h> #include <linux/slab.h> #include <linux/vmalloc.h> +#include <asm/cacheflush.h> #include <media/v4l2-dev.h> #include <media/v4l2-ioctl.h> diff --git a/drivers/nvdimm/pmem.c b/drivers/nvdimm/pmem.c index 97f948f8f4e6..d1ecd6da11a2 100644 --- a/drivers/nvdimm/pmem.c +++ b/drivers/nvdimm/pmem.c @@ -7,7 +7,6 @@ * Copyright (c) 2015, Boaz Harrosh <boaz@plexistor.com>. */ -#include <asm/cacheflush.h> #include <linux/blkdev.h> #include <linux/hdreg.h> #include <linux/init.h> @@ -25,6 +24,8 @@ #include <linux/dax.h> #include <linux/nd.h> #include <linux/backing-dev.h> +#include <linux/mm.h> +#include <asm/cacheflush.h> #include "pmem.h" #include "pfn.h" #include "nd.h" diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 21a59b598ed8..596132a96cd5 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1762,15 +1762,14 @@ static int set_bit_to_user(int nr, void __user *addr) int bit = nr + (log % PAGE_SIZE) * 8; int r; - r = get_user_pages_fast(log, 1, FOLL_WRITE, &page); + r = pin_user_pages_fast(log, 1, FOLL_WRITE, &page); if (r < 0) return r; BUG_ON(r != 1); base = kmap_atomic(page); set_bit(bit, base); kunmap_atomic(base); - set_page_dirty_lock(page); - put_page(page); + unpin_user_pages_dirty_lock(&page, 1, true); return 0; } diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 9b82bc111d0a..87ce229c63cf 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -854,7 +854,7 @@ static int load_flat_file(struct linux_binprm *bprm, #endif /* CONFIG_BINFMT_FLAT_OLD */ } - flush_icache_range(start_code, end_code); + flush_icache_user_range(start_code, end_code); /* zero the BSS, BRK and stack areas */ if (clear_user((void __user *)(datapos + data_len), bss_len + diff --git a/fs/exec.c b/fs/exec.c index 93ff1c4c7ebb..02d0c5d19be5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1053,14 +1053,17 @@ out: } EXPORT_SYMBOL_GPL(kernel_read_file_from_fd); +#if defined(CONFIG_HAVE_AOUT) || defined(CONFIG_BINFMT_FLAT) || \ + defined(CONFIG_BINFMT_ELF_FDPIC) ssize_t read_code(struct file *file, unsigned long addr, loff_t pos, size_t len) { ssize_t res = vfs_read(file, (void __user *)addr, len, &pos); if (res > 0) - flush_icache_range(addr, addr + len); + flush_icache_user_range(addr, addr + len); return res; } EXPORT_SYMBOL(read_code); +#endif /* * Maps the mm_struct mm into the current task struct. diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index df2143e05c57..5b405f32971d 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -14,6 +14,7 @@ #include <linux/mm.h> #include <linux/module.h> #include <linux/bpf-cgroup.h> +#include <linux/mount.h> #include "internal.h" static const struct dentry_operations proc_sys_dentry_operations; @@ -1703,3 +1704,147 @@ int __init proc_sys_init(void) return sysctl_init(); } + +struct sysctl_alias { + const char *kernel_param; + const char *sysctl_param; +}; + +/* + * Historically some settings had both sysctl and a command line parameter. + * With the generic sysctl. parameter support, we can handle them at a single + * place and only keep the historical name for compatibility. This is not meant + * to add brand new aliases. When adding existing aliases, consider whether + * the possibly different moment of changing the value (e.g. from early_param + * to the moment do_sysctl_args() is called) is an issue for the specific + * parameter. + */ +static const struct sysctl_alias sysctl_aliases[] = { + {"hardlockup_all_cpu_backtrace", "kernel.hardlockup_all_cpu_backtrace" }, + {"hung_task_panic", "kernel.hung_task_panic" }, + {"numa_zonelist_order", "vm.numa_zonelist_order" }, + {"softlockup_all_cpu_backtrace", "kernel.softlockup_all_cpu_backtrace" }, + {"softlockup_panic", "kernel.softlockup_panic" }, + { } +}; + +static const char *sysctl_find_alias(char *param) +{ + const struct sysctl_alias *alias; + + for (alias = &sysctl_aliases[0]; alias->kernel_param != NULL; alias++) { + if (strcmp(alias->kernel_param, param) == 0) + return alias->sysctl_param; + } + + return NULL; +} + +/* Set sysctl value passed on kernel command line. */ +static int process_sysctl_arg(char *param, char *val, + const char *unused, void *arg) +{ + char *path; + struct vfsmount **proc_mnt = arg; + struct file_system_type *proc_fs_type; + struct file *file; + int len; + int err; + loff_t pos = 0; + ssize_t wret; + + if (strncmp(param, "sysctl", sizeof("sysctl") - 1) == 0) { + param += sizeof("sysctl") - 1; + + if (param[0] != '/' && param[0] != '.') + return 0; + + param++; + } else { + param = (char *) sysctl_find_alias(param); + if (!param) + return 0; + } + + /* + * To set sysctl options, we use a temporary mount of proc, look up the + * respective sys/ file and write to it. To avoid mounting it when no + * options were given, we mount it only when the first sysctl option is + * found. Why not a persistent mount? There are problems with a + * persistent mount of proc in that it forces userspace not to use any + * proc mount options. + */ + if (!*proc_mnt) { + proc_fs_type = get_fs_type("proc"); + if (!proc_fs_type) { + pr_err("Failed to find procfs to set sysctl from command line\n"); + return 0; + } + *proc_mnt = kern_mount(proc_fs_type); + put_filesystem(proc_fs_type); + if (IS_ERR(*proc_mnt)) { + pr_err("Failed to mount procfs to set sysctl from command line\n"); + return 0; + } + } + + path = kasprintf(GFP_KERNEL, "sys/%s", param); + if (!path) + panic("%s: Failed to allocate path for %s\n", __func__, param); + strreplace(path, '.', '/'); + + file = file_open_root((*proc_mnt)->mnt_root, *proc_mnt, path, O_WRONLY, 0); + if (IS_ERR(file)) { + err = PTR_ERR(file); + if (err == -ENOENT) + pr_err("Failed to set sysctl parameter '%s=%s': parameter not found\n", + param, val); + else if (err == -EACCES) + pr_err("Failed to set sysctl parameter '%s=%s': permission denied (read-only?)\n", + param, val); + else + pr_err("Error %pe opening proc file to set sysctl parameter '%s=%s'\n", + file, param, val); + goto out; + } + len = strlen(val); + wret = kernel_write(file, val, len, &pos); + if (wret < 0) { + err = wret; + if (err == -EINVAL) + pr_err("Failed to set sysctl parameter '%s=%s': invalid value\n", + param, val); + else + pr_err("Error %pe writing to proc file to set sysctl parameter '%s=%s'\n", + ERR_PTR(err), param, val); + } else if (wret != len) { + pr_err("Wrote only %zd bytes of %d writing to proc file %s to set sysctl parameter '%s=%s\n", + wret, len, path, param, val); + } + + err = filp_close(file, NULL); + if (err) + pr_err("Error %pe closing proc file to set sysctl parameter '%s=%s\n", + ERR_PTR(err), param, val); +out: + kfree(path); + return 0; +} + +void do_sysctl_args(void) +{ + char *command_line; + struct vfsmount *proc_mnt = NULL; + + command_line = kstrdup(saved_command_line, GFP_KERNEL); + if (!command_line) + panic("%s: Failed to allocate copy of command line\n", __func__); + + parse_args("Setting sysctl args", command_line, + NULL, 0, -1, -1, &proc_mnt, process_sysctl_arg); + + if (proc_mnt) + kern_unmount(proc_mnt); + + kfree(command_line); +} diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index cac7404b2bdd..907fa5d16494 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -1,11 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ASM_CACHEFLUSH_H -#define __ASM_CACHEFLUSH_H - -/* Keep includes the same across arches. */ -#include <linux/mm.h> - -#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 +#ifndef _ASM_GENERIC_CACHEFLUSH_H +#define _ASM_GENERIC_CACHEFLUSH_H /* * The cache doesn't need to be flushed when TLB entries change when @@ -45,12 +40,14 @@ static inline void flush_cache_page(struct vm_area_struct *vma, } #endif -#ifndef flush_dcache_page +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE static inline void flush_dcache_page(struct page *page) { } +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #endif + #ifndef flush_dcache_mmap_lock static inline void flush_dcache_mmap_lock(struct address_space *mapping) { @@ -69,6 +66,10 @@ static inline void flush_icache_range(unsigned long start, unsigned long end) } #endif +#ifndef flush_icache_user_range +#define flush_icache_user_range flush_icache_range +#endif + #ifndef flush_icache_page static inline void flush_icache_page(struct vm_area_struct *vma, struct page *page) @@ -76,8 +77,8 @@ static inline void flush_icache_page(struct vm_area_struct *vma, } #endif -#ifndef flush_icache_user_range -static inline void flush_icache_user_range(struct vm_area_struct *vma, +#ifndef flush_icache_user_page +static inline void flush_icache_user_page(struct vm_area_struct *vma, struct page *page, unsigned long addr, int len) { @@ -100,7 +101,7 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ memcpy(dst, src, len); \ - flush_icache_user_range(vma, page, vaddr, len); \ + flush_icache_user_page(vma, page, vaddr, len); \ } while (0) #endif @@ -109,4 +110,4 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) memcpy(dst, src, len) #endif -#endif /* __ASM_CACHEFLUSH_H */ +#endif /* _ASM_GENERIC_CACHEFLUSH_H */ diff --git a/include/linux/dev_printk.h b/include/linux/dev_printk.h index 5aad06b4ca7b..3028b644b4fb 100644 --- a/include/linux/dev_printk.h +++ b/include/linux/dev_printk.h @@ -109,7 +109,8 @@ void _dev_info(const struct device *dev, const char *fmt, ...) #define dev_info(dev, fmt, ...) \ _dev_info(dev, dev_fmt(fmt), ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define dev_dbg(dev, fmt, ...) \ dynamic_dev_dbg(dev, dev_fmt(fmt), ##__VA_ARGS__) #elif defined(DEBUG) @@ -181,7 +182,8 @@ do { \ dev_level_ratelimited(dev_notice, dev, fmt, ##__VA_ARGS__) #define dev_info_ratelimited(dev, fmt, ...) \ dev_level_ratelimited(dev_info, dev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define dev_dbg_ratelimited(dev, fmt, ...) \ do { \ diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h index 4cf02ecd67de..abcd5fde30eb 100644 --- a/include/linux/dynamic_debug.h +++ b/include/linux/dynamic_debug.h @@ -48,7 +48,7 @@ struct _ddebug { -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG_CORE) int ddebug_add_module(struct _ddebug *tab, unsigned int n, const char *modname); extern int ddebug_remove_module(const char *mod_name); diff --git a/include/linux/ipc_namespace.h b/include/linux/ipc_namespace.h index c309f43bde45..a06a78c67f19 100644 --- a/include/linux/ipc_namespace.h +++ b/include/linux/ipc_namespace.h @@ -68,6 +68,8 @@ struct ipc_namespace { struct user_namespace *user_ns; struct ucounts *ucounts; + struct llist_node mnt_llist; + struct ns_common ns; } __randomize_layout; diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 9b7a8d74a9d6..82d91547d122 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -520,6 +520,12 @@ static inline u32 int_sqrt64(u64 x) } #endif +#ifdef CONFIG_SMP +extern unsigned int sysctl_oops_all_cpu_backtrace; +#else +#define sysctl_oops_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern void bust_spinlocks(int yes); extern int oops_in_progress; /* If set, an oops, panic(), BUG() or die() is in progress */ extern int panic_timeout; @@ -528,6 +534,8 @@ extern int panic_on_oops; extern int panic_on_unrecovered_nmi; extern int panic_on_io_nmi; extern int panic_on_warn; +extern unsigned long panic_on_taint; +extern bool panic_on_taint_nousertaint; extern int sysctl_panic_on_rcu_stall; extern int sysctl_panic_on_stackoverflow; @@ -596,6 +604,7 @@ extern enum system_states { #define TAINT_AUX 16 #define TAINT_RANDSTRUCT 17 #define TAINT_FLAGS_COUNT 18 +#define TAINT_FLAGS_MAX ((1UL << TAINT_FLAGS_COUNT) - 1) struct taint_flag { char c_true; /* character printed when tainted */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 86adc71a972f..9d6042178ca7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1706,6 +1706,8 @@ long pin_user_pages(unsigned long start, unsigned long nr_pages, struct vm_area_struct **vmas); long get_user_pages_locked(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, int *locked); +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, int *locked); long get_user_pages_unlocked(unsigned long start, unsigned long nr_pages, struct page **pages, unsigned int gup_flags); long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, @@ -1824,10 +1826,16 @@ extern int mprotect_fixup(struct vm_area_struct *vma, /* * doesn't attempt to fault and will return short. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages); +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages); int pin_user_pages_fast_only(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages); + +static inline bool get_user_page_fast_only(unsigned long addr, + unsigned int gup_flags, struct page **pagep) +{ + return get_user_pages_fast_only(addr, 1, gup_flags, pagep) == 1; +} /* * per-process(per-mm_struct) statistics. */ diff --git a/include/linux/net.h b/include/linux/net.h index e10f378194a5..016a9c5faa34 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -264,7 +264,8 @@ do { \ net_ratelimited_function(pr_warn, fmt, ##__VA_ARGS__) #define net_info_ratelimited(fmt, ...) \ net_ratelimited_function(pr_info, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define net_dbg_ratelimited(fmt, ...) \ do { \ DEFINE_DYNAMIC_DEBUG_METADATA(descriptor, fmt); \ diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 1a96e9c4ec36..5b364a2e0006 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4942,7 +4942,8 @@ do { \ #define MODULE_ALIAS_NETDEV(device) \ MODULE_ALIAS("netdev-" device) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netdev_dbg(__dev, format, args...) \ do { \ dynamic_netdev_dbg(__dev, format, ##args); \ @@ -5012,7 +5013,8 @@ do { \ #define netif_info(priv, type, dev, fmt, args...) \ netif_level(info, priv, type, dev, fmt, ##args) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define netif_dbg(priv, type, netdev, format, args...) \ do { \ if (netif_msg_##type(priv)) \ diff --git a/include/linux/printk.h b/include/linux/printk.h index 3cc2f178bf06..fc8f03c54543 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -399,7 +399,8 @@ extern int kptr_restrict; /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #include <linux/dynamic_debug.h> /** @@ -535,7 +536,8 @@ extern int kptr_restrict; #endif /* If you are writing a driver, please use dev_dbg instead */ -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define pr_debug_ratelimited(fmt, ...) \ do { \ @@ -582,7 +584,8 @@ static inline void print_hex_dump_bytes(const char *prefix_str, int prefix_type, #endif -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define print_hex_dump_debug(prefix_str, prefix_type, rowsize, \ groupsize, buf, len, ascii) \ dynamic_hex_dump(prefix_str, prefix_type, rowsize, \ diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 7b4d3a49b6c5..660ac49f2b53 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -7,6 +7,13 @@ struct ctl_table; #ifdef CONFIG_DETECT_HUNG_TASK + +#ifdef CONFIG_SMP +extern unsigned int sysctl_hung_task_all_cpu_backtrace; +#else +#define sysctl_hung_task_all_cpu_backtrace 0 +#endif /* CONFIG_SMP */ + extern int sysctl_hung_task_check_count; extern unsigned int sysctl_hung_task_panic; extern unsigned long sysctl_hung_task_timeout_secs; diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index f2401e45a3c2..50bb7f383a1b 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -197,6 +197,7 @@ struct ctl_table_header *register_sysctl_paths(const struct ctl_path *path, void unregister_sysctl_table(struct ctl_table_header * table); extern int sysctl_init(void); +void do_sysctl_args(void); extern int pwrsw_enabled; extern int unaligned_enabled; @@ -235,6 +236,9 @@ static inline void setup_sysctl_set(struct ctl_table_set *p, { } +static inline void do_sysctl_args(void) +{ +} #endif /* CONFIG_SYSCTL */ int sysctl_max_threads(struct ctl_table *table, int write, void *buffer, diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 14c893433139..b4d70e7568b2 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -576,7 +576,7 @@ void __xa_clear_mark(struct xarray *, unsigned long index, xa_mark_t); * * Context: Any context. Takes and releases the xa_lock while * disabling softirqs. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) @@ -602,7 +602,7 @@ static inline void *xa_store_bh(struct xarray *xa, unsigned long index, * * Context: Process context. Takes and releases the xa_lock while * disabling interrupts. - * Return: The entry which used to be at this index. + * Return: The old entry at this index or xa_err() if an error happened. */ static inline void *xa_store_irq(struct xarray *xa, unsigned long index, void *entry, gfp_t gfp) diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 033e7044f29c..ef2f3986c493 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -100,7 +100,8 @@ void ibdev_notice(const struct ib_device *ibdev, const char *format, ...); __printf(2, 3) __cold void ibdev_info(const struct ib_device *ibdev, const char *format, ...); -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) #define ibdev_dbg(__dev, format, args...) \ dynamic_ibdev_dbg(__dev, format, ##args) #else @@ -133,7 +134,8 @@ do { \ #define ibdev_info_ratelimited(ibdev, fmt, ...) \ ibdev_level_ratelimited(ibdev_info, ibdev, fmt, ##__VA_ARGS__) -#if defined(CONFIG_DYNAMIC_DEBUG) +#if defined(CONFIG_DYNAMIC_DEBUG) || \ + (defined(CONFIG_DYNAMIC_DEBUG_CORE) && defined(DYNAMIC_DEBUG_MODULE)) /* descriptor check is first to prevent flooding with "callbacks suppressed" */ #define ibdev_dbg_ratelimited(ibdev, fmt, ...) \ do { \ diff --git a/init/main.c b/init/main.c index 76df62fc3e2c..b59e09353881 100644 --- a/init/main.c +++ b/init/main.c @@ -1412,6 +1412,8 @@ static int __ref kernel_init(void *unused) rcu_end_inkernel_boot(); + do_sysctl_args(); + if (ramdisk_execute_command) { ret = run_init_process(ramdisk_execute_command); if (!ret) diff --git a/ipc/msg.c b/ipc/msg.c index caca67368cb5..acd1bc7af55a 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -268,6 +268,8 @@ static void expunge_all(struct msg_queue *msq, int res, * before freeque() is called. msg_ids.rwsem remains locked on exit. */ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) + __releases(RCU) + __releases(&msq->q_perm) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); diff --git a/ipc/namespace.c b/ipc/namespace.c index fdc3b5f3f53a..24e7b45320f7 100644 --- a/ipc/namespace.c +++ b/ipc/namespace.c @@ -117,6 +117,10 @@ void free_ipcs(struct ipc_namespace *ns, struct ipc_ids *ids, static void free_ipc_ns(struct ipc_namespace *ns) { + /* mq_put_mnt() waits for a grace period as kern_unmount() + * uses synchronize_rcu(). + */ + mq_put_mnt(ns); sem_exit_ns(ns); msg_exit_ns(ns); shm_exit_ns(ns); @@ -127,6 +131,21 @@ static void free_ipc_ns(struct ipc_namespace *ns) kfree(ns); } +static LLIST_HEAD(free_ipc_list); +static void free_ipc(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&free_ipc_list); + struct ipc_namespace *n, *t; + + llist_for_each_entry_safe(n, t, node, mnt_llist) + free_ipc_ns(n); +} + +/* + * The work queue is used to avoid the cost of synchronize_rcu in kern_unmount. + */ +static DECLARE_WORK(free_ipc_work, free_ipc); + /* * put_ipc_ns - drop a reference to an ipc namespace. * @ns: the namespace to put @@ -148,8 +167,9 @@ void put_ipc_ns(struct ipc_namespace *ns) if (refcount_dec_and_lock(&ns->count, &mq_lock)) { mq_clear_sbinfo(ns); spin_unlock(&mq_lock); - mq_put_mnt(ns); - free_ipc_ns(ns); + + if (llist_add(&ns->mnt_llist, &free_ipc_list)) + schedule_work(&free_ipc_work); } } diff --git a/kernel/events/core.c b/kernel/events/core.c index fcfadecd3a08..63d66bbebbd5 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6934,12 +6934,12 @@ static u64 perf_virt_to_phys(u64 virt) * Walking the pages tables for user address. * Interrupts are disabled, so it prevents any tear down * of the page tables. - * Try IRQ-safe __get_user_pages_fast first. + * Try IRQ-safe get_user_page_fast_only first. * If failed, leave phys_addr as 0. */ if (current->mm != NULL) { pagefault_disable(); - if (__get_user_pages_fast(virt, 1, 0, &p) == 1) + if (get_user_page_fast_only(virt, 0, &p)) phys_addr = page_to_phys(p) + virt % PAGE_SIZE; pagefault_enable(); } diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index eddc8db96027..e51ec844c87c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -1668,7 +1668,7 @@ void __weak arch_uprobe_copy_ixol(struct page *page, unsigned long vaddr, copy_to_page(page, vaddr, src, len); /* - * We probably need flush_icache_user_range() but it needs vma. + * We probably need flush_icache_user_page() but it needs vma. * This should work on most of architectures by default. If * architecture needs to do something different it can define * its own version of the function. diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 14a625c16cb3..ce76f490126c 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -53,9 +53,18 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; static bool hung_task_call_panic; +static bool hung_task_show_all_bt; static struct task_struct *watchdog_task; +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in a hung task event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_hung_task_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + /* * Should we panic (and reboot, if panic_timeout= is set) when a * hung task is detected: @@ -63,16 +72,6 @@ static struct task_struct *watchdog_task; unsigned int __read_mostly sysctl_hung_task_panic = CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE; -static int __init hung_task_panic_setup(char *str) -{ - int rc = kstrtouint(str, 0, &sysctl_hung_task_panic); - - if (rc) - return rc; - return 1; -} -__setup("hung_task_panic=", hung_task_panic_setup); - static int hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr) { @@ -137,6 +136,9 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) " disables this message.\n"); sched_show_task(t); hung_task_show_lock = true; + + if (sysctl_hung_task_all_cpu_backtrace) + hung_task_show_all_bt = true; } touch_nmi_watchdog(); @@ -201,10 +203,14 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); - if (hung_task_call_panic) { + + if (hung_task_show_all_bt) { + hung_task_show_all_bt = false; trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); } + + if (hung_task_call_panic) + panic("hung_task: blocked tasks"); } static long hung_timeout_jiffies(unsigned long last_checked, diff --git a/kernel/module.c b/kernel/module.c index ef400c389f49..e8a198588f26 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3344,12 +3344,6 @@ static int check_module_license_and_versions(struct module *mod) static void flush_module_icache(const struct module *mod) { - mm_segment_t old_fs; - - /* flush the icache in correct context */ - old_fs = get_fs(); - set_fs(KERNEL_DS); - /* * Flush the instruction cache, since we've played with text. * Do it before processing of module parameters, so the module @@ -3361,8 +3355,6 @@ static void flush_module_icache(const struct module *mod) + mod->init_layout.size); flush_icache_range((unsigned long)mod->core_layout.base, (unsigned long)mod->core_layout.base + mod->core_layout.size); - - set_fs(old_fs); } int __weak module_frob_arch_sections(Elf_Ehdr *hdr, diff --git a/kernel/panic.c b/kernel/panic.c index b69ee9e76cb2..85568bbfb12b 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -36,6 +36,14 @@ #define PANIC_TIMER_STEP 100 #define PANIC_BLINK_SPD 18 +#ifdef CONFIG_SMP +/* + * Should we dump all CPUs backtraces in an oops event? + * Defaults to 0, can be changed via sysctl. + */ +unsigned int __read_mostly sysctl_oops_all_cpu_backtrace; +#endif /* CONFIG_SMP */ + int panic_on_oops = CONFIG_PANIC_ON_OOPS_VALUE; static unsigned long tainted_mask = IS_ENABLED(CONFIG_GCC_PLUGIN_RANDSTRUCT) ? (1 << TAINT_RANDSTRUCT) : 0; @@ -44,6 +52,8 @@ static int pause_on_oops_flag; static DEFINE_SPINLOCK(pause_on_oops_lock); bool crash_kexec_post_notifiers; int panic_on_warn __read_mostly; +unsigned long panic_on_taint; +bool panic_on_taint_nousertaint = false; int panic_timeout = CONFIG_PANIC_TIMEOUT; EXPORT_SYMBOL_GPL(panic_timeout); @@ -434,6 +444,11 @@ void add_taint(unsigned flag, enum lockdep_ok lockdep_ok) pr_warn("Disabling lock debugging due to kernel taint\n"); set_bit(flag, &tainted_mask); + + if (tainted_mask & panic_on_taint) { + panic_on_taint = 0; + panic("panic_on_taint set ..."); + } } EXPORT_SYMBOL(add_taint); @@ -515,6 +530,9 @@ void oops_enter(void) /* can't trust the integrity of the kernel anymore: */ debug_locks_off(); do_oops_enter_exit(); + + if (sysctl_oops_all_cpu_backtrace) + trigger_all_cpu_backtrace(); } /* @@ -686,3 +704,30 @@ static int __init oops_setup(char *s) return 0; } early_param("oops", oops_setup); + +static int __init panic_on_taint_setup(char *s) +{ + char *taint_str; + + if (!s) + return -EINVAL; + + taint_str = strsep(&s, ","); + if (kstrtoul(taint_str, 16, &panic_on_taint)) + return -EINVAL; + + /* make sure panic_on_taint doesn't hold out-of-range TAINT flags */ + panic_on_taint &= TAINT_FLAGS_MAX; + + if (!panic_on_taint) + return -EINVAL; + + if (s && !strcmp(s, "nousertaint")) + panic_on_taint_nousertaint = true; + + pr_info("panic_on_taint: bitmask=0x%lx nousertaint_mode=%sabled\n", + panic_on_taint, panic_on_taint_nousertaint ? "en" : "dis"); + + return 0; +} +early_param("panic_on_taint", panic_on_taint_setup); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 715774d8c55f..db1ce7af2563 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -866,15 +866,23 @@ static int proc_taint(struct ctl_table *table, int write, return err; if (write) { + int i; + + /* + * If we are relying on panic_on_taint not producing + * false positives due to userspace input, bail out + * before setting the requested taint flags. + */ + if (panic_on_taint_nousertaint && (tmptaint & panic_on_taint)) + return -EINVAL; + /* * Poor man's atomic or. Not worth adding a primitive * to everyone's atomic.h for this */ - int i; - for (i = 0; i < BITS_PER_LONG && tmptaint >> i; i++) { - if ((tmptaint >> i) & 1) + for (i = 0; i < TAINT_FLAGS_COUNT; i++) + if ((1UL << i) & tmptaint) add_taint(i, LOCKDEP_STILL_OK); - } } return err; @@ -2141,6 +2149,17 @@ static struct ctl_table kern_table[] = { .proc_handler = proc_dointvec, }, #endif +#ifdef CONFIG_SMP + { + .procname = "oops_all_cpu_backtrace", + .data = &sysctl_oops_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "pid_max", .data = &pid_max, @@ -2428,6 +2447,17 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_DETECT_HUNG_TASK +#ifdef CONFIG_SMP + { + .procname = "hung_task_all_cpu_backtrace", + .data = &sysctl_hung_task_all_cpu_backtrace, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, +#endif /* CONFIG_SMP */ { .procname = "hung_task_panic", .data = &sysctl_hung_task_panic, diff --git a/kernel/watchdog.c b/kernel/watchdog.c index 53ff2c81b084..5abb5b22ad13 100644 --- a/kernel/watchdog.c +++ b/kernel/watchdog.c @@ -50,6 +50,11 @@ struct cpumask watchdog_cpumask __read_mostly; unsigned long *watchdog_cpumask_bits = cpumask_bits(&watchdog_cpumask); #ifdef CONFIG_HARDLOCKUP_DETECTOR + +# ifdef CONFIG_SMP +int __read_mostly sysctl_hardlockup_all_cpu_backtrace; +# endif /* CONFIG_SMP */ + /* * Should we panic when a soft-lockup or hard-lockup occurs: */ @@ -82,16 +87,6 @@ static int __init hardlockup_panic_setup(char *str) } __setup("nmi_watchdog=", hardlockup_panic_setup); -# ifdef CONFIG_SMP -int __read_mostly sysctl_hardlockup_all_cpu_backtrace; - -static int __init hardlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_hardlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("hardlockup_all_cpu_backtrace=", hardlockup_all_cpu_backtrace_setup); -# endif /* CONFIG_SMP */ #endif /* CONFIG_HARDLOCKUP_DETECTOR */ /* @@ -163,6 +158,10 @@ static void lockup_detector_update_enable(void) #define SOFTLOCKUP_RESET ULONG_MAX +#ifdef CONFIG_SMP +int __read_mostly sysctl_softlockup_all_cpu_backtrace; +#endif + /* Global variables, exported for sysctl */ unsigned int __read_mostly softlockup_panic = CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE; @@ -178,13 +177,6 @@ static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts); static DEFINE_PER_CPU(unsigned long, hrtimer_interrupts_saved); static unsigned long soft_lockup_nmi_warn; -static int __init softlockup_panic_setup(char *str) -{ - softlockup_panic = simple_strtoul(str, NULL, 0); - return 1; -} -__setup("softlockup_panic=", softlockup_panic_setup); - static int __init nowatchdog_setup(char *str) { watchdog_user_enabled = 0; @@ -206,17 +198,6 @@ static int __init watchdog_thresh_setup(char *str) } __setup("watchdog_thresh=", watchdog_thresh_setup); -#ifdef CONFIG_SMP -int __read_mostly sysctl_softlockup_all_cpu_backtrace; - -static int __init softlockup_all_cpu_backtrace_setup(char *str) -{ - sysctl_softlockup_all_cpu_backtrace = !!simple_strtol(str, NULL, 0); - return 1; -} -__setup("softlockup_all_cpu_backtrace=", softlockup_all_cpu_backtrace_setup); -#endif - static void __lockup_detector_cleanup(void); /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 9bd4eb7f5ec1..333e878d8af9 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -99,6 +99,7 @@ config DYNAMIC_DEBUG default n depends on PRINTK depends on (DEBUG_FS || PROC_FS) + select DYNAMIC_DEBUG_CORE help Compiles debug level messages into the kernel, which would not @@ -165,6 +166,17 @@ config DYNAMIC_DEBUG See Documentation/admin-guide/dynamic-debug-howto.rst for additional information. +config DYNAMIC_DEBUG_CORE + bool "Enable core function of dynamic debug support" + depends on PRINTK + depends on (DEBUG_FS || PROC_FS) + help + Enable core functional support of dynamic debug. It is useful + when you want to tie dynamic debug to your kernel modules with + DYNAMIC_DEBUG_MODULE defined for each of them, especially for + the case of embedded system where the kernel image size is + sensitive for people. + config SYMBOLIC_ERRNAME bool "Support symbolic error names in printf" default y if PRINTK diff --git a/lib/Makefile b/lib/Makefile index 32f19b4d1d2a..315516fa4ef4 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -190,7 +190,7 @@ lib-$(CONFIG_GENERIC_BUG) += bug.o obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o -obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o +obj-$(CONFIG_DYNAMIC_DEBUG_CORE) += dynamic_debug.o obj-$(CONFIG_SYMBOLIC_ERRNAME) += errname.o obj-$(CONFIG_NLATTR) += nlattr.o diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c index 8f199f403ab5..321437bbf87d 100644 --- a/lib/dynamic_debug.c +++ b/lib/dynamic_debug.c @@ -1032,8 +1032,13 @@ static int __init dynamic_debug_init(void) int verbose_bytes = 0; if (&__start___verbose == &__stop___verbose) { - pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); - return 1; + if (IS_ENABLED(CONFIG_DYNAMIC_DEBUG)) { + pr_warn("_ddebug table is empty in a CONFIG_DYNAMIC_DEBUG build\n"); + return 1; + } + pr_info("Ignore empty _ddebug table in a CONFIG_DYNAMIC_DEBUG_CORE build\n"); + ddebug_init_success = 1; + return 0; } iter = __start___verbose; modname = iter->modname; diff --git a/lib/test_sysctl.c b/lib/test_sysctl.c index 566dad3f4196..84eaae22d3a6 100644 --- a/lib/test_sysctl.c +++ b/lib/test_sysctl.c @@ -44,6 +44,8 @@ struct test_sysctl_data { int int_0002; int int_0003[4]; + int boot_int; + unsigned int uint_0001; char string_0001[65]; @@ -61,6 +63,8 @@ static struct test_sysctl_data test_data = { .int_0003[2] = 2, .int_0003[3] = 3, + .boot_int = 0, + .uint_0001 = 314, .string_0001 = "(none)", @@ -92,6 +96,15 @@ static struct ctl_table test_table[] = { .proc_handler = proc_dointvec, }, { + .procname = "boot_int", + .data = &test_data.boot_int, + .maxlen = sizeof(test_data.boot_int), + .mode = 0644, + .proc_handler = proc_dointvec, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { .procname = "uint_0001", .data = &test_data.uint_0001, .maxlen = sizeof(unsigned int), diff --git a/mm/frame_vector.c b/mm/frame_vector.c index c431ca81dad5..4107dbca0056 100644 --- a/mm/frame_vector.c +++ b/mm/frame_vector.c @@ -72,7 +72,7 @@ int get_vaddr_frames(unsigned long start, unsigned int nr_frames, if (!(vma->vm_flags & (VM_IO | VM_PFNMAP))) { vec->got_ref = true; vec->is_pfns = false; - ret = get_user_pages_locked(start, nr_frames, + ret = pin_user_pages_locked(start, nr_frames, gup_flags, (struct page **)(vec->ptrs), &locked); goto out; } @@ -122,7 +122,6 @@ EXPORT_SYMBOL(get_vaddr_frames); */ void put_vaddr_frames(struct frame_vector *vec) { - int i; struct page **pages; if (!vec->got_ref) @@ -135,8 +134,8 @@ void put_vaddr_frames(struct frame_vector *vec) */ if (WARN_ON(IS_ERR(pages))) goto out; - for (i = 0; i < vec->nr_frames; i++) - put_page(pages[i]); + + unpin_user_pages(pages, vec->nr_frames); vec->got_ref = false; out: vec->nr_frames = 0; @@ -2035,6 +2035,12 @@ long get_user_pages_locked(unsigned long start, unsigned long nr_pages, */ if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) return -EINVAL; + /* + * FOLL_PIN must only be set internally by the pin_user_pages*() APIs, + * never directly by the caller, so enforce that: + */ + if (WARN_ON_ONCE(gup_flags & FOLL_PIN)) + return -EINVAL; return __get_user_pages_locked(current, current->mm, start, nr_pages, pages, NULL, locked, @@ -2294,7 +2300,7 @@ pte_unmap: * to be special. * * For a futex to be placed on a THP tail page, get_futex_key requires a - * __get_user_pages_fast implementation that can pin pages. Thus it's still + * get_user_pages_fast_only implementation that can pin pages. Thus it's still * useful to have gup_huge_pmd even if we can't operate on ptes. */ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, @@ -2699,7 +2705,7 @@ static inline void gup_pgd_range(unsigned long addr, unsigned long end, #ifndef gup_fast_permitted /* - * Check if it's allowed to use __get_user_pages_fast() for the range, or + * Check if it's allowed to use get_user_pages_fast_only() for the range, or * we need to fall back to the slow version: */ static bool gup_fast_permitted(unsigned long start, unsigned long end) @@ -2811,8 +2817,14 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, return ret; } - -/* +/** + * get_user_pages_fast_only() - pin user pages in memory + * @start: starting user address + * @nr_pages: number of pages from start to pin + * @gup_flags: flags modifying pin behaviour + * @pages: array that receives pointers to the pages pinned. + * Should be at least nr_pages long. + * * Like get_user_pages_fast() except it's IRQ-safe in that it won't fall back to * the regular GUP. * Note a difference with get_user_pages_fast: this always returns the @@ -2825,8 +2837,8 @@ static int internal_get_user_pages_fast(unsigned long start, int nr_pages, * access can get ambiguous page results. If you call this function without * 'write' set, you'd better be sure that you're ok with that ambiguity. */ -int __get_user_pages_fast(unsigned long start, int nr_pages, int write, - struct page **pages) +int get_user_pages_fast_only(unsigned long start, int nr_pages, + unsigned int gup_flags, struct page **pages) { int nr_pinned; /* @@ -2836,10 +2848,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, * FOLL_FAST_ONLY is required in order to match the API description of * this routine: no fall back to regular ("slow") GUP. */ - unsigned int gup_flags = FOLL_GET | FOLL_FAST_ONLY; - - if (write) - gup_flags |= FOLL_WRITE; + gup_flags |= FOLL_GET | FOLL_FAST_ONLY; nr_pinned = internal_get_user_pages_fast(start, nr_pages, gup_flags, pages); @@ -2855,7 +2864,7 @@ int __get_user_pages_fast(unsigned long start, int nr_pages, int write, return nr_pinned; } -EXPORT_SYMBOL_GPL(__get_user_pages_fast); +EXPORT_SYMBOL_GPL(get_user_pages_fast_only); /** * get_user_pages_fast() - pin user pages in memory @@ -2909,9 +2918,6 @@ EXPORT_SYMBOL_GPL(get_user_pages_fast); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for further details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ int pin_user_pages_fast(unsigned long start, int nr_pages, unsigned int gup_flags, struct page **pages) @@ -2926,8 +2932,8 @@ int pin_user_pages_fast(unsigned long start, int nr_pages, EXPORT_SYMBOL_GPL(pin_user_pages_fast); /* - * This is the FOLL_PIN equivalent of __get_user_pages_fast(). Behavior is the - * same, except that this one sets FOLL_PIN instead of FOLL_GET. + * This is the FOLL_PIN equivalent of get_user_pages_fast_only(). Behavior + * is the same, except that this one sets FOLL_PIN instead of FOLL_GET. * * The API rules are the same, too: no negative values may be returned. */ @@ -2985,9 +2991,6 @@ EXPORT_SYMBOL_GPL(pin_user_pages_fast_only); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages_remote(struct task_struct *tsk, struct mm_struct *mm, unsigned long start, unsigned long nr_pages, @@ -3021,9 +3024,6 @@ EXPORT_SYMBOL(pin_user_pages_remote); * * FOLL_PIN means that the pages must be released via unpin_user_page(). Please * see Documentation/core-api/pin_user_pages.rst for details. - * - * This is intended for Case 1 (DIO) in Documentation/core-api/pin_user_pages.rst. It - * is NOT intended for Case 2 (RDMA: long-term pins). */ long pin_user_pages(unsigned long start, unsigned long nr_pages, unsigned int gup_flags, struct page **pages, @@ -3055,3 +3055,32 @@ long pin_user_pages_unlocked(unsigned long start, unsigned long nr_pages, return get_user_pages_unlocked(start, nr_pages, pages, gup_flags); } EXPORT_SYMBOL(pin_user_pages_unlocked); + +/* + * pin_user_pages_locked() is the FOLL_PIN variant of get_user_pages_locked(). + * Behavior is the same, except that this one sets FOLL_PIN and rejects + * FOLL_GET. + */ +long pin_user_pages_locked(unsigned long start, unsigned long nr_pages, + unsigned int gup_flags, struct page **pages, + int *locked) +{ + /* + * FIXME: Current FOLL_LONGTERM behavior is incompatible with + * FAULT_FLAG_ALLOW_RETRY because of the FS DAX check requirement on + * vmas. As there are no users of this flag in this call we simply + * disallow this option for now. + */ + if (WARN_ON_ONCE(gup_flags & FOLL_LONGTERM)) + return -EINVAL; + + /* FOLL_GET and FOLL_PIN are mutually exclusive. */ + if (WARN_ON_ONCE(gup_flags & FOLL_GET)) + return -EINVAL; + + gup_flags |= FOLL_PIN; + return __get_user_pages_locked(current, current->mm, start, nr_pages, + pages, NULL, locked, + gup_flags | FOLL_TOUCH); +} +EXPORT_SYMBOL(pin_user_pages_locked); diff --git a/mm/nommu.c b/mm/nommu.c index dfae55f41901..062dc1c90d21 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -433,7 +433,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Ok, looks good - let it rip. */ - flush_icache_range(mm->brk, brk); + flush_icache_user_range(mm->brk, brk); return mm->brk = brk; } @@ -1277,7 +1277,7 @@ share: /* we flush the region from the icache only when the first executable * mapping of it is made */ if (vma->vm_flags & VM_EXEC && !region->vm_icache_flushed) { - flush_icache_range(region->vm_start, region->vm_end); + flush_icache_user_range(region->vm_start, region->vm_end); region->vm_icache_flushed = true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 07ae77d97952..727751219003 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5575,15 +5575,6 @@ static int __parse_numa_zonelist_order(char *s) return 0; } -static __init int setup_numa_zonelist_order(char *s) -{ - if (!s) - return 0; - - return __parse_numa_zonelist_order(s); -} -early_param("numa_zonelist_order", setup_numa_zonelist_order); - char numa_zonelist_order[] = "Node"; /* diff --git a/mm/page_idle.c b/mm/page_idle.c index 295512465065..057c61df12db 100644 --- a/mm/page_idle.c +++ b/mm/page_idle.c @@ -4,6 +4,7 @@ #include <linux/fs.h> #include <linux/sysfs.h> #include <linux/kobject.h> +#include <linux/memory_hotplug.h> #include <linux/mm.h> #include <linux/mmzone.h> #include <linux/pagemap.h> @@ -30,13 +31,9 @@ */ static struct page *page_idle_get_page(unsigned long pfn) { - struct page *page; + struct page *page = pfn_to_online_page(pfn); pg_data_t *pgdat; - if (!pfn_valid(pfn)) - return NULL; - - page = pfn_to_page(pfn); if (!page || !PageLRU(page) || !get_page_unless_zero(page)) return NULL; diff --git a/tools/testing/selftests/sysctl/sysctl.sh b/tools/testing/selftests/sysctl/sysctl.sh index 6a970b127c9b..0aa8a86140e9 100755 --- a/tools/testing/selftests/sysctl/sysctl.sh +++ b/tools/testing/selftests/sysctl/sysctl.sh @@ -39,6 +39,7 @@ ALL_TESTS="$ALL_TESTS 0003:1:1:int_0002" ALL_TESTS="$ALL_TESTS 0004:1:1:uint_0001" ALL_TESTS="$ALL_TESTS 0005:3:1:int_0003" ALL_TESTS="$ALL_TESTS 0006:50:1:bitmap_0001" +ALL_TESTS="$ALL_TESTS 0007:1:1:boot_int" test_modprobe() { @@ -122,7 +123,7 @@ test_reqs() function load_req_mod() { - if [ ! -d $DIR ]; then + if [ ! -d $SYSCTL ]; then if ! modprobe -q -n $TEST_DRIVER; then echo "$0: module $TEST_DRIVER not found [SKIP]" exit $ksft_skip @@ -752,6 +753,46 @@ sysctl_test_0006() run_bitmaptest } +sysctl_test_0007() +{ + TARGET="${SYSCTL}/boot_int" + if [ ! -f $TARGET ]; then + echo "Skipping test for $TARGET as it is not present ..." + return $ksft_skip + fi + + if [ -d $DIR ]; then + echo "Boot param test only possible sysctl_test is built-in, not module:" + cat $TEST_DIR/config >&2 + return $ksft_skip + fi + + echo -n "Testing if $TARGET is set to 1 ..." + ORIG=$(cat "${TARGET}") + + if [ x$ORIG = "x1" ]; then + echo "ok" + return 0 + fi + echo "FAIL" + echo "Checking if /proc/cmdline contains setting of the expected parameter ..." + if [ ! -f /proc/cmdline ]; then + echo "/proc/cmdline does not exist, test inconclusive" + return 0 + fi + + FOUND=$(grep -c "sysctl[./]debug[./]test_sysctl[./]boot_int=1" /proc/cmdline) + if [ $FOUND = "1" ]; then + echo "Kernel param found but $TARGET is not 1, TEST FAILED" + rc=1 + test_rc + fi + + echo "Skipping test, expected kernel parameter missing." + echo "To perform this test, make sure kernel is booted with parameter: sysctl.debug.test_sysctl.boot_int=1" + return $ksft_skip +} + list_tests() { echo "Test ID list:" @@ -766,6 +807,7 @@ list_tests() echo "0004 x $(get_test_count 0004) - tests proc_douintvec()" echo "0005 x $(get_test_count 0005) - tests proc_douintvec() array" echo "0006 x $(get_test_count 0006) - tests proc_do_large_bitmap()" + echo "0007 x $(get_test_count 0007) - tests setting sysctl from kernel boot param" } usage() diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 7fa1e38e1659..7b0da1c28e51 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -1740,7 +1740,6 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, bool *writable, kvm_pfn_t *pfn) { struct page *page[1]; - int npages; /* * Fast pin a writable pfn only if it is a write fault request @@ -1750,8 +1749,7 @@ static bool hva_to_pfn_fast(unsigned long addr, bool write_fault, if (!(write_fault || writable)) return false; - npages = __get_user_pages_fast(addr, 1, 1, page); - if (npages == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, page)) { *pfn = page_to_pfn(page[0]); if (writable) @@ -1791,7 +1789,7 @@ static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault, if (unlikely(!write_fault) && writable) { struct page *wpage; - if (__get_user_pages_fast(addr, 1, 1, &wpage) == 1) { + if (get_user_page_fast_only(addr, FOLL_WRITE, &wpage)) { *writable = true; put_page(page); page = wpage; @@ -2003,7 +2001,7 @@ int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn, if (entry < nr_pages) return 0; - return __get_user_pages_fast(addr, nr_pages, 1, pages); + return get_user_pages_fast_only(addr, nr_pages, FOLL_WRITE, pages); } EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic); |