From 1b1ad288b8f1b11f83396e537003722897ecc12b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:43 -0800 Subject: kallsyms: remove arch specific text and data check Patch series "sections: Unify kernel sections range check and use", v4. There are three head files(kallsyms.h, kernel.h and sections.h) which include the kernel sections range check, let's make some cleanup and unify them. 1. cleanup arch specific text/data check and fix address boundary check in kallsyms.h 2. make all the basic/core kernel range check function into sections.h 3. update all the callers, and use the helper in sections.h to simplify the code After this series, we have 5 APIs about kernel sections range check in sections.h * is_kernel_rodata() --- already in sections.h * is_kernel_core_data() --- come from core_kernel_data() in kernel.h * is_kernel_inittext() --- come from kernel.h and kallsyms.h * __is_kernel_text() --- add new internal helper * __is_kernel() --- add new internal helper Note: For the last two helpers, people should not use directly, consider to use corresponding function in kallsyms.h. This patch (of 11): Remove arch specific text and data check after commit 4ba66a976072 ("arch: remove blackfin port"), no need arch-specific text/data check. Link: https://lkml.kernel.org/r/20210930071143.63410-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20210930071143.63410-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Arnd Bergmann Cc: Steven Rostedt Cc: Ingo Molnar Cc: David S. Miller Cc: Alexei Starovoitov Cc: Andrey Ryabinin Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Christophe Leroy Cc: Kefeng Wang Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Borislav Petkov Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michal Simek Cc: Petr Mladek Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/locking/lockdep.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 8e118caf835e..f2d3bf4960f4 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -818,9 +818,6 @@ static int static_obj(const void *obj) if ((addr >= start) && (addr < end)) return 1; - if (arch_is_kernel_data(addr)) - return 1; - /* * in-kernel percpu var? */ -- cgit v1.2.3 From a20deb3a348719adaf8c12e1bf4b599bfc51836e Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:51 -0800 Subject: sections: move and rename core_kernel_data() to is_kernel_core_data() Move core_kernel_data() into sections.h and rename it to is_kernel_core_data(), also make it return bool value, then update all the callers. Link: https://lkml.kernel.org/r/20210930071143.63410-4-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Arnd Bergmann Cc: Steven Rostedt Cc: Ingo Molnar Cc: "David S. Miller" Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Petr Mladek Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/sections.h | 16 ++++++++++++++++ include/linux/kernel.h | 1 - kernel/extable.c | 18 ------------------ kernel/trace/ftrace.c | 2 +- net/sysctl_net.c | 2 +- 5 files changed, 18 insertions(+), 21 deletions(-) (limited to 'kernel') diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 614fc809de34..7cba8ff10d3a 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -128,6 +128,22 @@ static inline bool init_section_intersects(void *virt, size_t size) return memory_intersects(__init_begin, __init_end, virt, size); } +/** + * is_kernel_core_data - checks if the pointer address is located in the + * .data section + * + * @addr: address to check + * + * Returns: true if the address is located in .data, false otherwise. + * Note: On some archs it may return true for core RODATA, and false + * for others. But will always be true for core RW data. + */ +static inline bool is_kernel_core_data(unsigned long addr) +{ + return addr >= (unsigned long)_sdata && + addr < (unsigned long)_edata; +} + /** * is_kernel_rodata - checks if the pointer address is located in the * .rodata section diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 46ca4404fb93..23f57a2d5a13 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -227,7 +227,6 @@ extern char *next_arg(char *args, char **param, char **val); extern int core_kernel_text(unsigned long addr); extern int init_kernel_text(unsigned long addr); -extern int core_kernel_data(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); diff --git a/kernel/extable.c b/kernel/extable.c index 290661f68e6b..0e3412b48bba 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -82,24 +82,6 @@ int notrace core_kernel_text(unsigned long addr) return 0; } -/** - * core_kernel_data - tell if addr points to kernel data - * @addr: address to test - * - * Returns true if @addr passed in is from the core kernel data - * section. - * - * Note: On some archs it may return true for core RODATA, and false - * for others. But will always be true for core RW data. - */ -int core_kernel_data(unsigned long addr) -{ - if (addr >= (unsigned long)_sdata && - addr < (unsigned long)_edata) - return 1; - return 0; -} - int __kernel_text_address(unsigned long addr) { if (kernel_text_address(addr)) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index feebf57c6458..7aaef2b78760 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -323,7 +323,7 @@ int __register_ftrace_function(struct ftrace_ops *ops) if (!ftrace_enabled && (ops->flags & FTRACE_OPS_FL_PERMANENT)) return -EBUSY; - if (!core_kernel_data((unsigned long)ops)) + if (!is_kernel_core_data((unsigned long)ops)) ops->flags |= FTRACE_OPS_FL_DYNAMIC; add_ftrace_ops(&ftrace_ops_list, ops); diff --git a/net/sysctl_net.c b/net/sysctl_net.c index f6cb0d4d114c..4b45ed631eb8 100644 --- a/net/sysctl_net.c +++ b/net/sysctl_net.c @@ -144,7 +144,7 @@ static void ensure_safe_net_sysctl(struct net *net, const char *path, addr = (unsigned long)ent->data; if (is_module_address(addr)) where = "module"; - else if (core_kernel_data(addr)) + else if (is_kernel_core_data(addr)) where = "kernel"; else continue; -- cgit v1.2.3 From b9ad8fe7b8cab3814d1de41e8ddca602b2646f4b Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:33:54 -0800 Subject: sections: move is_kernel_inittext() into sections.h The is_kernel_inittext() and init_kernel_text() are with same functionality, let's just keep is_kernel_inittext() and move it into sections.h, then update all the callers. Link: https://lkml.kernel.org/r/20210930071143.63410-5-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Arnd Bergmann Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: "David S. Miller" Cc: Dmitry Vyukov Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Petr Mladek Cc: Richard Henderson Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/kernel/unwind_orc.c | 2 +- include/asm-generic/sections.h | 14 ++++++++++++++ include/linux/kallsyms.h | 8 -------- include/linux/kernel.h | 1 - kernel/extable.c | 12 ++---------- 5 files changed, 17 insertions(+), 20 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/unwind_orc.c b/arch/x86/kernel/unwind_orc.c index a1202536fc57..d92ec2ced059 100644 --- a/arch/x86/kernel/unwind_orc.c +++ b/arch/x86/kernel/unwind_orc.c @@ -175,7 +175,7 @@ static struct orc_entry *orc_find(unsigned long ip) } /* vmlinux .init slow lookup: */ - if (init_kernel_text(ip)) + if (is_kernel_inittext(ip)) return __orc_find(__start_orc_unwind_ip, __start_orc_unwind, __stop_orc_unwind_ip - __start_orc_unwind_ip, ip); diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 7cba8ff10d3a..f82806ca8756 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -158,4 +158,18 @@ static inline bool is_kernel_rodata(unsigned long addr) addr < (unsigned long)__end_rodata; } +/** + * is_kernel_inittext - checks if the pointer address is located in the + * .init.text section + * + * @addr: address to check + * + * Returns: true if the address is located in .init.text, false otherwise. + */ +static inline bool is_kernel_inittext(unsigned long addr) +{ + return addr >= (unsigned long)_sinittext && + addr < (unsigned long)_einittext; +} + #endif /* _ASM_GENERIC_SECTIONS_H_ */ diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h index 62ce22b27ea8..bb5d9ec78e13 100644 --- a/include/linux/kallsyms.h +++ b/include/linux/kallsyms.h @@ -24,14 +24,6 @@ struct cred; struct module; -static inline int is_kernel_inittext(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext - && addr < (unsigned long)_einittext) - return 1; - return 0; -} - static inline int is_kernel_text(unsigned long addr) { if ((addr >= (unsigned long)_stext && addr < (unsigned long)_etext)) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 23f57a2d5a13..be84ab369650 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -226,7 +226,6 @@ extern bool parse_option_str(const char *str, const char *option); extern char *next_arg(char *args, char **param, char **val); extern int core_kernel_text(unsigned long addr); -extern int init_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); extern int func_ptr_is_kernel_text(void *ptr); diff --git a/kernel/extable.c b/kernel/extable.c index 0e3412b48bba..6505207aa7e6 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -62,14 +62,6 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) return e; } -int init_kernel_text(unsigned long addr) -{ - if (addr >= (unsigned long)_sinittext && - addr < (unsigned long)_einittext) - return 1; - return 0; -} - int notrace core_kernel_text(unsigned long addr) { if (addr >= (unsigned long)_stext && @@ -77,7 +69,7 @@ int notrace core_kernel_text(unsigned long addr) return 1; if (system_state < SYSTEM_FREEING_INITMEM && - init_kernel_text(addr)) + is_kernel_inittext(addr)) return 1; return 0; } @@ -94,7 +86,7 @@ int __kernel_text_address(unsigned long addr) * Since we are after the module-symbols check, there's * no danger of address overlap: */ - if (init_kernel_text(addr)) + if (is_kernel_inittext(addr)) return 1; return 0; } -- cgit v1.2.3 From 808b64565b028bd072c985624d5cbe4b6f5a5a35 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 8 Nov 2021 18:34:09 -0800 Subject: extable: use is_kernel_text() helper The core_kernel_text() should check the gate area, as it is part of kernel text range, use is_kernel_text() in core_kernel_text(). Link: https://lkml.kernel.org/r/20210930071143.63410-9-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Sergey Senozhatsky Cc: Steven Rostedt Cc: Alexander Potapenko Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Arnd Bergmann Cc: Benjamin Herrenschmidt Cc: Borislav Petkov Cc: Christophe Leroy Cc: "David S. Miller" Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Ivan Kokshaysky Cc: Matt Turner Cc: Michael Ellerman Cc: Michal Simek Cc: Paul Mackerras Cc: Petr Mladek Cc: Richard Henderson Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/extable.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/extable.c b/kernel/extable.c index 6505207aa7e6..b6f330f0fe74 100644 --- a/kernel/extable.c +++ b/kernel/extable.c @@ -64,8 +64,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr) int notrace core_kernel_text(unsigned long addr) { - if (addr >= (unsigned long)_stext && - addr < (unsigned long)_etext) + if (is_kernel_text(addr)) return 1; if (system_state < SYSTEM_FREEING_INITMEM && -- cgit v1.2.3 From ba1f70ddd1809c03660ddcfb757e1c1ced8b676d Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Mon, 8 Nov 2021 18:35:22 -0800 Subject: kernel/fork.c: unshare(): use swap() to make code cleaner Use swap() instead of reimplementing it. Link: https://lkml.kernel.org/r/20210909022046.8151-1-ran.xiaokai@zte.com.cn Signed-off-by: Ran Xiaokai Cc: Gabriel Krisman Bertazi Cc: Peter Zijlstra Cc: Eric W. Biederman Cc: Jens Axboe Cc: Alexey Gladkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 38681ad44c76..acf94e660ce3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -3027,7 +3027,7 @@ int unshare_fd(unsigned long unshare_flags, unsigned int max_fds, int ksys_unshare(unsigned long unshare_flags) { struct fs_struct *fs, *new_fs = NULL; - struct files_struct *fd, *new_fd = NULL; + struct files_struct *new_fd = NULL; struct cred *new_cred = NULL; struct nsproxy *new_nsproxy = NULL; int do_sysvsem = 0; @@ -3114,11 +3114,8 @@ int ksys_unshare(unsigned long unshare_flags) spin_unlock(&fs->lock); } - if (new_fd) { - fd = current->files; - current->files = new_fd; - new_fd = fd; - } + if (new_fd) + swap(current->files, new_fd); task_unlock(current); -- cgit v1.2.3 From 741ddd4519c4d21eb7313e89a2c4ccc44a3dd6b9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 8 Nov 2021 18:35:34 -0800 Subject: kcov: allocate per-CPU memory on the relevant node During boot kcov allocates per-CPU memory which is used later if remote/ softirq processing is enabled. Allocate the per-CPU memory on the CPU local node to avoid cross node memory access. Link: https://lkml.kernel.org/r/20210923164741.1859522-4-bigeasy@linutronix.de Link: https://lore.kernel.org/r/20210830172627.267989-4-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Dmitry Vyukov Acked-by: Marco Elver Tested-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Clark Williams Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 80bfe71bbe13..4f910231d99a 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -1034,8 +1034,8 @@ static int __init kcov_init(void) int cpu; for_each_possible_cpu(cpu) { - void *area = vmalloc(CONFIG_KCOV_IRQ_AREA_SIZE * - sizeof(unsigned long)); + void *area = vmalloc_node(CONFIG_KCOV_IRQ_AREA_SIZE * + sizeof(unsigned long), cpu_to_node(cpu)); if (!area) return -ENOMEM; per_cpu_ptr(&kcov_percpu_data, cpu)->irq_area = area; -- cgit v1.2.3 From 22036abe17c9f6e295bd9d767312cfb92fc9cf0a Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 8 Nov 2021 18:35:37 -0800 Subject: kcov: avoid enable+disable interrupts if !in_task() kcov_remote_start() may need to allocate memory in the in_task() case (otherwise per-CPU memory has been pre-allocated) and therefore requires enabled interrupts. The interrupts are enabled before checking if the allocation is required so if no allocation is required then the interrupts are needlessly enabled and disabled again. Enable interrupts only if memory allocation is performed. Link: https://lkml.kernel.org/r/20210923164741.1859522-5-bigeasy@linutronix.de Link: https://lore.kernel.org/r/20210830172627.267989-5-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Acked-by: Dmitry Vyukov Acked-by: Marco Elver Tested-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Clark Williams Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 4f910231d99a..620dc4ffeb68 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -869,19 +869,19 @@ void kcov_remote_start(u64 handle) size = CONFIG_KCOV_IRQ_AREA_SIZE; area = this_cpu_ptr(&kcov_percpu_data)->irq_area; } - spin_unlock_irqrestore(&kcov_remote_lock, flags); + spin_unlock(&kcov_remote_lock); /* Can only happen when in_task(). */ if (!area) { + local_irqrestore(flags); area = vmalloc(size * sizeof(unsigned long)); if (!area) { kcov_put(kcov); return; } + local_irq_save(flags); } - local_irq_save(flags); - /* Reset coverage size. */ *(u64 *)area = 0; -- cgit v1.2.3 From d5d2c51f1e5f56ed01d2c773974630c007e5e5f5 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Mon, 8 Nov 2021 18:35:40 -0800 Subject: kcov: replace local_irq_save() with a local_lock_t The kcov code mixes local_irq_save() and spin_lock() in kcov_remote_{start|end}(). This creates a warning on PREEMPT_RT because local_irq_save() disables interrupts and spin_lock_t is turned into a sleeping lock which can not be acquired in a section with disabled interrupts. The kcov_remote_lock is used to synchronize the access to the hash-list kcov_remote_map. The local_irq_save() block protects access to the per-CPU data kcov_percpu_data. There is no compelling reason to change the lock type to raw_spin_lock_t to make it work with local_irq_save(). Changing it would require to move memory allocation (in kcov_remote_add()) and deallocation outside of the locked section. Adding an unlimited amount of entries to the hashlist will increase the IRQ-off time during lookup. It could be argued that this is debug code and the latency does not matter. There is however no need to do so and it would allow to use this facility in an RT enabled build. Using a local_lock_t instead of local_irq_save() has the befit of adding a protection scope within the source which makes it obvious what is protected. On a !PREEMPT_RT && !LOCKDEP build the local_lock_irqsave() maps directly to local_irq_save() so there is overhead at runtime. Replace the local_irq_save() section with a local_lock_t. Link: https://lkml.kernel.org/r/20210923164741.1859522-6-bigeasy@linutronix.de Link: https://lore.kernel.org/r/20210830172627.267989-6-bigeasy@linutronix.de Reported-by: Clark Williams Signed-off-by: Sebastian Andrzej Siewior Acked-by: Dmitry Vyukov Acked-by: Marco Elver Tested-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: Steven Rostedt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcov.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/kcov.c b/kernel/kcov.c index 620dc4ffeb68..36ca640c4f8e 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -88,6 +88,7 @@ static struct list_head kcov_remote_areas = LIST_HEAD_INIT(kcov_remote_areas); struct kcov_percpu_data { void *irq_area; + local_lock_t lock; unsigned int saved_mode; unsigned int saved_size; @@ -96,7 +97,9 @@ struct kcov_percpu_data { int saved_sequence; }; -static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data); +static DEFINE_PER_CPU(struct kcov_percpu_data, kcov_percpu_data) = { + .lock = INIT_LOCAL_LOCK(lock), +}; /* Must be called with kcov_remote_lock locked. */ static struct kcov_remote *kcov_remote_find(u64 handle) @@ -824,7 +827,7 @@ void kcov_remote_start(u64 handle) if (!in_task() && !in_serving_softirq()) return; - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); /* * Check that kcov_remote_start() is not called twice in background @@ -832,7 +835,7 @@ void kcov_remote_start(u64 handle) */ mode = READ_ONCE(t->kcov_mode); if (WARN_ON(in_task() && kcov_mode_enabled(mode))) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* @@ -841,14 +844,15 @@ void kcov_remote_start(u64 handle) * happened while collecting coverage from a background thread. */ if (WARN_ON(in_serving_softirq() && t->kcov_softirq)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } spin_lock(&kcov_remote_lock); remote = kcov_remote_find(handle); if (!remote) { - spin_unlock_irqrestore(&kcov_remote_lock, flags); + spin_unlock(&kcov_remote_lock); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } kcov_debug("handle = %llx, context: %s\n", handle, @@ -873,13 +877,13 @@ void kcov_remote_start(u64 handle) /* Can only happen when in_task(). */ if (!area) { - local_irqrestore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); area = vmalloc(size * sizeof(unsigned long)); if (!area) { kcov_put(kcov); return; } - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); } /* Reset coverage size. */ @@ -891,7 +895,7 @@ void kcov_remote_start(u64 handle) } kcov_start(t, kcov, size, area, mode, sequence); - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); } EXPORT_SYMBOL(kcov_remote_start); @@ -965,12 +969,12 @@ void kcov_remote_stop(void) if (!in_task() && !in_serving_softirq()) return; - local_irq_save(flags); + local_lock_irqsave(&kcov_percpu_data.lock, flags); mode = READ_ONCE(t->kcov_mode); barrier(); if (!kcov_mode_enabled(mode)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* @@ -978,12 +982,12 @@ void kcov_remote_stop(void) * actually found the remote handle and started collecting coverage. */ if (in_serving_softirq() && !t->kcov_softirq) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } /* Make sure that kcov_softirq is only set when in softirq. */ if (WARN_ON(!in_serving_softirq() && t->kcov_softirq)) { - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); return; } @@ -1013,7 +1017,7 @@ void kcov_remote_stop(void) spin_unlock(&kcov_remote_lock); } - local_irq_restore(flags); + local_unlock_irqrestore(&kcov_percpu_data.lock, flags); /* Get in kcov_remote_start(). */ kcov_put(kcov); -- cgit v1.2.3 From b78dfa059fdd7d0f791132dc0cef39ceafdfd62e Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:35:46 -0800 Subject: kernel/resource: clean up and optimize iomem_is_exclusive() Patch series "virtio-mem: disallow mapping virtio-mem memory via /dev/mem", v5. Let's add the basic infrastructure to exclude some physical memory regions marked as "IORESOURCE_SYSTEM_RAM" completely from /dev/mem access, even though they are not marked IORESOURCE_BUSY and even though "iomem=relaxed" is set. Resource IORESOURCE_EXCLUSIVE for that purpose instead of adding new flags to express something similar to "soft-busy" or "not busy yet, but already prepared by a driver and not to be mapped by user space". Use it for virtio-mem, to disallow mapping any virtio-mem memory via /dev/mem to user space after the virtio-mem driver was loaded. This patch (of 3): We end up traversing subtrees of ranges we are not interested in; let's optimize this case, skipping such subtrees, cleaning up the function a bit. For example, in the following configuration (/proc/iomem): 00000000-00000fff : Reserved 00001000-00057fff : System RAM 00058000-00058fff : Reserved 00059000-0009cfff : System RAM 0009d000-000fffff : Reserved 000a0000-000bffff : PCI Bus 0000:00 000c0000-000c3fff : PCI Bus 0000:00 000c4000-000c7fff : PCI Bus 0000:00 000c8000-000cbfff : PCI Bus 0000:00 000cc000-000cffff : PCI Bus 0000:00 000d0000-000d3fff : PCI Bus 0000:00 000d4000-000d7fff : PCI Bus 0000:00 000d8000-000dbfff : PCI Bus 0000:00 000dc000-000dffff : PCI Bus 0000:00 000e0000-000e3fff : PCI Bus 0000:00 000e4000-000e7fff : PCI Bus 0000:00 000e8000-000ebfff : PCI Bus 0000:00 000ec000-000effff : PCI Bus 0000:00 000f0000-000fffff : PCI Bus 0000:00 000f0000-000fffff : System ROM 00100000-3fffffff : System RAM 40000000-403fffff : Reserved 40000000-403fffff : pnp 00:00 40400000-80a79fff : System RAM ... We don't have to look at any children of "0009d000-000fffff : Reserved" if we can just skip these 15 items directly because the parent range is not of interest. Link: https://lkml.kernel.org/r/20210920142856.17758-1-david@redhat.com Link: https://lkml.kernel.org/r/20210920142856.17758-2-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: "Rafael J. Wysocki" Cc: Hanjun Guo Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index ca9f5198a01f..2999f57da38c 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -73,6 +73,18 @@ static struct resource *next_resource(struct resource *p) return p->sibling; } +static struct resource *next_resource_skip_children(struct resource *p) +{ + while (!p->sibling && p->parent) + p = p->parent; + return p->sibling; +} + +#define for_each_resource(_root, _p, _skip_children) \ + for ((_p) = (_root)->child; (_p); \ + (_p) = (_skip_children) ? next_resource_skip_children(_p) : \ + next_resource(_p)) + static void *r_next(struct seq_file *m, void *v, loff_t *pos) { struct resource *p = v; @@ -1712,10 +1724,9 @@ static int strict_iomem_checks; */ bool iomem_is_exclusive(u64 addr) { - struct resource *p = &iomem_resource; - bool err = false; - loff_t l; + bool skip_children = false, err = false; int size = PAGE_SIZE; + struct resource *p; if (!strict_iomem_checks) return false; @@ -1723,15 +1734,19 @@ bool iomem_is_exclusive(u64 addr) addr = addr & PAGE_MASK; read_lock(&resource_lock); - for (p = p->child; p ; p = r_next(NULL, p, &l)) { + for_each_resource(&iomem_resource, p, skip_children) { /* * We can probably skip the resources without * IORESOURCE_IO attribute? */ if (p->start >= addr + size) break; - if (p->end < addr) + if (p->end < addr) { + skip_children = true; continue; + } + skip_children = false; + /* * A resource is exclusive if IORESOURCE_EXCLUSIVE is set * or CONFIG_IO_STRICT_DEVMEM is enabled and the -- cgit v1.2.3 From a9e7b8d4f663d13975e9bbe849735c8659c2a1f1 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Mon, 8 Nov 2021 18:35:50 -0800 Subject: kernel/resource: disallow access to exclusive system RAM regions virtio-mem dynamically exposes memory inside a device memory region as system RAM to Linux, coordinating with the hypervisor which parts are actually "plugged" and consequently usable/accessible. On the one hand, the virtio-mem driver adds/removes whole memory blocks, creating/removing busy IORESOURCE_SYSTEM_RAM resources, on the other hand, it logically (un)plugs memory inside added memory blocks, dynamically either exposing them to the buddy or hiding them from the buddy and marking them PG_offline. In contrast to physical devices, like a DIMM, the virtio-mem driver is required to actually make use of any of the device-provided memory, because it performs the handshake with the hypervisor. virtio-mem memory cannot simply be access via /dev/mem without a driver. There is no safe way to: a) Access plugged memory blocks via /dev/mem, as they might contain unplugged holes or might get silently unplugged by the virtio-mem driver and consequently turned inaccessible. b) Access unplugged memory blocks via /dev/mem because the virtio-mem driver is required to make them actually accessible first. The virtio-spec states that unplugged memory blocks MUST NOT be written, and only selected unplugged memory blocks MAY be read. We want to make sure, this is the case in sane environments -- where the virtio-mem driver was loaded. We want to make sure that in a sane environment, nobody "accidentially" accesses unplugged memory inside the device managed region. For example, a user might spot a memory region in /proc/iomem and try accessing it via /dev/mem via gdb or dumping it via something else. By the time the mmap() happens, the memory might already have been removed by the virtio-mem driver silently: the mmap() would succeeed and user space might accidentially access unplugged memory. So once the driver was loaded and detected the device along the device-managed region, we just want to disallow any access via /dev/mem to it. In an ideal world, we would mark the whole region as busy ("owned by a driver") and exclude it; however, that would be wrong, as we don't really have actual system RAM at these ranges added to Linux ("busy system RAM"). Instead, we want to mark such ranges as "not actual busy system RAM but still soft-reserved and prepared by a driver for future use." Let's teach iomem_is_exclusive() to reject access to any range with "IORESOURCE_SYSTEM_RAM | IORESOURCE_EXCLUSIVE", even if not busy and even if "iomem=relaxed" is set. Introduce EXCLUSIVE_SYSTEM_RAM to make it easier for applicable drivers to depend on this setting in their Kconfig. For now, there are no applicable ranges and we'll modify virtio-mem next to properly set IORESOURCE_EXCLUSIVE on the parent resource container it creates to contain all actual busy system RAM added via add_memory_driver_managed(). Link: https://lkml.kernel.org/r/20210920142856.17758-3-david@redhat.com Signed-off-by: David Hildenbrand Reviewed-by: Dan Williams Cc: Andy Shevchenko Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Cc: Hanjun Guo Cc: Jason Wang Cc: "Michael S. Tsirkin" Cc: "Rafael J. Wysocki" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/resource.c | 29 +++++++++++++++++++---------- mm/Kconfig | 7 +++++++ 2 files changed, 26 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/resource.c b/kernel/resource.c index 2999f57da38c..5ad3eba619ba 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -1719,26 +1719,23 @@ static int strict_iomem_checks; #endif /* - * check if an address is reserved in the iomem resource tree - * returns true if reserved, false if not reserved. + * Check if an address is exclusive to the kernel and must not be mapped to + * user space, for example, via /dev/mem. + * + * Returns true if exclusive to the kernel, otherwise returns false. */ bool iomem_is_exclusive(u64 addr) { + const unsigned int exclusive_system_ram = IORESOURCE_SYSTEM_RAM | + IORESOURCE_EXCLUSIVE; bool skip_children = false, err = false; int size = PAGE_SIZE; struct resource *p; - if (!strict_iomem_checks) - return false; - addr = addr & PAGE_MASK; read_lock(&resource_lock); for_each_resource(&iomem_resource, p, skip_children) { - /* - * We can probably skip the resources without - * IORESOURCE_IO attribute? - */ if (p->start >= addr + size) break; if (p->end < addr) { @@ -1747,12 +1744,24 @@ bool iomem_is_exclusive(u64 addr) } skip_children = false; + /* + * IORESOURCE_SYSTEM_RAM resources are exclusive if + * IORESOURCE_EXCLUSIVE is set, even if they + * are not busy and even if "iomem=relaxed" is set. The + * responsible driver dynamically adds/removes system RAM within + * such an area and uncontrolled access is dangerous. + */ + if ((p->flags & exclusive_system_ram) == exclusive_system_ram) { + err = true; + break; + } + /* * A resource is exclusive if IORESOURCE_EXCLUSIVE is set * or CONFIG_IO_STRICT_DEVMEM is enabled and the * resource is busy. */ - if ((p->flags & IORESOURCE_BUSY) == 0) + if (!strict_iomem_checks || !(p->flags & IORESOURCE_BUSY)) continue; if (IS_ENABLED(CONFIG_IO_STRICT_DEVMEM) || p->flags & IORESOURCE_EXCLUSIVE) { diff --git a/mm/Kconfig b/mm/Kconfig index ae1f151c2924..068ce591a13a 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -109,6 +109,13 @@ config NUMA_KEEP_MEMINFO config MEMORY_ISOLATION bool +# IORESOURCE_SYSTEM_RAM regions in the kernel resource tree that are marked +# IORESOURCE_EXCLUSIVE cannot be mapped to user space, for example, via +# /dev/mem. +config EXCLUSIVE_SYSTEM_RAM + def_bool y + depends on !DEVMEM || STRICT_DEVMEM + # # Only be set on architectures that have completely implemented memory hotplug # feature. If you are not sure, don't touch it. -- cgit v1.2.3