From 06f4e94898918bcad00cdd4d349313a439d6911e Mon Sep 17 00:00:00 2001 From: Zefan Li Date: Tue, 9 Aug 2016 11:25:01 +0800 Subject: cpuset: make sure new tasks conform to the current config of the cpuset A new task inherits cpus_allowed and mems_allowed masks from its parent, but if someone changes cpuset's config by writing to cpuset.cpus/cpuset.mems before this new task is inserted into the cgroup's task list, the new task won't be updated accordingly. Signed-off-by: Zefan Li Signed-off-by: Tejun Heo Cc: stable@vger.kernel.org --- kernel/cpuset.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'kernel') diff --git a/kernel/cpuset.c b/kernel/cpuset.c index c7fd2778ed50..c27e53326bef 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2069,6 +2069,20 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) mutex_unlock(&cpuset_mutex); } +/* + * Make sure the new task conform to the current state of its parent, + * which could have been changed by cpuset just after it inherits the + * state from the parent and before it sits on the cgroup's task list. + */ +void cpuset_fork(struct task_struct *task) +{ + if (task_css_is_root(task, cpuset_cgrp_id)) + return; + + set_cpus_allowed_ptr(task, ¤t->cpus_allowed); + task->mems_allowed = current->mems_allowed; +} + struct cgroup_subsys cpuset_cgrp_subsys = { .css_alloc = cpuset_css_alloc, .css_online = cpuset_css_online, @@ -2079,6 +2093,7 @@ struct cgroup_subsys cpuset_cgrp_subsys = { .attach = cpuset_attach, .post_attach = cpuset_post_attach, .bind = cpuset_bind, + .fork = cpuset_fork, .legacy_cftypes = files, .early_init = true, }; -- cgit v1.2.3 From 568ac888215c7fb2fabe8ea739b00ec3c1f5d440 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Wed, 10 Aug 2016 15:43:06 -0400 Subject: cgroup: reduce read locked section of cgroup_threadgroup_rwsem during fork cgroup_threadgroup_rwsem is acquired in read mode during process exit and fork. It is also grabbed in write mode during __cgroups_proc_write(). I've recently run into a scenario with lots of memory pressure and OOM and I am beginning to see systemd __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 percpu_down_write+0x114/0x170 __cgroup_procs_write.isra.12+0xb8/0x3c0 cgroup_file_write+0x74/0x1a0 kernfs_fop_write+0x188/0x200 __vfs_write+0x6c/0xe0 vfs_write+0xc0/0x230 SyS_write+0x6c/0x110 system_call+0x38/0xb4 This thread is waiting on the reader of cgroup_threadgroup_rwsem to exit. The reader itself is under memory pressure and has gone into reclaim after fork. There are times the reader also ends up waiting on oom_lock as well. __switch_to+0x1f8/0x350 __schedule+0x30c/0x990 schedule+0x48/0xc0 jbd2_log_wait_commit+0xd4/0x180 ext4_evict_inode+0x88/0x5c0 evict+0xf8/0x2a0 dispose_list+0x50/0x80 prune_icache_sb+0x6c/0x90 super_cache_scan+0x190/0x210 shrink_slab.part.15+0x22c/0x4c0 shrink_zone+0x288/0x3c0 do_try_to_free_pages+0x1dc/0x590 try_to_free_pages+0xdc/0x260 __alloc_pages_nodemask+0x72c/0xc90 alloc_pages_current+0xb4/0x1a0 page_table_alloc+0xc0/0x170 __pte_alloc+0x58/0x1f0 copy_page_range+0x4ec/0x950 copy_process.isra.5+0x15a0/0x1870 _do_fork+0xa8/0x4b0 ppc_clone+0x8/0xc In the meanwhile, all processes exiting/forking are blocked almost stalling the system. This patch moves the threadgroup_change_begin from before cgroup_fork() to just before cgroup_canfork(). There is no nee to worry about threadgroup changes till the task is actually added to the threadgroup. This avoids having to call reclaim with cgroup_threadgroup_rwsem held. tj: Subject and description edits. Signed-off-by: Balbir Singh Acked-by: Zefan Li Cc: Oleg Nesterov Cc: Andrew Morton Cc: stable@vger.kernel.org # v4.2+ Signed-off-by: Tejun Heo --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index 52e725d4a866..aaf782327bf3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1404,7 +1404,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->real_start_time = ktime_get_boot_ns(); p->io_context = NULL; p->audit_context = NULL; - threadgroup_change_begin(current); cgroup_fork(p); #ifdef CONFIG_NUMA p->mempolicy = mpol_dup(p->mempolicy); @@ -1556,6 +1555,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, INIT_LIST_HEAD(&p->thread_group); p->task_works = NULL; + threadgroup_change_begin(current); /* * Ensure that the cgroup subsystem policies allow the new process to be * forked. It should be noted the the new process's css_set can be changed @@ -1656,6 +1656,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, bad_fork_cancel_cgroup: cgroup_cancel_fork(p); bad_fork_free_pid: + threadgroup_change_end(current); if (pid != &init_struct_pid) free_pid(pid); bad_fork_cleanup_thread: @@ -1688,7 +1689,6 @@ bad_fork_cleanup_policy: mpol_put(p->mempolicy); bad_fork_cleanup_threadgroup_lock: #endif - threadgroup_change_end(current); delayacct_tsk_free(p); bad_fork_cleanup_count: atomic_dec(&p->cred->user->processes); -- cgit v1.2.3 From 485a252a5559b45d7df04c819ec91177c62c270b Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 10 Aug 2016 16:28:09 -0700 Subject: seccomp: Fix tracer exit notifications during fatal signals This fixes a ptrace vs fatal pending signals bug as manifested in seccomp now that seccomp was reordered to happen after ptrace. The short version is that seccomp should not attempt to call do_exit() while fatal signals are pending under a tracer. The existing code was trying to be as defensively paranoid as possible, but it now ends up confusing ptrace. Instead, the syscall can just be skipped (which solves the original concern that the do_exit() was addressing) and normal signal handling, tracer notification, and process death can happen. Paraphrasing from the original bug report: If a tracee task is in a PTRACE_EVENT_SECCOMP trap, or has been resumed after such a trap but not yet been scheduled, and another task in the thread-group calls exit_group(), then the tracee task exits without the ptracer receiving a PTRACE_EVENT_EXIT notification. Test case here: https://gist.github.com/khuey/3c43ac247c72cef8c956ca73281c9be7 The bug happens because when __seccomp_filter() detects fatal_signal_pending(), it calls do_exit() without dequeuing the fatal signal. When do_exit() sends the PTRACE_EVENT_EXIT notification and that task is descheduled, __schedule() notices that there is a fatal signal pending and changes its state from TASK_TRACED to TASK_RUNNING. That prevents the ptracer's waitpid() from returning the ptrace event. A more detailed analysis is here: https://github.com/mozilla/rr/issues/1762#issuecomment-237396255. Reported-by: Robert O'Callahan Reported-by: Kyle Huey Tested-by: Kyle Huey Fixes: 93e35efb8de4 ("x86/ptrace: run seccomp after ptrace") Signed-off-by: Kees Cook Acked-by: Oleg Nesterov Acked-by: James Morris --- kernel/seccomp.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/seccomp.c b/kernel/seccomp.c index ef6c6c3f9d8a..0db7c8a2afe2 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -605,12 +605,16 @@ static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd, ptrace_event(PTRACE_EVENT_SECCOMP, data); /* * The delivery of a fatal signal during event - * notification may silently skip tracer notification. - * Terminating the task now avoids executing a system - * call that may not be intended. + * notification may silently skip tracer notification, + * which could leave us with a potentially unmodified + * syscall that the tracer would have liked to have + * changed. Since the process is about to die, we just + * force the syscall to be skipped and let the signal + * kill the process and correctly handle any tracer exit + * notifications. */ if (fatal_signal_pending(current)) - do_exit(SIGSYS); + goto skip; /* Check if the tracer forced the syscall to be skipped. */ this_syscall = syscall_get_nr(current, task_pt_regs(current)); if (this_syscall < 0) -- cgit v1.2.3 From cd81a9170e69e018bbaba547c1fd85a585f5697a Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:38 +0200 Subject: mm: introduce get_task_exe_file For more convenient access if one has a pointer to the task. As a minor nit take advantage of the fact that only task lock + rcu are needed to safely grab ->exe_file. This saves mm refcount dance. Use the helper in proc_exe_link. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Cc: # 4.3.x Signed-off-by: Paul Moore --- fs/proc/base.c | 7 +------ include/linux/mm.h | 1 + kernel/fork.c | 23 +++++++++++++++++++++++ 3 files changed, 25 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0d163a84082d..da8b1943ba04 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1552,18 +1552,13 @@ static const struct file_operations proc_pid_set_comm_operations = { static int proc_exe_link(struct dentry *dentry, struct path *exe_path) { struct task_struct *task; - struct mm_struct *mm; struct file *exe_file; task = get_proc_task(d_inode(dentry)); if (!task) return -ENOENT; - mm = get_task_mm(task); + exe_file = get_task_exe_file(task); put_task_struct(task); - if (!mm) - return -ENOENT; - exe_file = get_mm_exe_file(mm); - mmput(mm); if (exe_file) { *exe_path = exe_file->f_path; path_get(&exe_file->f_path); diff --git a/include/linux/mm.h b/include/linux/mm.h index 8f468e0d2534..004c73a988b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1987,6 +1987,7 @@ extern void mm_drop_all_locks(struct mm_struct *mm); extern void set_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file); extern struct file *get_mm_exe_file(struct mm_struct *mm); +extern struct file *get_task_exe_file(struct task_struct *task); extern bool may_expand_vm(struct mm_struct *, vm_flags_t, unsigned long npages); extern void vm_stat_account(struct mm_struct *, vm_flags_t, long npages); diff --git a/kernel/fork.c b/kernel/fork.c index d277e83ed3e0..42451aeb245f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -773,6 +773,29 @@ struct file *get_mm_exe_file(struct mm_struct *mm) } EXPORT_SYMBOL(get_mm_exe_file); +/** + * get_task_exe_file - acquire a reference to the task's executable file + * + * Returns %NULL if task's mm (if any) has no associated executable file or + * this is a kernel thread with borrowed mm (see the comment above get_task_mm). + * User must release file via fput(). + */ +struct file *get_task_exe_file(struct task_struct *task) +{ + struct file *exe_file = NULL; + struct mm_struct *mm; + + task_lock(task); + mm = task->mm; + if (mm) { + if (!(task->flags & PF_KTHREAD)) + exe_file = get_mm_exe_file(mm); + } + task_unlock(task); + return exe_file; +} +EXPORT_SYMBOL(get_task_exe_file); + /** * get_task_mm - acquire a reference to the task's mm * -- cgit v1.2.3 From 5efc244346f9f338765da3d592f7947b0afdc4b5 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 23 Aug 2016 16:20:39 +0200 Subject: audit: fix exe_file access in audit_exe_compare Prior to the change the function would blindly deference mm, exe_file and exe_file->f_inode, each of which could have been NULL or freed. Use get_task_exe_file to safely obtain stable exe_file. Signed-off-by: Mateusz Guzik Acked-by: Konstantin Khlebnikov Acked-by: Richard Guy Briggs Cc: # 4.3.x Signed-off-by: Paul Moore --- kernel/audit_watch.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 3cf1c5978d39..4846691957da 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -19,6 +19,7 @@ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */ +#include #include #include #include @@ -544,10 +545,11 @@ int audit_exe_compare(struct task_struct *tsk, struct audit_fsnotify_mark *mark) unsigned long ino; dev_t dev; - rcu_read_lock(); - exe_file = rcu_dereference(tsk->mm->exe_file); + exe_file = get_task_exe_file(tsk); + if (!exe_file) + return 0; ino = exe_file->f_inode->i_ino; dev = exe_file->f_inode->i_sb->s_dev; - rcu_read_unlock(); + fput(exe_file); return audit_mark_compare(mark, ino, dev); } -- cgit v1.2.3 From 070c43eea5043e950daa423707ae3c77e2f48edb Mon Sep 17 00:00:00 2001 From: Thiago Jung Bauermann Date: Thu, 1 Sep 2016 16:14:44 -0700 Subject: kexec: fix double-free when failing to relocate the purgatory If kexec_apply_relocations fails, kexec_load_purgatory frees pi->sechdrs and pi->purgatory_buf. This is redundant, because in case of error kimage_file_prepare_segments calls kimage_file_post_load_cleanup, which will also free those buffers. This causes two warnings like the following, one for pi->sechdrs and the other for pi->purgatory_buf: kexec-bzImage64: Loading purgatory failed ------------[ cut here ]------------ WARNING: CPU: 1 PID: 2119 at mm/vmalloc.c:1490 __vunmap+0xc1/0xd0 Trying to vfree() nonexistent vm area (ffffc90000e91000) Modules linked in: CPU: 1 PID: 2119 Comm: kexec Not tainted 4.8.0-rc3+ #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x4d/0x65 __warn+0xcb/0xf0 warn_slowpath_fmt+0x4f/0x60 ? find_vmap_area+0x19/0x70 ? kimage_file_post_load_cleanup+0x47/0xb0 __vunmap+0xc1/0xd0 vfree+0x2e/0x70 kimage_file_post_load_cleanup+0x5e/0xb0 SyS_kexec_file_load+0x448/0x680 ? putname+0x54/0x60 ? do_sys_open+0x190/0x1f0 entry_SYSCALL_64_fastpath+0x13/0x8f ---[ end trace 158bb74f5950ca2b ]--- Fix by setting pi->sechdrs an pi->purgatory_buf to NULL, since vfree won't try to free a NULL pointer. Link: http://lkml.kernel.org/r/1472083546-23683-1-git-send-email-bauerman@linux.vnet.ibm.com Signed-off-by: Thiago Jung Bauermann Acked-by: Baoquan He Cc: "Eric W. Biederman" Cc: Vivek Goyal Cc: Dave Young Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec_file.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'kernel') diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c index 503bc2d348e5..037c321c5618 100644 --- a/kernel/kexec_file.c +++ b/kernel/kexec_file.c @@ -887,7 +887,10 @@ int kexec_load_purgatory(struct kimage *image, unsigned long min, return 0; out: vfree(pi->sechdrs); + pi->sechdrs = NULL; + vfree(pi->purgatory_buf); + pi->purgatory_buf = NULL; return ret; } -- cgit v1.2.3 From 236dec051078a8691950f56949612b4b74107e48 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 1 Sep 2016 16:14:47 -0700 Subject: kconfig: tinyconfig: provide whole choice blocks to avoid warnings Using "make tinyconfig" produces a couple of annoying warnings that show up for build test machines all the time: .config:966:warning: override: NOHIGHMEM changes choice state .config:965:warning: override: SLOB changes choice state .config:963:warning: override: KERNEL_XZ changes choice state .config:962:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:933:warning: override: SLOB changes choice state .config:930:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state .config:870:warning: override: SLOB changes choice state .config:868:warning: override: KERNEL_XZ changes choice state .config:867:warning: override: CC_OPTIMIZE_FOR_SIZE changes choice state I've made a previous attempt at fixing them and we discussed a number of alternatives. I tried changing the Makefile to use "merge_config.sh -n $(fragment-list)" but couldn't get that to work properly. This is yet another approach, based on the observation that we do want to see a warning for conflicting 'choice' options, and that we can simply make them non-conflicting by listing all other options as disabled. This is a trivial patch that we can apply independent of plans for other changes. Link: http://lkml.kernel.org/r/20160829214952.1334674-2-arnd@arndb.de Link: https://storage.kernelci.org/mainline/v4.7-rc6/x86-tinyconfig/build.log https://patchwork.kernel.org/patch/9212749/ Signed-off-by: Arnd Bergmann Reviewed-by: Josh Triplett Reviewed-by: Masahiro Yamada Acked-by: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/configs/tiny.config | 2 ++ kernel/configs/tiny.config | 8 ++++++++ 2 files changed, 10 insertions(+) (limited to 'kernel') diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config index 4e2ecfa23c15..4b429df40d7a 100644 --- a/arch/x86/configs/tiny.config +++ b/arch/x86/configs/tiny.config @@ -1 +1,3 @@ CONFIG_NOHIGHMEM=y +# CONFIG_HIGHMEM4G is not set +# CONFIG_HIGHMEM64G is not set diff --git a/kernel/configs/tiny.config b/kernel/configs/tiny.config index c2de56ab0fce..7fa0c4ae6394 100644 --- a/kernel/configs/tiny.config +++ b/kernel/configs/tiny.config @@ -1,4 +1,12 @@ +# CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE is not set CONFIG_CC_OPTIMIZE_FOR_SIZE=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_KERNEL_XZ=y +# CONFIG_KERNEL_LZO is not set +# CONFIG_KERNEL_LZ4 is not set CONFIG_OPTIMIZE_INLINING=y +# CONFIG_SLAB is not set +# CONFIG_SLUB is not set CONFIG_SLOB=y -- cgit v1.2.3 From 19feeff18bbfde659baa58c2346f15a24d7c405e Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Thu, 1 Sep 2016 16:15:04 -0700 Subject: printk/nmi: avoid direct printk()-s from __printk_nmi_flush() __printk_nmi_flush() can be called from nmi_panic(), therefore it has to test whether it's executed in NMI context and thus must route the messages through deferred printk() or via direct printk(). This is to avoid potential deadlocks, as described in commit cf9b1106c81c ("printk/nmi: flush NMI messages on the system panic"). However there remain two places where __printk_nmi_flush() does unconditional direct printk() calls: - pr_err("printk_nmi_flush: internal error ...") - pr_cont("\n") Factor out print_nmi_seq_line() parts into a new printk_nmi_flush_line() function, which takes care of in_nmi(), and use it in __printk_nmi_flush() for printing and error-reporting. Link: http://lkml.kernel.org/r/20160830161354.581-1-sergey.senozhatsky@gmail.com Signed-off-by: Sergey Senozhatsky Cc: Petr Mladek Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/nmi.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/kernel/printk/nmi.c b/kernel/printk/nmi.c index b69eb8a2876f..16bab471c7e2 100644 --- a/kernel/printk/nmi.c +++ b/kernel/printk/nmi.c @@ -99,26 +99,32 @@ again: return add; } -/* - * printk one line from the temporary buffer from @start index until - * and including the @end index. - */ -static void print_nmi_seq_line(struct nmi_seq_buf *s, int start, int end) +static void printk_nmi_flush_line(const char *text, int len) { - const char *buf = s->buffer + start; - /* * The buffers are flushed in NMI only on panic. The messages must * go only into the ring buffer at this stage. Consoles will get * explicitly called later when a crashdump is not generated. */ if (in_nmi()) - printk_deferred("%.*s", (end - start) + 1, buf); + printk_deferred("%.*s", len, text); else - printk("%.*s", (end - start) + 1, buf); + printk("%.*s", len, text); } +/* + * printk one line from the temporary buffer from @start index until + * and including the @end index. + */ +static void printk_nmi_flush_seq_line(struct nmi_seq_buf *s, + int start, int end) +{ + const char *buf = s->buffer + start; + + printk_nmi_flush_line(buf, (end - start) + 1); +} + /* * Flush data from the associated per_CPU buffer. The function * can be called either via IRQ work or independently. @@ -150,9 +156,11 @@ more: * the buffer an unexpected way. If we printed something then * @len must only increase. */ - if (i && i >= len) - pr_err("printk_nmi_flush: internal error: i=%d >= len=%zu\n", - i, len); + if (i && i >= len) { + const char *msg = "printk_nmi_flush: internal error\n"; + + printk_nmi_flush_line(msg, strlen(msg)); + } if (!len) goto out; /* Someone else has already flushed the buffer. */ @@ -166,14 +174,14 @@ more: /* Print line by line. */ for (; i < size; i++) { if (s->buffer[i] == '\n') { - print_nmi_seq_line(s, last_i, i); + printk_nmi_flush_seq_line(s, last_i, i); last_i = i + 1; } } /* Check if there was a partial line. */ if (last_i < size) { - print_nmi_seq_line(s, last_i, size - 1); - pr_cont("\n"); + printk_nmi_flush_seq_line(s, last_i, size - 1); + printk_nmi_flush_line("\n", strlen("\n")); } /* -- cgit v1.2.3 From c11600e4fed67ae4cd6a8096936afd445410e8ed Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Thu, 1 Sep 2016 16:15:07 -0700 Subject: mm, mempolicy: task->mempolicy must be NULL before dropping final reference KASAN allocates memory from the page allocator as part of kmem_cache_free(), and that can reference current->mempolicy through any number of allocation functions. It needs to be NULL'd out before the final reference is dropped to prevent a use-after-free bug: BUG: KASAN: use-after-free in alloc_pages_current+0x363/0x370 at addr ffff88010b48102c CPU: 0 PID: 15425 Comm: trinity-c2 Not tainted 4.8.0-rc2+ #140 ... Call Trace: dump_stack kasan_object_err kasan_report_error __asan_report_load2_noabort alloc_pages_current <-- use after free depot_save_stack save_stack kasan_slab_free kmem_cache_free __mpol_put <-- free do_exit This patch sets current->mempolicy to NULL before dropping the final reference. Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1608301442180.63329@chino.kir.corp.google.com Fixes: cd11016e5f52 ("mm, kasan: stackdepot implementation. Enable stackdepot for SLAB") Signed-off-by: David Rientjes Reported-by: Vegard Nossum Acked-by: Andrey Ryabinin Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: [4.6+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mempolicy.h | 4 ++++ kernel/exit.c | 7 +------ mm/mempolicy.c | 17 +++++++++++++++++ 3 files changed, 22 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 4429d255c8ab..5e5b2969d931 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -195,6 +195,7 @@ static inline bool vma_migratable(struct vm_area_struct *vma) } extern int mpol_misplaced(struct page *, struct vm_area_struct *, unsigned long); +extern void mpol_put_task_policy(struct task_struct *); #else @@ -297,5 +298,8 @@ static inline int mpol_misplaced(struct page *page, struct vm_area_struct *vma, return -1; /* no node preference */ } +static inline void mpol_put_task_policy(struct task_struct *task) +{ +} #endif /* CONFIG_NUMA */ #endif diff --git a/kernel/exit.c b/kernel/exit.c index 2f974ae042a6..091a78be3b09 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -848,12 +848,7 @@ void do_exit(long code) TASKS_RCU(preempt_enable()); exit_notify(tsk, group_dead); proc_exit_connector(tsk); -#ifdef CONFIG_NUMA - task_lock(tsk); - mpol_put(tsk->mempolicy); - tsk->mempolicy = NULL; - task_unlock(tsk); -#endif + mpol_put_task_policy(tsk); #ifdef CONFIG_FUTEX if (unlikely(current->pi_state_cache)) kfree(current->pi_state_cache); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index d8c4e38fb5f4..2da72a5b6ecc 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -2336,6 +2336,23 @@ out: return ret; } +/* + * Drop the (possibly final) reference to task->mempolicy. It needs to be + * dropped after task->mempolicy is set to NULL so that any allocation done as + * part of its kmem_cache_free(), such as by KASAN, doesn't reference a freed + * policy. + */ +void mpol_put_task_policy(struct task_struct *task) +{ + struct mempolicy *pol; + + task_lock(task); + pol = task->mempolicy; + task->mempolicy = NULL; + task_unlock(task); + mpol_put(pol); +} + static void sp_delete(struct shared_policy *sp, struct sp_node *n) { pr_debug("deleting %lx-l%lx\n", n->start, n->end); -- cgit v1.2.3 From 735f2770a770156100f534646158cb58cb8b2939 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 1 Sep 2016 16:15:13 -0700 Subject: kernel/fork: fix CLONE_CHILD_CLEARTID regression in nscd Commit fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit") has caused a subtle regression in nscd which uses CLONE_CHILD_CLEARTID to clear the nscd_certainly_running flag in the shared databases, so that the clients are notified when nscd is restarted. Now, when nscd uses a non-persistent database, clients that have it mapped keep thinking the database is being updated by nscd, when in fact nscd has created a new (anonymous) one (for non-persistent databases it uses an unlinked file as backend). The original proposal for the CLONE_CHILD_CLEARTID change claimed (https://lkml.org/lkml/2006/10/25/233): : The NPTL library uses the CLONE_CHILD_CLEARTID flag on clone() syscalls : on behalf of pthread_create() library calls. This feature is used to : request that the kernel clear the thread-id in user space (at an address : provided in the syscall) when the thread disassociates itself from the : address space, which is done in mm_release(). : : Unfortunately, when a multi-threaded process incurs a core dump (such as : from a SIGSEGV), the core-dumping thread sends SIGKILL signals to all of : the other threads, which then proceed to clear their user-space tids : before synchronizing in exit_mm() with the start of core dumping. This : misrepresents the state of process's address space at the time of the : SIGSEGV and makes it more difficult for someone to debug NPTL and glibc : problems (misleading him/her to conclude that the threads had gone away : before the fault). : : The fix below is to simply avoid the CLONE_CHILD_CLEARTID action if a : core dump has been initiated. The resulting patch from Roland (https://lkml.org/lkml/2006/10/26/269) seems to have a larger scope than the original patch asked for. It seems that limitting the scope of the check to core dumping should work for SIGSEGV issue describe above. [Changelog partly based on Andreas' description] Fixes: fec1d0115240 ("[PATCH] Disable CLONE_CHILD_CLEARTID for abnormal exit") Link: http://lkml.kernel.org/r/1471968749-26173-1-git-send-email-mhocko@kernel.org Signed-off-by: Michal Hocko Tested-by: William Preston Acked-by: Oleg Nesterov Cc: Roland McGrath Cc: Andreas Schwab Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/fork.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index aaf782327bf3..93bdba13d7d9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -913,14 +913,12 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm) deactivate_mm(tsk, mm); /* - * If we're exiting normally, clear a user-space tid field if - * requested. We leave this alone when dying by signal, to leave - * the value intact in a core dump, and to save the unnecessary - * trouble, say, a killed vfork parent shouldn't touch this mm. - * Userland only wants this done for a sys_exit. + * Signal userspace if we're not exiting with a core dump + * because we want to leave the value intact for debugging + * purposes. */ if (tsk->clear_child_tid) { - if (!(tsk->flags & PF_SIGNALED) && + if (!(tsk->signal->flags & SIGNAL_GROUP_COREDUMP) && atomic_read(&mm->mm_users) > 1) { /* * We don't check the error code - if userspace has -- cgit v1.2.3 From 08d072599234c959b0b82b63fa252c129225a899 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Fri, 2 Sep 2016 14:38:23 +0800 Subject: tick/nohz: Fix softlockup on scheduler stalls in kvm guest MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit tick_nohz_start_idle() is prevented to be called if the idle tick can't be stopped since commit 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter"). As a result, after suspend/resume the host machine, full dynticks kvm guest will softlockup: NMI watchdog: BUG: soft lockup - CPU#0 stuck for 26s! [swapper/0:0] Call Trace: default_idle+0x31/0x1a0 arch_cpu_idle+0xf/0x20 default_idle_call+0x2a/0x50 cpu_startup_entry+0x39b/0x4d0 rest_init+0x138/0x140 ? rest_init+0x5/0x140 start_kernel+0x4c1/0x4ce ? set_init_arg+0x55/0x55 ? early_idt_handler_array+0x120/0x120 x86_64_start_reservations+0x24/0x26 x86_64_start_kernel+0x142/0x14f In addition, cat /proc/stat | grep cpu in guest or host: cpu 398 16 5049 15754 5490 0 1 46 0 0 cpu0 206 5 450 0 0 0 1 14 0 0 cpu1 81 0 3937 3149 1514 0 0 9 0 0 cpu2 45 6 332 6052 2243 0 0 11 0 0 cpu3 65 2 328 6552 1732 0 0 11 0 0 The idle and iowait states are weird 0 for cpu0(housekeeping). The bug is present in both guest and host kernels, and they both have cpu0's idle and iowait states issue, however, host kernel's suspend/resume path etc will touch watchdog to avoid the softlockup. - The watchdog will not be touched in tick_nohz_stop_idle path (need be touched since the scheduler stall is expected) if idle_active flags are not detected. - The idle and iowait states will not be accounted when exit idle loop (resched or interrupt) if idle start time and idle_active flags are not set. This patch fixes it by reverting commit 1f3b0f8243cb934 since can't stop idle tick doesn't mean can't be idle. Fixes: 1f3b0f8243cb934 ("tick/nohz: Optimize nohz idle enter") Signed-off-by: Wanpeng Li Cc: Sanjeev Yadav Cc: Gaurav Jindal Cc: stable@vger.kernel.org Cc: kvm@vger.kernel.org Cc: Radim Krčmář Cc: Peter Zijlstra Cc: Paolo Bonzini Link: http://lkml.kernel.org/r/1472798303-4154-1-git-send-email-wanpeng.li@hotmail.com Signed-off-by: Thomas Gleixner --- kernel/time/tick-sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 204fdc86863d..2ec7c00228f3 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -908,10 +908,11 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts) ktime_t now, expires; int cpu = smp_processor_id(); + now = tick_nohz_start_idle(ts); + if (can_stop_idle_tick(cpu, ts)) { int was_stopped = ts->tick_stopped; - now = tick_nohz_start_idle(ts); ts->idle_calls++; expires = tick_nohz_stop_sched_tick(ts, now, cpu); -- cgit v1.2.3 From c86d06ba2818c5126078cb0cf4e0175ec381045b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 5 Sep 2016 08:38:13 -0400 Subject: PM / QoS: avoid calling cancel_delayed_work_sync() during early boot of_clk_init() ends up calling into pm_qos_update_request() very early during boot where irq is expected to stay disabled. pm_qos_update_request() uses cancel_delayed_work_sync() which correctly assumes that irq is enabled on invocation and unconditionally disables and re-enables it. Gate cancel_delayed_work_sync() invocation with kevented_up() to avoid enabling irq unexpectedly during early boot. Signed-off-by: Tejun Heo Reported-and-tested-by: Qiao Zhou Link: http://lkml.kernel.org/r/d2501c4c-8e7b-bea3-1b01-000b36b5dfe9@asrmicro.com Signed-off-by: Rafael J. Wysocki --- kernel/power/qos.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/power/qos.c b/kernel/power/qos.c index 97b0df71303e..168ff442ebde 100644 --- a/kernel/power/qos.c +++ b/kernel/power/qos.c @@ -482,7 +482,16 @@ void pm_qos_update_request(struct pm_qos_request *req, return; } - cancel_delayed_work_sync(&req->work); + /* + * This function may be called very early during boot, for example, + * from of_clk_init(), where irq needs to stay disabled. + * cancel_delayed_work_sync() assumes that irq is enabled on + * invocation and re-enables it on return. Avoid calling it until + * workqueue is initialized. + */ + if (keventd_up()) + cancel_delayed_work_sync(&req->work); + __pm_qos_update_request(req, new_value); } EXPORT_SYMBOL_GPL(pm_qos_update_request); -- cgit v1.2.3 From 9049771f7d5490a302589976984810064c83ab40 Mon Sep 17 00:00:00 2001 From: Dan Williams Date: Wed, 7 Sep 2016 08:51:21 -0700 Subject: mm: fix cache mode of dax pmd mappings track_pfn_insert() in vmf_insert_pfn_pmd() is marking dax mappings as uncacheable rendering them impractical for application usage. DAX-pte mappings are cached and the goal of establishing DAX-pmd mappings is to attain more performance, not dramatically less (3 orders of magnitude). track_pfn_insert() relies on a previous call to reserve_memtype() to establish the expected page_cache_mode for the range. While memremap() arranges for reserve_memtype() to be called, devm_memremap_pages() does not. So, teach track_pfn_insert() and untrack_pfn() how to handle tracking without a vma, and arrange for devm_memremap_pages() to establish the write-back-cache reservation in the memtype tree. Cc: Cc: Matthew Wilcox Cc: Ross Zwisler Cc: Nilesh Choudhury Cc: Kirill A. Shutemov Reported-by: Toshi Kani Reported-by: Kai Zhang Acked-by: Andrew Morton Signed-off-by: Dan Williams --- arch/x86/mm/pat.c | 17 ++++++++++------- kernel/memremap.c | 9 +++++++++ 2 files changed, 19 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/arch/x86/mm/pat.c b/arch/x86/mm/pat.c index ecb1b69c1651..170cc4ff057b 100644 --- a/arch/x86/mm/pat.c +++ b/arch/x86/mm/pat.c @@ -927,9 +927,10 @@ int track_pfn_copy(struct vm_area_struct *vma) } /* - * prot is passed in as a parameter for the new mapping. If the vma has a - * linear pfn mapping for the entire range reserve the entire vma range with - * single reserve_pfn_range call. + * prot is passed in as a parameter for the new mapping. If the vma has + * a linear pfn mapping for the entire range, or no vma is provided, + * reserve the entire pfn + size range with single reserve_pfn_range + * call. */ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, unsigned long pfn, unsigned long addr, unsigned long size) @@ -938,11 +939,12 @@ int track_pfn_remap(struct vm_area_struct *vma, pgprot_t *prot, enum page_cache_mode pcm; /* reserve the whole chunk starting from paddr */ - if (addr == vma->vm_start && size == (vma->vm_end - vma->vm_start)) { + if (!vma || (addr == vma->vm_start + && size == (vma->vm_end - vma->vm_start))) { int ret; ret = reserve_pfn_range(paddr, size, prot, 0); - if (!ret) + if (ret == 0 && vma) vma->vm_flags |= VM_PAT; return ret; } @@ -997,7 +999,7 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, resource_size_t paddr; unsigned long prot; - if (!(vma->vm_flags & VM_PAT)) + if (vma && !(vma->vm_flags & VM_PAT)) return; /* free the chunk starting from pfn or the whole chunk */ @@ -1011,7 +1013,8 @@ void untrack_pfn(struct vm_area_struct *vma, unsigned long pfn, size = vma->vm_end - vma->vm_start; } free_pfn_range(paddr, size); - vma->vm_flags &= ~VM_PAT; + if (vma) + vma->vm_flags &= ~VM_PAT; } /* diff --git a/kernel/memremap.c b/kernel/memremap.c index 251d16b4cb41..b501e390bb34 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -247,6 +247,7 @@ static void devm_memremap_pages_release(struct device *dev, void *data) align_start = res->start & ~(SECTION_SIZE - 1); align_size = ALIGN(resource_size(res), SECTION_SIZE); arch_remove_memory(align_start, align_size); + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); pgmap_radix_release(res); dev_WARN_ONCE(dev, pgmap->altmap && pgmap->altmap->alloc, "%s: failed to free all reserved pages\n", __func__); @@ -282,6 +283,7 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, struct percpu_ref *ref, struct vmem_altmap *altmap) { resource_size_t key, align_start, align_size, align_end; + pgprot_t pgprot = PAGE_KERNEL; struct dev_pagemap *pgmap; struct page_map *page_map; int error, nid, is_ram; @@ -351,6 +353,11 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, if (nid < 0) nid = numa_mem_id(); + error = track_pfn_remap(NULL, &pgprot, PHYS_PFN(align_start), 0, + align_size); + if (error) + goto err_pfn_remap; + error = arch_add_memory(nid, align_start, align_size, true); if (error) goto err_add_memory; @@ -371,6 +378,8 @@ void *devm_memremap_pages(struct device *dev, struct resource *res, return __va(res->start); err_add_memory: + untrack_pfn(NULL, PHYS_PFN(align_start), align_size); + err_pfn_remap: err_radix: pgmap_radix_release(res); devres_free(page_map); -- cgit v1.2.3