From 90228fc110303549aa1d4d86083bf585df8624c3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Dec 2012 03:33:38 -0500 Subject: switch compat_sys_sigaltstack() to COMPAT_SYSCALL_DEFINE Makes sigaltstack conversion easier to split into per-architecture parts. Signed-off-by: Al Viro --- kernel/signal.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/signal.c b/kernel/signal.c index 7aaa51d8e5b8..00b4a6d4449d 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -3119,8 +3119,9 @@ int __save_altstack(stack_t __user *uss, unsigned long sp) #ifdef CONFIG_COMPAT #ifdef CONFIG_GENERIC_SIGALTSTACK -asmlinkage long compat_sys_sigaltstack(const compat_stack_t __user *uss_ptr, - compat_stack_t __user *uoss_ptr) +COMPAT_SYSCALL_DEFINE2(sigaltstack, + const compat_stack_t __user *, uss_ptr, + compat_stack_t __user *, uoss_ptr) { stack_t uss, uoss; int ret; -- cgit v1.2.3 From 8d9807b109497ca41d363dc7b6ff2bb6c0d52524 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Dec 2012 14:56:40 -0500 Subject: switch compat_sys_wait4() and compat_sys_waitid() to COMPAT_SYSCALL_DEFINE Strictly speaking, ppc64 needs it for C ABI compliance. Realistically I would be very surprised if e.g. passing 0xffffffff as 'options' argument to waitid() from 32bit task would cause problems, but yes, it puts us into undefined behaviour territory. ppc64 expects int argument to be passed in 64bit register with bits 31..63 containing the same value. SYSCALL_DEFINE on ppc provides a wrapper that normalizes the value passed from userland; so does COMPAT_SYSCALL_DEFINE. Plain declaration of compat_sys_something() with an int argument obviously doesn't. Again, for wait4 and waitid I would be extremely surprised if gcc started to produce code depending on that value having been properly sign-extended - the argument(s) in question end up passed blindly to sys_wait4 and sys_waitid resp. and normalization for native syscalls takes care of their use there. Still, better to use COMPAT_SYSCALL_DEFINE here than worry about nasal daemons... Signed-off-by: Al Viro --- kernel/compat.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index f6150e92dfc9..0770ac57c62b 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -535,9 +535,11 @@ asmlinkage long compat_sys_getrusage(int who, struct compat_rusage __user *ru) return 0; } -asmlinkage long -compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, - struct compat_rusage __user *ru) +COMPAT_SYSCALL_DEFINE4(wait4, + compat_pid_t, pid, + compat_uint_t __user *, stat_addr, + int, options, + struct compat_rusage __user *, ru) { if (!ru) { return sys_wait4(pid, stat_addr, options, NULL); @@ -564,9 +566,10 @@ compat_sys_wait4(compat_pid_t pid, compat_uint_t __user *stat_addr, int options, } } -asmlinkage long compat_sys_waitid(int which, compat_pid_t pid, - struct compat_siginfo __user *uinfo, int options, - struct compat_rusage __user *uru) +COMPAT_SYSCALL_DEFINE5(waitid, + int, which, compat_pid_t, pid, + struct compat_siginfo __user *, uinfo, int, options, + struct compat_rusage __user *, uru) { siginfo_t info; struct rusage ru; -- cgit v1.2.3 From a566c288826ad4502e43b59570214f18173d7744 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 23 Dec 2012 23:14:49 -0500 Subject: x32: fix waitid() It needs 64bit rusage and 32bit siginfo. glibc never calls it with non-NULL rusage pointer, or we would've seen breakage already... Signed-off-by: Al Viro --- kernel/compat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index 0770ac57c62b..e5cc33c7122c 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -587,7 +587,11 @@ COMPAT_SYSCALL_DEFINE5(waitid, return ret; if (uru) { - ret = put_compat_rusage(&ru, uru); + /* sys_waitid() overwrites everything in ru */ + if (COMPAT_USE_64BIT_TIME) + ret = copy_to_user(uru, &ru, sizeof(ru)); + else + ret = put_compat_rusage(&ru, uru); if (ret) return ret; } -- cgit v1.2.3 From b2ddedcd21f44a5873ee3d6ff6118a2318e01e18 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 24 Dec 2012 12:31:00 -0500 Subject: x32: fix sigtimedwait It needs 64bit timespec. As it is, we end up truncating the timeout to whole seconds; usually it doesn't matter, but for having all sub-second timeouts truncated to one jiffy is visibly wrong. Signed-off-by: Al Viro --- kernel/compat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/compat.c b/kernel/compat.c index e5cc33c7122c..36700e9e2be9 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -1001,7 +1001,7 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, sigset_from_compat(&s, &s32); if (uts) { - if (get_compat_timespec(&t, uts)) + if (compat_get_timespec(&t, uts)) return -EFAULT; } -- cgit v1.2.3 From 52441fa8f2f1ccc9fa97607c6ccf8b46b9fd15ae Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Thu, 3 Jan 2013 11:09:53 +1030 Subject: module: prevent warning when finit_module a 0 sized file If we try to finit_module on a file sized 0 bytes vmalloc will scream and spit out a warning. Since modules have to be bigger than 0 bytes anyways we can just check that beforehand and avoid the warning. Signed-off-by: Sasha Levin Signed-off-by: Rusty Russell --- kernel/module.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..41bc1189b061 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2527,6 +2527,13 @@ static int copy_module_from_fd(int fd, struct load_info *info) err = -EFBIG; goto out; } + + /* Don't hand 0 to vmalloc, it whines. */ + if (stat.size == 0) { + err = -EINVAL; + goto out; + } + info->hdr = vmalloc(stat.size); if (!info->hdr) { err = -ENOMEM; -- cgit v1.2.3 From 2df8f8a6a897ebf4c5613b5be6103d33b2a21520 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 11 Jan 2013 16:14:10 -0500 Subject: tracing: Fix regression with irqsoff tracer and tracing_on file Commit 02404baf1b47 "tracing: Remove deprecated tracing_enabled file" removed the tracing_enabled file as it never worked properly and the tracing_on file should be used instead. But the tracing_on file didn't call into the tracers start/stop routines like the tracing_enabled file did. This caused trace-cmd to break when it enabled the irqsoff tracer. If you just did "echo irqsoff > current_tracer" then it would work properly. But the tool trace-cmd disables tracing first by writing "0" into the tracing_on file. Then it writes "irqsoff" into current_tracer and then writes "1" into tracing_on. Unfortunately, the above commit changed the irqsoff tracer to check the tracing_on status instead of the tracing_enabled status. If it's disabled then it does not start the tracer internals. The problem is that writing "1" into tracing_on does not call the tracers "start" routine like writing "1" into tracing_enabled did. This makes the irqsoff tracer not start when using the trace-cmd tool, and is a regression for userspace. Simple fix is to have the tracing_on file call the tracers start() method when being enabled (and the stop() method when disabled). Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 1bbfa0446507..f3ec1cfb0de1 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4817,10 +4817,17 @@ rb_simple_write(struct file *filp, const char __user *ubuf, return ret; if (buffer) { - if (val) + mutex_lock(&trace_types_lock); + if (val) { ring_buffer_record_on(buffer); - else + if (current_trace->start) + current_trace->start(tr); + } else { ring_buffer_record_off(buffer); + if (current_trace->stop) + current_trace->stop(tr); + } + mutex_unlock(&trace_types_lock); } (*ppos)++; -- cgit v1.2.3 From 0d21b0e3477395e7ff2acc269f15df6e6a8d356d Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 12 Jan 2013 11:38:44 +1030 Subject: module: add new state MODULE_STATE_UNFORMED. You should never look at such a module, so it's excised from all paths which traverse the modules list. We add the state at the end, to avoid gratuitous ABI break (ksplice). Signed-off-by: Rusty Russell --- include/linux/module.h | 10 ++++---- kernel/debug/kdb/kdb_main.c | 2 ++ kernel/module.c | 57 +++++++++++++++++++++++++++++++++++++++++---- 3 files changed, 59 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index 7760c6d344a3..1375ee3f03aa 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -199,11 +199,11 @@ struct module_use { struct module *source, *target; }; -enum module_state -{ - MODULE_STATE_LIVE, - MODULE_STATE_COMING, - MODULE_STATE_GOING, +enum module_state { + MODULE_STATE_LIVE, /* Normal state. */ + MODULE_STATE_COMING, /* Full formed, running module_init. */ + MODULE_STATE_GOING, /* Going away. */ + MODULE_STATE_UNFORMED, /* Still setting it up. */ }; /** diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 4d5f8d5612f3..8875254120b6 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1970,6 +1970,8 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("Module Size modstruct Used by\n"); list_for_each_entry(mod, kdb_modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); diff --git a/kernel/module.c b/kernel/module.c index 41bc1189b061..c3a2ee8e3679 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -188,6 +188,7 @@ struct load_info { ongoing or failed initialization etc. */ static inline int strong_try_module_get(struct module *mod) { + BUG_ON(mod && mod->state == MODULE_STATE_UNFORMED); if (mod && mod->state == MODULE_STATE_COMING) return -EBUSY; if (try_module_get(mod)) @@ -343,6 +344,9 @@ bool each_symbol_section(bool (*fn)(const struct symsearch *arr, #endif }; + if (mod->state == MODULE_STATE_UNFORMED) + continue; + if (each_symbol_in_section(arr, ARRAY_SIZE(arr), mod, fn, data)) return true; } @@ -450,16 +454,24 @@ const struct kernel_symbol *find_symbol(const char *name, EXPORT_SYMBOL_GPL(find_symbol); /* Search for module by name: must hold module_mutex. */ -struct module *find_module(const char *name) +static struct module *find_module_all(const char *name, + bool even_unformed) { struct module *mod; list_for_each_entry(mod, &modules, list) { + if (!even_unformed && mod->state == MODULE_STATE_UNFORMED) + continue; if (strcmp(mod->name, name) == 0) return mod; } return NULL; } + +struct module *find_module(const char *name) +{ + return find_module_all(name, false); +} EXPORT_SYMBOL_GPL(find_module); #ifdef CONFIG_SMP @@ -525,6 +537,8 @@ bool is_module_percpu_address(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (!mod->percpu_size) continue; for_each_possible_cpu(cpu) { @@ -1048,6 +1062,8 @@ static ssize_t show_initstate(struct module_attribute *mattr, case MODULE_STATE_GOING: state = "going"; break; + default: + BUG(); } return sprintf(buffer, "%s\n", state); } @@ -1786,6 +1802,8 @@ void set_all_modules_text_rw(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -1807,6 +1825,8 @@ void set_all_modules_text_ro(void) mutex_lock(&module_mutex); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((mod->module_core) && (mod->core_text_size)) { set_page_attributes(mod->module_core, mod->module_core + mod->core_text_size, @@ -2998,7 +3018,8 @@ static bool finished_loading(const char *name) mutex_lock(&module_mutex); mod = find_module(name); - ret = !mod || mod->state != MODULE_STATE_COMING; + ret = !mod || mod->state == MODULE_STATE_LIVE + || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); return ret; @@ -3361,6 +3382,8 @@ const char *module_address_lookup(unsigned long addr, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { if (modname) @@ -3384,6 +3407,8 @@ int lookup_module_symbol_name(unsigned long addr, char *symname) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3408,6 +3433,8 @@ int lookup_module_symbol_attrs(unsigned long addr, unsigned long *size, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_init(addr, mod) || within_module_core(addr, mod)) { const char *sym; @@ -3435,6 +3462,8 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type, preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (symnum < mod->num_symtab) { *value = mod->symtab[symnum].st_value; *type = mod->symtab[symnum].st_info; @@ -3477,9 +3506,12 @@ unsigned long module_kallsyms_lookup_name(const char *name) ret = mod_find_symname(mod, colon+1); *colon = ':'; } else { - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if ((ret = mod_find_symname(mod, name)) != 0) break; + } } preempt_enable(); return ret; @@ -3494,6 +3526,8 @@ int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *, int ret; list_for_each_entry(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; for (i = 0; i < mod->num_symtab; i++) { ret = fn(data, mod->strtab + mod->symtab[i].st_name, mod, mod->symtab[i].st_value); @@ -3509,6 +3543,7 @@ static char *module_flags(struct module *mod, char *buf) { int bx = 0; + BUG_ON(mod->state == MODULE_STATE_UNFORMED); if (mod->taints || mod->state == MODULE_STATE_GOING || mod->state == MODULE_STATE_COMING) { @@ -3550,6 +3585,10 @@ static int m_show(struct seq_file *m, void *p) struct module *mod = list_entry(p, struct module, list); char buf[8]; + /* We always ignore unformed modules. */ + if (mod->state == MODULE_STATE_UNFORMED) + return 0; + seq_printf(m, "%s %u", mod->name, mod->init_size + mod->core_size); print_unload_info(m, mod); @@ -3610,6 +3649,8 @@ const struct exception_table_entry *search_module_extables(unsigned long addr) preempt_disable(); list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (mod->num_exentries == 0) continue; @@ -3658,10 +3699,13 @@ struct module *__module_address(unsigned long addr) if (addr < module_addr_min || addr > module_addr_max) return NULL; - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; if (within_module_core(addr, mod) || within_module_init(addr, mod)) return mod; + } return NULL; } EXPORT_SYMBOL_GPL(__module_address); @@ -3714,8 +3758,11 @@ void print_modules(void) printk(KERN_DEFAULT "Modules linked in:"); /* Most callers should already have preempt disabled, but make sure */ preempt_disable(); - list_for_each_entry_rcu(mod, &modules, list) + list_for_each_entry_rcu(mod, &modules, list) { + if (mod->state == MODULE_STATE_UNFORMED) + continue; printk(" %s%s", mod->name, module_flags(mod, buf)); + } preempt_enable(); if (last_unloaded_module[0]) printk(" [last unloaded: %s]", last_unloaded_module); -- cgit v1.2.3 From 1fb9341ac34825aa40354e74d9a2c69df7d2c304 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Sat, 12 Jan 2013 13:27:34 +1030 Subject: module: put modules in list much earlier. Prarit's excellent bug report: > In recent Fedora releases (F17 & F18) some users have reported seeing > messages similar to > > [ 15.478160] kvm: Could not allocate 304 bytes percpu data > [ 15.478174] PERCPU: allocation failed, size=304 align=32, alloc from > reserved chunk failed > > during system boot. In some cases, users have also reported seeing this > message along with a failed load of other modules. > > What is happening is systemd is loading an instance of the kvm module for > each cpu found (see commit e9bda3b). When the module load occurs the kernel > currently allocates the modules percpu data area prior to checking to see > if the module is already loaded or is in the process of being loaded. If > the module is already loaded, or finishes load, the module loading code > releases the current instance's module's percpu data. Now we have a new state MODULE_STATE_UNFORMED, we can insert the module into the list (and thus guarantee its uniqueness) before we allocate the per-cpu region. Reported-by: Prarit Bhargava Signed-off-by: Rusty Russell Tested-by: Prarit Bhargava --- kernel/module.c | 90 +++++++++++++++++++++++++++++++-------------------------- lib/bug.c | 1 + 2 files changed, 50 insertions(+), 41 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index c3a2ee8e3679..ec535aa63c98 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3017,7 +3017,7 @@ static bool finished_loading(const char *name) bool ret; mutex_lock(&module_mutex); - mod = find_module(name); + mod = find_module_all(name, true); ret = !mod || mod->state == MODULE_STATE_LIVE || mod->state == MODULE_STATE_GOING; mutex_unlock(&module_mutex); @@ -3141,6 +3141,32 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_copy; } + /* + * We try to place it in the list now to make sure it's unique + * before we dedicate too many resources. In particular, + * temporary percpu memory exhaustion. + */ + mod->state = MODULE_STATE_UNFORMED; +again: + mutex_lock(&module_mutex); + if ((old = find_module_all(mod->name, true)) != NULL) { + if (old->state == MODULE_STATE_COMING + || old->state == MODULE_STATE_UNFORMED) { + /* Wait in case it fails to load. */ + mutex_unlock(&module_mutex); + err = wait_event_interruptible(module_wq, + finished_loading(mod->name)); + if (err) + goto free_module; + goto again; + } + err = -EEXIST; + mutex_unlock(&module_mutex); + goto free_module; + } + list_add_rcu(&mod->list, &modules); + mutex_unlock(&module_mutex); + #ifdef CONFIG_MODULE_SIG mod->sig_ok = info->sig_ok; if (!mod->sig_ok) @@ -3150,7 +3176,7 @@ static int load_module(struct load_info *info, const char __user *uargs, /* Now module is in final location, initialize linked lists, etc. */ err = module_unload_init(mod); if (err) - goto free_module; + goto unlink_mod; /* Now we've got everything in the final locations, we can * find optional sections. */ @@ -3185,54 +3211,33 @@ static int load_module(struct load_info *info, const char __user *uargs, goto free_arch_cleanup; } - /* Mark state as coming so strong_try_module_get() ignores us. */ - mod->state = MODULE_STATE_COMING; - - /* Now sew it into the lists so we can get lockdep and oops - * info during argument parsing. No one should access us, since - * strong_try_module_get() will fail. - * lockdep/oops can run asynchronous, so use the RCU list insertion - * function to insert in a way safe to concurrent readers. - * The mutex protects against concurrent writers. - */ -again: - mutex_lock(&module_mutex); - if ((old = find_module(mod->name)) != NULL) { - if (old->state == MODULE_STATE_COMING) { - /* Wait in case it fails to load. */ - mutex_unlock(&module_mutex); - err = wait_event_interruptible(module_wq, - finished_loading(mod->name)); - if (err) - goto free_arch_cleanup; - goto again; - } - err = -EEXIST; - goto unlock; - } - - /* This has to be done once we're sure module name is unique. */ dynamic_debug_setup(info->debug, info->num_debug); - /* Find duplicate symbols */ + mutex_lock(&module_mutex); + /* Find duplicate symbols (must be called under lock). */ err = verify_export_symbols(mod); if (err < 0) - goto ddebug; + goto ddebug_cleanup; + /* This relies on module_mutex for list integrity. */ module_bug_finalize(info->hdr, info->sechdrs, mod); - list_add_rcu(&mod->list, &modules); + + /* Mark state as coming so strong_try_module_get() ignores us, + * but kallsyms etc. can see us. */ + mod->state = MODULE_STATE_COMING; + mutex_unlock(&module_mutex); /* Module is ready to execute: parsing args may do that. */ err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, -32768, 32767, &ddebug_dyndbg_module_param_cb); if (err < 0) - goto unlink; + goto bug_cleanup; /* Link in to syfs. */ err = mod_sysfs_setup(mod, info, mod->kp, mod->num_kp); if (err < 0) - goto unlink; + goto bug_cleanup; /* Get rid of temporary copy. */ free_copy(info); @@ -3242,16 +3247,13 @@ again: return do_init_module(mod); - unlink: + bug_cleanup: + /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); - /* Unlink carefully: kallsyms could be walking list. */ - list_del_rcu(&mod->list); module_bug_cleanup(mod); - wake_up_all(&module_wq); - ddebug: - dynamic_debug_remove(info->debug); - unlock: mutex_unlock(&module_mutex); + ddebug_cleanup: + dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); free_arch_cleanup: @@ -3260,6 +3262,12 @@ again: free_modinfo(mod); free_unload: module_unload_free(mod); + unlink_mod: + mutex_lock(&module_mutex); + /* Unlink carefully: kallsyms could be walking list. */ + list_del_rcu(&mod->list); + wake_up_all(&module_wq); + mutex_unlock(&module_mutex); free_module: module_deallocate(mod, info); free_copy: diff --git a/lib/bug.c b/lib/bug.c index a28c1415357c..d0cdf14c651a 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -55,6 +55,7 @@ static inline unsigned long bug_addr(const struct bug_entry *bug) } #ifdef CONFIG_MODULES +/* Updates are protected by module mutex */ static LIST_HEAD(module_bug_list); static const struct bug_entry *module_find_bug(unsigned long bugaddr) -- cgit v1.2.3 From 250bfd3d8e7e19cb649dd94689f0af2ce3474060 Mon Sep 17 00:00:00 2001 From: Liu Bo Date: Mon, 14 Jan 2013 10:54:11 +0800 Subject: tracing: Fix regression of trace_pipe Commit 0fb9656d "tracing: Make tracing_enabled be equal to tracing_on" changes the behaviour of trace_pipe, ie. it makes trace_pipe return if we've read something and tracing is enabled, and this means that we have to 'cat trace_pipe' again and again while running tests. IMO the right way is if tracing is enabled, we always block and wait for ring buffer, or we may lose what we want since ring buffer's size is limited. Link: http://lkml.kernel.org/r/1358132051-5410-1-git-send-email-bo.li.liu@oracle.com Signed-off-by: Liu Bo Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f3ec1cfb0de1..3c13e46d7d24 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3454,7 +3454,7 @@ static int tracing_wait_pipe(struct file *filp) return -EINTR; /* - * We block until we read something and tracing is enabled. + * We block until we read something and tracing is disabled. * We still block if tracing is disabled, but we have never * read anything. This allows a user to cat this file, and * then enable tracing. But after we have read something, @@ -3462,7 +3462,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (tracing_is_enabled() && iter->pos) + if (!tracing_is_enabled() && iter->pos) break; } -- cgit v1.2.3 From 774a1221e862b343388347bac9b318767336b20b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 15 Jan 2013 18:52:51 -0800 Subject: module, async: async_synchronize_full() on module init iff async is used If the default iosched is built as module, the kernel may deadlock while trying to load the iosched module on device probe if the probing was running off async. This is because async_synchronize_full() at the end of module init ends up waiting for the async job which initiated the module loading. async A modprobe 1. finds a device 2. registers the block device 3. request_module(default iosched) 4. modprobe in userland 5. load and init module 6. async_synchronize_full() Async A waits for modprobe to finish in request_module() and modprobe waits for async A to finish in async_synchronize_full(). Because there's no easy to track dependency once control goes out to userland, implementing properly nested flushing is difficult. For now, make module init perform async_synchronize_full() iff module init has queued async jobs as suggested by Linus. This avoids the described deadlock because iosched module doesn't use async and thus wouldn't invoke async_synchronize_full(). This is hacky and incomplete. It will deadlock if async module loading nests; however, this works around the known problem case and seems to be the best of bad options. For more details, please refer to the following thread. http://thread.gmane.org/gmane.linux.kernel/1420814 Signed-off-by: Tejun Heo Reported-by: Alex Riesen Tested-by: Ming Lei Tested-by: Alex Riesen Cc: Arjan van de Ven Cc: Jens Axboe Signed-off-by: Linus Torvalds --- include/linux/sched.h | 1 + kernel/async.c | 3 +++ kernel/module.c | 27 +++++++++++++++++++++++++-- 3 files changed, 29 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 206bb089c06b..6fc8f45de4e9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1810,6 +1810,7 @@ extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, #define PF_MEMALLOC 0x00000800 /* Allocating memory */ #define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */ #define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */ +#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */ #define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */ #define PF_FROZEN 0x00010000 /* frozen for system suspend */ #define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */ diff --git a/kernel/async.c b/kernel/async.c index 9d3118384858..a1d585c351d6 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -196,6 +196,9 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct a atomic_inc(&entry_count); spin_unlock_irqrestore(&async_lock, flags); + /* mark that this task has queued an async job, used by module init */ + current->flags |= PF_USED_ASYNC; + /* schedule for execution */ queue_work(system_unbound_wq, &entry->work); diff --git a/kernel/module.c b/kernel/module.c index 250092c1d57d..b10b048367e1 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3013,6 +3013,12 @@ static int do_init_module(struct module *mod) { int ret = 0; + /* + * We want to find out whether @mod uses async during init. Clear + * PF_USED_ASYNC. async_schedule*() will set it. + */ + current->flags &= ~PF_USED_ASYNC; + blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_COMING, mod); @@ -3058,8 +3064,25 @@ static int do_init_module(struct module *mod) blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_LIVE, mod); - /* We need to finish all async code before the module init sequence is done */ - async_synchronize_full(); + /* + * We need to finish all async code before the module init sequence + * is done. This has potential to deadlock. For example, a newly + * detected block device can trigger request_module() of the + * default iosched from async probing task. Once userland helper + * reaches here, async_synchronize_full() will wait on the async + * task waiting on request_module() and deadlock. + * + * This deadlock is avoided by perfomring async_synchronize_full() + * iff module init queued any async jobs. This isn't a full + * solution as it will deadlock the same if module loading from + * async jobs nests more than once; however, due to the various + * constraints, this hack seems to be the best option for now. + * Please refer to the following thread for details. + * + * http://thread.gmane.org/gmane.linux.kernel/1420814 + */ + if (current->flags & PF_USED_ASYNC) + async_synchronize_full(); mutex_lock(&module_mutex); /* Drop initial reference. */ -- cgit v1.2.3 From b1e0318b8cd4bdbb0fbc48967b0350483ad9bd69 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Jan 2013 22:13:34 -0500 Subject: sys_clone() needs asmlinkage_protect Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- kernel/fork.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/fork.c b/kernel/fork.c index a31b823b3c2d..e05cff2429b5 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1660,8 +1660,10 @@ SYSCALL_DEFINE5(clone, unsigned long, clone_flags, unsigned long, newsp, int, tls_val) #endif { - return do_fork(clone_flags, newsp, 0, - parent_tidptr, child_tidptr); + long ret = do_fork(clone_flags, newsp, 0, parent_tidptr, child_tidptr); + asmlinkage_protect(5, ret, clone_flags, newsp, + parent_tidptr, child_tidptr, tls_val); + return ret; } #endif -- cgit v1.2.3 From edea0d03ee5f0ae0051b6adb6681ebdf976b1ca4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 20 Jan 2013 20:25:47 +0100 Subject: ia64: kill thread_matches(), unexport ptrace_check_attach() The ia64 function "thread_matches()" has no users since commit e868a55c2a8c ("[IA64] remove find_thread_for_addr()"). Remove it. This allows us to make ptrace_check_attach() static to kernel/ptrace.c, which is good since we'll need to change the semantics of it and fix up all the callers. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- arch/ia64/kernel/ptrace.c | 27 --------------------------- include/linux/ptrace.h | 1 - kernel/ptrace.c | 2 +- 3 files changed, 1 insertion(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/ia64/kernel/ptrace.c b/arch/ia64/kernel/ptrace.c index 4265ff64219b..b7a5fffe0924 100644 --- a/arch/ia64/kernel/ptrace.c +++ b/arch/ia64/kernel/ptrace.c @@ -672,33 +672,6 @@ ptrace_attach_sync_user_rbs (struct task_struct *child) read_unlock(&tasklist_lock); } -static inline int -thread_matches (struct task_struct *thread, unsigned long addr) -{ - unsigned long thread_rbs_end; - struct pt_regs *thread_regs; - - if (ptrace_check_attach(thread, 0) < 0) - /* - * If the thread is not in an attachable state, we'll - * ignore it. The net effect is that if ADDR happens - * to overlap with the portion of the thread's - * register backing store that is currently residing - * on the thread's kernel stack, then ptrace() may end - * up accessing a stale value. But if the thread - * isn't stopped, that's a problem anyhow, so we're - * doing as well as we can... - */ - return 0; - - thread_regs = task_pt_regs(thread); - thread_rbs_end = ia64_get_user_rbs_end(thread, thread_regs, NULL); - if (!on_kernel_rbs(addr, thread_regs->ar_bspstore, thread_rbs_end)) - return 0; - - return 1; /* looks like we've got a winner */ -} - /* * Write f32-f127 back to task->thread.fph if it has been modified. */ diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 1693775ecfe8..89573a33ab3c 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -45,7 +45,6 @@ extern long arch_ptrace(struct task_struct *child, long request, extern int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len); extern int ptrace_writedata(struct task_struct *tsk, char __user *src, unsigned long dst, int len); extern void ptrace_disable(struct task_struct *); -extern int ptrace_check_attach(struct task_struct *task, bool ignore_state); extern int ptrace_request(struct task_struct *child, long request, unsigned long addr, unsigned long data); extern void ptrace_notify(int exit_code); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 1599157336a6..612a56126851 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -139,7 +139,7 @@ void __ptrace_unlink(struct task_struct *child) * RETURNS: * 0 on success, -ESRCH if %child is not ready. */ -int ptrace_check_attach(struct task_struct *child, bool ignore_state) +static int ptrace_check_attach(struct task_struct *child, bool ignore_state) { int ret = -ESRCH; -- cgit v1.2.3 From ee61abb3223e28a1a14a8429c0319755d20d3e40 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Jan 2013 20:22:58 -0800 Subject: module: fix missing module_mutex unlock Commit 1fb9341ac348 ("module: put modules in list much earlier") moved some of the module initialization code around, and in the process changed the exit paths too. But for the duplicate export symbol error case the change made the ddebug_cleanup path jump to after the module mutex unlock, even though it happens with the mutex held. Rusty has some patches to split this function up into some helper functions, hopefully the mess of complex goto targets will go away eventually. Reported-by: Dan Carpenter Cc: Rusty Russell Signed-off-by: Linus Torvalds --- kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index d25e359279ae..eab08274ec9b 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -3274,8 +3274,8 @@ again: /* module_bug_cleanup needs module_mutex protection */ mutex_lock(&module_mutex); module_bug_cleanup(mod); - mutex_unlock(&module_mutex); ddebug_cleanup: + mutex_unlock(&module_mutex); dynamic_debug_remove(info->debug); synchronize_sched(); kfree(mod->args); -- cgit v1.2.3 From c1bf08ac26e92122faab9f6c32ea8aba94612dae Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 14 Dec 2012 09:48:15 -0500 Subject: ftrace: Be first to run code modification on modules If some other kernel subsystem has a module notifier, and adds a kprobe to a ftrace mcount point (now that kprobes work on ftrace points), when the ftrace notifier runs it will fail and disable ftrace, as well as kprobes that are attached to ftrace points. Here's the error: WARNING: at kernel/trace/ftrace.c:1618 ftrace_bug+0x239/0x280() Hardware name: Bochs Modules linked in: fat(+) stap_56d28a51b3fe546293ca0700b10bcb29__8059(F) nfsv4 auth_rpcgss nfs dns_resolver fscache xt_nat iptable_nat nf_conntrack_ipv4 nf_defrag_ipv4 nf_nat_ipv4 nf_nat nf_conntrack lockd sunrpc ppdev parport_pc parport microcode virtio_net i2c_piix4 drm_kms_helper ttm drm i2c_core [last unloaded: bid_shared] Pid: 8068, comm: modprobe Tainted: GF 3.7.0-0.rc8.git0.1.fc19.x86_64 #1 Call Trace: [] warn_slowpath_common+0x7f/0xc0 [] ? __probe_kernel_read+0x46/0x70 [] ? 0xffffffffa017ffff [] ? 0xffffffffa017ffff [] warn_slowpath_null+0x1a/0x20 [] ftrace_bug+0x239/0x280 [] ftrace_process_locs+0x376/0x520 [] ftrace_module_notify+0x47/0x50 [] notifier_call_chain+0x4d/0x70 [] __blocking_notifier_call_chain+0x58/0x80 [] blocking_notifier_call_chain+0x16/0x20 [] sys_init_module+0x73/0x220 [] system_call_fastpath+0x16/0x1b ---[ end trace 9ef46351e53bbf80 ]--- ftrace failed to modify [] init_once+0x0/0x20 [fat] actual: cc:bb:d2:4b:e1 A kprobe was added to the init_once() function in the fat module on load. But this happened before ftrace could have touched the code. As ftrace didn't run yet, the kprobe system had no idea it was a ftrace point and simply added a breakpoint to the code (0xcc in the cc:bb:d2:4b:e1). Then when ftrace went to modify the location from a call to mcount/fentry into a nop, it didn't see a call op, but instead it saw the breakpoint op and not knowing what to do with it, ftrace shut itself down. The solution is to simply give the ftrace module notifier the max priority. This should have been done regardless, as the core code ftrace modification also happens very early on in boot up. This makes the module modification closer to core modification. Link: http://lkml.kernel.org/r/20130107140333.593683061@goodmis.org Cc: stable@vger.kernel.org Acked-by: Masami Hiramatsu Reported-by: Frank Ch. Eigler Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 3ffe4c5ad3f3..41473b4ad7a4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3998,7 +3998,7 @@ static int ftrace_module_notify(struct notifier_block *self, struct notifier_block ftrace_module_nb = { .notifier_call = ftrace_module_notify, - .priority = 0, + .priority = INT_MAX, /* Run before anything that can use kprobes */ }; extern unsigned long __start_mcount_loc[]; -- cgit v1.2.3 From 910ffdb18a6408e14febbb6e4b6840fd2c928c82 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:47:41 +0100 Subject: ptrace: introduce signal_wake_up_state() and ptrace_signal_wake_up() Cleanup and preparation for the next change. signal_wake_up(resume => true) is overused. None of ptrace/jctl callers actually want to wakeup a TASK_WAKEKILL task, but they can't specify the necessary mask. Turn signal_wake_up() into signal_wake_up_state(state), reintroduce signal_wake_up() as a trivial helper, and add ptrace_signal_wake_up() which adds __TASK_TRACED. This way ptrace_signal_wake_up() can work "inside" ptrace_request() even if the tracee doesn't have the TASK_WAKEKILL bit set. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- include/linux/sched.h | 11 ++++++++++- kernel/ptrace.c | 8 ++++---- kernel/signal.c | 14 ++++---------- 3 files changed, 18 insertions(+), 15 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index 6fc8f45de4e9..d2112477ff5e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2714,7 +2714,16 @@ static inline void thread_group_cputime_init(struct signal_struct *sig) extern void recalc_sigpending_and_wake(struct task_struct *t); extern void recalc_sigpending(void); -extern void signal_wake_up(struct task_struct *t, int resume_stopped); +extern void signal_wake_up_state(struct task_struct *t, unsigned int state); + +static inline void signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0); +} +static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume) +{ + signal_wake_up_state(t, resume ? __TASK_TRACED : 0); +} /* * Wrappers for p->thread_info->cpu access. No-op on UP. diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 612a56126851..62f7c2774b16 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -117,7 +117,7 @@ void __ptrace_unlink(struct task_struct *child) * TASK_KILLABLE sleeps. */ if (child->jobctl & JOBCTL_STOP_PENDING || task_is_traced(child)) - signal_wake_up(child, task_is_traced(child)); + ptrace_signal_wake_up(child, true); spin_unlock(&child->sighand->siglock); } @@ -317,7 +317,7 @@ static int ptrace_attach(struct task_struct *task, long request, */ if (task_is_stopped(task) && task_set_jobctl_pending(task, JOBCTL_TRAP_STOP | JOBCTL_TRAPPING)) - signal_wake_up(task, 1); + signal_wake_up_state(task, __TASK_STOPPED); spin_unlock(&task->sighand->siglock); @@ -737,7 +737,7 @@ int ptrace_request(struct task_struct *child, long request, * tracee into STOP. */ if (likely(task_set_jobctl_pending(child, JOBCTL_TRAP_STOP))) - signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(child, child->jobctl & JOBCTL_LISTENING); unlock_task_sighand(child, &flags); ret = 0; @@ -763,7 +763,7 @@ int ptrace_request(struct task_struct *child, long request, * start of this trap and now. Trigger re-trap. */ if (child->jobctl & JOBCTL_TRAP_NOTIFY) - signal_wake_up(child, true); + ptrace_signal_wake_up(child, true); ret = 0; } unlock_task_sighand(child, &flags); diff --git a/kernel/signal.c b/kernel/signal.c index 53cd5c4d1172..6e97aa6fa32c 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -680,23 +680,17 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info) * No need to set need_resched since signal event passing * goes through ->blocked */ -void signal_wake_up(struct task_struct *t, int resume) +void signal_wake_up_state(struct task_struct *t, unsigned int state) { - unsigned int mask; - set_tsk_thread_flag(t, TIF_SIGPENDING); - /* - * For SIGKILL, we want to wake it up in the stopped/traced/killable + * TASK_WAKEKILL also means wake it up in the stopped/traced/killable * case. We don't check t->state here because there is a race with it * executing another processor and just now entering stopped state. * By using wake_up_state, we ensure the process will wake up and * handle its death signal. */ - mask = TASK_INTERRUPTIBLE; - if (resume) - mask |= TASK_WAKEKILL; - if (!wake_up_state(t, mask)) + if (!wake_up_state(t, state | TASK_INTERRUPTIBLE)) kick_process(t); } @@ -844,7 +838,7 @@ static void ptrace_trap_notify(struct task_struct *t) assert_spin_locked(&t->sighand->siglock); task_set_jobctl_pending(t, JOBCTL_TRAP_NOTIFY); - signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); + ptrace_signal_wake_up(t, t->jobctl & JOBCTL_LISTENING); } /* -- cgit v1.2.3 From 9899d11f654474d2d54ea52ceaa2a1f4db3abd68 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:48:00 +0100 Subject: ptrace: ensure arch_ptrace/ptrace_request can never race with SIGKILL putreg() assumes that the tracee is not running and pt_regs_access() can safely play with its stack. However a killed tracee can return from ptrace_stop() to the low-level asm code and do RESTORE_REST, this means that debugger can actually read/modify the kernel stack until the tracee does SAVE_REST again. set_task_blockstep() can race with SIGKILL too and in some sense this race is even worse, the very fact the tracee can be woken up breaks the logic. As Linus suggested we can clear TASK_WAKEKILL around the arch_ptrace() call, this ensures that nobody can ever wakeup the tracee while the debugger looks at it. Not only this fixes the mentioned problems, we can do some cleanups/simplifications in arch_ptrace() paths. Probably ptrace_unfreeze_traced() needs more callers, for example it makes sense to make the tracee killable for oom-killer before access_process_vm(). While at it, add the comment into may_ptrace_stop() to explain why ptrace_stop() still can't rely on SIGKILL and signal_pending_state(). Reported-by: Salman Qazi Reported-by: Suleiman Souhlal Suggested-by: Linus Torvalds Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- arch/x86/kernel/step.c | 9 +++---- kernel/ptrace.c | 64 ++++++++++++++++++++++++++++++++++++++++++-------- kernel/signal.c | 5 ++++ 3 files changed, 64 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/step.c b/arch/x86/kernel/step.c index cd3b2438a980..9b4d51d0c0d0 100644 --- a/arch/x86/kernel/step.c +++ b/arch/x86/kernel/step.c @@ -165,10 +165,11 @@ void set_task_blockstep(struct task_struct *task, bool on) * Ensure irq/preemption can't change debugctl in between. * Note also that both TIF_BLOCKSTEP and debugctl should * be changed atomically wrt preemption. - * FIXME: this means that set/clear TIF_BLOCKSTEP is simply - * wrong if task != current, SIGKILL can wakeup the stopped - * tracee and set/clear can play with the running task, this - * can confuse the next __switch_to_xtra(). + * + * NOTE: this means that set/clear TIF_BLOCKSTEP is only safe if + * task is current or it can't be running, otherwise we can race + * with __switch_to_xtra(). We rely on ptrace_freeze_traced() but + * PTRACE_KILL is not safe. */ local_irq_disable(); debugctl = get_debugctlmsr(); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 62f7c2774b16..6cbeaae4406d 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -122,6 +122,40 @@ void __ptrace_unlink(struct task_struct *child) spin_unlock(&child->sighand->siglock); } +/* Ensure that nothing can wake it up, even SIGKILL */ +static bool ptrace_freeze_traced(struct task_struct *task) +{ + bool ret = false; + + /* Lockless, nobody but us can set this flag */ + if (task->jobctl & JOBCTL_LISTENING) + return ret; + + spin_lock_irq(&task->sighand->siglock); + if (task_is_traced(task) && !__fatal_signal_pending(task)) { + task->state = __TASK_TRACED; + ret = true; + } + spin_unlock_irq(&task->sighand->siglock); + + return ret; +} + +static void ptrace_unfreeze_traced(struct task_struct *task) +{ + if (task->state != __TASK_TRACED) + return; + + WARN_ON(!task->ptrace || task->parent != current); + + spin_lock_irq(&task->sighand->siglock); + if (__fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + else + task->state = TASK_TRACED; + spin_unlock_irq(&task->sighand->siglock); +} + /** * ptrace_check_attach - check whether ptracee is ready for ptrace operation * @child: ptracee to check for @@ -151,24 +185,29 @@ static int ptrace_check_attach(struct task_struct *child, bool ignore_state) * be changed by us so it's not changing right after this. */ read_lock(&tasklist_lock); - if ((child->ptrace & PT_PTRACED) && child->parent == current) { + if (child->ptrace && child->parent == current) { + WARN_ON(child->state == __TASK_TRACED); /* * child->sighand can't be NULL, release_task() * does ptrace_unlink() before __exit_signal(). */ - spin_lock_irq(&child->sighand->siglock); - WARN_ON_ONCE(task_is_stopped(child)); - if (ignore_state || (task_is_traced(child) && - !(child->jobctl & JOBCTL_LISTENING))) + if (ignore_state || ptrace_freeze_traced(child)) ret = 0; - spin_unlock_irq(&child->sighand->siglock); } read_unlock(&tasklist_lock); - if (!ret && !ignore_state) - ret = wait_task_inactive(child, TASK_TRACED) ? 0 : -ESRCH; + if (!ret && !ignore_state) { + if (!wait_task_inactive(child, __TASK_TRACED)) { + /* + * This can only happen if may_ptrace_stop() fails and + * ptrace_stop() changes ->state back to TASK_RUNNING, + * so we should not worry about leaking __TASK_TRACED. + */ + WARN_ON(child->state == __TASK_TRACED); + ret = -ESRCH; + } + } - /* All systems go.. */ return ret; } @@ -900,6 +939,8 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, unsigned long, addr, goto out_put_task_struct; ret = arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); out_put_task_struct: put_task_struct(child); @@ -1039,8 +1080,11 @@ asmlinkage long compat_sys_ptrace(compat_long_t request, compat_long_t pid, ret = ptrace_check_attach(child, request == PTRACE_KILL || request == PTRACE_INTERRUPT); - if (!ret) + if (!ret) { ret = compat_arch_ptrace(child, request, addr, data); + if (ret || request != PTRACE_DETACH) + ptrace_unfreeze_traced(child); + } out_put_task_struct: put_task_struct(child); diff --git a/kernel/signal.c b/kernel/signal.c index 6e97aa6fa32c..3d09cf6cde75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -1794,6 +1794,10 @@ static inline int may_ptrace_stop(void) * If SIGKILL was already sent before the caller unlocked * ->siglock we must see ->core_state != NULL. Otherwise it * is safe to enter schedule(). + * + * This is almost outdated, a task with the pending SIGKILL can't + * block in TASK_TRACED. But PTRACE_EVENT_EXIT can be reported + * after SIGKILL was already dequeued. */ if (unlikely(current->mm->core_state) && unlikely(current->mm == current->parent->mm)) @@ -1919,6 +1923,7 @@ static void ptrace_stop(int exit_code, int why, int clear_code, siginfo_t *info) if (gstop_done) do_notify_parent_cldstop(current, false, why); + /* tasklist protects us from ptrace_freeze_traced() */ __set_current_state(TASK_RUNNING); if (clear_code) current->exit_code = 0; -- cgit v1.2.3 From 9067ac85d533651b98c2ff903182a20cbb361fcb Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Jan 2013 20:48:17 +0100 Subject: wake_up_process() should be never used to wakeup a TASK_STOPPED/TRACED task wake_up_process() should never wakeup a TASK_STOPPED/TRACED task. Change it to use TASK_NORMAL and add the WARN_ON(). TASK_ALL has no other users, probably can be killed. Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds --- kernel/sched/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 257002c13bb0..26058d0bebba 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1523,7 +1523,8 @@ out: */ int wake_up_process(struct task_struct *p) { - return try_to_wake_up(p, TASK_ALL, 0); + WARN_ON(task_is_stopped_or_traced(p)); + return try_to_wake_up(p, TASK_NORMAL, 0); } EXPORT_SYMBOL(wake_up_process); -- cgit v1.2.3 From f56c3196f251012de9b3ebaff55732a9074fdaae Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 22 Jan 2013 16:15:15 -0800 Subject: async: fix __lowest_in_progress() Commit 083b804c4d3e ("async: use workqueue for worker pool") made it possible that async jobs are moved from pending to running out-of-order. While pending async jobs will be queued and dispatched for execution in the same order, nothing guarantees they'll enter "1) move self to the running queue" of async_run_entry_fn() in the same order. Before the conversion, async implemented its own worker pool. An async worker, upon being woken up, fetches the first item from the pending list, which kept the executing lists sorted. The conversion to workqueue was done by adding work_struct to each async_entry and async just schedules the work item. The queueing and dispatching of such work items are still in order but now each worker thread is associated with a specific async_entry and moves that specific async_entry to the executing list. So, depending on which worker reaches that point earlier, which is non-deterministic, we may end up moving an async_entry with larger cookie before one with smaller one. This broke __lowest_in_progress(). running->domain may not be properly sorted and is not guaranteed to contain lower cookies than pending list when not empty. Fix it by ensuring sort-inserting to the running list and always looking at both pending and running when trying to determine the lowest cookie. Over time, the async synchronization implementation became quite messy. We better restructure it such that each async_entry is linked to two lists - one global and one per domain - and not move it when execution starts. There's no reason to distinguish pending and running. They behave the same for synchronization purposes. Signed-off-by: Tejun Heo Cc: Arjan van de Ven Cc: stable@vger.kernel.org Signed-off-by: Linus Torvalds --- kernel/async.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/async.c b/kernel/async.c index a1d585c351d6..6f34904a0b53 100644 --- a/kernel/async.c +++ b/kernel/async.c @@ -86,18 +86,27 @@ static atomic_t entry_count; */ static async_cookie_t __lowest_in_progress(struct async_domain *running) { + async_cookie_t first_running = next_cookie; /* infinity value */ + async_cookie_t first_pending = next_cookie; /* ditto */ struct async_entry *entry; + /* + * Both running and pending lists are sorted but not disjoint. + * Take the first cookies from both and return the min. + */ if (!list_empty(&running->domain)) { entry = list_first_entry(&running->domain, typeof(*entry), list); - return entry->cookie; + first_running = entry->cookie; } - list_for_each_entry(entry, &async_pending, list) - if (entry->running == running) - return entry->cookie; + list_for_each_entry(entry, &async_pending, list) { + if (entry->running == running) { + first_pending = entry->cookie; + break; + } + } - return next_cookie; /* "infinity" value */ + return min(first_running, first_pending); } static async_cookie_t lowest_in_progress(struct async_domain *running) @@ -118,13 +127,17 @@ static void async_run_entry_fn(struct work_struct *work) { struct async_entry *entry = container_of(work, struct async_entry, work); + struct async_entry *pos; unsigned long flags; ktime_t uninitialized_var(calltime), delta, rettime; struct async_domain *running = entry->running; - /* 1) move self to the running queue */ + /* 1) move self to the running queue, make sure it stays sorted */ spin_lock_irqsave(&async_lock, flags); - list_move_tail(&entry->list, &running->domain); + list_for_each_entry_reverse(pos, &running->domain, list) + if (entry->cookie < pos->cookie) + break; + list_move_tail(&entry->list, &pos->list); spin_unlock_irqrestore(&async_lock, flags); /* 2) run (and print duration) */ -- cgit v1.2.3