From 574732c73d155320f9358d9ee5d84beb0f4ecee2 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 23 Dec 2014 15:05:36 +1030 Subject: param: initialize store function to NULL if not available. I rebased Kees' 'param: do not set store func without write perm' on top of my 'params: cleanup sysfs allocation'. However, my patch uses krealloc which doesn't zero memory, leaving .store unset. Reported-by: Sasha Levin Cc: Kees Cook Signed-off-by: Rusty Russell --- kernel/params.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index 0af9b2c4e56c..bd65d136a470 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -648,6 +648,8 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, /* Do not allow runtime DAC changes to make param writable. */ if ((kp->perm & (S_IWUSR | S_IWGRP | S_IWOTH)) != 0) mk->mp->attrs[mk->mp->num].mattr.store = param_attr_store; + else + mk->mp->attrs[mk->mp->num].mattr.store = NULL; mk->mp->attrs[mk->mp->num].mattr.attr.name = (char *)name; mk->mp->attrs[mk->mp->num].mattr.attr.mode = kp->perm; mk->mp->num++; -- cgit v1.2.3 From 6ada1fc0e1c4775de0e043e1bd3ae9d065491aa5 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 3 Dec 2014 19:22:48 -0500 Subject: time: settimeofday: Validate the values of tv from user An unvalidated user input is multiplied by a constant, which can result in an undefined behaviour for large values. While this is validated later, we should avoid triggering undefined behaviour. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: stable Signed-off-by: Sasha Levin [jstultz: include trivial milisecond->microsecond correction noticed by Andy] Signed-off-by: John Stultz --- include/linux/time.h | 13 +++++++++++++ kernel/time/time.c | 4 ++++ 2 files changed, 17 insertions(+) (limited to 'kernel') diff --git a/include/linux/time.h b/include/linux/time.h index 8c42cf8d2444..5989b0ead1ec 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -99,6 +99,19 @@ static inline bool timespec_valid_strict(const struct timespec *ts) return true; } +static inline bool timeval_valid(const struct timeval *tv) +{ + /* Dates before 1970 are bogus */ + if (tv->tv_sec < 0) + return false; + + /* Can't have more microseconds then a second */ + if (tv->tv_usec < 0 || tv->tv_usec >= USEC_PER_SEC) + return false; + + return true; +} + extern struct timespec timespec_trunc(struct timespec t, unsigned gran); #define CURRENT_TIME (current_kernel_time()) diff --git a/kernel/time/time.c b/kernel/time/time.c index a9ae20fb0b11..22d5d3b73970 100644 --- a/kernel/time/time.c +++ b/kernel/time/time.c @@ -196,6 +196,10 @@ SYSCALL_DEFINE2(settimeofday, struct timeval __user *, tv, if (tv) { if (copy_from_user(&user_tv, tv, sizeof(*tv))) return -EFAULT; + + if (!timeval_valid(&user_tv)) + return -EINVAL; + new_ts.tv_sec = user_tv.tv_sec; new_ts.tv_nsec = user_tv.tv_usec * NSEC_PER_USEC; } -- cgit v1.2.3 From 5e5aeb4367b450a28f447f6d5ab57d8f2ab16a5f Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Wed, 3 Dec 2014 19:25:05 -0500 Subject: time: adjtimex: Validate the ADJ_FREQUENCY values Verify that the frequency value from userspace is valid and makes sense. Unverified values can cause overflows later on. Cc: Thomas Gleixner Cc: Ingo Molnar Cc: stable Signed-off-by: Sasha Levin [jstultz: Fix up bug for negative values and drop redunent cap check] Signed-off-by: John Stultz --- kernel/time/ntp.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'kernel') diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 87a346fd6d61..28bf91c60a0b 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -633,6 +633,13 @@ int ntp_validate_timex(struct timex *txc) if ((txc->modes & ADJ_SETOFFSET) && (!capable(CAP_SYS_TIME))) return -EPERM; + if (txc->modes & ADJ_FREQUENCY) { + if (LONG_MIN / PPM_SCALE > txc->freq) + return -EINVAL; + if (LONG_MAX / PPM_SCALE < txc->freq) + return -EINVAL; + } + return 0; } -- cgit v1.2.3 From 8f86f83709c585742dea5dd7f0d2b79c43f992ec Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 13 Jan 2015 11:20:43 -0500 Subject: ftrace: Fix updating of filters for shared global_ops filters As the set_ftrace_filter affects both the function tracer as well as the function graph tracer, the ops that represent each have a shared ftrace_ops_hash structure. This allows both to be updated when the filter files are updated. But if function graph is enabled and the global_ops (function tracing) ops is not, then it is possible that the filter could be changed without the update happening for the function graph ops. This will cause the changes to not take place and may even cause a ftrace_bug to occur as it could mess with the trampoline accounting. The solution is to check if the ops uses the shared global_ops filter and if the ops itself is not enabled, to check if there's another ops that is enabled and also shares the global_ops filter. In that case, the modification still needs to be executed. Link: http://lkml.kernel.org/r/20150114154329.055980438@goodmis.org Cc: stable@vger.kernel.org # 3.17+ Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 929a733d302e..2b35d0ba578d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4008,8 +4008,32 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) static void ftrace_ops_update_code(struct ftrace_ops *ops, struct ftrace_hash *old_hash) { - if (ops->flags & FTRACE_OPS_FL_ENABLED && ftrace_enabled) + struct ftrace_ops *op; + + if (!ftrace_enabled) + return; + + if (ops->flags & FTRACE_OPS_FL_ENABLED) { ftrace_run_modify_code(ops, FTRACE_UPDATE_CALLS, old_hash); + return; + } + + /* + * If this is the shared global_ops filter, then we need to + * check if there is another ops that shares it, is enabled. + * If so, we still need to run the modify code. + */ + if (ops->func_hash != &global_ops.local_hash) + return; + + do_for_each_ftrace_op(op, ftrace_ops_list) { + if (op->func_hash == &global_ops.local_hash && + op->flags & FTRACE_OPS_FL_ENABLED) { + ftrace_run_modify_code(op, FTRACE_UPDATE_CALLS, old_hash); + /* Only need to do this once */ + return; + } + } while_for_each_ftrace_op(op); } static int -- cgit v1.2.3 From 7485058eea40783ac142a60c3e799fc66ce72583 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Tue, 13 Jan 2015 14:03:38 -0500 Subject: ftrace: Check both notrace and filter for old hash Using just the filter for checking for trampolines or regs is not enough when updating the code against the records that represent all functions. Both the filter hash and the notrace hash need to be checked. To trigger this bug (using trace-cmd and perf): # perf probe -a do_fork # trace-cmd start -B foo -e probe # trace-cmd record -p function_graph -n do_fork sleep 1 The trace-cmd record at the end clears the filter before it disables function_graph tracing and then that causes the accounting of the ftrace function records to become incorrect and causes ftrace to bug. Link: http://lkml.kernel.org/r/20150114154329.358378039@goodmis.org Cc: stable@vger.kernel.org [ still need to switch old_hash_ops to old_ops_hash ] Reviewed-by: Masami Hiramatsu Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 2b35d0ba578d..224e768bdc73 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2497,12 +2497,14 @@ static void ftrace_run_update_code(int command) } static void ftrace_run_modify_code(struct ftrace_ops *ops, int command, - struct ftrace_hash *old_hash) + struct ftrace_ops_hash *old_hash) { ops->flags |= FTRACE_OPS_FL_MODIFYING; - ops->old_hash.filter_hash = old_hash; + ops->old_hash.filter_hash = old_hash->filter_hash; + ops->old_hash.notrace_hash = old_hash->notrace_hash; ftrace_run_update_code(command); ops->old_hash.filter_hash = NULL; + ops->old_hash.notrace_hash = NULL; ops->flags &= ~FTRACE_OPS_FL_MODIFYING; } @@ -3579,7 +3581,7 @@ static struct ftrace_ops trace_probe_ops __read_mostly = static int ftrace_probe_registered; -static void __enable_ftrace_function_probe(struct ftrace_hash *old_hash) +static void __enable_ftrace_function_probe(struct ftrace_ops_hash *old_hash) { int ret; int i; @@ -3637,6 +3639,7 @@ int register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, void *data) { + struct ftrace_ops_hash old_hash_ops; struct ftrace_func_probe *entry; struct ftrace_hash **orig_hash = &trace_probe_ops.func_hash->filter_hash; struct ftrace_hash *old_hash = *orig_hash; @@ -3658,6 +3661,10 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, mutex_lock(&trace_probe_ops.func_hash->regex_lock); + old_hash_ops.filter_hash = old_hash; + /* Probes only have filters */ + old_hash_ops.notrace_hash = NULL; + hash = alloc_and_copy_ftrace_hash(FTRACE_HASH_DEFAULT_BITS, old_hash); if (!hash) { count = -ENOMEM; @@ -3718,7 +3725,7 @@ register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops, ret = ftrace_hash_move(&trace_probe_ops, 1, orig_hash, hash); - __enable_ftrace_function_probe(old_hash); + __enable_ftrace_function_probe(&old_hash_ops); if (!ret) free_ftrace_hash_rcu(old_hash); @@ -4006,7 +4013,7 @@ ftrace_match_addr(struct ftrace_hash *hash, unsigned long ip, int remove) } static void ftrace_ops_update_code(struct ftrace_ops *ops, - struct ftrace_hash *old_hash) + struct ftrace_ops_hash *old_hash) { struct ftrace_ops *op; @@ -4041,6 +4048,7 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, unsigned long ip, int remove, int reset, int enable) { struct ftrace_hash **orig_hash; + struct ftrace_ops_hash old_hash_ops; struct ftrace_hash *old_hash; struct ftrace_hash *hash; int ret; @@ -4077,9 +4085,11 @@ ftrace_set_hash(struct ftrace_ops *ops, unsigned char *buf, int len, mutex_lock(&ftrace_lock); old_hash = *orig_hash; + old_hash_ops.filter_hash = ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = ops->func_hash->notrace_hash; ret = ftrace_hash_move(ops, enable, orig_hash, hash); if (!ret) { - ftrace_ops_update_code(ops, old_hash); + ftrace_ops_update_code(ops, &old_hash_ops); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); @@ -4291,6 +4301,7 @@ static void __init set_ftrace_early_filters(void) int ftrace_regex_release(struct inode *inode, struct file *file) { struct seq_file *m = (struct seq_file *)file->private_data; + struct ftrace_ops_hash old_hash_ops; struct ftrace_iterator *iter; struct ftrace_hash **orig_hash; struct ftrace_hash *old_hash; @@ -4324,10 +4335,12 @@ int ftrace_regex_release(struct inode *inode, struct file *file) mutex_lock(&ftrace_lock); old_hash = *orig_hash; + old_hash_ops.filter_hash = iter->ops->func_hash->filter_hash; + old_hash_ops.notrace_hash = iter->ops->func_hash->notrace_hash; ret = ftrace_hash_move(iter->ops, filter_hash, orig_hash, iter->hash); if (!ret) { - ftrace_ops_update_code(iter->ops, old_hash); + ftrace_ops_update_code(iter->ops, &old_hash_ops); free_ftrace_hash_rcu(old_hash); } mutex_unlock(&ftrace_lock); -- cgit v1.2.3 From 83829b74f54186a154ae6b586c05cdb838c2ebc6 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 14 Jan 2015 12:24:37 -0500 Subject: tracing: Remove extra call to init_ftrace_syscalls() trace_init() calls init_ftrace_syscalls() and then calls trace_event_init() which also calls init_ftrace_syscalls(). It makes more sense to only call it from trace_event_init(). Calling it twice wastes memory, as it allocates the syscall events twice, and loses the first copy of it. Link: http://lkml.kernel.org/r/54AF53BD.5070303@huawei.com Link: http://lkml.kernel.org/r/20150115040505.930398632@goodmis.org Reported-by: Wang Nan Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 - 1 file changed, 1 deletion(-) (limited to 'kernel') diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2e767972e99c..4a9079b9f082 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -6918,7 +6918,6 @@ void __init trace_init(void) tracepoint_printk = 0; } tracer_alloc_buffers(); - init_ftrace_syscalls(); trace_event_init(); } -- cgit v1.2.3 From ce1039bd3a89e99e4f624e75fb1777fc92d76eb3 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (Red Hat)" Date: Wed, 14 Jan 2015 12:53:45 -0500 Subject: tracing: Fix enabling of syscall events on the command line Commit 5f893b2639b2 "tracing: Move enabling tracepoints to just after rcu_init()" broke the enabling of system call events from the command line. The reason was that the enabling of command line trace events was moved before PID 1 started, and the syscall tracepoints require that all tasks have the TIF_SYSCALL_TRACEPOINT flag set. But the swapper task (pid 0) is not part of that. Since the swapper task is the only task that is running at this early in boot, no task gets the flag set, and the tracepoint never gets reached. Instead of setting the swapper task flag (there should be no reason to do that), re-enabled trace events again after the init thread (PID 1) has been started. It requires disabling all command line events and re-enabling them, as just enabling them again will not reset the logic to set the TIF_SYSCALL_TRACEPOINT flag, as the syscall tracepoint will be fooled into thinking that it was already set, and wont try setting it again. For this reason, we must first disable it and re-enable it. Link: http://lkml.kernel.org/r/1421188517-18312-1-git-send-email-mpe@ellerman.id.au Link: http://lkml.kernel.org/r/20150115040506.216066449@goodmis.org Reported-by: Michael Ellerman Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 69 ++++++++++++++++++++++++++++++++++++--------- 1 file changed, 55 insertions(+), 14 deletions(-) (limited to 'kernel') diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 366a78a3e61e..b03a0ea77b99 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -2429,12 +2429,39 @@ static __init int event_trace_memsetup(void) return 0; } +static __init void +early_enable_events(struct trace_array *tr, bool disable_first) +{ + char *buf = bootup_event_buf; + char *token; + int ret; + + while (true) { + token = strsep(&buf, ","); + + if (!token) + break; + if (!*token) + continue; + + /* Restarting syscalls requires that we stop them first */ + if (disable_first) + ftrace_set_clr_event(tr, token, 0); + + ret = ftrace_set_clr_event(tr, token, 1); + if (ret) + pr_warn("Failed to enable trace event: %s\n", token); + + /* Put back the comma to allow this to be called again */ + if (buf) + *(buf - 1) = ','; + } +} + static __init int event_trace_enable(void) { struct trace_array *tr = top_trace_array(); struct ftrace_event_call **iter, *call; - char *buf = bootup_event_buf; - char *token; int ret; if (!tr) @@ -2456,18 +2483,7 @@ static __init int event_trace_enable(void) */ __trace_early_add_events(tr); - while (true) { - token = strsep(&buf, ","); - - if (!token) - break; - if (!*token) - continue; - - ret = ftrace_set_clr_event(tr, token, 1); - if (ret) - pr_warn("Failed to enable trace event: %s\n", token); - } + early_enable_events(tr, false); trace_printk_start_comm(); @@ -2478,6 +2494,31 @@ static __init int event_trace_enable(void) return 0; } +/* + * event_trace_enable() is called from trace_event_init() first to + * initialize events and perhaps start any events that are on the + * command line. Unfortunately, there are some events that will not + * start this early, like the system call tracepoints that need + * to set the TIF_SYSCALL_TRACEPOINT flag of pid 1. But event_trace_enable() + * is called before pid 1 starts, and this flag is never set, making + * the syscall tracepoint never get reached, but the event is enabled + * regardless (and not doing anything). + */ +static __init int event_trace_enable_again(void) +{ + struct trace_array *tr; + + tr = top_trace_array(); + if (!tr) + return -ENODEV; + + early_enable_events(tr, true); + + return 0; +} + +early_initcall(event_trace_enable_again); + static __init int event_trace_init(void) { struct trace_array *tr; -- cgit v1.2.3 From 29187a9eeaf362d8422e62e17a22a6e115277a49 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 16 Jan 2015 14:21:16 -0500 Subject: workqueue: fix subtle pool management issue which can stall whole worker_pool A worker_pool's forward progress is guaranteed by the fact that the last idle worker assumes the manager role to create more workers and summon the rescuers if creating workers doesn't succeed in timely manner before proceeding to execute work items. This manager role is implemented in manage_workers(), which indicates whether the worker may proceed to work item execution with its return value. This is necessary because multiple workers may contend for the manager role, and, if there already is a manager, others should proceed to work item execution. Unfortunately, the function also indicates that the worker may proceed to work item execution if need_to_create_worker() is false at the head of the function. need_to_create_worker() tests the following conditions. pending work items && !nr_running && !nr_idle The first and third conditions are protected by pool->lock and thus won't change while holding pool->lock; however, nr_running can change asynchronously as other workers block and resume and while it's likely to be zero, as someone woke this worker up in the first place, some other workers could have become runnable inbetween making it non-zero. If this happens, manage_worker() could return false even with zero nr_idle making the worker, the last idle one, proceed to execute work items. If then all workers of the pool end up blocking on a resource which can only be released by a work item which is pending on that pool, the whole pool can deadlock as there's no one to create more workers or summon the rescuers. This patch fixes the problem by removing the early exit condition from maybe_create_worker() and making manage_workers() return false iff there's already another manager, which ensures that the last worker doesn't start executing work items. We can leave the early exit condition alone and just ignore the return value but the only reason it was put there is because the manage_workers() used to perform both creations and destructions of workers and thus the function may be invoked while the pool is trying to reduce the number of workers. Now that manage_workers() is called only when more workers are needed, the only case this early exit condition is triggered is rare race conditions rendering it pointless. Tested with simulated workload and modified workqueue code which trigger the pool deadlock reliably without this patch. Signed-off-by: Tejun Heo Reported-by: Eric Sandeen Link: http://lkml.kernel.org/g/54B019F4.8030009@sandeen.net Cc: Dave Chinner Cc: Lai Jiangshan Cc: stable@vger.kernel.org --- kernel/workqueue.c | 25 ++++++++----------------- 1 file changed, 8 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 6202b08f1933..beeeac9e0e3e 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1841,17 +1841,11 @@ static void pool_mayday_timeout(unsigned long __pool) * spin_lock_irq(pool->lock) which may be released and regrabbed * multiple times. Does GFP_KERNEL allocations. Called only from * manager. - * - * Return: - * %false if no action was taken and pool->lock stayed locked, %true - * otherwise. */ -static bool maybe_create_worker(struct worker_pool *pool) +static void maybe_create_worker(struct worker_pool *pool) __releases(&pool->lock) __acquires(&pool->lock) { - if (!need_to_create_worker(pool)) - return false; restart: spin_unlock_irq(&pool->lock); @@ -1877,7 +1871,6 @@ restart: */ if (need_to_create_worker(pool)) goto restart; - return true; } /** @@ -1897,16 +1890,14 @@ restart: * multiple times. Does GFP_KERNEL allocations. * * Return: - * %false if the pool don't need management and the caller can safely start - * processing works, %true indicates that the function released pool->lock - * and reacquired it to perform some management function and that the - * conditions that the caller verified while holding the lock before - * calling the function might no longer be true. + * %false if the pool doesn't need management and the caller can safely + * start processing works, %true if management function was performed and + * the conditions that the caller verified before calling the function may + * no longer be true. */ static bool manage_workers(struct worker *worker) { struct worker_pool *pool = worker->pool; - bool ret = false; /* * Anyone who successfully grabs manager_arb wins the arbitration @@ -1919,12 +1910,12 @@ static bool manage_workers(struct worker *worker) * actual management, the pool may stall indefinitely. */ if (!mutex_trylock(&pool->manager_arb)) - return ret; + return false; - ret |= maybe_create_worker(pool); + maybe_create_worker(pool); mutex_unlock(&pool->manager_arb); - return ret; + return true; } /** -- cgit v1.2.3 From fc7f0dd381720ea5ee5818645f7d0e9dece41cb0 Mon Sep 17 00:00:00 2001 From: Louis Langholtz Date: Thu, 15 Jan 2015 22:04:46 -0700 Subject: kernel: avoid overflow in cmp_range Avoid overflow possibility. [ The overflow is purely theoretical, since this is used for memory ranges that aren't even close to using the full 64 bits, but this is the right thing to do regardless. - Linus ] Signed-off-by: Louis Langholtz Cc: Yinghai Lu Cc: Peter Anvin Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/range.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index 322ea8e93e4b..82cfc285b046 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -113,12 +113,12 @@ static int cmp_range(const void *x1, const void *x2) { const struct range *r1 = x1; const struct range *r2 = x2; - s64 start1, start2; - start1 = r1->start; - start2 = r2->start; - - return start1 - start2; + if (r1->start < r2->start) + return -1; + if (r1->start > r2->start) + return 1; + return 0; } int clean_sort_range(struct range *range, int az) -- cgit v1.2.3 From c772be52319de9756fd82f36d37a6d3e003441e3 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:04 +1030 Subject: param: fix uninitialized read with CONFIG_DEBUG_LOCK_ALLOC ignore_lockdep is uninitialized, and sysfs_attr_init() doesn't initialize it, so memset to 0. Reported-by: Huang Ying Cc: Eric W. Biederman Signed-off-by: Rusty Russell --- kernel/params.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/params.c b/kernel/params.c index bd65d136a470..728e05b167de 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -642,6 +642,7 @@ static __modinit int add_sysfs_param(struct module_kobject *mk, mk->mp->grp.attrs = new_attrs; /* Tack new one on the end. */ + memset(&mk->mp->attrs[mk->mp->num], 0, sizeof(mk->mp->attrs[0])); sysfs_attr_init(&mk->mp->attrs[mk->mp->num].mattr.attr); mk->mp->attrs[mk->mp->num].param = kp; mk->mp->attrs[mk->mp->num].mattr.show = param_attr_show; -- cgit v1.2.3 From d453cded05ee219b77815ea194dc36efa5398bca Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:04 +1030 Subject: module_arch_freeing_init(): new hook for archs before module->module_init freed. Archs have been abusing module_free() to clean up their arch-specific allocations. Since module_free() is also (ab)used by BPF and trace code, let's keep it to simple allocations, and provide a hook called before that. This means that avr32, ia64, parisc and s390 no longer need to implement their own module_free() at all. avr32 doesn't need module_finalize() either. Signed-off-by: Rusty Russell Cc: Chris Metcalf Cc: Haavard Skinnemoen Cc: Hans-Christian Egtvedt Cc: Tony Luck Cc: Fenghua Yu Cc: "James E.J. Bottomley" Cc: Helge Deller Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: linux-kernel@vger.kernel.org Cc: linux-ia64@vger.kernel.org Cc: linux-parisc@vger.kernel.org Cc: linux-s390@vger.kernel.org --- arch/avr32/kernel/module.c | 13 +------------ arch/ia64/kernel/module.c | 6 ++---- arch/parisc/kernel/module.c | 6 +----- arch/s390/kernel/module.c | 10 +++------- arch/tile/kernel/module.c | 2 +- include/linux/moduleloader.h | 2 ++ kernel/module.c | 7 +++++++ 7 files changed, 17 insertions(+), 29 deletions(-) (limited to 'kernel') diff --git a/arch/avr32/kernel/module.c b/arch/avr32/kernel/module.c index 2c9412908024..164efa009e5b 100644 --- a/arch/avr32/kernel/module.c +++ b/arch/avr32/kernel/module.c @@ -19,12 +19,10 @@ #include #include -void module_free(struct module *mod, void *module_region) +void module_arch_freeing_init(struct module *mod) { vfree(mod->arch.syminfo); mod->arch.syminfo = NULL; - - vfree(module_region); } static inline int check_rela(Elf32_Rela *rela, struct module *module, @@ -291,12 +289,3 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, const char *strtab, return ret; } - -int module_finalize(const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, - struct module *module) -{ - vfree(module->arch.syminfo); - module->arch.syminfo = NULL; - - return 0; -} diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 24603be24c14..29754aae5177 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -305,14 +305,12 @@ plt_target (struct plt_entry *plt) #endif /* !USE_BRL */ void -module_free (struct module *mod, void *module_region) +module_arch_freeing_init (struct module *mod) { - if (mod && mod->arch.init_unw_table && - module_region == mod->module_init) { + if (mod->arch.init_unw_table) { unw_remove_unwind_table(mod->arch.init_unw_table); mod->arch.init_unw_table = NULL; } - vfree(module_region); } /* Have we already seen one of these relocations? */ diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index 50dfafc3f2c1..5822e8e200e6 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -298,14 +298,10 @@ static inline unsigned long count_stubs(const Elf_Rela *rela, unsigned long n) } #endif - -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_arch_freeing_init(struct module *mod) { kfree(mod->arch.section); mod->arch.section = NULL; - - vfree(module_region); } /* Additional bytes needed in front of individual sections */ diff --git a/arch/s390/kernel/module.c b/arch/s390/kernel/module.c index b89b59158b95..409d152585be 100644 --- a/arch/s390/kernel/module.c +++ b/arch/s390/kernel/module.c @@ -55,14 +55,10 @@ void *module_alloc(unsigned long size) } #endif -/* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_arch_freeing_init(struct module *mod) { - if (mod) { - vfree(mod->arch.syminfo); - mod->arch.syminfo = NULL; - } - vfree(module_region); + vfree(mod->arch.syminfo); + mod->arch.syminfo = NULL; } static void check_rela(Elf_Rela *rela, struct module *me) diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 96447c9160a0..62a597e810d6 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -83,7 +83,7 @@ void module_free(struct module *mod, void *module_region) 0, 0, 0, NULL, NULL, 0); /* - * FIXME: If module_region == mod->module_init, trim exception + * FIXME: Add module_arch_freeing_init to trim exception * table entries. */ } diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 7eeb9bbfb816..054eac853090 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -82,4 +82,6 @@ int module_finalize(const Elf_Ehdr *hdr, /* Any cleanup needed when module leaves. */ void module_arch_cleanup(struct module *mod); +/* Any cleanup before freeing mod->module_init */ +void module_arch_freeing_init(struct module *mod); #endif diff --git a/kernel/module.c b/kernel/module.c index 3965511ae133..68be0b1f9e7f 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1804,6 +1804,10 @@ void __weak module_arch_cleanup(struct module *mod) { } +void __weak module_arch_freeing_init(struct module *mod) +{ +} + /* Free a module, remove from lists, etc. */ static void free_module(struct module *mod) { @@ -1841,6 +1845,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); + module_arch_freeing_init(mod); module_free(mod, mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -2930,6 +2935,7 @@ static struct module *layout_and_allocate(struct load_info *info, int flags) static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); + module_arch_freeing_init(mod); module_free(mod, mod->module_init); module_free(mod, mod->module_core); } @@ -3055,6 +3061,7 @@ static int do_init_module(struct module *mod) mod->strtab = mod->core_strtab; #endif unset_module_init_ro_nx(mod); + module_arch_freeing_init(mod); module_free(mod, mod->module_init); mod->module_init = NULL; mod->init_size = 0; -- cgit v1.2.3 From be1f221c0445a4157d177197c236f888d3581914 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:05 +1030 Subject: module: remove mod arg from module_free, rename module_memfree(). Nothing needs the module pointer any more, and the next patch will call it from RCU, where the module itself might no longer exist. Removing the arg is the safest approach. This just codifies the use of the module_alloc/module_free pattern which ftrace and bpf use. Signed-off-by: Rusty Russell Acked-by: Alexei Starovoitov Cc: Mikael Starvik Cc: Jesper Nilsson Cc: Ralf Baechle Cc: Ley Foon Tan Cc: Benjamin Herrenschmidt Cc: Chris Metcalf Cc: Steven Rostedt Cc: x86@kernel.org Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Masami Hiramatsu Cc: linux-cris-kernel@axis.com Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Cc: nios2-dev@lists.rocketboards.org Cc: linuxppc-dev@lists.ozlabs.org Cc: sparclinux@vger.kernel.org Cc: netdev@vger.kernel.org --- arch/cris/kernel/module.c | 2 +- arch/mips/net/bpf_jit.c | 2 +- arch/nios2/kernel/module.c | 2 +- arch/powerpc/net/bpf_jit_comp.c | 2 +- arch/sparc/net/bpf_jit_comp.c | 4 ++-- arch/tile/kernel/module.c | 2 +- arch/x86/kernel/ftrace.c | 2 +- include/linux/moduleloader.h | 2 +- kernel/bpf/core.c | 2 +- kernel/kprobes.c | 2 +- kernel/module.c | 14 +++++++------- 11 files changed, 18 insertions(+), 18 deletions(-) (limited to 'kernel') diff --git a/arch/cris/kernel/module.c b/arch/cris/kernel/module.c index 51123f985eb5..af04cb6b6dc9 100644 --- a/arch/cris/kernel/module.c +++ b/arch/cris/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) } /* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_memfree(void *module_region) { kfree(module_region); } diff --git a/arch/mips/net/bpf_jit.c b/arch/mips/net/bpf_jit.c index 9fd6834a2172..5d6139390bf8 100644 --- a/arch/mips/net/bpf_jit.c +++ b/arch/mips/net/bpf_jit.c @@ -1388,7 +1388,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_free(NULL, fp->bpf_func); + module_memfree(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/nios2/kernel/module.c b/arch/nios2/kernel/module.c index cc924a38f22a..e2e3f13f98d5 100644 --- a/arch/nios2/kernel/module.c +++ b/arch/nios2/kernel/module.c @@ -36,7 +36,7 @@ void *module_alloc(unsigned long size) } /* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_memfree(void *module_region) { kfree(module_region); } diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c index 1ca125b9c226..d1916b577f2c 100644 --- a/arch/powerpc/net/bpf_jit_comp.c +++ b/arch/powerpc/net/bpf_jit_comp.c @@ -699,7 +699,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_free(NULL, fp->bpf_func); + module_memfree(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/sparc/net/bpf_jit_comp.c b/arch/sparc/net/bpf_jit_comp.c index f33e7c7a3bf7..7931eeeb649a 100644 --- a/arch/sparc/net/bpf_jit_comp.c +++ b/arch/sparc/net/bpf_jit_comp.c @@ -776,7 +776,7 @@ cond_branch: f_offset = addrs[i + filter[i].jf]; if (unlikely(proglen + ilen > oldproglen)) { pr_err("bpb_jit_compile fatal error\n"); kfree(addrs); - module_free(NULL, image); + module_memfree(image); return; } memcpy(image + proglen, temp, ilen); @@ -822,7 +822,7 @@ out: void bpf_jit_free(struct bpf_prog *fp) { if (fp->jited) - module_free(NULL, fp->bpf_func); + module_memfree(fp->bpf_func); bpf_prog_unlock_free(fp); } diff --git a/arch/tile/kernel/module.c b/arch/tile/kernel/module.c index 62a597e810d6..2305084c9b93 100644 --- a/arch/tile/kernel/module.c +++ b/arch/tile/kernel/module.c @@ -74,7 +74,7 @@ error: /* Free memory returned from module_alloc */ -void module_free(struct module *mod, void *module_region) +void module_memfree(void *module_region) { vfree(module_region); diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index 2142376dc8c6..8b7b0a51e742 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -674,7 +674,7 @@ static inline void *alloc_tramp(unsigned long size) } static inline void tramp_free(void *tramp) { - module_free(NULL, tramp); + module_memfree(tramp); } #else /* Trampolines can only be created if modules are supported */ diff --git a/include/linux/moduleloader.h b/include/linux/moduleloader.h index 054eac853090..f7556261fe3c 100644 --- a/include/linux/moduleloader.h +++ b/include/linux/moduleloader.h @@ -26,7 +26,7 @@ unsigned int arch_mod_section_prepend(struct module *mod, unsigned int section); void *module_alloc(unsigned long size); /* Free memory returned from module_alloc. */ -void module_free(struct module *mod, void *module_region); +void module_memfree(void *module_region); /* * Apply the given relocation to the (simplified) ELF. Return -error diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index d6594e457a25..a64e7a207d2b 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -163,7 +163,7 @@ bpf_jit_binary_alloc(unsigned int proglen, u8 **image_ptr, void bpf_jit_binary_free(struct bpf_binary_header *hdr) { - module_free(NULL, hdr); + module_memfree(hdr); } #endif /* CONFIG_BPF_JIT */ diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 06f58309fed2..ee619929cf90 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -127,7 +127,7 @@ static void *alloc_insn_page(void) static void free_insn_page(void *page) { - module_free(NULL, page); + module_memfree(page); } struct kprobe_insn_cache kprobe_insn_slots = { diff --git a/kernel/module.c b/kernel/module.c index 68be0b1f9e7f..1f85fd5c89d3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1795,7 +1795,7 @@ static void unset_module_core_ro_nx(struct module *mod) { } static void unset_module_init_ro_nx(struct module *mod) { } #endif -void __weak module_free(struct module *mod, void *module_region) +void __weak module_memfree(void *module_region) { vfree(module_region); } @@ -1846,7 +1846,7 @@ static void free_module(struct module *mod) /* This may be NULL, but that's OK */ unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_free(mod, mod->module_init); + module_memfree(mod->module_init); kfree(mod->args); percpu_modfree(mod); @@ -1855,7 +1855,7 @@ static void free_module(struct module *mod) /* Finally, free the core (containing the module structure) */ unset_module_core_ro_nx(mod); - module_free(mod, mod->module_core); + module_memfree(mod->module_core); #ifdef CONFIG_MPU update_protections(current->mm); @@ -2790,7 +2790,7 @@ static int move_module(struct module *mod, struct load_info *info) */ kmemleak_ignore(ptr); if (!ptr) { - module_free(mod, mod->module_core); + module_memfree(mod->module_core); return -ENOMEM; } memset(ptr, 0, mod->init_size); @@ -2936,8 +2936,8 @@ static void module_deallocate(struct module *mod, struct load_info *info) { percpu_modfree(mod); module_arch_freeing_init(mod); - module_free(mod, mod->module_init); - module_free(mod, mod->module_core); + module_memfree(mod->module_init); + module_memfree(mod->module_core); } int __weak module_finalize(const Elf_Ehdr *hdr, @@ -3062,7 +3062,7 @@ static int do_init_module(struct module *mod) #endif unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_free(mod, mod->module_init); + module_memfree(mod->module_init); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; -- cgit v1.2.3 From c749637909eea5d4090c6f50b89c2c20b534a280 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 20 Jan 2015 09:07:05 +1030 Subject: module: fix race in kallsyms resolution during module load success. The kallsyms routines (module_symbol_name, lookup_module_* etc) disable preemption to walk the modules rather than taking the module_mutex: this is because they are used for symbol resolution during oopses. This works because there are synchronize_sched() and synchronize_rcu() in the unload and failure paths. However, there's one case which doesn't have that: the normal case where module loading succeeds, and we free the init section. We don't want a synchronize_rcu() there, because it would slow down module loading: this bug was introduced in 2009 to speed module loading in the first place. Thus, we want to do the free in an RCU callback. We do this in the simplest possible way by allocating a new rcu_head: if we put it in the module structure we'd have to worry about that getting freed. Reported-by: Rui Xiang Signed-off-by: Rusty Russell --- kernel/module.c | 55 ++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 42 insertions(+), 13 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 1f85fd5c89d3..ed4ec9c30bd2 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2989,10 +2989,31 @@ static void do_mod_ctors(struct module *mod) #endif } +/* For freeing module_init on success, in case kallsyms traversing */ +struct mod_initfree { + struct rcu_head rcu; + void *module_init; +}; + +static void do_free_init(struct rcu_head *head) +{ + struct mod_initfree *m = container_of(head, struct mod_initfree, rcu); + module_memfree(m->module_init); + kfree(m); +} + /* This is where the real work happens */ static int do_init_module(struct module *mod) { int ret = 0; + struct mod_initfree *freeinit; + + freeinit = kmalloc(sizeof(*freeinit), GFP_KERNEL); + if (!freeinit) { + ret = -ENOMEM; + goto fail; + } + freeinit->module_init = mod->module_init; /* * We want to find out whether @mod uses async during init. Clear @@ -3005,18 +3026,7 @@ static int do_init_module(struct module *mod) if (mod->init != NULL) ret = do_one_initcall(mod->init); if (ret < 0) { - /* - * Init routine failed: abort. Try to protect us from - * buggy refcounters. - */ - mod->state = MODULE_STATE_GOING; - synchronize_sched(); - module_put(mod); - blocking_notifier_call_chain(&module_notify_list, - MODULE_STATE_GOING, mod); - free_module(mod); - wake_up_all(&module_wq); - return ret; + goto fail_free_freeinit; } if (ret > 0) { pr_warn("%s: '%s'->init suspiciously returned %d, it should " @@ -3062,15 +3072,34 @@ static int do_init_module(struct module *mod) #endif unset_module_init_ro_nx(mod); module_arch_freeing_init(mod); - module_memfree(mod->module_init); mod->module_init = NULL; mod->init_size = 0; mod->init_ro_size = 0; mod->init_text_size = 0; + /* + * We want to free module_init, but be aware that kallsyms may be + * walking this with preempt disabled. In all the failure paths, + * we call synchronize_rcu/synchronize_sched, but we don't want + * to slow down the success path, so use actual RCU here. + */ + call_rcu(&freeinit->rcu, do_free_init); mutex_unlock(&module_mutex); wake_up_all(&module_wq); return 0; + +fail_free_freeinit: + kfree(freeinit); +fail: + /* Try to protect us from buggy refcounters. */ + mod->state = MODULE_STATE_GOING; + synchronize_sched(); + module_put(mod); + blocking_notifier_call_chain(&module_notify_list, + MODULE_STATE_GOING, mod); + free_module(mod); + wake_up_all(&module_wq); + return ret; } static int may_init_module(void) -- cgit v1.2.3 From d5db139ab3764640e0882a1746e7b9fdee33fd87 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 22 Jan 2015 11:13:14 +1030 Subject: module: make module_refcount() a signed integer. James Bottomley points out that it will be -1 during unload. It's only used for diagnostics, so let's not hide that as it could be a clue as to what's gone wrong. Cc: Jason Wessel Acked-and-documention-added-by: James Bottomley Reviewed-by: Masami Hiramatsu Signed-off-by: Rusty Russell --- include/linux/module.h | 2 +- kernel/debug/kdb/kdb_main.c | 2 +- kernel/module.c | 17 +++++++++++++---- 3 files changed, 15 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/module.h b/include/linux/module.h index ebfb0e153c6a..b653d7c0a05a 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -444,7 +444,7 @@ extern void __module_put_and_exit(struct module *mod, long code) #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code) #ifdef CONFIG_MODULE_UNLOAD -unsigned long module_refcount(struct module *mod); +int module_refcount(struct module *mod); void __symbol_put(const char *symbol); #define symbol_put(x) __symbol_put(VMLINUX_SYMBOL_STR(x)) void symbol_put_addr(void *addr); diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 379650b984f8..2934889f2cce 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -1979,7 +1979,7 @@ static int kdb_lsmod(int argc, const char **argv) kdb_printf("%-20s%8u 0x%p ", mod->name, mod->core_size, (void *)mod); #ifdef CONFIG_MODULE_UNLOAD - kdb_printf("%4ld ", module_refcount(mod)); + kdb_printf("%4d ", module_refcount(mod)); #endif if (mod->state == MODULE_STATE_GOING) kdb_printf(" (Unloading)"); diff --git a/kernel/module.c b/kernel/module.c index ed4ec9c30bd2..d856e96a3cce 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -772,9 +772,18 @@ static int try_stop_module(struct module *mod, int flags, int *forced) return 0; } -unsigned long module_refcount(struct module *mod) +/** + * module_refcount - return the refcount or -1 if unloading + * + * @mod: the module we're checking + * + * Returns: + * -1 if the module is in the process of unloading + * otherwise the number of references in the kernel to the module + */ +int module_refcount(struct module *mod) { - return (unsigned long)atomic_read(&mod->refcnt) - MODULE_REF_BASE; + return atomic_read(&mod->refcnt) - MODULE_REF_BASE; } EXPORT_SYMBOL(module_refcount); @@ -856,7 +865,7 @@ static inline void print_unload_info(struct seq_file *m, struct module *mod) struct module_use *use; int printed_something = 0; - seq_printf(m, " %lu ", module_refcount(mod)); + seq_printf(m, " %i ", module_refcount(mod)); /* * Always include a trailing , so userspace can differentiate @@ -908,7 +917,7 @@ EXPORT_SYMBOL_GPL(symbol_put_addr); static ssize_t show_refcnt(struct module_attribute *mattr, struct module_kobject *mk, char *buffer) { - return sprintf(buffer, "%lu\n", module_refcount(mk->mod)); + return sprintf(buffer, "%i\n", module_refcount(mk->mod)); } static struct module_attribute modinfo_refcnt = -- cgit v1.2.3 From 3c606d35fe9766ede86a45273a628b320be13b45 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Thu, 22 Jan 2015 10:19:43 -0500 Subject: cgroup: prevent mount hang due to memory controller lifetime Since b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined groups"), re-mounting the memory controller after using it is very likely to hang. The cgroup core assumes that any remaining references after deleting a cgroup are temporary in nature, and synchroneously waits for them, but the above-mentioned commit has left-over page cache pin its css until it is reclaimed naturally. That being said, swap entries and charged kernel memory have been doing the same indefinite pinning forever, the bug is just more likely to trigger with left-over page cache. Reparenting kernel memory is highly impractical, which leaves changing the cgroup assumptions to reflect this: once a controller has been mounted and used, it has internal state that is independent from mount and cgroup lifetime. It can be unmounted and remounted, but it can't be reconfigured during subsequent mounts. Don't offline the controller root as long as there are any children, dead or alive. A remount will no longer wait for these old references to drain, it will simply mount the persistent controller state again. Reported-by: "Suzuki K. Poulose" Reported-by: Will Deacon Signed-off-by: Johannes Weiner Signed-off-by: Tejun Heo --- kernel/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index bb263d0caab3..04cfe8ace520 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1909,7 +1909,7 @@ static void cgroup_kill_sb(struct super_block *sb) * * And don't kill the default root. */ - if (css_has_online_children(&root->cgrp.self) || + if (!list_empty(&root->cgrp.self.children) || root == &cgrp_dfl_root) cgroup_put(&root->cgrp); else -- cgit v1.2.3 From e9d1b4f3c60997fe197bf0243cb4a41a44387a88 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 8 Jan 2015 14:30:22 -0800 Subject: x86, mpx: Strictly enforce empty prctl() args Description from Michael Kerrisk. He suggested an identical patch to one I had already coded up and tested. commit fe3d197f8431 "x86, mpx: On-demand kernel allocation of bounds tables" added two new prctl() operations, PR_MPX_ENABLE_MANAGEMENT and PR_MPX_DISABLE_MANAGEMENT. However, no checks were included to ensure that unused arguments are zero, as is done in many existing prctl()s and as should be done for all new prctl()s. This patch adds the required checks. Suggested-by: Andy Lutomirski Suggested-by: Michael Kerrisk Signed-off-by: Dave Hansen Cc: Dave Hansen Link: http://lkml.kernel.org/r/20150108223022.7F56FD13@viggo.jf.intel.com Signed-off-by: Thomas Gleixner --- kernel/sys.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index a8c9f5a7dda6..ea9c88109894 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2210,9 +2210,13 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, up_write(&me->mm->mmap_sem); break; case PR_MPX_ENABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_ENABLE_MANAGEMENT(me); break; case PR_MPX_DISABLE_MANAGEMENT: + if (arg2 || arg3 || arg4 || arg5) + return -EINVAL; error = MPX_DISABLE_MANAGEMENT(me); break; default: -- cgit v1.2.3 From 8ebe667c41e054384df19f2f382bc415badfaee1 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Thu, 22 Jan 2015 17:11:08 -0800 Subject: bpf: rcu lock must not be held when calling copy_to_user() BUG: sleeping function called from invalid context at mm/memory.c:3732 in_atomic(): 0, irqs_disabled(): 0, pid: 671, name: test_maps 1 lock held by test_maps/671: #0: (rcu_read_lock){......}, at: [<0000000000264190>] map_lookup_elem+0xe8/0x260 Call Trace: ([<0000000000115b7e>] show_trace+0x12e/0x150) [<0000000000115c40>] show_stack+0xa0/0x100 [<00000000009b163c>] dump_stack+0x74/0xc8 [<000000000017424a>] ___might_sleep+0x23a/0x248 [<00000000002b58e8>] might_fault+0x70/0xe8 [<0000000000264230>] map_lookup_elem+0x188/0x260 [<0000000000264716>] SyS_bpf+0x20e/0x840 Fix it by allocating temporary buffer to store map element value. Fixes: db20fd2b0108 ("bpf: add lookup/update/delete/iterate methods to BPF maps") Reported-by: Michael Holzheu Signed-off-by: Alexei Starovoitov Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- kernel/bpf/syscall.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 088ac0b1b106..536edc2be307 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -150,7 +150,7 @@ static int map_lookup_elem(union bpf_attr *attr) int ufd = attr->map_fd; struct fd f = fdget(ufd); struct bpf_map *map; - void *key, *value; + void *key, *value, *ptr; int err; if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM)) @@ -169,20 +169,29 @@ static int map_lookup_elem(union bpf_attr *attr) if (copy_from_user(key, ukey, map->key_size) != 0) goto free_key; - err = -ENOENT; - rcu_read_lock(); - value = map->ops->map_lookup_elem(map, key); + err = -ENOMEM; + value = kmalloc(map->value_size, GFP_USER); if (!value) - goto err_unlock; + goto free_key; + + rcu_read_lock(); + ptr = map->ops->map_lookup_elem(map, key); + if (ptr) + memcpy(value, ptr, map->value_size); + rcu_read_unlock(); + + err = -ENOENT; + if (!ptr) + goto free_value; err = -EFAULT; if (copy_to_user(uvalue, value, map->value_size) != 0) - goto err_unlock; + goto free_value; err = 0; -err_unlock: - rcu_read_unlock(); +free_value: + kfree(value); free_key: kfree(key); err_put: -- cgit v1.2.3 From c3c87e770458aa004bd7ed3f29945ff436fd6511 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 23 Jan 2015 11:19:48 +0100 Subject: perf: Tighten (and fix) the grouping condition The fix from 9fc81d87420d ("perf: Fix events installation during moving group") was incomplete in that it failed to recognise that creating a group with events for different CPUs is semantically broken -- they cannot be co-scheduled. Furthermore, it leads to real breakage where, when we create an event for CPU Y and then migrate it to form a group on CPU X, the code gets confused where the counter is programmed -- triggered in practice as well by me via the perf fuzzer. Fix this by tightening the rules for creating groups. Only allow grouping of counters that can be co-scheduled in the same context. This means for the same task and/or the same cpu. Fixes: 9fc81d87420d ("perf: Fix events installation during moving group") Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Link: http://lkml.kernel.org/r/20150123125834.090683288@infradead.org Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 6 ------ kernel/events/core.c | 15 +++++++++++++-- 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 4f7a61ca4b39..664de5a4ec46 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -450,11 +450,6 @@ struct perf_event { #endif /* CONFIG_PERF_EVENTS */ }; -enum perf_event_context_type { - task_context, - cpu_context, -}; - /** * struct perf_event_context - event context structure * @@ -462,7 +457,6 @@ enum perf_event_context_type { */ struct perf_event_context { struct pmu *pmu; - enum perf_event_context_type type; /* * Protect the states of the events in the list, * nr_active, and the list: diff --git a/kernel/events/core.c b/kernel/events/core.c index 882f835a0d85..19efcf13375a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -6776,7 +6776,6 @@ skip_type: __perf_event_init_context(&cpuctx->ctx); lockdep_set_class(&cpuctx->ctx.mutex, &cpuctx_mutex); lockdep_set_class(&cpuctx->ctx.lock, &cpuctx_lock); - cpuctx->ctx.type = cpu_context; cpuctx->ctx.pmu = pmu; __perf_cpu_hrtimer_init(cpuctx, cpu); @@ -7420,7 +7419,19 @@ SYSCALL_DEFINE5(perf_event_open, * task or CPU context: */ if (move_group) { - if (group_leader->ctx->type != ctx->type) + /* + * Make sure we're both on the same task, or both + * per-cpu events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. + */ + if (group_leader->cpu != event->cpu) goto err_context; } else { if (group_leader->ctx != ctx) -- cgit v1.2.3 From 00845eb968ead28007338b2bb852b8beef816583 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 1 Feb 2015 12:23:32 -0800 Subject: sched: don't cause task state changes in nested sleep debugging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 8eb23b9f35aa ("sched: Debug nested sleeps") added code to report on nested sleep conditions, which we generally want to avoid because the inner sleeping operation can re-set the thread state to TASK_RUNNING, but that will then cause the outer sleep loop not actually sleep when it calls schedule. However, that's actually valid traditional behavior, with the inner sleep being some fairly rare case (like taking a sleeping lock that normally doesn't actually need to sleep). And the debug code would actually change the state of the task to TASK_RUNNING internally, which makes that kind of traditional and working code not work at all, because now the nested sleep doesn't just sometimes cause the outer one to not block, but will cause it to happen every time. In particular, it will cause the cardbus kernel daemon (pccardd) to basically busy-loop doing scheduling, converting a laptop into a heater, as reported by Bruno Prémont. But there may be other legacy uses of that nested sleep model in other drivers that are also likely to never get converted to the new model. This fixes both cases: - don't set TASK_RUNNING when the nested condition happens (note: even if WARN_ONCE() only _warns_ once, the return value isn't whether the warning happened, but whether the condition for the warning was true. So despite the warning only happening once, the "if (WARN_ON(..))" would trigger for every nested sleep. - in the cases where we knowingly disable the warning by using "sched_annotate_sleep()", don't change the task state (that is used for all core scheduling decisions), instead use '->task_state_change' that is used for the debugging decision itself. (Credit for the second part of the fix goes to Oleg Nesterov: "Can't we avoid this subtle change in behaviour DEBUG_ATOMIC_SLEEP adds?" with the suggested change to use 'task_state_change' as part of the test) Reported-and-bisected-by: Bruno Prémont Tested-by: Rafael J Wysocki Acked-by: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner , Cc: Ilya Dryomov , Cc: Mike Galbraith Cc: Ingo Molnar Cc: Peter Hurley , Cc: Davidlohr Bueso , Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 2 +- kernel/sched/core.c | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 5449d2f4a1ef..64ce58bee6f5 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -176,7 +176,7 @@ extern int _cond_resched(void); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) -# define sched_annotate_sleep() __set_current_state(TASK_RUNNING) +# define sched_annotate_sleep() (current->task_state_change = 0) #else static inline void ___might_sleep(const char *file, int line, int preempt_offset) { } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c0accc00566e..e628cb11b560 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7292,13 +7292,12 @@ void __might_sleep(const char *file, int line, int preempt_offset) * since we will exit with TASK_RUNNING make sure we enter with it, * otherwise we will destroy state. */ - if (WARN_ONCE(current->state != TASK_RUNNING, + WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, "do not call blocking ops when !TASK_RUNNING; " "state=%lx set at [<%p>] %pS\n", current->state, (void *)current->task_state_change, - (void *)current->task_state_change)) - __set_current_state(TASK_RUNNING); + (void *)current->task_state_change); ___might_sleep(file, line, preempt_offset); } -- cgit v1.2.3