From 1f5320d5972aa50d3e8d2b227b636b370e608359 Mon Sep 17 00:00:00 2001 From: Daisuke Nishimura Date: Thu, 4 Oct 2012 16:37:16 +0900 Subject: cgroup: notify_on_release may not be triggered in some cases notify_on_release must be triggered when the last process in a cgroup is move to another. But if the first(and only) process in a cgroup is moved to another, notify_on_release is not triggered. # mkdir /cgroup/cpu/SRC # mkdir /cgroup/cpu/DST # # echo 1 >/cgroup/cpu/SRC/notify_on_release # echo 1 >/cgroup/cpu/DST/notify_on_release # # sleep 300 & [1] 8629 # # echo 8629 >/cgroup/cpu/SRC/tasks # echo 8629 >/cgroup/cpu/DST/tasks -> notify_on_release for /SRC must be triggered at this point, but it isn't. This is because put_css_set() is called before setting CGRP_RELEASABLE in cgroup_task_migrate(), and is a regression introduce by the commit:74a1166d(cgroups: make procs file writable), which was merged into v3.0. Cc: Ben Blum Cc: # v3.0.x and later Acked-by: Li Zefan Signed-off-by: Daisuke Nishimura Signed-off-by: Tejun Heo --- kernel/cgroup.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 13774b3b39aa..d1739fc7eb94 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1962,9 +1962,8 @@ static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, * trading it for newcg is protected by cgroup_mutex, we're safe to drop * it here; it will be freed under RCU. */ - put_css_set(oldcg); - set_bit(CGRP_RELEASABLE, &oldcgrp->flags); + put_css_set(oldcg); } /** -- cgit v1.2.3 From 9bb71308b8133d643648776243e4d5599b1c193d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Oct 2012 17:52:07 -0700 Subject: Revert "cgroup: Drop task_lock(parent) on cgroup_fork()" This reverts commit 7e381b0eb1e1a9805c37335562e8dc02e7d7848c. The commit incorrectly assumed that fork path always performed threadgroup_change_begin/end() and depended on that for synchronization against task exit and cgroup migration paths instead of explicitly grabbing task_lock(). threadgroup_change is not locked when forking a new process (as opposed to a new thread in the same process) and even if it were it wouldn't be effective as different processes use different threadgroup locks. Revert the incorrect optimization. Signed-off-by: Tejun Heo LKML-Reference: <20121008020000.GB2575@localhost> Acked-by: Li Zefan Bitterly-Acked-by: Frederic Weisbecker Cc: stable@vger.kernel.org --- kernel/cgroup.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index d1739fc7eb94..75aec12c78a0 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4814,31 +4814,20 @@ static const struct file_operations proc_cgroupstats_operations = { * * A pointer to the shared css_set was automatically copied in * fork.c by dup_task_struct(). However, we ignore that copy, since - * it was not made under the protection of RCU, cgroup_mutex or - * threadgroup_change_begin(), so it might no longer be a valid - * cgroup pointer. cgroup_attach_task() might have already changed - * current->cgroups, allowing the previously referenced cgroup - * group to be removed and freed. - * - * Outside the pointer validity we also need to process the css_set - * inheritance between threadgoup_change_begin() and - * threadgoup_change_end(), this way there is no leak in any process - * wide migration performed by cgroup_attach_proc() that could otherwise - * miss a thread because it is too early or too late in the fork stage. + * it was not made under the protection of RCU or cgroup_mutex, so + * might no longer be a valid cgroup pointer. cgroup_attach_task() might + * have already changed current->cgroups, allowing the previously + * referenced cgroup group to be removed and freed. * * At the point that cgroup_fork() is called, 'current' is the parent * task, and the passed argument 'child' points to the child task. */ void cgroup_fork(struct task_struct *child) { - /* - * We don't need to task_lock() current because current->cgroups - * can't be changed concurrently here. The parent obviously hasn't - * exited and called cgroup_exit(), and we are synchronized against - * cgroup migration through threadgroup_change_begin(). - */ + task_lock(current); child->cgroups = current->cgroups; get_css_set(child->cgroups); + task_unlock(current); INIT_LIST_HEAD(&child->cg_list); } -- cgit v1.2.3 From d87838321124061f6c935069d97f37010fa417e6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 18 Oct 2012 17:40:30 -0700 Subject: Revert "cgroup: Remove task_lock() from cgroup_post_fork()" This reverts commit 7e3aa30ac8c904a706518b725c451bb486daaae9. The commit incorrectly assumed that fork path always performed threadgroup_change_begin/end() and depended on that for synchronization against task exit and cgroup migration paths instead of explicitly grabbing task_lock(). threadgroup_change is not locked when forking a new process (as opposed to a new thread in the same process) and even if it were it wouldn't be effective as different processes use different threadgroup locks. Revert the incorrect optimization. Signed-off-by: Tejun Heo LKML-Reference: <20121008020000.GB2575@localhost> Acked-by: Li Zefan Cc: Frederic Weisbecker Cc: stable@vger.kernel.org --- kernel/cgroup.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 75aec12c78a0..f24f724620dd 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4883,19 +4883,10 @@ void cgroup_post_fork(struct task_struct *child) */ if (use_task_css_set_links) { write_lock(&css_set_lock); - if (list_empty(&child->cg_list)) { - /* - * It's safe to use child->cgroups without task_lock() - * here because we are protected through - * threadgroup_change_begin() against concurrent - * css_set change in cgroup_task_migrate(). Also - * the task can't exit at that point until - * wake_up_new_task() is called, so we are protected - * against cgroup_exit() setting child->cgroup to - * init_css_set. - */ + task_lock(child); + if (list_empty(&child->cg_list)) list_add(&child->cg_list, &child->cgroups->tasks); - } + task_unlock(child); write_unlock(&css_set_lock); } } -- cgit v1.2.3 From c0158ca64da5732dfb86a3f28944e9626776692f Mon Sep 17 00:00:00 2001 From: Dan Magenheimer Date: Thu, 18 Oct 2012 16:31:37 -0700 Subject: workqueue: cancel_delayed_work() should return %false if work item is idle 57b30ae77b ("workqueue: reimplement cancel_delayed_work() using try_to_grab_pending()") made cancel_delayed_work() always return %true unless someone else is also trying to cancel the work item, which is broken - if the target work item is idle, the return value should be %false. try_to_grab_pending() indicates that the target work item was idle by zero return value. Use it for return. Note that this brings cancel_delayed_work() in line with __cancel_work_timer() in return value handling. Signed-off-by: Dan Magenheimer Signed-off-by: Tejun Heo LKML-Reference: <444a6439-b1a4-4740-9e7e-bc37267cfe73@default> --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index d951daa0ca9a..042d221d33cc 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -2982,7 +2982,7 @@ bool cancel_delayed_work(struct delayed_work *dwork) set_work_cpu_and_clear_pending(&dwork->work, work_cpu(&dwork->work)); local_irq_restore(flags); - return true; + return ret; } EXPORT_SYMBOL(cancel_delayed_work); -- cgit v1.2.3 From f2302505775fd13ba93f034206f1e2a587017929 Mon Sep 17 00:00:00 2001 From: Andrew Vagin Date: Thu, 25 Oct 2012 13:38:07 -0700 Subject: pidns: limit the nesting depth of pid namespaces 'struct pid' is a "variable sized struct" - a header with an array of upids at the end. The size of the array depends on a level (depth) of pid namespaces. Now a level of pidns is not limited, so 'struct pid' can be more than one page. Looks reasonable, that it should be less than a page. MAX_PIS_NS_LEVEL is not calculated from PAGE_SIZE, because in this case it depends on architectures, config options and it will be reduced, if someone adds a new fields in struct pid or struct upid. I suggest to set MAX_PIS_NS_LEVEL = 32, because it saves ability to expand "struct pid" and it's more than enough for all known for me use-cases. When someone finds a reasonable use case, we can add a config option or a sysctl parameter. In addition it will reduce the effect of another problem, when we have many nested namespaces and the oldest one starts dying. zap_pid_ns_processe will be called for each namespace and find_vpid will be called for each process in a namespace. find_vpid will be called minimum max_level^2 / 2 times. The reason of that is that when we found a bit in pidmap, we can't determine this pidns is top for this process or it isn't. vpid is a heavy operation, so a fork bomb, which create many nested namespace, can make a system inaccessible for a long time. For example my system becomes inaccessible for a few minutes with 4000 processes. [akpm@linux-foundation.org: return -EINVAL in response to excessive nesting, not -ENOMEM] Signed-off-by: Andrew Vagin Acked-by: Oleg Nesterov Cc: Cyrill Gorcunov Cc: "Eric W. Biederman" Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/pid_namespace.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index eb00be205811..7b07cc0dfb75 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -71,12 +71,22 @@ err_alloc: return NULL; } +/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */ +#define MAX_PID_NS_LEVEL 32 + static struct pid_namespace *create_pid_namespace(struct pid_namespace *parent_pid_ns) { struct pid_namespace *ns; unsigned int level = parent_pid_ns->level + 1; - int i, err = -ENOMEM; + int i; + int err; + + if (level > MAX_PID_NS_LEVEL) { + err = -EINVAL; + goto out; + } + err = -ENOMEM; ns = kmem_cache_zalloc(pid_ns_cachep, GFP_KERNEL); if (ns == NULL) goto out; -- cgit v1.2.3 From 2008713c7174e5c0f207bac684c6df0939047009 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 24 Oct 2012 14:11:48 -0700 Subject: Makefile: Documentation for external tool should be correct If one includes documentation for an external tool, it should be correct. This is not: 1. Overriding the input to rngd should typically be neither necessary nor desired. This is especially so since newer versions of rngd support a number of different *types* of sources. 2. The default kernel-exported device is called /dev/hwrng not /dev/hwrandom nor /dev/hw_random (both of which were used in the past; however, kernel and udev seem to have converged on /dev/hwrng.) Overall it is better if the documentation for rngd is kept with rngd rather than in a kernel Makefile. Signed-off-by: H. Peter Anvin Cc: David Howells Cc: Jeff Garzik Signed-off-by: Linus Torvalds --- kernel/Makefile | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/Makefile b/kernel/Makefile index 0dfeca4324ee..86e3285ae7e5 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -174,10 +174,8 @@ signing_key.priv signing_key.x509: x509.genkey @echo "###" @echo "### If this takes a long time, you might wish to run rngd in the" @echo "### background to keep the supply of entropy topped up. It" - @echo "### needs to be run as root, and should use a hardware random" - @echo "### number generator if one is available, eg:" - @echo "###" - @echo "### rngd -r /dev/hwrandom" + @echo "### needs to be run as root, and uses a hardware random" + @echo "### number generator if one is available." @echo "###" openssl req -new -nodes -utf8 $(sign_key_with_hash) -days 36500 -batch \ -x509 -config x509.genkey \ -- cgit v1.2.3 From 59ef28b1f14899b10d6b2682c7057ca00a9a3f47 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Thu, 25 Oct 2012 10:49:25 +1030 Subject: module: fix out-by-one error in kallsyms Masaki found and patched a kallsyms issue: the last symbol in a module's symtab wasn't transferred. This is because we manually copy the zero'th entry (which is always empty) then copy the rest in a loop starting at 1, though from src[0]. His fix was minimal, I prefer to rewrite the loops in more standard form. There are two loops: one to get the size, and one to copy. Make these identical: always count entry 0 and any defined symbol in an allocated non-init section. This bug exists since the following commit was introduced. module: reduce symbol table for loaded modules (v2) commit: 4a4962263f07d14660849ec134ee42b63e95ea9a LKML: http://lkml.org/lkml/2012/10/24/27 Reported-by: Masaki Kimura Cc: stable@kernel.org --- kernel/module.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) (limited to 'kernel') diff --git a/kernel/module.c b/kernel/module.c index 6085f5ef88ea..6e48c3a43599 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -2293,12 +2293,17 @@ static void layout_symtab(struct module *mod, struct load_info *info) src = (void *)info->hdr + symsect->sh_offset; nsrc = symsect->sh_size / sizeof(*src); + /* strtab always starts with a nul, so offset 0 is the empty string. */ + strtab_size = 1; + /* Compute total space required for the core symbols' strtab. */ - for (ndst = i = strtab_size = 1; i < nsrc; ++i, ++src) - if (is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) { - strtab_size += strlen(&info->strtab[src->st_name]) + 1; + for (ndst = i = 0; i < nsrc; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + strtab_size += strlen(&info->strtab[src[i].st_name])+1; ndst++; } + } /* Append room for core symbols at end of core part. */ info->symoffs = ALIGN(mod->core_size, symsect->sh_addralign ?: 1); @@ -2332,15 +2337,15 @@ static void add_kallsyms(struct module *mod, const struct load_info *info) mod->core_symtab = dst = mod->module_core + info->symoffs; mod->core_strtab = s = mod->module_core + info->stroffs; src = mod->symtab; - *dst = *src; *s++ = 0; - for (ndst = i = 1; i < mod->num_symtab; ++i, ++src) { - if (!is_core_symbol(src, info->sechdrs, info->hdr->e_shnum)) - continue; - - dst[ndst] = *src; - dst[ndst++].st_name = s - mod->core_strtab; - s += strlcpy(s, &mod->strtab[src->st_name], KSYM_NAME_LEN) + 1; + for (ndst = i = 0; i < mod->num_symtab; i++) { + if (i == 0 || + is_core_symbol(src+i, info->sechdrs, info->hdr->e_shnum)) { + dst[ndst] = src[i]; + dst[ndst++].st_name = s - mod->core_strtab; + s += strlcpy(s, &mod->strtab[src[i].st_name], + KSYM_NAME_LEN) + 1; + } } mod->core_num_syms = ndst; } -- cgit v1.2.3