From eff30363c0b8b057f773108589bfd8881659fe74 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 20 Apr 2010 22:41:18 +0100 Subject: CRED: Fix double free in prepare_usermodehelper_creds() error handling Patch 570b8fb505896e007fd3bb07573ba6640e51851d: Author: Mathieu Desnoyers Date: Tue Mar 30 00:04:00 2010 +0100 Subject: CRED: Fix memory leak in error handling attempts to fix a memory leak in the error handling by making the offending return statement into a jump down to the bottom of the function where a kfree(tgcred) is inserted. This is, however, incorrect, as it does a kfree() after doing put_cred() if security_prepare_creds() fails. That will result in a double free if 'error' is jumped to as put_cred() will also attempt to free the new tgcred record by virtue of it being pointed to by the new cred record. Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index e1dbe9eef800..ce1a52b9e8a3 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -398,6 +398,8 @@ struct cred *prepare_usermodehelper_creds(void) error: put_cred(new); + return NULL; + free_tgcred: #ifdef CONFIG_KEYS kfree(tgcred); -- cgit v1.2.3 From e134d200d57d43b171dcb0b55c178a1a0c7db14a Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 21 Apr 2010 10:28:25 +0100 Subject: CRED: Fix a race in creds_are_invalid() in credentials debugging creds_are_invalid() reads both cred->usage and cred->subscribers and then compares them to make sure the number of processes subscribed to a cred struct never exceeds the refcount of that cred struct. The problem is that this can cause a race with both copy_creds() and exit_creds() as the two counters, whilst they are of atomic_t type, are only atomic with respect to themselves, and not atomic with respect to each other. This means that if creds_are_invalid() can read the values on one CPU whilst they're being modified on another CPU, and so can observe an evolving state in which the subscribers count now is greater than the usage count a moment before. Switching the order in which the counts are read cannot help, so the thing to do is to remove that particular check. I had considered rechecking the values to see if they're in flux if the test fails, but I can't guarantee they won't appear the same, even if they've changed several times in the meantime. Note that this can only happen if CONFIG_DEBUG_CREDENTIALS is enabled. The problem is only likely to occur with multithreaded programs, and can be tested by the tst-eintr1 program from glibc's "make check". The symptoms look like: CRED: Invalid credentials CRED: At include/linux/cred.h:240 CRED: Specified credentials: ffff88003dda5878 [real][eff] CRED: ->magic=43736564, put_addr=(null) CRED: ->usage=766, subscr=766 CRED: ->*uid = { 0,0,0,0 } CRED: ->*gid = { 0,0,0,0 } CRED: ->security is ffff88003d72f538 CRED: ->security {359, 359} ------------[ cut here ]------------ kernel BUG at kernel/cred.c:850! ... RIP: 0010:[] [] __invalid_creds+0x4e/0x52 ... Call Trace: [] copy_creds+0x6b/0x23f Note the ->usage=766 and subscr=766. The values appear the same because they've been re-read since the check was made. Reported-by: Roland McGrath Signed-off-by: David Howells Signed-off-by: James Morris --- kernel/cred.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cred.c b/kernel/cred.c index ce1a52b9e8a3..62af1816c235 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -793,8 +793,6 @@ bool creds_are_invalid(const struct cred *cred) { if (cred->magic != CRED_MAGIC) return true; - if (atomic_read(&cred->usage) < atomic_read(&cred->subscribers)) - return true; #ifdef CONFIG_SECURITY_SELINUX if (selinux_is_enabled()) { if ((unsigned long) cred->security < PAGE_SIZE) -- cgit v1.2.3 From 4b402210486c6414fe5fbfd85934a0a22da56b04 Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Fri, 16 Apr 2010 23:20:00 +0200 Subject: mutex: Don't spin when the owner CPU is offline or other weird cases Due to recent load-balancer changes that delay the task migration to the next wakeup, the adaptive mutex spinning ends up in a live lock when the owner's CPU gets offlined because the cpu_online() check lives before the owner running check. This patch changes mutex_spin_on_owner() to return 0 (don't spin) in any case where we aren't sure about the owner struct validity or CPU number, and if the said CPU is offline. There is no point going back & re-evaluate spinning in corner cases like that, let's just go to sleep. Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Peter Zijlstra LKML-Reference: <1271212509.13059.135.camel@pasglop> Signed-off-by: Ingo Molnar --- kernel/sched.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index 6af210a7de70..de0bd26e520a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -3780,7 +3780,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the mutex owner just released it and exited. */ if (probe_kernel_address(&owner->cpu, cpu)) - goto out; + return 0; #else cpu = owner->cpu; #endif @@ -3790,14 +3790,14 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) * the cpu field may no longer be valid. */ if (cpu >= nr_cpumask_bits) - goto out; + return 0; /* * We need to validate that we can do a * get_cpu() and that we have the percpu area. */ if (!cpu_online(cpu)) - goto out; + return 0; rq = cpu_rq(cpu); @@ -3816,7 +3816,7 @@ int mutex_spin_on_owner(struct mutex *lock, struct thread_info *owner) cpu_relax(); } -out: + return 1; } #endif -- cgit v1.2.3 From 46da27664887fb95cedba53eafcf876de812c8c1 Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Fri, 23 Apr 2010 13:17:44 -0400 Subject: kernel/sys.c: fix compat uname machine On ppc64 you get this error: $ setarch ppc -R true setarch: ppc: Unrecognized architecture because uname still reports ppc64 as the machine. So mask off the personality flags when checking for PER_LINUX32. Signed-off-by: Andreas Schwab Reviewed-by: Christoph Hellwig Acked-by: David S. Miller Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/sys.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/sys.c b/kernel/sys.c index 6d1a7e0f9d5b..7cb426a58965 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -1118,7 +1118,7 @@ DECLARE_RWSEM(uts_sem); #ifdef COMPAT_UTS_MACHINE #define override_architecture(name) \ - (current->personality == PER_LINUX32 && \ + (personality(current->personality) == PER_LINUX32 && \ copy_to_user(name->machine, COMPAT_UTS_MACHINE, \ sizeof(COMPAT_UTS_MACHINE))) #else -- cgit v1.2.3 From 47dd5be2d6a82b8153e059a1d09eb3879d485bfd Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 30 Apr 2010 07:23:51 +0200 Subject: workqueue: flush_delayed_work: keep the original workqueue for re-queueing flush_delayed_work() always uses keventd_wq for re-queueing, but it should use the workqueue this dwork was queued on. Signed-off-by: Oleg Nesterov Signed-off-by: Tejun Heo --- kernel/workqueue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/workqueue.c b/kernel/workqueue.c index dee48658805c..5bfb213984b2 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -774,7 +774,7 @@ void flush_delayed_work(struct delayed_work *dwork) { if (del_timer_sync(&dwork->timer)) { struct cpu_workqueue_struct *cwq; - cwq = wq_per_cpu(keventd_wq, get_cpu()); + cwq = wq_per_cpu(get_wq_data(&dwork->work)->wq, get_cpu()); __queue_work(cwq, &dwork->work); put_cpu(); } -- cgit v1.2.3 From 8b08ca52f5942c21564bbb90ccfb61053f2c26a1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 21 Apr 2010 13:02:07 -0700 Subject: rcu: Fix RCU lockdep splat in set_task_cpu on fork path Add an RCU read-side critical section to suppress this false positive. Located-by: Eric Paris Signed-off-by: Peter Zijlstra Signed-off-by: Paul E. McKenney Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: eric.dumazet@gmail.com LKML-Reference: <1271880131-3951-1-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'kernel') diff --git a/kernel/sched.c b/kernel/sched.c index de0bd26e520a..3c2a54f70ffe 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -323,6 +323,15 @@ static inline struct task_group *task_group(struct task_struct *p) /* Change a task's cfs_rq and parent entity if it moves across CPUs/groups */ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { + /* + * Strictly speaking this rcu_read_lock() is not needed since the + * task_group is tied to the cgroup, which in turn can never go away + * as long as there are tasks attached to it. + * + * However since task_group() uses task_subsys_state() which is an + * rcu_dereference() user, this quiets CONFIG_PROVE_RCU. + */ + rcu_read_lock(); #ifdef CONFIG_FAIR_GROUP_SCHED p->se.cfs_rq = task_group(p)->cfs_rq[cpu]; p->se.parent = task_group(p)->se[cpu]; @@ -332,6 +341,7 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu) p->rt.rt_rq = task_group(p)->rt_rq[cpu]; p->rt.parent = task_group(p)->rt_se[cpu]; #endif + rcu_read_unlock(); } #else -- cgit v1.2.3 From 8b46f880841aac821af8efa6581bb0e46b8b9845 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 21 Apr 2010 13:02:08 -0700 Subject: rcu: Fix RCU lockdep splat on freezer_fork path Add an RCU read-side critical section to suppress this false positive. Located-by: Eric Paris Signed-off-by: Paul E. McKenney Acked-by: Li Zefan Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: mathieu.desnoyers@polymtl.ca Cc: josh@joshtriplett.org Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org Cc: Valdis.Kletnieks@vt.edu Cc: dhowells@redhat.com Cc: eric.dumazet@gmail.com LKML-Reference: <1271880131-3951-2-git-send-email-paulmck@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/cgroup_freezer.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/cgroup_freezer.c b/kernel/cgroup_freezer.c index da5e13975531..e5c0244962b0 100644 --- a/kernel/cgroup_freezer.c +++ b/kernel/cgroup_freezer.c @@ -205,9 +205,12 @@ static void freezer_fork(struct cgroup_subsys *ss, struct task_struct *task) * No lock is needed, since the task isn't on tasklist yet, * so it can't be moved to another cgroup, which means the * freezer won't be removed and will be valid during this - * function call. + * function call. Nevertheless, apply RCU read-side critical + * section to suppress RCU lockdep false positives. */ + rcu_read_lock(); freezer = task_freezer(task); + rcu_read_unlock(); /* * The root cgroup is non-freezable, so we can skip the -- cgit v1.2.3 From 048c852051d2bd5da54a4488bc1f16b0fc74c695 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sat, 1 May 2010 10:11:35 +0200 Subject: perf: Fix resource leak in failure path of perf_event_open() perf_event_open() kfrees event after init failure which doesn't release all resources allocated by perf_event_alloc(). Use free_event() instead. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: LKML-Reference: <4BDBE237.1040809@kernel.org> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2f3fbf84215a..3d1552d3c12b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4897,7 +4897,7 @@ err_fput_free_put_context: err_free_put_context: if (err < 0) - kfree(event); + free_event(event); err_put_context: if (err < 0) -- cgit v1.2.3 From 9a9686b634acc5cb6b7c601c171ae64af0318a24 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 22 Apr 2010 17:29:24 +0800 Subject: cgroup: Fix an RCU warning in cgroup_path() with CONFIG_PROVE_RCU=y, a warning can be triggered: # mount -t cgroup -o debug xxx /mnt # cat /proc/$$/cgroup ... kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection! ... This is a false-positive, because cgroup_path() can be called with either rcu_read_lock() held or cgroup_mutex held. Signed-off-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/cgroup.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index e2769e13980c..4ca928db890c 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -1646,7 +1646,9 @@ static inline struct cftype *__d_cft(struct dentry *dentry) int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) { char *start; - struct dentry *dentry = rcu_dereference(cgrp->dentry); + struct dentry *dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); if (!dentry || cgrp == dummytop) { /* @@ -1662,13 +1664,17 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) *--start = '\0'; for (;;) { int len = dentry->d_name.len; + if ((start -= len) < buf) return -ENAMETOOLONG; - memcpy(start, cgrp->dentry->d_name.name, len); + memcpy(start, dentry->d_name.name, len); cgrp = cgrp->parent; if (!cgrp) break; - dentry = rcu_dereference(cgrp->dentry); + + dentry = rcu_dereference_check(cgrp->dentry, + rcu_read_lock_held() || + cgroup_lock_is_held()); if (!cgrp->parent) continue; if (--start < buf) -- cgit v1.2.3 From fae9c791703606636c1220e47f6690660042ce7f Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 22 Apr 2010 17:30:00 +0800 Subject: cgroup: Fix an RCU warning in alloc_css_id() With CONFIG_PROVE_RCU=y, a warning can be triggered: # mount -t cgroup -o memory xxx /mnt # mkdir /mnt/0 ... kernel/cgroup.c:4442 invoked rcu_dereference_check() without protection! ... This is a false-positive. It's safe to directly access parent_css->id. Signed-off-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 4ca928db890c..3a53c771e503 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4561,13 +4561,13 @@ static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, { int subsys_id, i, depth = 0; struct cgroup_subsys_state *parent_css, *child_css; - struct css_id *child_id, *parent_id = NULL; + struct css_id *child_id, *parent_id; subsys_id = ss->subsys_id; parent_css = parent->subsys[subsys_id]; child_css = child->subsys[subsys_id]; - depth = css_depth(parent_css) + 1; parent_id = parent_css->id; + depth = parent_id->depth; child_id = get_new_cssid(ss, depth); if (IS_ERR(child_id)) -- cgit v1.2.3 From b629317e66fb1c6066c550dded45ab85a936163c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 22 Apr 2010 17:30:40 +0800 Subject: sched: Fix an RCU warning in print_task() With CONFIG_PROVE_RCU=y, a warning can be triggered: $ cat /proc/sched_debug ... kernel/cgroup.c:1649 invoked rcu_dereference_check() without protection! ... Both cgroup_path() and task_group() should be called with either rcu_read_lock or cgroup_mutex held. The rcu_dereference_check() does include cgroup_lock_is_held(), so we know that this lock is not held. Therefore, in a CONFIG_PREEMPT kernel, to say nothing of a CONFIG_PREEMPT_RT kernel, the original code could have ended up copying a string out of the freelist. This patch inserts RCU read-side primitives needed to prevent this scenario. Signed-off-by: Li Zefan Signed-off-by: Paul E. McKenney --- kernel/sched_debug.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'kernel') diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c index 9b49db144037..19be00ba6123 100644 --- a/kernel/sched_debug.c +++ b/kernel/sched_debug.c @@ -114,7 +114,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) { char path[64]; + rcu_read_lock(); cgroup_path(task_group(p)->css.cgroup, path, sizeof(path)); + rcu_read_unlock(); SEQ_printf(m, " %s", path); } #endif -- cgit v1.2.3 From ee84b8243b07c33a5c8aed42b4b2da60cb16d1d2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 6 May 2010 09:28:41 -0700 Subject: rcu: create rcu_my_thread_group_empty() wrapper Some RCU-lockdep splat repairs need to know whether they are running in a single-threaded process. Unfortunately, the thread_group_empty() primitive is defined in sched.h, and can induce #include hell. This commit therefore introduces a rcu_my_thread_group_empty() wrapper that is defined in rcupdate.c, thus avoiding the need to include sched.h everywhere. Signed-off-by: "Paul E. McKenney" --- include/linux/rcupdate.h | 2 ++ kernel/rcupdate.c | 11 +++++++++++ 2 files changed, 13 insertions(+) (limited to 'kernel') diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 07db2feb8572..db266bbed23f 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -190,6 +190,8 @@ static inline int rcu_read_lock_sched_held(void) #ifdef CONFIG_PROVE_RCU +extern int rcu_my_thread_group_empty(void); + /** * rcu_dereference_check - rcu_dereference with debug checking * @p: The pointer to read, prior to dereferencing diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 03a7ea1579f6..49d808e833b0 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -122,3 +122,14 @@ void wakeme_after_rcu(struct rcu_head *head) rcu = container_of(head, struct rcu_synchronize, head); complete(&rcu->completion); } + +#ifdef CONFIG_PROVE_RCU +/* + * wrapper function to avoid #include problems. + */ +int rcu_my_thread_group_empty(void) +{ + return thread_group_empty(current); +} +EXPORT_SYMBOL_GPL(rcu_my_thread_group_empty); +#endif /* #ifdef CONFIG_PROVE_RCU */ -- cgit v1.2.3 From 34441427aab4bdb3069a4ffcda69a99357abcb2e Mon Sep 17 00:00:00 2001 From: Robin Holt Date: Tue, 11 May 2010 14:06:46 -0700 Subject: revert "procfs: provide stack information for threads" and its fixup commits Originally, commit d899bf7b ("procfs: provide stack information for threads") attempted to introduce a new feature for showing where the threadstack was located and how many pages are being utilized by the stack. Commit c44972f1 ("procfs: disable per-task stack usage on NOMMU") was applied to fix the NO_MMU case. Commit 89240ba0 ("x86, fs: Fix x86 procfs stack information for threads on 64-bit") was applied to fix a bug in ia32 executables being loaded. Commit 9ebd4eba7 ("procfs: fix /proc//stat stack pointer for kernel threads") was applied to fix a bug which had kernel threads printing a userland stack address. Commit 1306d603f ('proc: partially revert "procfs: provide stack information for threads"') was then applied to revert the stack pages being used to solve a significant performance regression. This patch nearly undoes the effect of all these patches. The reason for reverting these is it provides an unusable value in field 28. For x86_64, a fork will result in the task->stack_start value being updated to the current user top of stack and not the stack start address. This unpredictability of the stack_start value makes it worthless. That includes the intended use of showing how much stack space a thread has. Other architectures will get different values. As an example, ia64 gets 0. The do_fork() and copy_process() functions appear to treat the stack_start and stack_size parameters as architecture specific. I only partially reverted c44972f1 ("procfs: disable per-task stack usage on NOMMU") . If I had completely reverted it, I would have had to change mm/Makefile only build pagewalk.o when CONFIG_PROC_PAGE_MONITOR is configured. Since I could not test the builds without significant effort, I decided to not change mm/Makefile. I only partially reverted 89240ba0 ("x86, fs: Fix x86 procfs stack information for threads on 64-bit") . I left the KSTK_ESP() change in place as that seemed worthwhile. Signed-off-by: Robin Holt Cc: Stefani Seibold Cc: KOSAKI Motohiro Cc: Michal Simek Cc: Ingo Molnar Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/proc.txt | 3 +-- fs/compat.c | 2 -- fs/exec.c | 2 -- fs/proc/array.c | 3 +-- fs/proc/task_mmu.c | 19 ------------------- include/linux/sched.h | 1 - kernel/fork.c | 2 -- 7 files changed, 2 insertions(+), 30 deletions(-) (limited to 'kernel') diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt index a4f30faa4f1f..1e359b62c40a 100644 --- a/Documentation/filesystems/proc.txt +++ b/Documentation/filesystems/proc.txt @@ -316,7 +316,7 @@ address perms offset dev inode pathname 08049000-0804a000 rw-p 00001000 03:00 8312 /opt/test 0804a000-0806b000 rw-p 00000000 00:00 0 [heap] a7cb1000-a7cb2000 ---p 00000000 00:00 0 -a7cb2000-a7eb2000 rw-p 00000000 00:00 0 [threadstack:001ff4b4] +a7cb2000-a7eb2000 rw-p 00000000 00:00 0 a7eb2000-a7eb3000 ---p 00000000 00:00 0 a7eb3000-a7ed5000 rw-p 00000000 00:00 0 a7ed5000-a8008000 r-xp 00000000 03:00 4222 /lib/libc.so.6 @@ -352,7 +352,6 @@ is not associated with a file: [stack] = the stack of the main process [vdso] = the "virtual dynamic shared object", the kernel system call handler - [threadstack:xxxxxxxx] = the stack of the thread, xxxxxxxx is the stack size or if empty, the mapping is anonymous. diff --git a/fs/compat.c b/fs/compat.c index 4b6ed03cc478..05448730f840 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1531,8 +1531,6 @@ int compat_do_execve(char * filename, if (retval < 0) goto out; - current->stack_start = current->mm->start_stack; - /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/exec.c b/fs/exec.c index 49cdaa19e5b9..e6e94c626c2c 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1387,8 +1387,6 @@ int do_execve(char * filename, if (retval < 0) goto out; - current->stack_start = current->mm->start_stack; - /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; diff --git a/fs/proc/array.c b/fs/proc/array.c index e51f2ec2c5e5..885ab5513ac5 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -81,7 +81,6 @@ #include #include #include -#include #include #include @@ -495,7 +494,7 @@ static int do_task_stat(struct seq_file *m, struct pid_namespace *ns, rsslim, mm ? mm->start_code : 0, mm ? mm->end_code : 0, - (permitted && mm) ? task->stack_start : 0, + (permitted && mm) ? mm->start_stack : 0, esp, eip, /* The signal information here is obsolete. diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 070553427dd5..47f5b145f56e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -247,25 +247,6 @@ static void show_map_vma(struct seq_file *m, struct vm_area_struct *vma) } else if (vma->vm_start <= mm->start_stack && vma->vm_end >= mm->start_stack) { name = "[stack]"; - } else { - unsigned long stack_start; - struct proc_maps_private *pmp; - - pmp = m->private; - stack_start = pmp->task->stack_start; - - if (vma->vm_start <= stack_start && - vma->vm_end >= stack_start) { - pad_len_spaces(m, len); - seq_printf(m, - "[threadstack:%08lx]", -#ifdef CONFIG_STACK_GROWSUP - vma->vm_end - stack_start -#else - stack_start - vma->vm_start -#endif - ); - } } } else { name = "[vdso]"; diff --git a/include/linux/sched.h b/include/linux/sched.h index dad7f668ebf7..2b7b81df78b3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1497,7 +1497,6 @@ struct task_struct { /* bitmask of trace recursion */ unsigned long trace_recursion; #endif /* CONFIG_TRACING */ - unsigned long stack_start; #ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */ struct memcg_batch_info { int do_batch; /* incremented when batch uncharge started */ diff --git a/kernel/fork.c b/kernel/fork.c index 44b0791b0a2e..4c14942a0ee3 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1114,8 +1114,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, p->bts = NULL; - p->stack_start = stack_start; - /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); -- cgit v1.2.3 From 475f9aa6aa538befcbd0fa95bdebada600f247cd Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Tue, 11 May 2010 14:06:51 -0700 Subject: kexec: fix OOPS in crash_kernel_shrink Two "echo 0 > /sys/kernel/kexec_crash_size" OOPSes kernel. Also content of this file is invalid after first shrink to zero: it shows 1 instead of 0. This scenario is unlikely to happen often (root privs, valid crashkernel= in cmdline, dump-capture kernel not loaded), I hit it only by chance. This patch fixes it. Signed-off-by: Vitaly Mayatskikh Cc: Cong Wang Cc: Neil Horman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kexec.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/kexec.c b/kernel/kexec.c index 87ebe8adc474..474a84715eac 100644 --- a/kernel/kexec.c +++ b/kernel/kexec.c @@ -1134,11 +1134,9 @@ int crash_shrink_memory(unsigned long new_size) free_reserved_phys_range(end, crashk_res.end); - if (start == end) { - crashk_res.end = end; + if (start == end) release_resource(&crashk_res); - } else - crashk_res.end = end - 1; + crashk_res.end = end - 1; unlock: mutex_unlock(&kexec_mutex); -- cgit v1.2.3 From 11cad320a4f4bc53d3585c85600c782faa12b99e Mon Sep 17 00:00:00 2001 From: Vitaliy Gusev Date: Tue, 11 May 2010 14:06:56 -0700 Subject: bsdacct: use del_timer_sync() in acct_exit_ns() acct_exit_ns --> acct_file_reopen deletes timer without check timer execution on other CPUs. So acct_timeout() can change an unmapped memory. Signed-off-by: Vitaliy Gusev Cc: Pavel Emelyanov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/acct.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) (limited to 'kernel') diff --git a/kernel/acct.c b/kernel/acct.c index 24f8c81fc48d..e4c0e1fee9b0 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -353,17 +353,18 @@ restart: void acct_exit_ns(struct pid_namespace *ns) { - struct bsd_acct_struct *acct; + struct bsd_acct_struct *acct = ns->bacct; - spin_lock(&acct_lock); - acct = ns->bacct; - if (acct != NULL) { - if (acct->file != NULL) - acct_file_reopen(acct, NULL, NULL); + if (acct == NULL) + return; - kfree(acct); - } + del_timer_sync(&acct->timer); + spin_lock(&acct_lock); + if (acct->file != NULL) + acct_file_reopen(acct, NULL, NULL); spin_unlock(&acct_lock); + + kfree(acct); } /* -- cgit v1.2.3 From 7f0f15464185a92f9d8791ad231bcd7bf6df54e4 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 11 May 2010 14:06:58 -0700 Subject: memcg: fix css_id() RCU locking for real Commit ad4ba375373937817404fd92239ef4cadbded23b ("memcg: css_id() must be called under rcu_read_lock()") modifies memcontol.c for fixing RCU check message. But Andrew Morton pointed out that the fix doesn't seems sane and it was just for hidining lockdep messages. This is a patch for do proper things. Checking again, all places, accessing without rcu_read_lock, that commit fixies was intentional.... all callers of css_id() has reference count on it. So, it's not necessary to be under rcu_read_lock(). Considering again, we can use rcu_dereference_check for css_id(). We know css->id is valid if css->refcnt > 0. (css->id never changes and freed after css->refcnt going to be 0.) This patch makes use of rcu_dereference_check() in css_id/depth and remove unnecessary rcu-read-lock added by the commit. Signed-off-by: KAMEZAWA Hiroyuki Cc: "Paul E. McKenney" Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 15 +++++++++++++-- mm/memcontrol.c | 19 +++++-------------- 2 files changed, 18 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 3a53c771e503..6db8b7f297a1 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4435,7 +4435,15 @@ __setup("cgroup_disable=", cgroup_disable); */ unsigned short css_id(struct cgroup_subsys_state *css) { - struct css_id *cssid = rcu_dereference(css->id); + struct css_id *cssid; + + /* + * This css_id() can return correct value when somone has refcnt + * on this or this is under rcu_read_lock(). Once css->id is allocated, + * it's unchanged until freed. + */ + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); if (cssid) return cssid->id; @@ -4445,7 +4453,10 @@ EXPORT_SYMBOL_GPL(css_id); unsigned short css_depth(struct cgroup_subsys_state *css) { - struct css_id *cssid = rcu_dereference(css->id); + struct css_id *cssid; + + cssid = rcu_dereference_check(css->id, + rcu_read_lock_held() || atomic_read(&css->refcnt)); if (cssid) return cssid->depth; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0f711c213d2e..595d03f33b2c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2314,9 +2314,7 @@ mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) /* record memcg information */ if (do_swap_account && swapout && memcg) { - rcu_read_lock(); swap_cgroup_record(ent, css_id(&memcg->css)); - rcu_read_unlock(); mem_cgroup_get(memcg); } if (swapout && memcg) @@ -2373,10 +2371,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, { unsigned short old_id, new_id; - rcu_read_lock(); old_id = css_id(&from->css); new_id = css_id(&to->css); - rcu_read_unlock(); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { mem_cgroup_swap_statistics(from, false); @@ -4044,16 +4040,11 @@ static int is_target_pte_for_mc(struct vm_area_struct *vma, put_page(page); } /* throught */ - if (ent.val && do_swap_account && !ret) { - unsigned short id; - rcu_read_lock(); - id = css_id(&mc.from->css); - rcu_read_unlock(); - if (id == lookup_swap_cgroup(ent)) { - ret = MC_TARGET_SWAP; - if (target) - target->ent = ent; - } + if (ent.val && do_swap_account && !ret && + css_id(&mc.from->css) == lookup_swap_cgroup(ent)) { + ret = MC_TARGET_SWAP; + if (target) + target->ent = ent; } return ret; } -- cgit v1.2.3 From 747388d78a0ae768fd82b55c4ed38aa646a72364 Mon Sep 17 00:00:00 2001 From: KAMEZAWA Hiroyuki Date: Tue, 11 May 2010 14:06:59 -0700 Subject: memcg: fix css_is_ancestor() RCU locking Some callers (in memcontrol.c) calls css_is_ancestor() without rcu_read_lock. Because css_is_ancestor() has to access RCU protected data, it should be under rcu_read_lock(). This makes css_is_ancestor() itself does safe access to RCU protected area. (At least, "root" can have refcnt==0 if it's not an ancestor of "child". So, we need rcu_read_lock().) Signed-off-by: KAMEZAWA Hiroyuki Cc: "Paul E. McKenney" Cc: Daisuke Nishimura Cc: Balbir Singh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/cgroup.c | 31 ++++++++++++++++++++++++++----- mm/memcontrol.c | 4 ---- 2 files changed, 26 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 6db8b7f297a1..6d870f2d1228 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -4464,15 +4464,36 @@ unsigned short css_depth(struct cgroup_subsys_state *css) } EXPORT_SYMBOL_GPL(css_depth); +/** + * css_is_ancestor - test "root" css is an ancestor of "child" + * @child: the css to be tested. + * @root: the css supporsed to be an ancestor of the child. + * + * Returns true if "root" is an ancestor of "child" in its hierarchy. Because + * this function reads css->id, this use rcu_dereference() and rcu_read_lock(). + * But, considering usual usage, the csses should be valid objects after test. + * Assuming that the caller will do some action to the child if this returns + * returns true, the caller must take "child";s reference count. + * If "child" is valid object and this returns true, "root" is valid, too. + */ + bool css_is_ancestor(struct cgroup_subsys_state *child, const struct cgroup_subsys_state *root) { - struct css_id *child_id = rcu_dereference(child->id); - struct css_id *root_id = rcu_dereference(root->id); + struct css_id *child_id; + struct css_id *root_id; + bool ret = true; - if (!child_id || !root_id || (child_id->depth < root_id->depth)) - return false; - return child_id->stack[root_id->depth] == root_id->id; + rcu_read_lock(); + child_id = rcu_dereference(child->id); + root_id = rcu_dereference(root->id); + if (!child_id + || !root_id + || (child_id->depth < root_id->depth) + || (child_id->stack[root_id->depth] != root_id->id)) + ret = false; + rcu_read_unlock(); + return ret; } static void __free_css_id_cb(struct rcu_head *head) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 595d03f33b2c..8a79a6f0f029 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -811,12 +811,10 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem) * enabled in "curr" and "curr" is a child of "mem" in *cgroup* * hierarchy(even if use_hierarchy is disabled in "mem"). */ - rcu_read_lock(); if (mem->use_hierarchy) ret = css_is_ancestor(&curr->css, &mem->css); else ret = (curr == mem); - rcu_read_unlock(); css_put(&curr->css); return ret; } @@ -1603,7 +1601,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, * There is a small race that "from" or "to" can be * freed by rmdir, so we use css_tryget(). */ - rcu_read_lock(); from = mc.from; to = mc.to; if (from && css_tryget(&from->css)) { @@ -1624,7 +1621,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, do_continue = (to == mem_over_limit); css_put(&to->css); } - rcu_read_unlock(); if (do_continue) { DEFINE_WAIT(wait); prepare_to_wait(&mc.waitq, &wait, -- cgit v1.2.3