From 7b959fc582741227a1c4cba710d6aff8fb183128 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Wed, 22 May 2013 18:34:09 +0900 Subject: kprobes: Fix to free gone and unused optprobes Fix to free gone and unused optprobes. This bug will cause a kernel panic if the user reuses the killed and unused probe. Reported at: http://sourceware.org/ml/systemtap/2013-q2/msg00142.html In the normal path, an optprobe on an init function is unregistered when a module goes live. unregister_kprobe(kp) -> __unregister_kprobe_top ->__disable_kprobe ->disarm_kprobe(ap == op) ->__disarm_kprobe ->unoptimize_kprobe : the op is queued on unoptimizing_list and do nothing in __unregister_kprobe_bottom After a while (usually wait 5 jiffies), kprobe_optimizer runs to unoptimize and free optprobe. kprobe_optimizer ->do_unoptimize_kprobes ->arch_unoptimize_kprobes : moved to free_list ->do_free_cleaned_kprobes ->hlist_del: the op is removed ->free_aggr_kprobe ->arch_remove_optimized_kprobe ->arch_remove_kprobe ->kfree: the op is freed Here, if kprobes_module_callback is called and the delayed unoptimizing probe is picked BEFORE kprobe_optimizer runs, kprobes_module_callback ->kill_kprobe ->kill_optimized_kprobe : dequeued from unoptimizing_list <=!!! ->arch_remove_optimized_kprobe ->arch_remove_kprobe (but op is not freed, and on the kprobe hash table) This doesn't happen if the probe unregistration is done AFTER kprobes_module_callback is called (because at that time the op is gone), and kprobe-tracer does it. To fix this bug, this patch changes kprobes_module_callback to enqueue the op to freeing_list at kill_optimized_kprobe only if the op is unused. The unused probes on freeing_list will be freed in do_free_cleaned_kprobes. Note that this calls arch_remove_*kprobe twice on the same probe. Thus those functions have to check the double free. Fortunately, most of arch codes already checked that except for mips. This will be fixed in the next patch. Signed-off-by: Masami Hiramatsu Cc: Timo Juhani Lindfors Cc: Ananth N Mavinakayanahalli Cc: Anil S Keshavamurthy Cc: Frank Ch. Eigler Cc: systemtap@sourceware.org Cc: yrl.pp-manager.tt@hitachi.com Cc: David S. Miller Cc: "David S. Miller" Link: http://lkml.kernel.org/r/20130522093409.9084.63554.stgit@mhiramat-M0-7522 [ Minor edits. ] Signed-off-by: Ingo Molnar --- kernel/kprobes.c | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 3fed7f0cbcdf..bddf3b201a48 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -467,6 +467,7 @@ static struct kprobe *__kprobes get_optimized_kprobe(unsigned long addr) /* Optimization staging list, protected by kprobe_mutex */ static LIST_HEAD(optimizing_list); static LIST_HEAD(unoptimizing_list); +static LIST_HEAD(freeing_list); static void kprobe_optimizer(struct work_struct *work); static DECLARE_DELAYED_WORK(optimizing_work, kprobe_optimizer); @@ -504,7 +505,7 @@ static __kprobes void do_optimize_kprobes(void) * Unoptimize (replace a jump with a breakpoint and remove the breakpoint * if need) kprobes listed on unoptimizing_list. */ -static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) +static __kprobes void do_unoptimize_kprobes(void) { struct optimized_kprobe *op, *tmp; @@ -515,9 +516,9 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) /* Ditto to do_optimize_kprobes */ get_online_cpus(); mutex_lock(&text_mutex); - arch_unoptimize_kprobes(&unoptimizing_list, free_list); + arch_unoptimize_kprobes(&unoptimizing_list, &freeing_list); /* Loop free_list for disarming */ - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { /* Disarm probes if marked disabled */ if (kprobe_disabled(&op->kp)) arch_disarm_kprobe(&op->kp); @@ -536,11 +537,11 @@ static __kprobes void do_unoptimize_kprobes(struct list_head *free_list) } /* Reclaim all kprobes on the free_list */ -static __kprobes void do_free_cleaned_kprobes(struct list_head *free_list) +static __kprobes void do_free_cleaned_kprobes(void) { struct optimized_kprobe *op, *tmp; - list_for_each_entry_safe(op, tmp, free_list, list) { + list_for_each_entry_safe(op, tmp, &freeing_list, list) { BUG_ON(!kprobe_unused(&op->kp)); list_del_init(&op->list); free_aggr_kprobe(&op->kp); @@ -556,8 +557,6 @@ static __kprobes void kick_kprobe_optimizer(void) /* Kprobe jump optimizer */ static __kprobes void kprobe_optimizer(struct work_struct *work) { - LIST_HEAD(free_list); - mutex_lock(&kprobe_mutex); /* Lock modules while optimizing kprobes */ mutex_lock(&module_mutex); @@ -566,7 +565,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) * Step 1: Unoptimize kprobes and collect cleaned (unused and disarmed) * kprobes before waiting for quiesence period. */ - do_unoptimize_kprobes(&free_list); + do_unoptimize_kprobes(); /* * Step 2: Wait for quiesence period to ensure all running interrupts @@ -581,7 +580,7 @@ static __kprobes void kprobe_optimizer(struct work_struct *work) do_optimize_kprobes(); /* Step 4: Free cleaned kprobes after quiesence period */ - do_free_cleaned_kprobes(&free_list); + do_free_cleaned_kprobes(); mutex_unlock(&module_mutex); mutex_unlock(&kprobe_mutex); @@ -723,8 +722,19 @@ static void __kprobes kill_optimized_kprobe(struct kprobe *p) if (!list_empty(&op->list)) /* Dequeue from the (un)optimization queue */ list_del_init(&op->list); - op->kp.flags &= ~KPROBE_FLAG_OPTIMIZED; + + if (kprobe_unused(p)) { + /* Enqueue if it is unused */ + list_add(&op->list, &freeing_list); + /* + * Remove unused probes from the hash list. After waiting + * for synchronization, this probe is reclaimed. + * (reclaiming is done by do_free_cleaned_kprobes().) + */ + hlist_del_rcu(&op->kp.hlist); + } + /* Don't touch the code, because it is already freed. */ arch_remove_optimized_kprobe(op); } -- cgit v1.2.3 From 26cb63ad11e04047a64309362674bcbbd6a6f246 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 May 2013 10:55:48 +0200 Subject: perf: Fix perf mmap bugs Vince reported a problem found by his perf specific trinity fuzzer. Al noticed 2 problems with perf's mmap(): - it has issues against fork() since we use vma->vm_mm for accounting. - it has an rb refcount leak on double mmap(). We fix the issues against fork() by using VM_DONTCOPY; I don't think there's code out there that uses this; we didn't hear about weird accounting problems/crashes. If we do need this to work, the previously proposed VM_PINNED could make this work. Aside from the rb reference leak spotted by Al, Vince's example prog was indeed doing a double mmap() through the use of perf_event_set_output(). This exposes another problem, since we now have 2 events with one buffer, the accounting gets screwy because we account per event. Fix this by making the buffer responsible for its own accounting. Reported-by: Vince Weaver Signed-off-by: Peter Zijlstra Cc: Al Viro Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Link: http://lkml.kernel.org/r/20130528085548.GA12193@twins.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 3 +-- kernel/events/core.c | 37 ++++++++++++++++++++----------------- kernel/events/internal.h | 3 +++ 3 files changed, 24 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f463a46424e2..c5b6dbf9c2fc 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -389,8 +389,7 @@ struct perf_event { /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; - int mmap_locked; - struct user_struct *mmap_user; + struct ring_buffer *rb; struct list_head rb_entry; diff --git a/kernel/events/core.c b/kernel/events/core.c index 9dc297faf7c0..ae752cd4a086 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -2917,7 +2917,7 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static void ring_buffer_put(struct ring_buffer *rb); +static bool ring_buffer_put(struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -3582,13 +3582,13 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static void ring_buffer_put(struct ring_buffer *rb) +static bool ring_buffer_put(struct ring_buffer *rb) { struct perf_event *event, *n; unsigned long flags; if (!atomic_dec_and_test(&rb->refcount)) - return; + return false; spin_lock_irqsave(&rb->event_lock, flags); list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { @@ -3598,6 +3598,7 @@ static void ring_buffer_put(struct ring_buffer *rb) spin_unlock_irqrestore(&rb->event_lock, flags); call_rcu(&rb->rcu_head, rb_free_rcu); + return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3612,18 +3613,20 @@ static void perf_mmap_close(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - unsigned long size = perf_data_size(event->rb); - struct user_struct *user = event->mmap_user; struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm); - vma->vm_mm->pinned_vm -= event->mmap_locked; rcu_assign_pointer(event->rb, NULL); ring_buffer_detach(event, rb); mutex_unlock(&event->mmap_mutex); - ring_buffer_put(rb); - free_uid(user); + if (ring_buffer_put(rb)) { + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + } } } @@ -3676,9 +3679,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) WARN_ON_ONCE(event->ctx->parent_ctx); mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages == nr_pages) - atomic_inc(&event->rb->refcount); - else + if (event->rb->nr_pages != nr_pages) ret = -EINVAL; goto unlock; } @@ -3720,12 +3721,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; goto unlock; } - rcu_assign_pointer(event->rb, rb); + + rb->mmap_locked = extra; + rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); - event->mmap_locked = extra; - event->mmap_user = get_current_user(); - vma->vm_mm->pinned_vm += event->mmap_locked; + vma->vm_mm->pinned_vm += extra; + + rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3734,7 +3737,7 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); - vma->vm_flags |= VM_DONTEXPAND | VM_DONTDUMP; + vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; return ret; diff --git a/kernel/events/internal.h b/kernel/events/internal.h index eb675c4d59df..5bc6c8e9b851 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,6 +31,9 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; + int mmap_locked; + struct user_struct *mmap_user; + struct perf_event_mmap_page *user_page; void *data_pages[0]; }; -- cgit v1.2.3 From 45eacc692771bd2b1ea3d384e6345cab3da10861 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 May 2013 22:16:32 +0200 Subject: vtime: Use consistent clocks among nohz accounting While computing the cputime delta of dynticks CPUs, we are mixing up clocks of differents natures: * local_clock() which takes care of unstable clock sources and fix these if needed. * sched_clock() which is the weaker version of local_clock(). It doesn't compute any fixup in case of unstable source. If the clock source is stable, those two clocks are the same and we can safely compute the difference against two random points. Otherwise it results in random deltas as sched_clock() can randomly drift away, back or forward, from local_clock(). As a consequence, some strange behaviour with unstable tsc has been observed such as non progressing constant zero cputime. (The 'top' command showing no load). Fix this by only using local_clock(), or its irq safe/remote equivalent, in vtime code. Reported-by: Mike Galbraith Suggested-by: Mike Galbraith Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Cc: Li Zhong Cc: Mike Galbraith Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- include/linux/vtime.h | 4 ++-- kernel/sched/core.c | 2 +- kernel/sched/cputime.c | 6 +++--- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'kernel') diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 71a5782d8c59..b1dd2db80076 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -34,7 +34,7 @@ static inline void vtime_user_exit(struct task_struct *tsk) } extern void vtime_guest_enter(struct task_struct *tsk); extern void vtime_guest_exit(struct task_struct *tsk); -extern void vtime_init_idle(struct task_struct *tsk); +extern void vtime_init_idle(struct task_struct *tsk, int cpu); #else static inline void vtime_account_irq_exit(struct task_struct *tsk) { @@ -45,7 +45,7 @@ static inline void vtime_user_enter(struct task_struct *tsk) { } static inline void vtime_user_exit(struct task_struct *tsk) { } static inline void vtime_guest_enter(struct task_struct *tsk) { } static inline void vtime_guest_exit(struct task_struct *tsk) { } -static inline void vtime_init_idle(struct task_struct *tsk) { } +static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..e1a27f918723 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -4745,7 +4745,7 @@ void __cpuinit init_idle(struct task_struct *idle, int cpu) */ idle->sched_class = &idle_sched_class; ftrace_graph_init_idle_task(idle, cpu); - vtime_init_idle(idle); + vtime_init_idle(idle, cpu); #if defined(CONFIG_SMP) sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); #endif diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index cc2dc3eea8a3..b5ccba22603b 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -747,17 +747,17 @@ void arch_vtime_task_switch(struct task_struct *prev) write_seqlock(¤t->vtime_seqlock); current->vtime_snap_whence = VTIME_SYS; - current->vtime_snap = sched_clock(); + current->vtime_snap = sched_clock_cpu(smp_processor_id()); write_sequnlock(¤t->vtime_seqlock); } -void vtime_init_idle(struct task_struct *t) +void vtime_init_idle(struct task_struct *t, int cpu) { unsigned long flags; write_seqlock_irqsave(&t->vtime_seqlock, flags); t->vtime_snap_whence = VTIME_SYS; - t->vtime_snap = sched_clock(); + t->vtime_snap = sched_clock_cpu(cpu); write_sequnlock_irqrestore(&t->vtime_seqlock, flags); } -- cgit v1.2.3 From 521921bad1192fb1b8f9b6a5aa673635848b8b5f Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 16 May 2013 01:21:38 +0200 Subject: kvm: Move guest entry/exit APIs to context_tracking The kvm_host.h header file doesn't handle well inclusion when archs don't support KVM. This results in build crashes for such archs when they want to implement context tracking because this subsystem includes kvm_host.h in order to implement the guest_enter/exit APIs but it doesn't handle KVM off case. To fix this, move the guest_enter()/guest_exit() declarations and generic implementation to the context tracking headers. These generic APIs actually belong to this subsystem, besides other domains boundary tracking like user_enter() et al. KVM now properly becomes a user of this library, not the other buggy way around. Reported-by: Kevin Hilman Reviewed-by: Kevin Hilman Tested-by: Kevin Hilman Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Paul E. McKenney Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Kevin Hilman Cc: Marcelo Tosatti Cc: Gleb Natapov Signed-off-by: Ingo Molnar --- include/linux/context_tracking.h | 35 +++++++++++++++++++++++++++++++++++ include/linux/kvm_host.h | 37 +------------------------------------ kernel/context_tracking.c | 1 - 3 files changed, 36 insertions(+), 37 deletions(-) (limited to 'kernel') diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h index 365f4a61bf04..fc09d7b0dacf 100644 --- a/include/linux/context_tracking.h +++ b/include/linux/context_tracking.h @@ -3,6 +3,7 @@ #include #include +#include #include struct context_tracking { @@ -19,6 +20,26 @@ struct context_tracking { } state; }; +static inline void __guest_enter(void) +{ + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); + current->flags |= PF_VCPU; +} + +static inline void __guest_exit(void) +{ + /* + * This is running in ioctl context so we can avoid + * the call to vtime_account() with its unnecessary idle check. + */ + vtime_account_system(current); + current->flags &= ~PF_VCPU; +} + #ifdef CONFIG_CONTEXT_TRACKING DECLARE_PER_CPU(struct context_tracking, context_tracking); @@ -35,6 +56,9 @@ static inline bool context_tracking_active(void) extern void user_enter(void); extern void user_exit(void); +extern void guest_enter(void); +extern void guest_exit(void); + static inline enum ctx_state exception_enter(void) { enum ctx_state prev_ctx; @@ -57,6 +81,17 @@ extern void context_tracking_task_switch(struct task_struct *prev, static inline bool context_tracking_in_user(void) { return false; } static inline void user_enter(void) { } static inline void user_exit(void) { } + +static inline void guest_enter(void) +{ + __guest_enter(); +} + +static inline void guest_exit(void) +{ + __guest_exit(); +} + static inline enum ctx_state exception_enter(void) { return 0; } static inline void exception_exit(enum ctx_state prev_ctx) { } static inline void context_tracking_task_switch(struct task_struct *prev, diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index f0eea07d2c2b..8db53cfaccdb 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -760,42 +761,6 @@ static inline int kvm_iommu_unmap_guest(struct kvm *kvm) } #endif -static inline void __guest_enter(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags |= PF_VCPU; -} - -static inline void __guest_exit(void) -{ - /* - * This is running in ioctl context so we can avoid - * the call to vtime_account() with its unnecessary idle check. - */ - vtime_account_system(current); - current->flags &= ~PF_VCPU; -} - -#ifdef CONFIG_CONTEXT_TRACKING -extern void guest_enter(void); -extern void guest_exit(void); - -#else /* !CONFIG_CONTEXT_TRACKING */ -static inline void guest_enter(void) -{ - __guest_enter(); -} - -static inline void guest_exit(void) -{ - __guest_exit(); -} -#endif /* !CONFIG_CONTEXT_TRACKING */ - static inline void kvm_guest_enter(void) { unsigned long flags; diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..85bdde1137eb 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -15,7 +15,6 @@ */ #include -#include #include #include #include -- cgit v1.2.3 From 1a7f829f094dd7951e7d46c571a18080e455a436 Mon Sep 17 00:00:00 2001 From: Li Zhong Date: Fri, 17 May 2013 16:44:04 +0800 Subject: nohz: Fix notifier return val that enforce timekeeping In tick_nohz_cpu_down_callback() if the cpu is the one handling timekeeping, we must return something that stops the CPU_DOWN_PREPARE notifiers and then start notify CPU_DOWN_FAILED on the already called notifier call backs. However traditional errno values are not handled by the notifier unless these are encapsulated using errno_to_notifier(). Hence the current -EINVAL is misinterpreted and converted to junk after notifier_to_errno(), leaving the notifier subsystem to random behaviour such as eventually allowing the cpu to go down. Fix this by using the standard NOTIFY_BAD instead. Signed-off-by: Li Zhong Reviewed-by: Srivatsa S. Bhat Acked-by: Steven Rostedt Cc: Paul E. McKenney Cc: Ingo Molnar Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Borislav Petkov Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index f4208138fbf4..0cf1c1453181 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -306,7 +306,7 @@ static int __cpuinit tick_nohz_cpu_down_callback(struct notifier_block *nfb, * we can't safely shutdown that CPU. */ if (have_nohz_full_mask && tick_do_timer_cpu == cpu) - return -EINVAL; + return NOTIFY_BAD; break; } return NOTIFY_OK; -- cgit v1.2.3 From f5d00c1f9adb350c24c5301600f7bf2da99b66de Mon Sep 17 00:00:00 2001 From: Jiri Bohac Date: Tue, 28 May 2013 15:29:03 +0200 Subject: tick: Remove useless timekeeping duty attribution to broadcast source Since 7300711e ("clockevents: broadcast fixup possible waiters"), the timekeeping duty is assigned to the CPU that handles the tick broadcast clock device by the time it is set in one shot mode. This is an issue in full dynticks mode where the timekeeping duty must stay handled by the boot CPU for now. Otherwise it prevents secondary CPUs from offlining and this breaks suspend/shutdown/reboot/... As it appears there is no reason for this timekeeping duty to be moved to the broadcast CPU, besides nothing prevent it from being later re-assigned to another target, let's simply remove it. Signed-off-by: Jiri Bohac Reported-by: Steven Rostedt Acked-by: Thomas Gleixner Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Borislav Petkov Signed-off-by: Frederic Weisbecker Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 0c739423b0f9..b4c245580b79 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -698,10 +698,6 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) bc->event_handler = tick_handle_oneshot_broadcast; - /* Take the do_timer update */ - if (!tick_nohz_full_cpu(cpu)) - tick_do_timer_cpu = cpu; - /* * We must be careful here. There might be other CPUs * waiting for periodic broadcast. We need to set the -- cgit v1.2.3 From d7880812b3594d3c6dcbe3cfd71dabb17347d082 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 10 Jun 2013 16:52:03 +0200 Subject: idle: Add the stack canary init to cpu_startup_entry() Moving x86 to the generic idle implementation (commit 7d1a9417 "x86: Use generic idle loop") wreckaged the stack protector. I stupidly missed that boot_init_stack_canary() must be inlined from a function which never returns, but I put that call into arch_cpu_idle_prepare() which of course returns. I pondered to play tricks with arch_cpu_idle_prepare() first, but then I noticed, that the other archs which have implemented the stackprotector (ARM and SH) do not initialize the canary for the non-boot cpus. So I decided to move the boot_init_stack_canary() call into cpu_startup_entry() ifdeffed with an CONFIG_X86 for now. This #ifdef is just a temporary measure as I don't want to inflict the boot_init_stack_canary() call on ARM and SH that late in the cycle. I'll queue a patch for 3.11 which removes the #ifdef if the ARM/SH maintainers have no objection. Reported-by: Wouter van Kesteren Cc: x86@kernel.org Cc: Russell King Cc: Paul Mundt Signed-off-by: Thomas Gleixner --- arch/x86/kernel/process.c | 12 ------------ kernel/cpu/idle.c | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'kernel') diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 4e7a37ff03ab..81a5f5e8f142 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -277,18 +277,6 @@ void exit_idle(void) } #endif -void arch_cpu_idle_prepare(void) -{ - /* - * If we're the non-boot CPU, nothing set the stack canary up - * for us. CPU0 already has it initialized but no harm in - * doing it again. This is a good place for updating it, as - * we wont ever return from this function (so the invalid - * canaries already on the stack wont ever trigger). - */ - boot_init_stack_canary(); -} - void arch_cpu_idle_enter(void) { local_touch_nmi(); diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index d5585f5e038e..bf2ee1aafa0e 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -112,6 +113,21 @@ static void cpu_idle_loop(void) void cpu_startup_entry(enum cpuhp_state state) { + /* + * This #ifdef needs to die, but it's too late in the cycle to + * make this generic (arm and sh have never invoked the canary + * init for the non boot cpus!). Will be fixed in 3.11 + */ +#ifdef CONFIG_X86 + /* + * If we're the non-boot CPU, nothing set the stack canary up + * for us. The boot CPU already has it initialized but no harm + * in doing it again. This is a good place for updating it, as + * we wont ever return from this function (so the invalid + * canaries already on the stack wont ever trigger). + */ + boot_init_stack_canary(); +#endif current_set_polling(); arch_cpu_idle_prepare(); cpu_idle_loop(); -- cgit v1.2.3 From 29ce3785b22da47c49f4ef6e14b9014fa5dee261 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 8 May 2013 14:05:34 -0700 Subject: idle: Enable interrupts in the weak arch_cpu_idle() implementation PARISC bootup triggers the warning at kernel/cpu/idle.c:96. That's caused by the weak arch_cpu_idle() implementation, which is provided to avoid that architectures implement idle_poll over and over. The switchover to polling mode happens in the first call of the weak arch_cpu_idle() implementation, but that code fails to reenable interrupts and therefor triggers the warning. Fix this by enabling interrupts in the weak arch_cpu_idle() code. [ tglx: Made the changelog match the patch ] Signed-off-by: James Bottomley Reviewed-by: Srivatsa S. Bhat Link: http://lkml.kernel.org/r/1371236142.2726.43.camel@dabdike Signed-off-by: Thomas Gleixner --- kernel/cpu/idle.c | 1 + 1 file changed, 1 insertion(+) (limited to 'kernel') diff --git a/kernel/cpu/idle.c b/kernel/cpu/idle.c index bf2ee1aafa0e..e695c0a0bcb5 100644 --- a/kernel/cpu/idle.c +++ b/kernel/cpu/idle.c @@ -59,6 +59,7 @@ void __weak arch_cpu_idle_dead(void) { } void __weak arch_cpu_idle(void) { cpu_idle_force_poll = 1; + local_irq_enable(); } /* -- cgit v1.2.3 From 0541881502a1276149889fe468662ff6a8fc8f6d Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 13 Jun 2013 13:17:02 -0700 Subject: range: Do not add new blank slot with add_range_with_merge Joshua reported: Commit cd7b304dfaf1 (x86, range: fix missing merge during add range) broke mtrr cleanup on his setup in 3.9.5. corresponding commit in upstream is fbe06b7bae7c. The reason is add_range_with_merge could generate blank spot. We could avoid that by searching new expanded start/end, that new range should include all connected ranges in range array. At last add the new expanded start/end to the range array. Also move up left array so do not add new blank slot in the range array. -v2: move left array to avoid enhance add_range() -v3: include fix from Joshua about memmove declaring when DYN_DEBUG is used. Reported-by: Joshua Covington Tested-by: Joshua Covington Signed-off-by: Yinghai Lu Link: http://lkml.kernel.org/r/1371154622-8929-3-git-send-email-yinghai@kernel.org Cc: v3.9 Signed-off-by: H. Peter Anvin --- kernel/range.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) (limited to 'kernel') diff --git a/kernel/range.c b/kernel/range.c index eb911dbce267..322ea8e93e4b 100644 --- a/kernel/range.c +++ b/kernel/range.c @@ -4,7 +4,7 @@ #include #include #include - +#include #include int add_range(struct range *range, int az, int nr_range, u64 start, u64 end) @@ -32,9 +32,8 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (start >= end) return nr_range; - /* Try to merge it with old one: */ + /* get new start/end: */ for (i = 0; i < nr_range; i++) { - u64 final_start, final_end; u64 common_start, common_end; if (!range[i].end) @@ -45,14 +44,16 @@ int add_range_with_merge(struct range *range, int az, int nr_range, if (common_start > common_end) continue; - final_start = min(range[i].start, start); - final_end = max(range[i].end, end); + /* new start/end, will add it back at last */ + start = min(range[i].start, start); + end = max(range[i].end, end); - /* clear it and add it back for further merge */ - range[i].start = 0; - range[i].end = 0; - return add_range_with_merge(range, az, nr_range, - final_start, final_end); + memmove(&range[i], &range[i + 1], + (nr_range - (i + 1)) * sizeof(range[i])); + range[nr_range - 1].start = 0; + range[nr_range - 1].end = 0; + nr_range--; + i--; } /* Need to add it: */ -- cgit v1.2.3 From 9bb5d40cd93c9dd4be74834b1dcb1ba03629716b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 4 Jun 2013 10:44:21 +0200 Subject: perf: Fix mmap() accounting hole Vince's fuzzer once again found holes. This time it spotted a leak in the locked page accounting. When an event had redirected output and its close() was the last reference to the buffer we didn't have a vm context to undo accounting. Change the code to destroy the buffer on the last munmap() and detach all redirected events at that time. This provides us the right context to undo the vm accounting. Reported-and-tested-by: Vince Weaver Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/20130604084421.GI8923@twins.programming.kicks-ass.net Cc: Signed-off-by: Ingo Molnar --- kernel/events/core.c | 228 ++++++++++++++++++++++++++++++++--------------- kernel/events/internal.h | 3 +- 2 files changed, 159 insertions(+), 72 deletions(-) (limited to 'kernel') diff --git a/kernel/events/core.c b/kernel/events/core.c index ae752cd4a086..b391907d5352 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -196,9 +196,6 @@ static void cpu_ctx_sched_in(struct perf_cpu_context *cpuctx, static void update_context_time(struct perf_event_context *ctx); static u64 perf_event_time(struct perf_event *event); -static void ring_buffer_attach(struct perf_event *event, - struct ring_buffer *rb); - void __weak perf_event_print_debug(void) { } extern __weak const char *perf_pmu_name(void) @@ -2917,7 +2914,8 @@ static void free_event_rcu(struct rcu_head *head) kfree(event); } -static bool ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_put(struct ring_buffer *rb); +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb); static void free_event(struct perf_event *event) { @@ -2942,15 +2940,30 @@ static void free_event(struct perf_event *event) if (has_branch_stack(event)) { static_key_slow_dec_deferred(&perf_sched_events); /* is system-wide event */ - if (!(event->attach_state & PERF_ATTACH_TASK)) + if (!(event->attach_state & PERF_ATTACH_TASK)) { atomic_dec(&per_cpu(perf_branch_stack_events, event->cpu)); + } } } if (event->rb) { - ring_buffer_put(event->rb); - event->rb = NULL; + struct ring_buffer *rb; + + /* + * Can happen when we close an event with re-directed output. + * + * Since we have a 0 refcount, perf_mmap_close() will skip + * over us; possibly making our ring_buffer_put() the last. + */ + mutex_lock(&event->mmap_mutex); + rb = event->rb; + if (rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* could be last */ + } + mutex_unlock(&event->mmap_mutex); } if (is_cgroup_event(event)) @@ -3188,30 +3201,13 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) unsigned int events = POLL_HUP; /* - * Race between perf_event_set_output() and perf_poll(): perf_poll() - * grabs the rb reference but perf_event_set_output() overrides it. - * Here is the timeline for two threads T1, T2: - * t0: T1, rb = rcu_dereference(event->rb) - * t1: T2, old_rb = event->rb - * t2: T2, event->rb = new rb - * t3: T2, ring_buffer_detach(old_rb) - * t4: T1, ring_buffer_attach(rb1) - * t5: T1, poll_wait(event->waitq) - * - * To avoid this problem, we grab mmap_mutex in perf_poll() - * thereby ensuring that the assignment of the new ring buffer - * and the detachment of the old buffer appear atomic to perf_poll() + * Pin the event->rb by taking event->mmap_mutex; otherwise + * perf_event_set_output() can swizzle our rb and make us miss wakeups. */ mutex_lock(&event->mmap_mutex); - - rcu_read_lock(); - rb = rcu_dereference(event->rb); - if (rb) { - ring_buffer_attach(event, rb); + rb = event->rb; + if (rb) events = atomic_xchg(&rb->poll, 0); - } - rcu_read_unlock(); - mutex_unlock(&event->mmap_mutex); poll_wait(file, &event->waitq, wait); @@ -3521,16 +3517,12 @@ static void ring_buffer_attach(struct perf_event *event, return; spin_lock_irqsave(&rb->event_lock, flags); - if (!list_empty(&event->rb_entry)) - goto unlock; - - list_add(&event->rb_entry, &rb->event_list); -unlock: + if (list_empty(&event->rb_entry)) + list_add(&event->rb_entry, &rb->event_list); spin_unlock_irqrestore(&rb->event_lock, flags); } -static void ring_buffer_detach(struct perf_event *event, - struct ring_buffer *rb) +static void ring_buffer_detach(struct perf_event *event, struct ring_buffer *rb) { unsigned long flags; @@ -3549,13 +3541,10 @@ static void ring_buffer_wakeup(struct perf_event *event) rcu_read_lock(); rb = rcu_dereference(event->rb); - if (!rb) - goto unlock; - - list_for_each_entry_rcu(event, &rb->event_list, rb_entry) - wake_up_all(&event->waitq); - -unlock: + if (rb) { + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) + wake_up_all(&event->waitq); + } rcu_read_unlock(); } @@ -3582,23 +3571,14 @@ static struct ring_buffer *ring_buffer_get(struct perf_event *event) return rb; } -static bool ring_buffer_put(struct ring_buffer *rb) +static void ring_buffer_put(struct ring_buffer *rb) { - struct perf_event *event, *n; - unsigned long flags; - if (!atomic_dec_and_test(&rb->refcount)) - return false; + return; - spin_lock_irqsave(&rb->event_lock, flags); - list_for_each_entry_safe(event, n, &rb->event_list, rb_entry) { - list_del_init(&event->rb_entry); - wake_up_all(&event->waitq); - } - spin_unlock_irqrestore(&rb->event_lock, flags); + WARN_ON_ONCE(!list_empty(&rb->event_list)); call_rcu(&rb->rcu_head, rb_free_rcu); - return true; } static void perf_mmap_open(struct vm_area_struct *vma) @@ -3606,28 +3586,100 @@ static void perf_mmap_open(struct vm_area_struct *vma) struct perf_event *event = vma->vm_file->private_data; atomic_inc(&event->mmap_count); + atomic_inc(&event->rb->mmap_count); } +/* + * A buffer can be mmap()ed multiple times; either directly through the same + * event, or through other events by use of perf_event_set_output(). + * + * In order to undo the VM accounting done by perf_mmap() we need to destroy + * the buffer here, where we still have a VM context. This means we need + * to detach all events redirecting to us. + */ static void perf_mmap_close(struct vm_area_struct *vma) { struct perf_event *event = vma->vm_file->private_data; - if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) { - struct ring_buffer *rb = event->rb; - struct user_struct *mmap_user = rb->mmap_user; - int mmap_locked = rb->mmap_locked; - unsigned long size = perf_data_size(rb); + struct ring_buffer *rb = event->rb; + struct user_struct *mmap_user = rb->mmap_user; + int mmap_locked = rb->mmap_locked; + unsigned long size = perf_data_size(rb); - rcu_assign_pointer(event->rb, NULL); - ring_buffer_detach(event, rb); - mutex_unlock(&event->mmap_mutex); + atomic_dec(&rb->mmap_count); + + if (!atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) + return; + + /* Detach current event from the buffer. */ + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + mutex_unlock(&event->mmap_mutex); + + /* If there's still other mmap()s of this buffer, we're done. */ + if (atomic_read(&rb->mmap_count)) { + ring_buffer_put(rb); /* can't be last */ + return; + } - if (ring_buffer_put(rb)) { - atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); - vma->vm_mm->pinned_vm -= mmap_locked; - free_uid(mmap_user); + /* + * No other mmap()s, detach from all other events that might redirect + * into the now unreachable buffer. Somewhat complicated by the + * fact that rb::event_lock otherwise nests inside mmap_mutex. + */ +again: + rcu_read_lock(); + list_for_each_entry_rcu(event, &rb->event_list, rb_entry) { + if (!atomic_long_inc_not_zero(&event->refcount)) { + /* + * This event is en-route to free_event() which will + * detach it and remove it from the list. + */ + continue; } + rcu_read_unlock(); + + mutex_lock(&event->mmap_mutex); + /* + * Check we didn't race with perf_event_set_output() which can + * swizzle the rb from under us while we were waiting to + * acquire mmap_mutex. + * + * If we find a different rb; ignore this event, a next + * iteration will no longer find it on the list. We have to + * still restart the iteration to make sure we're not now + * iterating the wrong list. + */ + if (event->rb == rb) { + rcu_assign_pointer(event->rb, NULL); + ring_buffer_detach(event, rb); + ring_buffer_put(rb); /* can't be last, we still have one */ + } + mutex_unlock(&event->mmap_mutex); + put_event(event); + + /* + * Restart the iteration; either we're on the wrong list or + * destroyed its integrity by doing a deletion. + */ + goto again; } + rcu_read_unlock(); + + /* + * It could be there's still a few 0-ref events on the list; they'll + * get cleaned up by free_event() -- they'll also still have their + * ref on the rb and will free it whenever they are done with it. + * + * Aside from that, this buffer is 'fully' detached and unmapped, + * undo the VM accounting. + */ + + atomic_long_sub((size >> PAGE_SHIFT) + 1, &mmap_user->locked_vm); + vma->vm_mm->pinned_vm -= mmap_locked; + free_uid(mmap_user); + + ring_buffer_put(rb); /* could be last */ } static const struct vm_operations_struct perf_mmap_vmops = { @@ -3677,10 +3729,24 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) return -EINVAL; WARN_ON_ONCE(event->ctx->parent_ctx); +again: mutex_lock(&event->mmap_mutex); if (event->rb) { - if (event->rb->nr_pages != nr_pages) + if (event->rb->nr_pages != nr_pages) { ret = -EINVAL; + goto unlock; + } + + if (!atomic_inc_not_zero(&event->rb->mmap_count)) { + /* + * Raced against perf_mmap_close() through + * perf_event_set_output(). Try again, hope for better + * luck. + */ + mutex_unlock(&event->mmap_mutex); + goto again; + } + goto unlock; } @@ -3722,12 +3788,14 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } + atomic_set(&rb->mmap_count, 1); rb->mmap_locked = extra; rb->mmap_user = get_current_user(); atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->pinned_vm += extra; + ring_buffer_attach(event, rb); rcu_assign_pointer(event->rb, rb); perf_event_update_userpage(event); @@ -3737,6 +3805,10 @@ unlock: atomic_inc(&event->mmap_count); mutex_unlock(&event->mmap_mutex); + /* + * Since pinned accounting is per vm we cannot allow fork() to copy our + * vma. + */ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_ops = &perf_mmap_vmops; @@ -6415,6 +6487,8 @@ set: if (atomic_read(&event->mmap_count)) goto unlock; + old_rb = event->rb; + if (output_event) { /* get the rb we want to redirect to */ rb = ring_buffer_get(output_event); @@ -6422,16 +6496,28 @@ set: goto unlock; } - old_rb = event->rb; - rcu_assign_pointer(event->rb, rb); if (old_rb) ring_buffer_detach(event, old_rb); + + if (rb) + ring_buffer_attach(event, rb); + + rcu_assign_pointer(event->rb, rb); + + if (old_rb) { + ring_buffer_put(old_rb); + /* + * Since we detached before setting the new rb, so that we + * could attach the new rb, we could have missed a wakeup. + * Provide it now. + */ + wake_up_all(&event->waitq); + } + ret = 0; unlock: mutex_unlock(&event->mmap_mutex); - if (old_rb) - ring_buffer_put(old_rb); out: return ret; } diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 5bc6c8e9b851..ca6599723be5 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -31,7 +31,8 @@ struct ring_buffer { spinlock_t event_lock; struct list_head event_list; - int mmap_locked; + atomic_t mmap_count; + unsigned long mmap_locked; struct user_struct *mmap_user; struct perf_event_mmap_page *user_page; -- cgit v1.2.3 From 873b4c65b519fd769940eb281f77848227d4e5c1 Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Wed, 5 Jun 2013 10:13:11 +0200 Subject: sched: Fix clear NOHZ_BALANCE_KICK I have faced a sequence where the Idle Load Balance was sometime not triggered for a while on my platform, in the following scenario: CPU 0 and CPU 1 are running tasks and CPU 2 is idle CPU 1 kicks the Idle Load Balance CPU 1 selects CPU 2 as the new Idle Load Balancer CPU 2 sets NOHZ_BALANCE_KICK for CPU 2 CPU 2 sends a reschedule IPI to CPU 2 While CPU 3 wakes up, CPU 0 or CPU 1 migrates a waking up task A on CPU 2 CPU 2 finally wakes up, runs task A and discards the Idle Load Balance task A quickly goes back to sleep (before a tick occurs on CPU 2) CPU 2 goes back to idle with NOHZ_BALANCE_KICK set Whenever CPU 2 will be selected as the ILB, no reschedule IPI will be sent because NOHZ_BALANCE_KICK is already set and no Idle Load Balance will be performed. We must wait for the sched softirq to be raised on CPU 2 thanks to another part the kernel to come back to clear NOHZ_BALANCE_KICK. The proposed solution clears NOHZ_BALANCE_KICK in schedule_ipi if we can't raise the sched_softirq for the Idle Load Balance. Change since V1: - move the clear of NOHZ_BALANCE_KICK in got_nohz_idle_kick if the ILB can't run on this CPU (as suggested by Peter) Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1370419991-13870-1-git-send-email-vincent.guittot@linaro.org Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 58453b8272fd..919bee68032b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -633,7 +633,19 @@ void wake_up_nohz_cpu(int cpu) static inline bool got_nohz_idle_kick(void) { int cpu = smp_processor_id(); - return idle_cpu(cpu) && test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + + if (!test_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu))) + return false; + + if (idle_cpu(cpu) && !need_resched()) + return true; + + /* + * We can't run Idle Load Balance on this CPU for this time so we + * cancel it and clear NOHZ_BALANCE_KICK + */ + clear_bit(NOHZ_BALANCE_KICK, nohz_flags(cpu)); + return false; } #else /* CONFIG_NO_HZ_COMMON */ @@ -1393,8 +1405,9 @@ static void sched_ttwu_pending(void) void scheduler_ipi(void) { - if (llist_empty(&this_rq()->wake_list) && !got_nohz_idle_kick() - && !tick_nohz_full_cpu(smp_processor_id())) + if (llist_empty(&this_rq()->wake_list) + && !tick_nohz_full_cpu(smp_processor_id()) + && !got_nohz_idle_kick()) return; /* @@ -1417,7 +1430,7 @@ void scheduler_ipi(void) /* * Check if someone kicked us for doing the nohz idle load balance. */ - if (unlikely(got_nohz_idle_kick() && !need_resched())) { + if (unlikely(got_nohz_idle_kick())) { this_rq()->idle_balance = 1; raise_softirq_irqoff(SCHED_SOFTIRQ); } -- cgit v1.2.3 From 29bb9e5a75684106a37593ad75ec75ff8312731b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 May 2013 15:23:40 -0400 Subject: tracing/context-tracking: Add preempt_schedule_context() for tracing Dave Jones hit the following bug report: =============================== [ INFO: suspicious RCU usage. ] 3.10.0-rc2+ #1 Not tainted ------------------------------- include/linux/rcupdate.h:771 rcu_read_lock() used illegally while idle! other info that might help us debug this: RCU used illegally from idle CPU! rcu_scheduler_active = 1, debug_locks = 0 RCU used illegally from extended quiescent state! 2 locks held by cc1/63645: #0: (&rq->lock){-.-.-.}, at: [] __schedule+0xed/0x9b0 #1: (rcu_read_lock){.+.+..}, at: [] cpuacct_charge+0x5/0x1f0 CPU: 1 PID: 63645 Comm: cc1 Not tainted 3.10.0-rc2+ #1 [loadavg: 40.57 27.55 13.39 25/277 64369] Hardware name: Gigabyte Technology Co., Ltd. GA-MA78GM-S2H/GA-MA78GM-S2H, BIOS F12a 04/23/2010 0000000000000000 ffff88010f78fcf8 ffffffff816ae383 ffff88010f78fd28 ffffffff810b698d ffff88011c092548 000000000023d073 ffff88011c092500 0000000000000001 ffff88010f78fd60 ffffffff8109d7c5 ffffffff8109d645 Call Trace: [] dump_stack+0x19/0x1b [] lockdep_rcu_suspicious+0xfd/0x130 [] cpuacct_charge+0x185/0x1f0 [] ? cpuacct_charge+0x5/0x1f0 [] update_curr+0xec/0x240 [] put_prev_task_fair+0x228/0x480 [] __schedule+0x161/0x9b0 [] preempt_schedule+0x51/0x80 [] ? __cond_resched_softirq+0x60/0x60 [] ? retint_careful+0x12/0x2e [] ftrace_ops_control_func+0x1dc/0x210 [] ftrace_call+0x5/0x2f [] ? retint_careful+0xb/0x2e [] ? schedule_user+0x5/0x70 [] ? schedule_user+0x5/0x70 [] ? retint_careful+0x12/0x2e ------------[ cut here ]------------ What happened was that the function tracer traced the schedule_user() code that tells RCU that the system is coming back from userspace, and to add the CPU back to the RCU monitoring. Because the function tracer does a preempt_disable/enable_notrace() calls the preempt_enable_notrace() checks the NEED_RESCHED flag. If it is set, then preempt_schedule() is called. But this is called before the user_exit() function can inform the kernel that the CPU is no longer in user mode and needs to be accounted for by RCU. The fix is to create a new preempt_schedule_context() that checks if the kernel is still in user mode and if so to switch it to kernel mode before calling schedule. It also switches back to user mode coming back from schedule in need be. The only user of this currently is the preempt_enable_notrace(), which is only used by the tracing subsystem. Signed-off-by: Steven Rostedt Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1369423420.6828.226.camel@gandalf.local.home Signed-off-by: Ingo Molnar --- include/linux/preempt.h | 18 +++++++++++++++++- kernel/context_tracking.c | 40 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 87a03c746f17..f5d4723cdb3d 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -33,9 +33,25 @@ do { \ preempt_schedule(); \ } while (0) +#ifdef CONFIG_CONTEXT_TRACKING + +void preempt_schedule_context(void); + +#define preempt_check_resched_context() \ +do { \ + if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) \ + preempt_schedule_context(); \ +} while (0) +#else + +#define preempt_check_resched_context() preempt_check_resched() + +#endif /* CONFIG_CONTEXT_TRACKING */ + #else /* !CONFIG_PREEMPT */ #define preempt_check_resched() do { } while (0) +#define preempt_check_resched_context() do { } while (0) #endif /* CONFIG_PREEMPT */ @@ -88,7 +104,7 @@ do { \ do { \ preempt_enable_no_resched_notrace(); \ barrier(); \ - preempt_check_resched(); \ + preempt_check_resched_context(); \ } while (0) #else /* !CONFIG_PREEMPT_COUNT */ diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c index 65349f07b878..66677003e223 100644 --- a/kernel/context_tracking.c +++ b/kernel/context_tracking.c @@ -71,6 +71,46 @@ void user_enter(void) local_irq_restore(flags); } +#ifdef CONFIG_PREEMPT +/** + * preempt_schedule_context - preempt_schedule called by tracing + * + * The tracing infrastructure uses preempt_enable_notrace to prevent + * recursion and tracing preempt enabling caused by the tracing + * infrastructure itself. But as tracing can happen in areas coming + * from userspace or just about to enter userspace, a preempt enable + * can occur before user_exit() is called. This will cause the scheduler + * to be called when the system is still in usermode. + * + * To prevent this, the preempt_enable_notrace will use this function + * instead of preempt_schedule() to exit user context if needed before + * calling the scheduler. + */ +void __sched notrace preempt_schedule_context(void) +{ + struct thread_info *ti = current_thread_info(); + enum ctx_state prev_ctx; + + if (likely(ti->preempt_count || irqs_disabled())) + return; + + /* + * Need to disable preemption in case user_exit() is traced + * and the tracer calls preempt_enable_notrace() causing + * an infinite recursion. + */ + preempt_disable_notrace(); + prev_ctx = exception_enter(); + preempt_enable_no_resched_notrace(); + + preempt_schedule(); + + preempt_disable_notrace(); + exception_exit(prev_ctx); + preempt_enable_notrace(); +} +EXPORT_SYMBOL_GPL(preempt_schedule_context); +#endif /* CONFIG_PREEMPT */ /** * user_exit - Inform the context tracking that the CPU is -- cgit v1.2.3 From 8b4d801b2b123b6c09742f861fe44a8527b84d47 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:06 +0200 Subject: hw_breakpoint: Fix cpu check in task_bp_pinned(cpu) trinity fuzzer triggered WARN_ONCE("Can't find any breakpoint slot") in arch_install_hw_breakpoint() but the problem is not arch-specific. The problem is, task_bp_pinned(cpu) checks "cpu == iter->cpu" but this doesn't account the "all cpus" events with iter->cpu < 0. This means that, say, register_user_hw_breakpoint(tsk) can happily create the arbitrary number > HBP_NUM of breakpoints which can not be activated. toggle_bp_task_slot() is equally wrong by the same reason and nr_task_bp_pinned[] can have negative entries. Simple test: # perl -e 'sleep 1 while 1' & # perf record -e mem:0x10,mem:0x10,mem:0x10,mem:0x10,mem:0x10 -p `pidof perl` Before this patch this triggers the same problem/WARN_ON(), after the patch it correctly fails with -ENOSPC. Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Link: http://lkml.kernel.org/r/20130620155006.GA6324@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a64f8aeb5c1f..a853deabe6cf 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -120,7 +120,7 @@ static int task_bp_pinned(int cpu, struct perf_event *bp, enum bp_type_idx type) list_for_each_entry(iter, &bp_task_head, hw.bp_list) { if (iter->hw.bp_target == tsk && find_slot_idx(iter) == type && - cpu == iter->cpu) + (iter->cpu < 0 || cpu == iter->cpu)) count += hw_breakpoint_weight(iter); } -- cgit v1.2.3 From c790b0ad23f427c7522ffed264706238c57c007e Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 20 Jun 2013 17:50:09 +0200 Subject: hw_breakpoint: Use cpu_possible_mask in {reserve,release}_bp_slot() fetch_bp_busy_slots() and toggle_bp_slot() use for_each_online_cpu(), this is obviously wrong wrt cpu_up() or cpu_down(), we can over/under account the per-cpu numbers. For example: # echo 0 >> /sys/devices/system/cpu/cpu1/online # perf record -e mem:0x10 -p 1 & # echo 1 >> /sys/devices/system/cpu/cpu1/online # perf record -e mem:0x10,mem:0x10,mem:0x10,mem:0x10 -C1 -a & # taskset -p 0x2 1 triggers the same WARN_ONCE("Can't find any breakpoint slot") in arch_install_hw_breakpoint(). Reported-by: Vince Weaver Signed-off-by: Oleg Nesterov Acked-by: Frederic Weisbecker Cc: Link: http://lkml.kernel.org/r/20130620155009.GA6327@redhat.com Signed-off-by: Ingo Molnar --- kernel/events/hw_breakpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index a853deabe6cf..20185ea64aa6 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -149,7 +149,7 @@ fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp, return; } - for_each_online_cpu(cpu) { + for_each_possible_cpu(cpu) { unsigned int nr; nr = per_cpu(nr_cpu_bp_pinned[type], cpu); @@ -235,7 +235,7 @@ toggle_bp_slot(struct perf_event *bp, bool enable, enum bp_type_idx type, if (cpu >= 0) { toggle_bp_task_slot(bp, cpu, enable, type, weight); } else { - for_each_online_cpu(cpu) + for_each_possible_cpu(cpu) toggle_bp_task_slot(bp, cpu, enable, type, weight); } -- cgit v1.2.3 From ea8deb8dfa6b0e8d1b3d1051585706739b46656c Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 17 Jun 2013 18:15:35 +0200 Subject: tick: Fix tick_broadcast_pending_mask not cleared The recent modification in the cpuidle framework consolidated the timer broadcast code across the different drivers by setting a new flag in the idle state. It tells the cpuidle core code to enter/exit the broadcast mode for the cpu when entering a deep idle state. The broadcast timer enter/exit is no longer handled by the back-end driver. This change made the local interrupt to be enabled *before* calling CLOCK_EVENT_NOTIFY_EXIT. On a tegra114, a four cores system, when the flag has been introduced in the driver, the following warning appeared: WARNING: at kernel/time/tick-broadcast.c:578 tick_broadcast_oneshot_control CPU: 2 PID: 0 Comm: swapper/2 Not tainted 3.10.0-rc3-next-20130529+ #15 [] (tick_broadcast_oneshot_control+0x1a4/0x1d0) from [] (tick_notify+0x240/0x40c) [] (tick_notify+0x240/0x40c) from [] (notifier_call_chain+0x44/0x84) [] (notifier_call_chain+0x44/0x84) from [] (raw_notifier_call_chain+0x18/0x20) [] (raw_notifier_call_chain+0x18/0x20) from [] (clockevents_notify+0x28/0x170) [] (clockevents_notify+0x28/0x170) from [] (cpuidle_idle_call+0x11c/0x168) [] (cpuidle_idle_call+0x11c/0x168) from [] (arch_cpu_idle+0x8/0x38) [] (arch_cpu_idle+0x8/0x38) from [] (cpu_startup_entry+0x60/0x134) [] (cpu_startup_entry+0x60/0x134) from [<804fe9a4>] (0x804fe9a4) I don't have the hardware, so I wasn't able to reproduce the warning but after looking a while at the code, I deduced the following: 1. the CPU2 enters a deep idle state and sets the broadcast timer 2. the timer expires, the tick_handle_oneshot_broadcast function is called, setting the tick_broadcast_pending_mask and waking up the idle cpu CPU2 3. the CPU2 exits idle handles the interrupt and then invokes tick_broadcast_oneshot_control with CLOCK_EVENT_NOTIFY_EXIT which runs the following code: [...] if (dev->next_event.tv64 == KTIME_MAX) goto out; if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_pending_mask)) goto out; [...] So if there is no next event scheduled for CPU2, we fulfil the first condition and jump out without clearing the tick_broadcast_pending_mask. 4. CPU2 goes to deep idle again and calls tick_broadcast_oneshot_control with CLOCK_NOTIFY_EVENT_ENTER but with the tick_broadcast_pending_mask set for CPU2, triggering the warning. The issue only surfaced due to the modifications of the cpuidle framework, which resulted in interrupts being enabled before the call to the clockevents code. If the call happens before interrupts have been enabled, the warning cannot trigger, because there is still the event pending which caused the broadcast timer expiry. Move the check for the next event below the check for the pending bit, so the pending bit gets cleared whether an event is scheduled on the cpu or not. [ tglx: Massaged changelog ] Signed-off-by: Daniel Lezcano Reported-and-tested-by: Joseph Lo Cc: Stephen Warren Cc: linux-arm-kernel@lists.infradead.org Cc: linaro-kernel@lists.linaro.org Link: http://lkml.kernel.org/r/1371485735-31249-1-git-send-email-daniel.lezcano@linaro.org Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b4c245580b79..20d6fba70652 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -599,8 +599,6 @@ void tick_broadcast_oneshot_control(unsigned long reason) } else { if (cpumask_test_and_clear_cpu(cpu, tick_broadcast_oneshot_mask)) { clockevents_set_mode(dev, CLOCK_EVT_MODE_ONESHOT); - if (dev->next_event.tv64 == KTIME_MAX) - goto out; /* * The cpu which was handling the broadcast * timer marked this cpu in the broadcast @@ -614,6 +612,11 @@ void tick_broadcast_oneshot_control(unsigned long reason) tick_broadcast_pending_mask)) goto out; + /* + * Bail out if there is no next event. + */ + if (dev->next_event.tv64 == KTIME_MAX) + goto out; /* * If the pending bit is not set, then we are * either the CPU handling the broadcast -- cgit v1.2.3 From 706b23bde27a391f0974df2a8351661770fa2e07 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Fri, 28 Jun 2013 09:49:46 -0400 Subject: Fix: kernel/ptrace.c: ptrace_peek_siginfo() missing __put_user() validation This __put_user() could be used by unprivileged processes to write into kernel memory. The issue here is that even if copy_siginfo_to_user() fails, the error code is not checked before __put_user() is executed. Luckily, ptrace_peek_siginfo() has been added within the 3.10-rc cycle, so it has not hit a stable release yet. Signed-off-by: Mathieu Desnoyers Acked-by: Oleg Nesterov Cc: Andrey Vagin Cc: Roland McGrath Cc: Paul McKenney Cc: David Howells Cc: Dave Jones Cc: Pavel Emelyanov Cc: Pedro Alves Cc: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/ptrace.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aed981a3f69c..335a7ae697f5 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -665,20 +665,22 @@ static int ptrace_peek_siginfo(struct task_struct *child, if (unlikely(is_compat_task())) { compat_siginfo_t __user *uinfo = compat_ptr(data); - ret = copy_siginfo_to_user32(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); + if (copy_siginfo_to_user32(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } + } else #endif { siginfo_t __user *uinfo = (siginfo_t __user *) data; - ret = copy_siginfo_to_user(uinfo, &info); - ret |= __put_user(info.si_code, &uinfo->si_code); - } - - if (ret) { - ret = -EFAULT; - break; + if (copy_siginfo_to_user(uinfo, &info) || + __put_user(info.si_code, &uinfo->si_code)) { + ret = -EFAULT; + break; + } } data += sizeof(siginfo_t); -- cgit v1.2.3