diff options
Diffstat (limited to 'kernel')
48 files changed, 1979 insertions, 713 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index 864ff75d65f2..6aebdeb2aa34 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -100,6 +100,7 @@ obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o obj-$(CONFIG_PERF_EVENTS) += perf_event.o obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o obj-$(CONFIG_USER_RETURN_NOTIFIER) += user-return-notifier.o +obj-$(CONFIG_PADATA) += padata.o ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y) # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is diff --git a/kernel/cgroup.c b/kernel/cgroup.c index 1fbcc748044a..4fd90e129772 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -23,6 +23,7 @@ */ #include <linux/cgroup.h> +#include <linux/module.h> #include <linux/ctype.h> #include <linux/errno.h> #include <linux/fs.h> @@ -166,6 +167,20 @@ static DEFINE_SPINLOCK(hierarchy_id_lock); */ static int need_forkexit_callback __read_mostly; +#ifdef CONFIG_PROVE_LOCKING +int cgroup_lock_is_held(void) +{ + return lockdep_is_held(&cgroup_mutex); +} +#else /* #ifdef CONFIG_PROVE_LOCKING */ +int cgroup_lock_is_held(void) +{ + return mutex_is_locked(&cgroup_mutex); +} +#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ + +EXPORT_SYMBOL_GPL(cgroup_lock_is_held); + /* convenient tests for these bits */ inline int cgroup_is_removed(const struct cgroup *cgrp) { @@ -2936,14 +2951,17 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry, for_each_subsys(root, ss) { struct cgroup_subsys_state *css = ss->create(ss, cgrp); + if (IS_ERR(css)) { err = PTR_ERR(css); goto err_destroy; } init_cgroup_css(css, ss, cgrp); - if (ss->use_id) - if (alloc_css_id(ss, parent, cgrp)) + if (ss->use_id) { + err = alloc_css_id(ss, parent, cgrp); + if (err) goto err_destroy; + } /* At error, ->destroy() callback has to free assigned ID. */ } diff --git a/kernel/cpu.c b/kernel/cpu.c index 1c8ddd6ee940..677f25376a38 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -151,13 +151,13 @@ static inline void check_for_tasks(int cpu) write_lock_irq(&tasklist_lock); for_each_process(p) { - if (task_cpu(p) == cpu && + if (task_cpu(p) == cpu && p->state == TASK_RUNNING && (!cputime_eq(p->utime, cputime_zero) || !cputime_eq(p->stime, cputime_zero))) - printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d\ - (state = %ld, flags = %x) \n", - p->comm, task_pid_nr(p), cpu, - p->state, p->flags); + printk(KERN_WARNING "Task %s (pid = %d) is on cpu %d " + "(state = %ld, flags = %x)\n", + p->comm, task_pid_nr(p), cpu, + p->state, p->flags); } write_unlock_irq(&tasklist_lock); } diff --git a/kernel/cred.c b/kernel/cred.c index dd76cfe5f5b0..1ed8ca18790c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -224,7 +224,7 @@ struct cred *cred_alloc_blank(void) #ifdef CONFIG_KEYS new->tgcred = kzalloc(sizeof(*new->tgcred), GFP_KERNEL); if (!new->tgcred) { - kfree(new); + kmem_cache_free(cred_jar, new); return NULL; } atomic_set(&new->tgcred->usage, 1); diff --git a/kernel/exit.c b/kernel/exit.c index 546774a31a66..45ed043b8bf5 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -85,7 +85,9 @@ static void __exit_signal(struct task_struct *tsk) BUG_ON(!sig); BUG_ON(!atomic_read(&sig->count)); - sighand = rcu_dereference(tsk->sighand); + sighand = rcu_dereference_check(tsk->sighand, + rcu_read_lock_held() || + lockdep_is_held(&tasklist_lock)); spin_lock(&sighand->siglock); posix_cpu_timers_exit(tsk); @@ -170,8 +172,10 @@ void release_task(struct task_struct * p) repeat: tracehook_prepare_release_task(p); /* don't need to get the RCU readlock here - the process is dead and - * can't be modifying its own credentials */ + * can't be modifying its own credentials. But shut RCU-lockdep up */ + rcu_read_lock(); atomic_dec(&__task_cred(p)->user->processes); + rcu_read_unlock(); proc_flush_task(p); @@ -473,9 +477,11 @@ static void close_files(struct files_struct * files) /* * It is safe to dereference the fd table without RCU or * ->file_lock because this is the last reference to the - * files structure. + * files structure. But use RCU to shut RCU-lockdep up. */ + rcu_read_lock(); fdt = files_fdtable(files); + rcu_read_unlock(); for (;;) { unsigned long set; i = j * __NFDBITS; @@ -521,10 +527,12 @@ void put_files_struct(struct files_struct *files) * at the end of the RCU grace period. Otherwise, * you can free files immediately. */ + rcu_read_lock(); fdt = files_fdtable(files); if (fdt != &files->fdtab) kmem_cache_free(files_cachep, files); free_fdtable(fdt); + rcu_read_unlock(); } } diff --git a/kernel/fork.c b/kernel/fork.c index 5b2959b3ffc2..17bbf093356d 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -86,6 +86,7 @@ int max_threads; /* tunable limit on nr_threads */ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ +EXPORT_SYMBOL_GPL(tasklist_lock); int nr_processes(void) { @@ -1241,21 +1242,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Need tasklist lock for parent etc handling! */ write_lock_irq(&tasklist_lock); - /* - * The task hasn't been attached yet, so its cpus_allowed mask will - * not be changed, nor will its assigned CPU. - * - * The cpus_allowed mask of the parent may have changed after it was - * copied first time - so re-copy it here, then check the child's CPU - * to ensure it is on a valid CPU (and if not, just force it back to - * parent's CPU). This avoids alot of nasty races. - */ - p->cpus_allowed = current->cpus_allowed; - p->rt.nr_cpus_allowed = current->rt.nr_cpus_allowed; - if (unlikely(!cpu_isset(task_cpu(p), p->cpus_allowed) || - !cpu_online(task_cpu(p)))) - set_task_cpu(p, smp_processor_id()); - /* CLONE_PARENT re-uses the old parent */ if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { p->real_parent = current->real_parent; diff --git a/kernel/futex.c b/kernel/futex.c index d9b3a2228f9d..e7a35f1039e7 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -530,8 +530,25 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return -EINVAL; WARN_ON(!atomic_read(&pi_state->refcount)); - WARN_ON(pid && pi_state->owner && - pi_state->owner->pid != pid); + + /* + * When pi_state->owner is NULL then the owner died + * and another waiter is on the fly. pi_state->owner + * is fixed up by the task which acquires + * pi_state->rt_mutex. + * + * We do not check for pid == 0 which can happen when + * the owner died and robust_list_exit() cleared the + * TID. + */ + if (pid && pi_state->owner) { + /* + * Bail out if user space manipulated the + * futex value. + */ + if (pid != task_pid_vnr(pi_state->owner)) + return -EINVAL; + } atomic_inc(&pi_state->refcount); *ps = pi_state; @@ -758,6 +775,13 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) if (!pi_state) return -EINVAL; + /* + * If current does not own the pi_state then the futex is + * inconsistent and user space fiddled with the futex value. + */ + if (pi_state->owner != current) + return -EINVAL; + raw_spin_lock(&pi_state->pi_mutex.wait_lock); new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); @@ -1971,7 +1995,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - goto out; + goto out_put_key; out_unlock_put_key: queue_unlock(&q, hb); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index c030ae657f20..967e66143e11 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -243,38 +243,70 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ -int reserve_bp_slot(struct perf_event *bp) +static int __reserve_bp_slot(struct perf_event *bp) { struct bp_busy_slots slots = {0}; - int ret = 0; - - mutex_lock(&nr_bp_mutex); fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ - if (slots.pinned + (!!slots.flexible) == HBP_NUM) { - ret = -ENOSPC; - goto end; - } + if (slots.pinned + (!!slots.flexible) == HBP_NUM) + return -ENOSPC; toggle_bp_slot(bp, true); -end: + return 0; +} + +int reserve_bp_slot(struct perf_event *bp) +{ + int ret; + + mutex_lock(&nr_bp_mutex); + + ret = __reserve_bp_slot(bp); + mutex_unlock(&nr_bp_mutex); return ret; } +static void __release_bp_slot(struct perf_event *bp) +{ + toggle_bp_slot(bp, false); +} + void release_bp_slot(struct perf_event *bp) { mutex_lock(&nr_bp_mutex); - toggle_bp_slot(bp, false); + __release_bp_slot(bp); mutex_unlock(&nr_bp_mutex); } +/* + * Allow the kernel debugger to reserve breakpoint slots without + * taking a lock using the dbg_* variant of for the reserve and + * release breakpoint slots. + */ +int dbg_reserve_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + return __reserve_bp_slot(bp); +} + +int dbg_release_bp_slot(struct perf_event *bp) +{ + if (mutex_is_locked(&nr_bp_mutex)) + return -1; + + __release_bp_slot(bp); + + return 0; +} int register_perf_hw_breakpoint(struct perf_event *bp) { @@ -328,8 +360,8 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { u64 old_addr = bp->attr.bp_addr; + u64 old_len = bp->attr.bp_len; int old_type = bp->attr.bp_type; - int old_len = bp->attr.bp_len; int err = 0; perf_event_disable(bp); diff --git a/kernel/kfifo.c b/kernel/kfifo.c index 32c5c15d750d..35edbe22e9a9 100644 --- a/kernel/kfifo.c +++ b/kernel/kfifo.c @@ -80,7 +80,7 @@ int kfifo_alloc(struct kfifo *fifo, unsigned int size, gfp_t gfp_mask) buffer = kmalloc(size, gfp_mask); if (!buffer) { - _kfifo_init(fifo, 0, 0); + _kfifo_init(fifo, NULL, 0); return -ENOMEM; } @@ -97,6 +97,7 @@ EXPORT_SYMBOL(kfifo_alloc); void kfifo_free(struct kfifo *fifo) { kfree(fifo->buffer); + _kfifo_init(fifo, NULL, 0); } EXPORT_SYMBOL(kfifo_free); @@ -349,6 +350,7 @@ EXPORT_SYMBOL(__kfifo_from_user_n); * @fifo: the fifo to be used. * @from: pointer to the data to be added. * @len: the length of the data to be added. + * @total: the actual returned data length. * * This function copies at most @len bytes from the @from into the * FIFO depending and returns -EFAULT/0. @@ -399,7 +401,7 @@ EXPORT_SYMBOL(__kfifo_to_user_n); * @fifo: the fifo to be used. * @to: where the data must be copied. * @len: the size of the destination buffer. - @ @lenout: pointer to output variable with copied data + * @lenout: pointer to output variable with copied data * * This function copies at most @len bytes from the FIFO into the * @to buffer and 0 or -EFAULT. diff --git a/kernel/kgdb.c b/kernel/kgdb.c index 2eb517e23514..761fdd2b3034 100644 --- a/kernel/kgdb.c +++ b/kernel/kgdb.c @@ -583,6 +583,9 @@ static void kgdb_wait(struct pt_regs *regs) smp_wmb(); atomic_set(&cpu_in_kgdb[cpu], 1); + /* Disable any cpu specific hw breakpoints */ + kgdb_disable_hw_debug(regs); + /* Wait till primary CPU is done with debugging */ while (atomic_read(&passive_cpu_wait[cpu])) cpu_relax(); @@ -596,7 +599,7 @@ static void kgdb_wait(struct pt_regs *regs) /* Signal the primary CPU that we are done: */ atomic_set(&cpu_in_kgdb[cpu], 0); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); } @@ -1450,7 +1453,7 @@ acquirelock: (kgdb_info[cpu].task && kgdb_info[cpu].task->pid != kgdb_sstep_pid) && --sstep_tries) { atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); @@ -1550,7 +1553,7 @@ kgdb_restore: } /* Free kgdb_active */ atomic_set(&kgdb_active, -1); - touch_softlockup_watchdog(); + touch_softlockup_watchdog_sync(); clocksource_touch_watchdog(); local_irq_restore(flags); diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 5feaddcdbe49..0c30d0455de1 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -2147,7 +2147,7 @@ check_usage_backwards(struct task_struct *curr, struct held_lock *this, return ret; return print_irq_inversion_bug(curr, &root, target_entry, - this, 1, irqclass); + this, 0, irqclass); } void print_irqtrace_events(struct task_struct *curr) @@ -3809,3 +3809,21 @@ void lockdep_sys_exit(void) lockdep_print_held_locks(curr); } } + +void lockdep_rcu_dereference(const char *file, const int line) +{ + struct task_struct *curr = current; + + if (!debug_locks_off()) + return; + printk("\n===================================================\n"); + printk( "[ INFO: suspicious rcu_dereference_check() usage. ]\n"); + printk( "---------------------------------------------------\n"); + printk("%s:%d invoked rcu_dereference_check() without protection!\n", + file, line); + printk("\nother info that might help us debug this:\n\n"); + lockdep_print_held_locks(curr); + printk("\nstack backtrace:\n"); + dump_stack(); +} +EXPORT_SYMBOL_GPL(lockdep_rcu_dereference); diff --git a/kernel/notifier.c b/kernel/notifier.c index acd24e7643eb..2488ba7eb568 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -78,10 +78,10 @@ static int __kprobes notifier_call_chain(struct notifier_block **nl, int ret = NOTIFY_DONE; struct notifier_block *nb, *next_nb; - nb = rcu_dereference(*nl); + nb = rcu_dereference_raw(*nl); while (nb && nr_to_call) { - next_nb = rcu_dereference(nb->next); + next_nb = rcu_dereference_raw(nb->next); #ifdef CONFIG_DEBUG_NOTIFIERS if (unlikely(!func_ptr_is_kernel_text(nb->notifier_call))) { @@ -309,7 +309,7 @@ int __blocking_notifier_call_chain(struct blocking_notifier_head *nh, * racy then it does not matter what the result of the test * is, we re-check the list after having taken the lock anyway: */ - if (rcu_dereference(nh->head)) { + if (rcu_dereference_raw(nh->head)) { down_read(&nh->rwsem); ret = notifier_call_chain(&nh->head, val, v, nr_to_call, nr_calls); diff --git a/kernel/padata.c b/kernel/padata.c new file mode 100644 index 000000000000..6f9bcb8313d6 --- /dev/null +++ b/kernel/padata.c @@ -0,0 +1,690 @@ +/* + * padata.c - generic interface to process data streams in parallel + * + * Copyright (C) 2008, 2009 secunet Security Networks AG + * Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc., + * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA. + */ + +#include <linux/module.h> +#include <linux/cpumask.h> +#include <linux/err.h> +#include <linux/cpu.h> +#include <linux/padata.h> +#include <linux/mutex.h> +#include <linux/sched.h> +#include <linux/rcupdate.h> + +#define MAX_SEQ_NR INT_MAX - NR_CPUS +#define MAX_OBJ_NUM 10000 * NR_CPUS + +static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index) +{ + int cpu, target_cpu; + + target_cpu = cpumask_first(pd->cpumask); + for (cpu = 0; cpu < cpu_index; cpu++) + target_cpu = cpumask_next(target_cpu, pd->cpumask); + + return target_cpu; +} + +static int padata_cpu_hash(struct padata_priv *padata) +{ + int cpu_index; + struct parallel_data *pd; + + pd = padata->pd; + + /* + * Hash the sequence numbers to the cpus by taking + * seq_nr mod. number of cpus in use. + */ + cpu_index = padata->seq_nr % cpumask_weight(pd->cpumask); + + return padata_index_to_cpu(pd, cpu_index); +} + +static void padata_parallel_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + struct padata_instance *pinst; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, pwork); + pd = queue->pd; + pinst = pd->pinst; + + spin_lock(&queue->parallel.lock); + list_replace_init(&queue->parallel.list, &local_list); + spin_unlock(&queue->parallel.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->parallel(padata); + } + + local_bh_enable(); +} + +/* + * padata_do_parallel - padata parallelization function + * + * @pinst: padata instance + * @padata: object to be parallelized + * @cb_cpu: cpu the serialization callback function will run on, + * must be in the cpumask of padata. + * + * The parallelization callback function will run with BHs off. + * Note: Every object which is parallelized by padata_do_parallel + * must be seen by padata_do_serial. + */ +int padata_do_parallel(struct padata_instance *pinst, + struct padata_priv *padata, int cb_cpu) +{ + int target_cpu, err; + struct padata_queue *queue; + struct parallel_data *pd; + + rcu_read_lock_bh(); + + pd = rcu_dereference(pinst->pd); + + err = 0; + if (!(pinst->flags & PADATA_INIT)) + goto out; + + err = -EBUSY; + if ((pinst->flags & PADATA_RESET)) + goto out; + + if (atomic_read(&pd->refcnt) >= MAX_OBJ_NUM) + goto out; + + err = -EINVAL; + if (!cpumask_test_cpu(cb_cpu, pd->cpumask)) + goto out; + + err = -EINPROGRESS; + atomic_inc(&pd->refcnt); + padata->pd = pd; + padata->cb_cpu = cb_cpu; + + if (unlikely(atomic_read(&pd->seq_nr) == pd->max_seq_nr)) + atomic_set(&pd->seq_nr, -1); + + padata->seq_nr = atomic_inc_return(&pd->seq_nr); + + target_cpu = padata_cpu_hash(padata); + queue = per_cpu_ptr(pd->queue, target_cpu); + + spin_lock(&queue->parallel.lock); + list_add_tail(&padata->list, &queue->parallel.list); + spin_unlock(&queue->parallel.lock); + + queue_work_on(target_cpu, pinst->wq, &queue->pwork); + +out: + rcu_read_unlock_bh(); + + return err; +} +EXPORT_SYMBOL(padata_do_parallel); + +static struct padata_priv *padata_get_next(struct parallel_data *pd) +{ + int cpu, num_cpus, empty, calc_seq_nr; + int seq_nr, next_nr, overrun, next_overrun; + struct padata_queue *queue, *next_queue; + struct padata_priv *padata; + struct padata_list *reorder; + + empty = 0; + next_nr = -1; + next_overrun = 0; + next_queue = NULL; + + num_cpus = cpumask_weight(pd->cpumask); + + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + reorder = &queue->reorder; + + /* + * Calculate the seq_nr of the object that should be + * next in this queue. + */ + overrun = 0; + calc_seq_nr = (atomic_read(&queue->num_obj) * num_cpus) + + queue->cpu_index; + + if (unlikely(calc_seq_nr > pd->max_seq_nr)) { + calc_seq_nr = calc_seq_nr - pd->max_seq_nr - 1; + overrun = 1; + } + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + seq_nr = padata->seq_nr; + BUG_ON(calc_seq_nr != seq_nr); + } else { + seq_nr = calc_seq_nr; + empty++; + } + + if (next_nr < 0 || seq_nr < next_nr + || (next_overrun && !overrun)) { + next_nr = seq_nr; + next_overrun = overrun; + next_queue = queue; + } + } + + padata = NULL; + + if (empty == num_cpus) + goto out; + + reorder = &next_queue->reorder; + + if (!list_empty(&reorder->list)) { + padata = list_entry(reorder->list.next, + struct padata_priv, list); + + if (unlikely(next_overrun)) { + for_each_cpu(cpu, pd->cpumask) { + queue = per_cpu_ptr(pd->queue, cpu); + atomic_set(&queue->num_obj, 0); + } + } + + spin_lock(&reorder->lock); + list_del_init(&padata->list); + atomic_dec(&pd->reorder_objects); + spin_unlock(&reorder->lock); + + atomic_inc(&next_queue->num_obj); + + goto out; + } + + if (next_nr % num_cpus == next_queue->cpu_index) { + padata = ERR_PTR(-ENODATA); + goto out; + } + + padata = ERR_PTR(-EINPROGRESS); +out: + return padata; +} + +static void padata_reorder(struct parallel_data *pd) +{ + struct padata_priv *padata; + struct padata_queue *queue; + struct padata_instance *pinst = pd->pinst; + +try_again: + if (!spin_trylock_bh(&pd->lock)) + goto out; + + while (1) { + padata = padata_get_next(pd); + + if (!padata || PTR_ERR(padata) == -EINPROGRESS) + break; + + if (PTR_ERR(padata) == -ENODATA) { + spin_unlock_bh(&pd->lock); + goto out; + } + + queue = per_cpu_ptr(pd->queue, padata->cb_cpu); + + spin_lock(&queue->serial.lock); + list_add_tail(&padata->list, &queue->serial.list); + spin_unlock(&queue->serial.lock); + + queue_work_on(padata->cb_cpu, pinst->wq, &queue->swork); + } + + spin_unlock_bh(&pd->lock); + + if (atomic_read(&pd->reorder_objects)) + goto try_again; + +out: + return; +} + +static void padata_serial_worker(struct work_struct *work) +{ + struct padata_queue *queue; + struct parallel_data *pd; + LIST_HEAD(local_list); + + local_bh_disable(); + queue = container_of(work, struct padata_queue, swork); + pd = queue->pd; + + spin_lock(&queue->serial.lock); + list_replace_init(&queue->serial.list, &local_list); + spin_unlock(&queue->serial.lock); + + while (!list_empty(&local_list)) { + struct padata_priv *padata; + + padata = list_entry(local_list.next, + struct padata_priv, list); + + list_del_init(&padata->list); + + padata->serial(padata); + atomic_dec(&pd->refcnt); + } + local_bh_enable(); +} + +/* + * padata_do_serial - padata serialization function + * + * @padata: object to be serialized. + * + * padata_do_serial must be called for every parallelized object. + * The serialization callback function will run with BHs off. + */ +void padata_do_serial(struct padata_priv *padata) +{ + int cpu; + struct padata_queue *queue; + struct parallel_data *pd; + + pd = padata->pd; + + cpu = get_cpu(); + queue = per_cpu_ptr(pd->queue, cpu); + + spin_lock(&queue->reorder.lock); + atomic_inc(&pd->reorder_objects); + list_add_tail(&padata->list, &queue->reorder.list); + spin_unlock(&queue->reorder.lock); + + put_cpu(); + + padata_reorder(pd); +} +EXPORT_SYMBOL(padata_do_serial); + +static struct parallel_data *padata_alloc_pd(struct padata_instance *pinst, + const struct cpumask *cpumask) +{ + int cpu, cpu_index, num_cpus; + struct padata_queue *queue; + struct parallel_data *pd; + + cpu_index = 0; + + pd = kzalloc(sizeof(struct parallel_data), GFP_KERNEL); + if (!pd) + goto err; + + pd->queue = alloc_percpu(struct padata_queue); + if (!pd->queue) + goto err_free_pd; + + if (!alloc_cpumask_var(&pd->cpumask, GFP_KERNEL)) + goto err_free_queue; + + for_each_possible_cpu(cpu) { + queue = per_cpu_ptr(pd->queue, cpu); + + queue->pd = pd; + + if (cpumask_test_cpu(cpu, cpumask) + && cpumask_test_cpu(cpu, cpu_active_mask)) { + queue->cpu_index = cpu_index; + cpu_index++; + } else + queue->cpu_index = -1; + + INIT_LIST_HEAD(&queue->reorder.list); + INIT_LIST_HEAD(&queue->parallel.list); + INIT_LIST_HEAD(&queue->serial.list); + spin_lock_init(&queue->reorder.lock); + spin_lock_init(&queue->parallel.lock); + spin_lock_init(&queue->serial.lock); + + INIT_WORK(&queue->pwork, padata_parallel_worker); + INIT_WORK(&queue->swork, padata_serial_worker); + atomic_set(&queue->num_obj, 0); + } + + cpumask_and(pd->cpumask, cpumask, cpu_active_mask); + + num_cpus = cpumask_weight(pd->cpumask); + pd->max_seq_nr = (MAX_SEQ_NR / num_cpus) * num_cpus - 1; + + atomic_set(&pd->seq_nr, -1); + atomic_set(&pd->reorder_objects, 0); + atomic_set(&pd->refcnt, 0); + pd->pinst = pinst; + spin_lock_init(&pd->lock); + + return pd; + +err_free_queue: + free_percpu(pd->queue); +err_free_pd: + kfree(pd); +err: + return NULL; +} + +static void padata_free_pd(struct parallel_data *pd) +{ + free_cpumask_var(pd->cpumask); + free_percpu(pd->queue); + kfree(pd); +} + +static void padata_replace(struct padata_instance *pinst, + struct parallel_data *pd_new) +{ + struct parallel_data *pd_old = pinst->pd; + + pinst->flags |= PADATA_RESET; + + rcu_assign_pointer(pinst->pd, pd_new); + + synchronize_rcu(); + + while (atomic_read(&pd_old->refcnt) != 0) + yield(); + + flush_workqueue(pinst->wq); + + padata_free_pd(pd_old); + + pinst->flags &= ~PADATA_RESET; +} + +/* + * padata_set_cpumask - set the cpumask that padata should use + * + * @pinst: padata instance + * @cpumask: the cpumask to use + */ +int padata_set_cpumask(struct padata_instance *pinst, + cpumask_var_t cpumask) +{ + struct parallel_data *pd; + int err = 0; + + might_sleep(); + + mutex_lock(&pinst->lock); + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) { + err = -ENOMEM; + goto out; + } + + cpumask_copy(pinst->cpumask, cpumask); + + padata_replace(pinst, pd); + +out: + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_set_cpumask); + +static int __padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_active_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_add_cpu - add a cpu to the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to add + */ +int padata_add_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_set_cpu(cpu, pinst->cpumask); + err = __padata_add_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_add_cpu); + +static int __padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + struct parallel_data *pd; + + if (cpumask_test_cpu(cpu, cpu_online_mask)) { + pd = padata_alloc_pd(pinst, pinst->cpumask); + if (!pd) + return -ENOMEM; + + padata_replace(pinst, pd); + } + + return 0; +} + +/* + * padata_remove_cpu - remove a cpu from the padata cpumask + * + * @pinst: padata instance + * @cpu: cpu to remove + */ +int padata_remove_cpu(struct padata_instance *pinst, int cpu) +{ + int err; + + might_sleep(); + + mutex_lock(&pinst->lock); + + cpumask_clear_cpu(cpu, pinst->cpumask); + err = __padata_remove_cpu(pinst, cpu); + + mutex_unlock(&pinst->lock); + + return err; +} +EXPORT_SYMBOL(padata_remove_cpu); + +/* + * padata_start - start the parallel processing + * + * @pinst: padata instance to start + */ +void padata_start(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags |= PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_start); + +/* + * padata_stop - stop the parallel processing + * + * @pinst: padata instance to stop + */ +void padata_stop(struct padata_instance *pinst) +{ + might_sleep(); + + mutex_lock(&pinst->lock); + pinst->flags &= ~PADATA_INIT; + mutex_unlock(&pinst->lock); +} +EXPORT_SYMBOL(padata_stop); + +static int __cpuinit padata_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + int err; + struct padata_instance *pinst; + int cpu = (unsigned long)hcpu; + + pinst = container_of(nfb, struct padata_instance, cpu_notifier); + + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + err = __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + if (err) + return NOTIFY_BAD; + break; + + case CPU_UP_CANCELED: + case CPU_UP_CANCELED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_remove_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + if (!cpumask_test_cpu(cpu, pinst->cpumask)) + break; + mutex_lock(&pinst->lock); + __padata_add_cpu(pinst, cpu); + mutex_unlock(&pinst->lock); + } + + return NOTIFY_OK; +} + +/* + * padata_alloc - allocate and initialize a padata instance + * + * @cpumask: cpumask that padata uses for parallelization + * @wq: workqueue to use for the allocated padata instance + */ +struct padata_instance *padata_alloc(const struct cpumask *cpumask, + struct workqueue_struct *wq) +{ + int err; + struct padata_instance *pinst; + struct parallel_data *pd; + + pinst = kzalloc(sizeof(struct padata_instance), GFP_KERNEL); + if (!pinst) + goto err; + + pd = padata_alloc_pd(pinst, cpumask); + if (!pd) + goto err_free_inst; + + rcu_assign_pointer(pinst->pd, pd); + + pinst->wq = wq; + + cpumask_copy(pinst->cpumask, cpumask); + + pinst->flags = 0; + + pinst->cpu_notifier.notifier_call = padata_cpu_callback; + pinst->cpu_notifier.priority = 0; + err = register_hotcpu_notifier(&pinst->cpu_notifier); + if (err) + goto err_free_pd; + + mutex_init(&pinst->lock); + + return pinst; + +err_free_pd: + padata_free_pd(pd); +err_free_inst: + kfree(pinst); +err: + return NULL; +} +EXPORT_SYMBOL(padata_alloc); + +/* + * padata_free - free a padata instance + * + * @ padata_inst: padata instance to free + */ +void padata_free(struct padata_instance *pinst) +{ + padata_stop(pinst); + + synchronize_rcu(); + + while (atomic_read(&pinst->pd->refcnt) != 0) + yield(); + + unregister_hotcpu_notifier(&pinst->cpu_notifier); + padata_free_pd(pinst->pd); + kfree(pinst); +} +EXPORT_SYMBOL(padata_free); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index aa6155b5e24c..a661e7991865 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -3407,8 +3407,6 @@ static void perf_event_task_output(struct perf_event *event, task_event->event_id.tid = perf_event_tid(event, task); task_event->event_id.ptid = perf_event_tid(event, current); - task_event->event_id.time = perf_clock(); - perf_output_put(&handle, task_event->event_id); perf_output_end(&handle); @@ -3416,7 +3414,7 @@ static void perf_event_task_output(struct perf_event *event, static int perf_event_task_match(struct perf_event *event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -3448,7 +3446,7 @@ static void perf_event_task_event(struct perf_task_event *task_event) cpuctx = &get_cpu_var(perf_cpu_context); perf_event_task_ctx(&cpuctx->ctx, task_event); if (!ctx) - ctx = rcu_dereference(task_event->task->perf_event_ctxp); + ctx = rcu_dereference(current->perf_event_ctxp); if (ctx) perf_event_task_ctx(ctx, task_event); put_cpu_var(perf_cpu_context); @@ -3479,6 +3477,7 @@ static void perf_event_task(struct task_struct *task, /* .ppid */ /* .tid */ /* .ptid */ + .time = perf_clock(), }, }; @@ -3528,7 +3527,7 @@ static void perf_event_comm_output(struct perf_event *event, static int perf_event_comm_match(struct perf_event *event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -3648,7 +3647,7 @@ static void perf_event_mmap_output(struct perf_event *event, static int perf_event_mmap_match(struct perf_event *event, struct perf_mmap_event *mmap_event) { - if (event->state != PERF_EVENT_STATE_ACTIVE) + if (event->state < PERF_EVENT_STATE_INACTIVE) return 0; if (event->cpu != -1 && event->cpu != smp_processor_id()) @@ -4728,7 +4727,7 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr, if (attr->type >= PERF_TYPE_MAX) return -EINVAL; - if (attr->__reserved_1 || attr->__reserved_2) + if (attr->__reserved_1) return -EINVAL; if (attr->sample_type & ~(PERF_SAMPLE_MAX-1)) diff --git a/kernel/pid.c b/kernel/pid.c index 2e17c9c92cbe..b08e697cd83f 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -367,7 +367,7 @@ struct task_struct *pid_task(struct pid *pid, enum pid_type type) struct task_struct *result = NULL; if (pid) { struct hlist_node *first; - first = rcu_dereference(pid->tasks[type].first); + first = rcu_dereference_check(pid->tasks[type].first, rcu_read_lock_held() || lockdep_is_held(&tasklist_lock)); if (first) result = hlist_entry(first, struct task_struct, pids[(type)].node); } diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig index 91e09d3b2eb2..5c36ea9d55d2 100644 --- a/kernel/power/Kconfig +++ b/kernel/power/Kconfig @@ -27,6 +27,15 @@ config PM_DEBUG code. This is helpful when debugging and reporting PM bugs, like suspend support. +config PM_ADVANCED_DEBUG + bool "Extra PM attributes in sysfs for low-level debugging/testing" + depends on PM_DEBUG + default n + ---help--- + Add extra sysfs attributes allowing one to access some Power Management + fields of device objects from user space. If you are not a kernel + developer interested in debugging/testing Power Management, say "no". + config PM_VERBOSE bool "Verbose Power Management debugging" depends on PM_DEBUG @@ -85,6 +94,11 @@ config PM_SLEEP depends on SUSPEND || HIBERNATION || XEN_SAVE_RESTORE default y +config PM_SLEEP_ADVANCED_DEBUG + bool + depends on PM_ADVANCED_DEBUG + default n + config SUSPEND bool "Suspend to RAM and standby" depends on PM && ARCH_SUSPEND_POSSIBLE @@ -222,3 +236,8 @@ config PM_RUNTIME and the bus type drivers of the buses the devices are on are responsible for the actual handling of the autosuspend requests and wake-up events. + +config PM_OPS + bool + depends on PM_SLEEP || PM_RUNTIME + default y diff --git a/kernel/power/main.c b/kernel/power/main.c index 0998c7139053..b58800b21fc0 100644 --- a/kernel/power/main.c +++ b/kernel/power/main.c @@ -44,6 +44,32 @@ int pm_notifier_call_chain(unsigned long val) == NOTIFY_BAD) ? -EINVAL : 0; } +/* If set, devices may be suspended and resumed asynchronously. */ +int pm_async_enabled = 1; + +static ssize_t pm_async_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + return sprintf(buf, "%d\n", pm_async_enabled); +} + +static ssize_t pm_async_store(struct kobject *kobj, struct kobj_attribute *attr, + const char *buf, size_t n) +{ + unsigned long val; + + if (strict_strtoul(buf, 10, &val)) + return -EINVAL; + + if (val > 1) + return -EINVAL; + + pm_async_enabled = val; + return n; +} + +power_attr(pm_async); + #ifdef CONFIG_PM_DEBUG int pm_test_level = TEST_NONE; @@ -208,9 +234,12 @@ static struct attribute * g[] = { #ifdef CONFIG_PM_TRACE &pm_trace_attr.attr, #endif -#if defined(CONFIG_PM_SLEEP) && defined(CONFIG_PM_DEBUG) +#ifdef CONFIG_PM_SLEEP + &pm_async_attr.attr, +#ifdef CONFIG_PM_DEBUG &pm_test_attr.attr, #endif +#endif NULL, }; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 36cb168e4330..830cadecbdfc 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -1181,7 +1181,7 @@ static void free_unnecessary_pages(void) memory_bm_position_reset(©_bm); - while (to_free_normal > 0 && to_free_highmem > 0) { + while (to_free_normal > 0 || to_free_highmem > 0) { unsigned long pfn = memory_bm_next_pfn(©_bm); struct page *page = pfn_to_page(pfn); @@ -1500,7 +1500,7 @@ asmlinkage int swsusp_save(void) { unsigned int nr_pages, nr_highmem; - printk(KERN_INFO "PM: Creating hibernation image: \n"); + printk(KERN_INFO "PM: Creating hibernation image:\n"); drain_local_pages(NULL); nr_pages = count_data_pages(); diff --git a/kernel/power/swap.c b/kernel/power/swap.c index 09b2b0ae9e9d..1d575733d4e1 100644 --- a/kernel/power/swap.c +++ b/kernel/power/swap.c @@ -657,10 +657,6 @@ int swsusp_read(unsigned int *flags_p) struct swsusp_info *header; *flags_p = swsusp_header->flags; - if (IS_ERR(resume_bdev)) { - pr_debug("PM: Image device not initialised\n"); - return PTR_ERR(resume_bdev); - } memset(&snapshot, 0, sizeof(struct snapshot_handle)); error = snapshot_write_next(&snapshot, PAGE_SIZE); diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c deleted file mode 100644 index 5b3601bd1893..000000000000 --- a/kernel/power/swsusp.c +++ /dev/null @@ -1,58 +0,0 @@ -/* - * linux/kernel/power/swsusp.c - * - * This file provides code to write suspend image to swap and read it back. - * - * Copyright (C) 1998-2001 Gabor Kuti <seasons@fornax.hu> - * Copyright (C) 1998,2001-2005 Pavel Machek <pavel@suse.cz> - * - * This file is released under the GPLv2. - * - * I'd like to thank the following people for their work: - * - * Pavel Machek <pavel@ucw.cz>: - * Modifications, defectiveness pointing, being with me at the very beginning, - * suspend to swap space, stop all tasks. Port to 2.4.18-ac and 2.5.17. - * - * Steve Doddi <dirk@loth.demon.co.uk>: - * Support the possibility of hardware state restoring. - * - * Raph <grey.havens@earthling.net>: - * Support for preserving states of network devices and virtual console - * (including X and svgatextmode) - * - * Kurt Garloff <garloff@suse.de>: - * Straightened the critical function in order to prevent compilers from - * playing tricks with local variables. - * - * Andreas Mohr <a.mohr@mailto.de> - * - * Alex Badea <vampire@go.ro>: - * Fixed runaway init - * - * Rafael J. Wysocki <rjw@sisk.pl> - * Reworked the freeing of memory and the handling of swap - * - * More state savers are welcome. Especially for the scsi layer... - * - * For TODOs,FIXMEs also look in Documentation/power/swsusp.txt - */ - -#include <linux/mm.h> -#include <linux/suspend.h> -#include <linux/spinlock.h> -#include <linux/kernel.h> -#include <linux/major.h> -#include <linux/swap.h> -#include <linux/pm.h> -#include <linux/swapops.h> -#include <linux/bootmem.h> -#include <linux/syscalls.h> -#include <linux/highmem.h> -#include <linux/time.h> -#include <linux/rbtree.h> -#include <linux/io.h> - -#include "power.h" - -int in_suspend __nosavedata = 0; diff --git a/kernel/power/user.c b/kernel/power/user.c index bf0014d6a5f0..4d2289626a84 100644 --- a/kernel/power/user.c +++ b/kernel/power/user.c @@ -195,6 +195,15 @@ static ssize_t snapshot_write(struct file *filp, const char __user *buf, return res; } +static void snapshot_deprecated_ioctl(unsigned int cmd) +{ + if (printk_ratelimit()) + printk(KERN_NOTICE "%pf: ioctl '%.8x' is deprecated and will " + "be removed soon, update your suspend-to-disk " + "utilities\n", + __builtin_return_address(0), cmd); +} + static long snapshot_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { @@ -246,8 +255,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->frozen = 0; break; - case SNAPSHOT_CREATE_IMAGE: case SNAPSHOT_ATOMIC_SNAPSHOT: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_CREATE_IMAGE: if (data->mode != O_RDONLY || !data->frozen || data->ready) { error = -EPERM; break; @@ -275,8 +285,9 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, data->ready = 0; break; - case SNAPSHOT_PREF_IMAGE_SIZE: case SNAPSHOT_SET_IMAGE_SIZE: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_PREF_IMAGE_SIZE: image_size = arg; break; @@ -290,15 +301,17 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_AVAIL_SWAP_SIZE: case SNAPSHOT_AVAIL_SWAP: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_AVAIL_SWAP_SIZE: size = count_swap_pages(data->swap, 1); size <<= PAGE_SHIFT; error = put_user(size, (loff_t __user *)arg); break; - case SNAPSHOT_ALLOC_SWAP_PAGE: case SNAPSHOT_GET_SWAP_PAGE: + snapshot_deprecated_ioctl(cmd); + case SNAPSHOT_ALLOC_SWAP_PAGE: if (data->swap < 0 || data->swap >= MAX_SWAPFILES) { error = -ENODEV; break; @@ -321,6 +334,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_SET_SWAP_FILE: /* This ioctl is deprecated */ + snapshot_deprecated_ioctl(cmd); if (!swsusp_swap_in_use()) { /* * User space encodes device types as two-byte values, @@ -362,6 +376,7 @@ static long snapshot_ioctl(struct file *filp, unsigned int cmd, break; case SNAPSHOT_PMOPS: /* This ioctl is deprecated */ + snapshot_deprecated_ioctl(cmd); error = -EINVAL; switch (arg) { diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c index 9b7fd4723878..f1125c1a6321 100644 --- a/kernel/rcupdate.c +++ b/kernel/rcupdate.c @@ -44,14 +44,43 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/module.h> +#include <linux/kernel_stat.h> #ifdef CONFIG_DEBUG_LOCK_ALLOC static struct lock_class_key rcu_lock_key; struct lockdep_map rcu_lock_map = STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key); EXPORT_SYMBOL_GPL(rcu_lock_map); + +static struct lock_class_key rcu_bh_lock_key; +struct lockdep_map rcu_bh_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_bh", &rcu_bh_lock_key); +EXPORT_SYMBOL_GPL(rcu_bh_lock_map); + +static struct lock_class_key rcu_sched_lock_key; +struct lockdep_map rcu_sched_lock_map = + STATIC_LOCKDEP_MAP_INIT("rcu_read_lock_sched", &rcu_sched_lock_key); +EXPORT_SYMBOL_GPL(rcu_sched_lock_map); #endif +int rcu_scheduler_active __read_mostly; +EXPORT_SYMBOL_GPL(rcu_scheduler_active); + +/* + * This function is invoked towards the end of the scheduler's initialization + * process. Before this is called, the idle task might contain + * RCU read-side critical sections (during which time, this idle + * task is booting the system). After this function is called, the + * idle tasks are prohibited from containing RCU read-side critical + * sections. + */ +void rcu_scheduler_starting(void) +{ + WARN_ON(num_online_cpus() != 1); + WARN_ON(nr_context_switches() > 0); + rcu_scheduler_active = 1; +} + /* * Awaken the corresponding synchronize_rcu() instance now that a * grace period has elapsed. diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c index 9bb52177af02..258cdf0a91eb 100644 --- a/kernel/rcutorture.c +++ b/kernel/rcutorture.c @@ -61,6 +61,9 @@ static int test_no_idle_hz; /* Test RCU's support for tickless idle CPUs. */ static int shuffle_interval = 3; /* Interval between shuffles (in sec)*/ static int stutter = 5; /* Start/stop testing interval (in sec) */ static int irqreader = 1; /* RCU readers from irq (timers). */ +static int fqs_duration = 0; /* Duration of bursts (us), 0 to disable. */ +static int fqs_holdoff = 0; /* Hold time within burst (us). */ +static int fqs_stutter = 3; /* Wait time between bursts (s). */ static char *torture_type = "rcu"; /* What RCU implementation to torture. */ module_param(nreaders, int, 0444); @@ -79,6 +82,12 @@ module_param(stutter, int, 0444); MODULE_PARM_DESC(stutter, "Number of seconds to run/halt test"); module_param(irqreader, int, 0444); MODULE_PARM_DESC(irqreader, "Allow RCU readers from irq handlers"); +module_param(fqs_duration, int, 0444); +MODULE_PARM_DESC(fqs_duration, "Duration of fqs bursts (us)"); +module_param(fqs_holdoff, int, 0444); +MODULE_PARM_DESC(fqs_holdoff, "Holdoff time within fqs bursts (us)"); +module_param(fqs_stutter, int, 0444); +MODULE_PARM_DESC(fqs_stutter, "Wait time between fqs bursts (s)"); module_param(torture_type, charp, 0444); MODULE_PARM_DESC(torture_type, "Type of RCU to torture (rcu, rcu_bh, srcu)"); @@ -99,6 +108,7 @@ static struct task_struct **reader_tasks; static struct task_struct *stats_task; static struct task_struct *shuffler_task; static struct task_struct *stutter_task; +static struct task_struct *fqs_task; #define RCU_TORTURE_PIPE_LEN 10 @@ -263,6 +273,7 @@ struct rcu_torture_ops { void (*deferred_free)(struct rcu_torture *p); void (*sync)(void); void (*cb_barrier)(void); + void (*fqs)(void); int (*stats)(char *page); int irq_capable; char *name; @@ -347,6 +358,7 @@ static struct rcu_torture_ops rcu_ops = { .deferred_free = rcu_torture_deferred_free, .sync = synchronize_rcu, .cb_barrier = rcu_barrier, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu" @@ -388,6 +400,7 @@ static struct rcu_torture_ops rcu_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu, .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_sync" @@ -403,6 +416,7 @@ static struct rcu_torture_ops rcu_expedited_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_rcu_expedited, .cb_barrier = NULL, + .fqs = rcu_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_expedited" @@ -465,6 +479,7 @@ static struct rcu_torture_ops rcu_bh_ops = { .deferred_free = rcu_bh_torture_deferred_free, .sync = rcu_bh_torture_synchronize, .cb_barrier = rcu_barrier_bh, + .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_bh" @@ -480,6 +495,7 @@ static struct rcu_torture_ops rcu_bh_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = rcu_bh_torture_synchronize, .cb_barrier = NULL, + .fqs = rcu_bh_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "rcu_bh_sync" @@ -621,6 +637,7 @@ static struct rcu_torture_ops sched_ops = { .deferred_free = rcu_sched_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = rcu_barrier_sched, + .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .irq_capable = 1, .name = "sched" @@ -636,6 +653,7 @@ static struct rcu_torture_ops sched_sync_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = sched_torture_synchronize, .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, .stats = NULL, .name = "sched_sync" }; @@ -650,12 +668,45 @@ static struct rcu_torture_ops sched_expedited_ops = { .deferred_free = rcu_sync_torture_deferred_free, .sync = synchronize_sched_expedited, .cb_barrier = NULL, + .fqs = rcu_sched_force_quiescent_state, .stats = rcu_expedited_torture_stats, .irq_capable = 1, .name = "sched_expedited" }; /* + * RCU torture force-quiescent-state kthread. Repeatedly induces + * bursts of calls to force_quiescent_state(), increasing the probability + * of occurrence of some important types of race conditions. + */ +static int +rcu_torture_fqs(void *arg) +{ + unsigned long fqs_resume_time; + int fqs_burst_remaining; + + VERBOSE_PRINTK_STRING("rcu_torture_fqs task started"); + do { + fqs_resume_time = jiffies + fqs_stutter * HZ; + while (jiffies - fqs_resume_time > LONG_MAX) { + schedule_timeout_interruptible(1); + } + fqs_burst_remaining = fqs_duration; + while (fqs_burst_remaining > 0) { + cur_ops->fqs(); + udelay(fqs_holdoff); + fqs_burst_remaining -= fqs_holdoff; + } + rcu_stutter_wait("rcu_torture_fqs"); + } while (!kthread_should_stop() && fullstop == FULLSTOP_DONTSTOP); + VERBOSE_PRINTK_STRING("rcu_torture_fqs task stopping"); + rcutorture_shutdown_absorb("rcu_torture_fqs"); + while (!kthread_should_stop()) + schedule_timeout_uninterruptible(1); + return 0; +} + +/* * RCU torture writer kthread. Repeatedly substitutes a new structure * for that pointed to by rcu_torture_current, freeing the old structure * after a series of grace periods (the "pipeline"). @@ -745,7 +796,11 @@ static void rcu_torture_timer(unsigned long unused) idx = cur_ops->readlock(); completed = cur_ops->completed(); - p = rcu_dereference(rcu_torture_current); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); if (p == NULL) { /* Leave because rcu_torture_writer is not yet underway */ cur_ops->readunlock(idx); @@ -798,11 +853,15 @@ rcu_torture_reader(void *arg) do { if (irqreader && cur_ops->irq_capable) { if (!timer_pending(&t)) - mod_timer(&t, 1); + mod_timer(&t, jiffies + 1); } idx = cur_ops->readlock(); completed = cur_ops->completed(); - p = rcu_dereference(rcu_torture_current); + p = rcu_dereference_check(rcu_torture_current, + rcu_read_lock_held() || + rcu_read_lock_bh_held() || + rcu_read_lock_sched_held() || + srcu_read_lock_held(&srcu_ctl)); if (p == NULL) { /* Wait for rcu_torture_writer to get underway */ cur_ops->readunlock(idx); @@ -1030,10 +1089,11 @@ rcu_torture_print_module_parms(char *tag) printk(KERN_ALERT "%s" TORTURE_FLAG "--- %s: nreaders=%d nfakewriters=%d " "stat_interval=%d verbose=%d test_no_idle_hz=%d " - "shuffle_interval=%d stutter=%d irqreader=%d\n", + "shuffle_interval=%d stutter=%d irqreader=%d " + "fqs_duration=%d fqs_holdoff=%d fqs_stutter=%d\n", torture_type, tag, nrealreaders, nfakewriters, stat_interval, verbose, test_no_idle_hz, shuffle_interval, - stutter, irqreader); + stutter, irqreader, fqs_duration, fqs_holdoff, fqs_stutter); } static struct notifier_block rcutorture_nb = { @@ -1109,6 +1169,12 @@ rcu_torture_cleanup(void) } stats_task = NULL; + if (fqs_task) { + VERBOSE_PRINTK_STRING("Stopping rcu_torture_fqs task"); + kthread_stop(fqs_task); + } + fqs_task = NULL; + /* Wait for all RCU callbacks to fire. */ if (cur_ops->cb_barrier != NULL) @@ -1154,6 +1220,11 @@ rcu_torture_init(void) mutex_unlock(&fullstop_mutex); return -EINVAL; } + if (cur_ops->fqs == NULL && fqs_duration != 0) { + printk(KERN_ALERT "rcu-torture: ->fqs NULL and non-zero " + "fqs_duration, fqs disabled.\n"); + fqs_duration = 0; + } if (cur_ops->init) cur_ops->init(); /* no "goto unwind" prior to this point!!! */ @@ -1282,6 +1353,19 @@ rcu_torture_init(void) goto unwind; } } + if (fqs_duration < 0) + fqs_duration = 0; + if (fqs_duration) { + /* Create the stutter thread */ + fqs_task = kthread_run(rcu_torture_fqs, NULL, + "rcu_torture_fqs"); + if (IS_ERR(fqs_task)) { + firsterr = PTR_ERR(fqs_task); + VERBOSE_PRINTK_ERRSTRING("Failed to create fqs"); + fqs_task = NULL; + goto unwind; + } + } register_reboot_notifier(&rcutorture_nb); mutex_unlock(&fullstop_mutex); return 0; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 53ae9598f798..3ec8160fc75f 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -46,7 +46,6 @@ #include <linux/cpu.h> #include <linux/mutex.h> #include <linux/time.h> -#include <linux/kernel_stat.h> #include "rcutree.h" @@ -66,11 +65,11 @@ static struct lock_class_key rcu_node_class[NUM_RCU_LVLS]; .signaled = RCU_GP_IDLE, \ .gpnum = -300, \ .completed = -300, \ - .onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \ + .onofflock = __RAW_SPIN_LOCK_UNLOCKED(&name.onofflock), \ .orphan_cbs_list = NULL, \ .orphan_cbs_tail = &name.orphan_cbs_list, \ .orphan_qlen = 0, \ - .fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \ + .fqslock = __RAW_SPIN_LOCK_UNLOCKED(&name.fqslock), \ .n_force_qs = 0, \ .n_force_qs_ngp = 0, \ } @@ -81,9 +80,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data); struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state); DEFINE_PER_CPU(struct rcu_data, rcu_bh_data); -static int rcu_scheduler_active __read_mostly; - - /* * Return true if an RCU grace period is in progress. The ACCESS_ONCE()s * permit this function to be invoked without holding the root rcu_node @@ -157,6 +153,24 @@ long rcu_batches_completed_bh(void) EXPORT_SYMBOL_GPL(rcu_batches_completed_bh); /* + * Force a quiescent state for RCU BH. + */ +void rcu_bh_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_bh_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state); + +/* + * Force a quiescent state for RCU-sched. + */ +void rcu_sched_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_sched_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state); + +/* * Does the CPU have callbacks ready to be invoked? */ static int @@ -439,10 +453,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) /* Only let one CPU complain about others per time interval. */ - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); delta = jiffies - rsp->jiffies_stall; if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; @@ -452,13 +466,15 @@ static void print_other_cpu_stall(struct rcu_state *rsp) * due to CPU offlining. */ rcu_print_task_stall(rnp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); /* OK, time to rat on our buddy... */ printk(KERN_ERR "INFO: RCU detected CPU stalls:"); rcu_for_each_leaf_node(rsp, rnp) { + raw_spin_lock_irqsave(&rnp->lock, flags); rcu_print_task_stall(rnp); + raw_spin_unlock_irqrestore(&rnp->lock, flags); if (rnp->qsmask == 0) continue; for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++) @@ -469,6 +485,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp) smp_processor_id(), (long)(jiffies - rsp->gp_start)); trigger_all_cpu_backtrace(); + /* If so configured, complain about tasks blocking the grace period. */ + + rcu_print_detail_task_stall(rsp); + force_quiescent_state(rsp, 0); /* Kick them all. */ } @@ -481,11 +501,11 @@ static void print_cpu_stall(struct rcu_state *rsp) smp_processor_id(), jiffies - rsp->gp_start); trigger_all_cpu_backtrace(); - spin_lock_irqsave(&rnp->lock, flags); - if ((long)(jiffies - rsp->jiffies_stall) >= 0) + raw_spin_lock_irqsave(&rnp->lock, flags); + if (ULONG_CMP_GE(jiffies, rsp->jiffies_stall)) rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK; - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); set_need_resched(); /* kick ourselves to get things going. */ } @@ -545,12 +565,12 @@ static void note_new_gpnum(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if (rdp->gpnum == ACCESS_ONCE(rnp->gpnum) || /* outside lock. */ - !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } __note_new_gpnum(rsp, rnp, rdp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -609,12 +629,12 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp) local_irq_save(flags); rnp = rdp->mynode; if (rdp->completed == ACCESS_ONCE(rnp->completed) || /* outside lock. */ - !spin_trylock(&rnp->lock)) { /* irqs already off, retry later. */ + !raw_spin_trylock(&rnp->lock)) { /* irqs already off, so later. */ local_irq_restore(flags); return; } __rcu_process_gp_end(rsp, rnp, rdp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -659,12 +679,14 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) struct rcu_data *rdp = rsp->rda[smp_processor_id()]; struct rcu_node *rnp = rcu_get_root(rsp); - if (!cpu_needs_another_gp(rsp, rdp)) { + if (!cpu_needs_another_gp(rsp, rdp) || rsp->fqs_active) { + if (cpu_needs_another_gp(rsp, rdp)) + rsp->fqs_need_gp = 1; if (rnp->completed == rsp->completed) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* * Propagate new ->completed value to rcu_node structures @@ -672,9 +694,9 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * of the next grace period to process their callbacks. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->completed = rsp->completed; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } local_irq_restore(flags); return; @@ -695,15 +717,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) rnp->completed = rsp->completed; rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state OK. */ rcu_start_gp_per_cpu(rsp, rnp, rdp); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } - spin_unlock(&rnp->lock); /* leave irqs disabled. */ + raw_spin_unlock(&rnp->lock); /* leave irqs disabled. */ /* Exclude any concurrent CPU-hotplug operations. */ - spin_lock(&rsp->onofflock); /* irqs already disabled. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* * Set the quiescent-state-needed bits in all the rcu_node @@ -723,21 +745,21 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags) * irqs disabled. */ rcu_for_each_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rcu_preempt_check_blocked_tasks(rnp); rnp->qsmask = rnp->qsmaskinit; rnp->gpnum = rsp->gpnum; rnp->completed = rsp->completed; if (rnp == rdp->mynode) rcu_start_gp_per_cpu(rsp, rnp, rdp); - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } rnp = rcu_get_root(rsp); - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */ - spin_unlock(&rnp->lock); /* irqs remain disabled. */ - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -776,14 +798,14 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, if (!(rnp->qsmask & mask)) { /* Our bit has already been cleared, so done. */ - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } rnp->qsmask &= ~mask; if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { /* Other bits still set at this level, so done. */ - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } mask = rnp->grpmask; @@ -793,10 +815,10 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp, break; } - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); rnp_c = rnp; rnp = rnp->parent; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); WARN_ON_ONCE(rnp_c->qsmask); } @@ -825,7 +847,7 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las struct rcu_node *rnp; rnp = rdp->mynode; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); if (lastcomp != rnp->completed) { /* @@ -837,12 +859,12 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp, long las * race occurred. */ rdp->passed_quiesc = 0; /* try again later! */ - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; } mask = rdp->grpmask; if ((rnp->qsmask & mask) == 0) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } else { rdp->qs_pending = 0; @@ -906,7 +928,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) if (rdp->nxtlist == NULL) return; /* irqs disabled, so comparison is stable. */ - spin_lock(&rsp->onofflock); /* irqs already disabled. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ *rsp->orphan_cbs_tail = rdp->nxtlist; rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL]; rdp->nxtlist = NULL; @@ -914,7 +936,7 @@ static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp) rdp->nxttail[i] = &rdp->nxtlist; rsp->orphan_qlen += rdp->qlen; rdp->qlen = 0; - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ } /* @@ -925,10 +947,10 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) unsigned long flags; struct rcu_data *rdp; - spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->onofflock, flags); rdp = rsp->rda[smp_processor_id()]; if (rsp->orphan_cbs_list == NULL) { - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); return; } *rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list; @@ -937,7 +959,7 @@ static void rcu_adopt_orphan_cbs(struct rcu_state *rsp) rsp->orphan_cbs_list = NULL; rsp->orphan_cbs_tail = &rsp->orphan_cbs_list; rsp->orphan_qlen = 0; - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } /* @@ -953,23 +975,23 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) struct rcu_node *rnp; /* Exclude any attempts to start a new grace period. */ - spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Remove the outgoing CPU from the masks in the rcu_node hierarchy. */ rnp = rdp->mynode; /* this is the outgoing CPU's rnp. */ mask = rdp->grpmask; /* rnp->grplo is constant. */ do { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit &= ~mask; if (rnp->qsmaskinit != 0) { if (rnp != rdp->mynode) - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ break; } if (rnp == rdp->mynode) need_report = rcu_preempt_offline_tasks(rsp, rnp, rdp); else - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ mask = rnp->grpmask; rnp = rnp->parent; } while (rnp != NULL); @@ -980,12 +1002,12 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp) * because invoking rcu_report_unblock_qs_rnp() with ->onofflock * held leads to deadlock. */ - spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ + raw_spin_unlock(&rsp->onofflock); /* irqs remain disabled. */ rnp = rdp->mynode; if (need_report & RCU_OFL_TASKS_NORM_GP) rcu_report_unblock_qs_rnp(rnp, flags); else - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); if (need_report & RCU_OFL_TASKS_EXP_GP) rcu_report_exp_rnp(rsp, rnp); @@ -1144,11 +1166,9 @@ void rcu_check_callbacks(int cpu, int user) /* * Scan the leaf rcu_node structures, processing dyntick state for any that * have not yet encountered a quiescent state, using the function specified. - * Returns 1 if the current grace period ends while scanning (possibly - * because we made it end). + * The caller must have suppressed start of new grace periods. */ -static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, - int (*f)(struct rcu_data *)) +static void force_qs_rnp(struct rcu_state *rsp, int (*f)(struct rcu_data *)) { unsigned long bit; int cpu; @@ -1158,13 +1178,13 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, rcu_for_each_leaf_node(rsp, rnp) { mask = 0; - spin_lock_irqsave(&rnp->lock, flags); - if (rnp->completed != lastcomp) { - spin_unlock_irqrestore(&rnp->lock, flags); - return 1; + raw_spin_lock_irqsave(&rnp->lock, flags); + if (!rcu_gp_in_progress(rsp)) { + raw_spin_unlock_irqrestore(&rnp->lock, flags); + return; } if (rnp->qsmask == 0) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); continue; } cpu = rnp->grplo; @@ -1173,15 +1193,14 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu])) mask |= bit; } - if (mask != 0 && rnp->completed == lastcomp) { + if (mask != 0) { /* rcu_report_qs_rnp() releases rnp->lock. */ rcu_report_qs_rnp(mask, rsp, rnp, flags); continue; } - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } - return 0; } /* @@ -1191,32 +1210,26 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp, static void force_quiescent_state(struct rcu_state *rsp, int relaxed) { unsigned long flags; - long lastcomp; struct rcu_node *rnp = rcu_get_root(rsp); - u8 signaled; - u8 forcenow; if (!rcu_gp_in_progress(rsp)) return; /* No grace period in progress, nothing to force. */ - if (!spin_trylock_irqsave(&rsp->fqslock, flags)) { + if (!raw_spin_trylock_irqsave(&rsp->fqslock, flags)) { rsp->n_force_qs_lh++; /* Inexact, can lose counts. Tough! */ return; /* Someone else is already on the job. */ } - if (relaxed && - (long)(rsp->jiffies_force_qs - jiffies) >= 0) - goto unlock_ret; /* no emergency and done recently. */ + if (relaxed && ULONG_CMP_GE(rsp->jiffies_force_qs, jiffies)) + goto unlock_fqs_ret; /* no emergency and done recently. */ rsp->n_force_qs++; - spin_lock(&rnp->lock); - lastcomp = rsp->gpnum - 1; - signaled = rsp->signaled; + raw_spin_lock(&rnp->lock); /* irqs already disabled */ rsp->jiffies_force_qs = jiffies + RCU_JIFFIES_TILL_FORCE_QS; if(!rcu_gp_in_progress(rsp)) { rsp->n_force_qs_ngp++; - spin_unlock(&rnp->lock); - goto unlock_ret; /* no GP in progress, time updated. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + goto unlock_fqs_ret; /* no GP in progress, time updated. */ } - spin_unlock(&rnp->lock); - switch (signaled) { + rsp->fqs_active = 1; + switch (rsp->signaled) { case RCU_GP_IDLE: case RCU_GP_INIT: @@ -1224,45 +1237,38 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed) case RCU_SAVE_DYNTICK: + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ if (RCU_SIGNAL_INIT != RCU_SAVE_DYNTICK) break; /* So gcc recognizes the dead code. */ /* Record dyntick-idle state. */ - if (rcu_process_dyntick(rsp, lastcomp, - dyntick_save_progress_counter)) - goto unlock_ret; - /* fall into next case. */ - - case RCU_SAVE_COMPLETED: - - /* Update state, record completion counter. */ - forcenow = 0; - spin_lock(&rnp->lock); - if (lastcomp + 1 == rsp->gpnum && - lastcomp == rsp->completed && - rsp->signaled == signaled) { + force_qs_rnp(rsp, dyntick_save_progress_counter); + raw_spin_lock(&rnp->lock); /* irqs already disabled */ + if (rcu_gp_in_progress(rsp)) rsp->signaled = RCU_FORCE_QS; - rsp->completed_fqs = lastcomp; - forcenow = signaled == RCU_SAVE_COMPLETED; - } - spin_unlock(&rnp->lock); - if (!forcenow) - break; - /* fall into next case. */ + break; case RCU_FORCE_QS: /* Check dyntick-idle state, send IPI to laggarts. */ - if (rcu_process_dyntick(rsp, rsp->completed_fqs, - rcu_implicit_dynticks_qs)) - goto unlock_ret; + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ + force_qs_rnp(rsp, rcu_implicit_dynticks_qs); /* Leave state in case more forcing is required. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled */ break; } -unlock_ret: - spin_unlock_irqrestore(&rsp->fqslock, flags); + rsp->fqs_active = 0; + if (rsp->fqs_need_gp) { + raw_spin_unlock(&rsp->fqslock); /* irqs remain disabled */ + rsp->fqs_need_gp = 0; + rcu_start_gp(rsp, flags); /* releases rnp->lock */ + return; + } + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ +unlock_fqs_ret: + raw_spin_unlock_irqrestore(&rsp->fqslock, flags); } #else /* #ifdef CONFIG_SMP */ @@ -1290,7 +1296,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) * If an RCU GP has gone long enough, go check for dyntick * idle CPUs and, if needed, send resched IPIs. */ - if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) + if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) force_quiescent_state(rsp, 1); /* @@ -1304,7 +1310,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp) /* Does this CPU require a not-yet-started grace period? */ if (cpu_needs_another_gp(rsp, rdp)) { - spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); + raw_spin_lock_irqsave(&rcu_get_root(rsp)->lock, flags); rcu_start_gp(rsp, flags); /* releases above lock */ } @@ -1335,6 +1341,9 @@ static void rcu_process_callbacks(struct softirq_action *unused) * grace-period manipulations above. */ smp_mb(); /* See above block comment. */ + + /* If we are last CPU on way to dyntick-idle mode, accelerate it. */ + rcu_needs_cpu_flush(); } static void @@ -1369,7 +1378,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), unsigned long nestflag; struct rcu_node *rnp_root = rcu_get_root(rsp); - spin_lock_irqsave(&rnp_root->lock, nestflag); + raw_spin_lock_irqsave(&rnp_root->lock, nestflag); rcu_start_gp(rsp, nestflag); /* releases rnp_root->lock. */ } @@ -1387,7 +1396,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu), force_quiescent_state(rsp, 0); rdp->n_force_qs_snap = rsp->n_force_qs; rdp->qlen_last_fqs_check = rdp->qlen; - } else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0) + } else if (ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) force_quiescent_state(rsp, 1); local_irq_restore(flags); } @@ -1520,7 +1529,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (rcu_gp_in_progress(rsp) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { + ULONG_CMP_LT(ACCESS_ONCE(rsp->jiffies_force_qs), jiffies)) { rdp->n_rp_need_fqs++; return 1; } @@ -1545,10 +1554,9 @@ static int rcu_pending(int cpu) /* * Check to see if any future RCU-related work will need to be done * by the current CPU, even if none need be done immediately, returning - * 1 if so. This function is part of the RCU implementation; it is -not- - * an exported member of the RCU API. + * 1 if so. */ -int rcu_needs_cpu(int cpu) +static int rcu_needs_cpu_quick_check(int cpu) { /* RCU callbacks either ready or pending? */ return per_cpu(rcu_sched_data, cpu).nxtlist || @@ -1556,21 +1564,6 @@ int rcu_needs_cpu(int cpu) rcu_preempt_needs_cpu(cpu); } -/* - * This function is invoked towards the end of the scheduler's initialization - * process. Before this is called, the idle task might contain - * RCU read-side critical sections (during which time, this idle - * task is booting the system). After this function is called, the - * idle tasks are prohibited from containing RCU read-side critical - * sections. - */ -void rcu_scheduler_starting(void) -{ - WARN_ON(num_online_cpus() != 1); - WARN_ON(nr_context_switches() > 0); - rcu_scheduler_active = 1; -} - static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL}; static atomic_t rcu_barrier_cpu_count; static DEFINE_MUTEX(rcu_barrier_mutex); @@ -1659,7 +1652,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); rdp->grpmask = 1UL << (cpu - rdp->mynode->grplo); rdp->nxtlist = NULL; for (i = 0; i < RCU_NEXT_SIZE; i++) @@ -1669,7 +1662,7 @@ rcu_boot_init_percpu_data(int cpu, struct rcu_state *rsp) rdp->dynticks = &per_cpu(rcu_dynticks, cpu); #endif /* #ifdef CONFIG_NO_HZ */ rdp->cpu = cpu; - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -1687,7 +1680,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) struct rcu_node *rnp = rcu_get_root(rsp); /* Set up local state, ensuring consistent view of global state. */ - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); rdp->passed_quiesc = 0; /* We could be racing with new GP, */ rdp->qs_pending = 1; /* so set up to respond to current GP. */ rdp->beenonline = 1; /* We have now been online. */ @@ -1695,7 +1688,7 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->qlen_last_fqs_check = 0; rdp->n_force_qs_snap = rsp->n_force_qs; rdp->blimit = blimit; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ /* * A new grace period might start here. If so, we won't be part @@ -1703,14 +1696,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) */ /* Exclude any attempts to start a new GP on large systems. */ - spin_lock(&rsp->onofflock); /* irqs already disabled. */ + raw_spin_lock(&rsp->onofflock); /* irqs already disabled. */ /* Add CPU to rcu_node bitmasks. */ rnp = rdp->mynode; mask = rdp->grpmask; do { /* Exclude any attempts to start a new GP on small systems. */ - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->qsmaskinit |= mask; mask = rnp->grpmask; if (rnp == rdp->mynode) { @@ -1718,11 +1711,11 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable) rdp->completed = rnp->completed; rdp->passed_quiesc_completed = rnp->completed - 1; } - spin_unlock(&rnp->lock); /* irqs already disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs already disabled. */ rnp = rnp->parent; } while (rnp != NULL && !(rnp->qsmaskinit & mask)); - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } static void __cpuinit rcu_online_cpu(int cpu) @@ -1806,11 +1799,17 @@ static void __init rcu_init_levelspread(struct rcu_state *rsp) */ static void __init rcu_init_one(struct rcu_state *rsp) { + static char *buf[] = { "rcu_node_level_0", + "rcu_node_level_1", + "rcu_node_level_2", + "rcu_node_level_3" }; /* Match MAX_RCU_LVLS */ int cpustride = 1; int i; int j; struct rcu_node *rnp; + BUILD_BUG_ON(MAX_RCU_LVLS > ARRAY_SIZE(buf)); /* Fix buf[] init! */ + /* Initialize the level-tracking arrays. */ for (i = 1; i < NUM_RCU_LVLS; i++) @@ -1823,8 +1822,9 @@ static void __init rcu_init_one(struct rcu_state *rsp) cpustride *= rsp->levelspread[i]; rnp = rsp->level[i]; for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) { - spin_lock_init(&rnp->lock); - lockdep_set_class(&rnp->lock, &rcu_node_class[i]); + raw_spin_lock_init(&rnp->lock); + lockdep_set_class_and_name(&rnp->lock, + &rcu_node_class[i], buf[i]); rnp->gpnum = 0; rnp->qsmask = 0; rnp->qsmaskinit = 0; @@ -1876,7 +1876,7 @@ do { \ void __init rcu_init(void) { - int i; + int cpu; rcu_bootup_announce(); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR @@ -1896,8 +1896,8 @@ void __init rcu_init(void) * or the scheduler are operational. */ cpu_notifier(rcu_cpu_notify, 0); - for_each_online_cpu(i) - rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)i); + for_each_online_cpu(cpu) + rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu); } #include "rcutree_plugin.h" diff --git a/kernel/rcutree.h b/kernel/rcutree.h index d2a0046f63b2..1439eb504c22 100644 --- a/kernel/rcutree.h +++ b/kernel/rcutree.h @@ -90,12 +90,12 @@ struct rcu_dynticks { * Definition for node within the RCU grace-period-detection hierarchy. */ struct rcu_node { - spinlock_t lock; /* Root rcu_node's lock protects some */ + raw_spinlock_t lock; /* Root rcu_node's lock protects some */ /* rcu_state fields as well as following. */ - long gpnum; /* Current grace period for this node. */ + unsigned long gpnum; /* Current grace period for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ - long completed; /* Last grace period completed for this node. */ + unsigned long completed; /* Last GP completed for this node. */ /* This will either be equal to or one */ /* behind the root rcu_node's gpnum. */ unsigned long qsmask; /* CPUs or groups that need to switch in */ @@ -161,11 +161,11 @@ struct rcu_node { /* Per-CPU data for read-copy update. */ struct rcu_data { /* 1) quiescent-state and grace-period handling : */ - long completed; /* Track rsp->completed gp number */ + unsigned long completed; /* Track rsp->completed gp number */ /* in order to detect GP end. */ - long gpnum; /* Highest gp number that this CPU */ + unsigned long gpnum; /* Highest gp number that this CPU */ /* is aware of having started. */ - long passed_quiesc_completed; + unsigned long passed_quiesc_completed; /* Value of completed at time of qs. */ bool passed_quiesc; /* User-mode/idle loop etc. */ bool qs_pending; /* Core waits for quiesc state. */ @@ -221,14 +221,14 @@ struct rcu_data { unsigned long resched_ipi; /* Sent a resched IPI. */ /* 5) __rcu_pending() statistics. */ - long n_rcu_pending; /* rcu_pending() calls since boot. */ - long n_rp_qs_pending; - long n_rp_cb_ready; - long n_rp_cpu_needs_gp; - long n_rp_gp_completed; - long n_rp_gp_started; - long n_rp_need_fqs; - long n_rp_need_nothing; + unsigned long n_rcu_pending; /* rcu_pending() calls since boot. */ + unsigned long n_rp_qs_pending; + unsigned long n_rp_cb_ready; + unsigned long n_rp_cpu_needs_gp; + unsigned long n_rp_gp_completed; + unsigned long n_rp_gp_started; + unsigned long n_rp_need_fqs; + unsigned long n_rp_need_nothing; int cpu; }; @@ -237,12 +237,11 @@ struct rcu_data { #define RCU_GP_IDLE 0 /* No grace period in progress. */ #define RCU_GP_INIT 1 /* Grace period being initialized. */ #define RCU_SAVE_DYNTICK 2 /* Need to scan dyntick state. */ -#define RCU_SAVE_COMPLETED 3 /* Need to save rsp->completed. */ -#define RCU_FORCE_QS 4 /* Need to force quiescent state. */ +#define RCU_FORCE_QS 3 /* Need to force quiescent state. */ #ifdef CONFIG_NO_HZ #define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK #else /* #ifdef CONFIG_NO_HZ */ -#define RCU_SIGNAL_INIT RCU_SAVE_COMPLETED +#define RCU_SIGNAL_INIT RCU_FORCE_QS #endif /* #else #ifdef CONFIG_NO_HZ */ #define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ @@ -256,6 +255,9 @@ struct rcu_data { #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#define ULONG_CMP_GE(a, b) (ULONG_MAX / 2 >= (a) - (b)) +#define ULONG_CMP_LT(a, b) (ULONG_MAX / 2 < (a) - (b)) + /* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) @@ -277,12 +279,19 @@ struct rcu_state { u8 signaled ____cacheline_internodealigned_in_smp; /* Force QS state. */ - long gpnum; /* Current gp number. */ - long completed; /* # of last completed gp. */ + u8 fqs_active; /* force_quiescent_state() */ + /* is running. */ + u8 fqs_need_gp; /* A CPU was prevented from */ + /* starting a new grace */ + /* period because */ + /* force_quiescent_state() */ + /* was running. */ + unsigned long gpnum; /* Current gp number. */ + unsigned long completed; /* # of last completed gp. */ /* End of fields guarded by root rcu_node's lock. */ - spinlock_t onofflock; /* exclude on/offline and */ + raw_spinlock_t onofflock; /* exclude on/offline and */ /* starting new GP. Also */ /* protects the following */ /* orphan_cbs fields. */ @@ -292,10 +301,8 @@ struct rcu_state { /* going offline. */ struct rcu_head **orphan_cbs_tail; /* And tail pointer. */ long orphan_qlen; /* Number of orphaned cbs. */ - spinlock_t fqslock; /* Only one task forcing */ + raw_spinlock_t fqslock; /* Only one task forcing */ /* quiescent states. */ - long completed_fqs; /* Value of completed @ snap. */ - /* Protected by fqslock. */ unsigned long jiffies_force_qs; /* Time at which to invoke */ /* force_quiescent_state(). */ unsigned long n_force_qs; /* Number of calls to */ @@ -319,8 +326,6 @@ struct rcu_state { #define RCU_OFL_TASKS_EXP_GP 0x2 /* Tasks blocking expedited */ /* GP were moved to root. */ -#ifdef RCU_TREE_NONCORE - /* * RCU implementation internal declarations: */ @@ -335,7 +340,7 @@ extern struct rcu_state rcu_preempt_state; DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ -#else /* #ifdef RCU_TREE_NONCORE */ +#ifndef RCU_TREE_NONCORE /* Forward declarations for rcutree_plugin.h */ static void rcu_bootup_announce(void); @@ -347,6 +352,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags); #endif /* #ifdef CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_RCU_CPU_STALL_DETECTOR +static void rcu_print_detail_task_stall(struct rcu_state *rsp); static void rcu_print_task_stall(struct rcu_node *rnp); #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp); @@ -367,5 +373,6 @@ static int rcu_preempt_needs_cpu(int cpu); static void __cpuinit rcu_preempt_init_percpu_data(int cpu); static void rcu_preempt_send_cbs_to_orphanage(void); static void __init __rcu_init_preempt(void); +static void rcu_needs_cpu_flush(void); -#endif /* #else #ifdef RCU_TREE_NONCORE */ +#endif /* #ifndef RCU_TREE_NONCORE */ diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index 37fbccdf41d5..464ad2cdee00 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -62,6 +62,15 @@ long rcu_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_batches_completed); /* + * Force a quiescent state for preemptible RCU. + */ +void rcu_force_quiescent_state(void) +{ + force_quiescent_state(&rcu_preempt_state, 0); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* * Record a preemptable-RCU quiescent state for the specified CPU. Note * that this just means that the task currently running on the CPU is * not in a quiescent state. There might be any number of tasks blocked @@ -102,7 +111,7 @@ static void rcu_preempt_note_context_switch(int cpu) /* Possibly blocking in an RCU read-side critical section. */ rdp = rcu_preempt_state.rda[cpu]; rnp = rdp->mynode; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); t->rcu_read_unlock_special |= RCU_READ_UNLOCK_BLOCKED; t->rcu_blocked_node = rnp; @@ -123,7 +132,7 @@ static void rcu_preempt_note_context_switch(int cpu) WARN_ON_ONCE(!list_empty(&t->rcu_node_entry)); phase = (rnp->gpnum + !(rnp->qsmask & rdp->grpmask)) & 0x1; list_add(&t->rcu_node_entry, &rnp->blocked_tasks[phase]); - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -180,7 +189,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) struct rcu_node *rnp_p; if (rnp->qsmask != 0 || rcu_preempted_readers(rnp)) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); return; /* Still need more quiescent states! */ } @@ -197,8 +206,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) /* Report up the rest of the hierarchy. */ mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ - spin_lock(&rnp_p->lock); /* irqs already disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_lock(&rnp_p->lock); /* irqs already disabled. */ rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags); } @@ -248,10 +257,10 @@ static void rcu_read_unlock_special(struct task_struct *t) */ for (;;) { rnp = t->rcu_blocked_node; - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ if (rnp == t->rcu_blocked_node) break; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } empty = !rcu_preempted_readers(rnp); empty_exp = !rcu_preempted_readers_exp(rnp); @@ -265,7 +274,7 @@ static void rcu_read_unlock_special(struct task_struct *t) * Note that rcu_report_unblock_qs_rnp() releases rnp->lock. */ if (empty) - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); else rcu_report_unblock_qs_rnp(rnp, flags); @@ -295,29 +304,73 @@ void __rcu_read_unlock(void) if (--ACCESS_ONCE(t->rcu_read_lock_nesting) == 0 && unlikely(ACCESS_ONCE(t->rcu_read_unlock_special))) rcu_read_unlock_special(t); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON_ONCE(ACCESS_ONCE(t->rcu_read_lock_nesting) < 0); +#endif /* #ifdef CONFIG_PROVE_LOCKING */ } EXPORT_SYMBOL_GPL(__rcu_read_unlock); #ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#ifdef CONFIG_RCU_CPU_STALL_VERBOSE + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period on the specified rcu_node structure. + */ +static void rcu_print_detail_task_stall_rnp(struct rcu_node *rnp) +{ + unsigned long flags; + struct list_head *lp; + int phase; + struct task_struct *t; + + if (rcu_preempted_readers(rnp)) { + raw_spin_lock_irqsave(&rnp->lock, flags); + phase = rnp->gpnum & 0x1; + lp = &rnp->blocked_tasks[phase]; + list_for_each_entry(t, lp, rcu_node_entry) + sched_show_task(t); + raw_spin_unlock_irqrestore(&rnp->lock, flags); + } +} + +/* + * Dump detailed information for all tasks blocking the current RCU + * grace period. + */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ + struct rcu_node *rnp = rcu_get_root(rsp); + + rcu_print_detail_task_stall_rnp(rnp); + rcu_for_each_leaf_node(rsp, rnp) + rcu_print_detail_task_stall_rnp(rnp); +} + +#else /* #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ + +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ +} + +#endif /* #else #ifdef CONFIG_RCU_CPU_STALL_VERBOSE */ + /* * Scan the current list of tasks blocked within RCU read-side critical * sections, printing out the tid of each. */ static void rcu_print_task_stall(struct rcu_node *rnp) { - unsigned long flags; struct list_head *lp; int phase; struct task_struct *t; if (rcu_preempted_readers(rnp)) { - spin_lock_irqsave(&rnp->lock, flags); phase = rnp->gpnum & 0x1; lp = &rnp->blocked_tasks[phase]; list_for_each_entry(t, lp, rcu_node_entry) printk(" P%d", t->pid); - spin_unlock_irqrestore(&rnp->lock, flags); } } @@ -388,11 +441,11 @@ static int rcu_preempt_offline_tasks(struct rcu_state *rsp, lp_root = &rnp_root->blocked_tasks[i]; while (!list_empty(lp)) { tp = list_entry(lp->next, typeof(*tp), rcu_node_entry); - spin_lock(&rnp_root->lock); /* irqs already disabled */ + raw_spin_lock(&rnp_root->lock); /* irqs already disabled */ list_del(&tp->rcu_node_entry); tp->rcu_blocked_node = rnp_root; list_add(&tp->rcu_node_entry, lp_root); - spin_unlock(&rnp_root->lock); /* irqs remain disabled */ + raw_spin_unlock(&rnp_root->lock); /* irqs remain disabled */ } } return retval; @@ -516,7 +569,7 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) unsigned long flags; unsigned long mask; - spin_lock_irqsave(&rnp->lock, flags); + raw_spin_lock_irqsave(&rnp->lock, flags); for (;;) { if (!sync_rcu_preempt_exp_done(rnp)) break; @@ -525,12 +578,12 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp) break; } mask = rnp->grpmask; - spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ rnp = rnp->parent; - spin_lock(&rnp->lock); /* irqs already disabled */ + raw_spin_lock(&rnp->lock); /* irqs already disabled */ rnp->expmask &= ~mask; } - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } /* @@ -545,11 +598,11 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp) { int must_wait; - spin_lock(&rnp->lock); /* irqs already disabled */ + raw_spin_lock(&rnp->lock); /* irqs already disabled */ list_splice_init(&rnp->blocked_tasks[0], &rnp->blocked_tasks[2]); list_splice_init(&rnp->blocked_tasks[1], &rnp->blocked_tasks[3]); must_wait = rcu_preempted_readers_exp(rnp); - spin_unlock(&rnp->lock); /* irqs remain disabled */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled */ if (!must_wait) rcu_report_exp_rnp(rsp, rnp); } @@ -594,13 +647,13 @@ void synchronize_rcu_expedited(void) /* force all RCU readers onto blocked_tasks[]. */ synchronize_sched_expedited(); - spin_lock_irqsave(&rsp->onofflock, flags); + raw_spin_lock_irqsave(&rsp->onofflock, flags); /* Initialize ->expmask for all non-leaf rcu_node structures. */ rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) { - spin_lock(&rnp->lock); /* irqs already disabled. */ + raw_spin_lock(&rnp->lock); /* irqs already disabled. */ rnp->expmask = rnp->qsmaskinit; - spin_unlock(&rnp->lock); /* irqs remain disabled. */ + raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */ } /* Snapshot current state of ->blocked_tasks[] lists. */ @@ -609,7 +662,7 @@ void synchronize_rcu_expedited(void) if (NUM_RCU_NODES > 1) sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp)); - spin_unlock_irqrestore(&rsp->onofflock, flags); + raw_spin_unlock_irqrestore(&rsp->onofflock, flags); /* Wait for snapshotted ->blocked_tasks[] lists to drain. */ rnp = rcu_get_root(rsp); @@ -713,6 +766,16 @@ long rcu_batches_completed(void) EXPORT_SYMBOL_GPL(rcu_batches_completed); /* + * Force a quiescent state for RCU, which, because there is no preemptible + * RCU, becomes the same as rcu-sched. + */ +void rcu_force_quiescent_state(void) +{ + rcu_sched_force_quiescent_state(); +} +EXPORT_SYMBOL_GPL(rcu_force_quiescent_state); + +/* * Because preemptable RCU does not exist, we never have to check for * CPUs being in quiescent states. */ @@ -734,7 +797,7 @@ static int rcu_preempted_readers(struct rcu_node *rnp) /* Because preemptible RCU does not exist, no quieting of tasks. */ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) { - spin_unlock_irqrestore(&rnp->lock, flags); + raw_spin_unlock_irqrestore(&rnp->lock, flags); } #endif /* #ifdef CONFIG_HOTPLUG_CPU */ @@ -745,6 +808,14 @@ static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags) * Because preemptable RCU does not exist, we never have to check for * tasks blocked within RCU read-side critical sections. */ +static void rcu_print_detail_task_stall(struct rcu_state *rsp) +{ +} + +/* + * Because preemptable RCU does not exist, we never have to check for + * tasks blocked within RCU read-side critical sections. + */ static void rcu_print_task_stall(struct rcu_node *rnp) { } @@ -884,3 +955,113 @@ static void __init __rcu_init_preempt(void) } #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */ + +#if !defined(CONFIG_RCU_FAST_NO_HZ) + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + * + * Because we have preemptible RCU, just check whether this CPU needs + * any flavor of RCU. Do not chew up lots of CPU cycles with preemption + * disabled in a most-likely vain attempt to cause RCU not to need this CPU. + */ +int rcu_needs_cpu(int cpu) +{ + return rcu_needs_cpu_quick_check(cpu); +} + +/* + * Check to see if we need to continue a callback-flush operations to + * allow the last CPU to enter dyntick-idle mode. But fast dyntick-idle + * entry is not configured, so we never do need to. + */ +static void rcu_needs_cpu_flush(void) +{ +} + +#else /* #if !defined(CONFIG_RCU_FAST_NO_HZ) */ + +#define RCU_NEEDS_CPU_FLUSHES 5 +static DEFINE_PER_CPU(int, rcu_dyntick_drain); +static DEFINE_PER_CPU(unsigned long, rcu_dyntick_holdoff); + +/* + * Check to see if any future RCU-related work will need to be done + * by the current CPU, even if none need be done immediately, returning + * 1 if so. This function is part of the RCU implementation; it is -not- + * an exported member of the RCU API. + * + * Because we are not supporting preemptible RCU, attempt to accelerate + * any current grace periods so that RCU no longer needs this CPU, but + * only if all other CPUs are already in dynticks-idle mode. This will + * allow the CPU cores to be powered down immediately, as opposed to after + * waiting many milliseconds for grace periods to elapse. + * + * Because it is not legal to invoke rcu_process_callbacks() with irqs + * disabled, we do one pass of force_quiescent_state(), then do a + * raise_softirq() to cause rcu_process_callbacks() to be invoked later. + * The per-cpu rcu_dyntick_drain variable controls the sequencing. + */ +int rcu_needs_cpu(int cpu) +{ + int c = 0; + int thatcpu; + + /* Don't bother unless we are the last non-dyntick-idle CPU. */ + for_each_cpu_not(thatcpu, nohz_cpu_mask) + if (thatcpu != cpu) { + per_cpu(rcu_dyntick_drain, cpu) = 0; + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies - 1; + return rcu_needs_cpu_quick_check(cpu); + } + + /* Check and update the rcu_dyntick_drain sequencing. */ + if (per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* First time through, initialize the counter. */ + per_cpu(rcu_dyntick_drain, cpu) = RCU_NEEDS_CPU_FLUSHES; + } else if (--per_cpu(rcu_dyntick_drain, cpu) <= 0) { + /* We have hit the limit, so time to give up. */ + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + return rcu_needs_cpu_quick_check(cpu); + } + + /* Do one step pushing remaining RCU callbacks through. */ + if (per_cpu(rcu_sched_data, cpu).nxtlist) { + rcu_sched_qs(cpu); + force_quiescent_state(&rcu_sched_state, 0); + c = c || per_cpu(rcu_sched_data, cpu).nxtlist; + } + if (per_cpu(rcu_bh_data, cpu).nxtlist) { + rcu_bh_qs(cpu); + force_quiescent_state(&rcu_bh_state, 0); + c = c || per_cpu(rcu_bh_data, cpu).nxtlist; + } + + /* If RCU callbacks are still pending, RCU still needs this CPU. */ + if (c) { + raise_softirq(RCU_SOFTIRQ); + per_cpu(rcu_dyntick_holdoff, cpu) = jiffies; + } + return c; +} + +/* + * Check to see if we need to continue a callback-flush operations to + * allow the last CPU to enter dyntick-idle mode. + */ +static void rcu_needs_cpu_flush(void) +{ + int cpu = smp_processor_id(); + unsigned long flags; + + if (per_cpu(rcu_dyntick_drain, cpu) <= 0) + return; + local_irq_save(flags); + (void)rcu_needs_cpu(cpu); + local_irq_restore(flags); +} + +#endif /* #else #if !defined(CONFIG_RCU_FAST_NO_HZ) */ diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 9d2c88423b31..d45db2e35d27 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -50,7 +50,7 @@ static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%3d%cc=%ld g=%ld pq=%d pqc=%ld qp=%d", + seq_printf(m, "%3d%cc=%lu g=%lu pq=%d pqc=%lu qp=%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? '!' : ' ', rdp->completed, rdp->gpnum, @@ -105,7 +105,7 @@ static void print_one_rcu_data_csv(struct seq_file *m, struct rcu_data *rdp) { if (!rdp->beenonline) return; - seq_printf(m, "%d,%s,%ld,%ld,%d,%ld,%d", + seq_printf(m, "%d,%s,%lu,%lu,%d,%lu,%d", rdp->cpu, cpu_is_offline(rdp->cpu) ? "\"N\"" : "\"Y\"", rdp->completed, rdp->gpnum, @@ -155,13 +155,13 @@ static const struct file_operations rcudata_csv_fops = { static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp) { - long gpnum; + unsigned long gpnum; int level = 0; int phase; struct rcu_node *rnp; gpnum = rsp->gpnum; - seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x " + seq_printf(m, "c=%lu g=%lu s=%d jfq=%ld j=%x " "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n", rsp->completed, gpnum, rsp->signaled, (long)(rsp->jiffies_force_qs - jiffies), @@ -215,12 +215,12 @@ static const struct file_operations rcuhier_fops = { static int show_rcugp(struct seq_file *m, void *unused) { #ifdef CONFIG_TREE_PREEMPT_RCU - seq_printf(m, "rcu_preempt: completed=%ld gpnum=%ld\n", + seq_printf(m, "rcu_preempt: completed=%ld gpnum=%lu\n", rcu_preempt_state.completed, rcu_preempt_state.gpnum); #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */ - seq_printf(m, "rcu_sched: completed=%ld gpnum=%ld\n", + seq_printf(m, "rcu_sched: completed=%ld gpnum=%lu\n", rcu_sched_state.completed, rcu_sched_state.gpnum); - seq_printf(m, "rcu_bh: completed=%ld gpnum=%ld\n", + seq_printf(m, "rcu_bh: completed=%ld gpnum=%lu\n", rcu_bh_state.completed, rcu_bh_state.gpnum); return 0; } diff --git a/kernel/resource.c b/kernel/resource.c index af96c1e4b54b..24e9e60c1459 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -188,6 +188,36 @@ static int __release_resource(struct resource *old) return -EINVAL; } +static void __release_child_resources(struct resource *r) +{ + struct resource *tmp, *p; + resource_size_t size; + + p = r->child; + r->child = NULL; + while (p) { + tmp = p; + p = p->sibling; + + tmp->parent = NULL; + tmp->sibling = NULL; + __release_child_resources(tmp); + + printk(KERN_DEBUG "release child resource %pR\n", tmp); + /* need to restore size, and keep flags */ + size = resource_size(tmp); + tmp->start = 0; + tmp->end = size - 1; + } +} + +void release_child_resources(struct resource *r) +{ + write_lock(&resource_lock); + __release_child_resources(r); + write_unlock(&resource_lock); +} + /** * request_resource - request and reserve an I/O or memory resource * @root: root resource descriptor @@ -303,8 +333,10 @@ int walk_system_ram_range(unsigned long start_pfn, unsigned long nr_pages, static int find_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), void *alignf_data) { struct resource *this = root->child; @@ -330,7 +362,7 @@ static int find_resource(struct resource *root, struct resource *new, tmp.end = max; tmp.start = ALIGN(tmp.start, align); if (alignf) - alignf(alignf_data, &tmp, size, align); + tmp.start = alignf(alignf_data, &tmp, size, align); if (tmp.start < tmp.end && tmp.end - tmp.start >= size - 1) { new->start = tmp.start; new->end = tmp.start + size - 1; @@ -358,8 +390,10 @@ static int find_resource(struct resource *root, struct resource *new, int allocate_resource(struct resource *root, struct resource *new, resource_size_t size, resource_size_t min, resource_size_t max, resource_size_t align, - void (*alignf)(void *, struct resource *, - resource_size_t, resource_size_t), + resource_size_t (*alignf)(void *, + const struct resource *, + resource_size_t, + resource_size_t), void *alignf_data) { int err; diff --git a/kernel/sched.c b/kernel/sched.c index 7266b912139f..9d163f83e5c3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -645,6 +645,11 @@ static inline int cpu_of(struct rq *rq) #endif } +#define rcu_dereference_check_sched_domain(p) \ + rcu_dereference_check((p), \ + rcu_read_lock_sched_held() || \ + lockdep_is_held(&sched_domains_mutex)) + /* * The domain tree (rq->sd) is protected by RCU's quiescent state transition. * See detach_destroy_domains: synchronize_sched for details. @@ -653,7 +658,7 @@ static inline int cpu_of(struct rq *rq) * preempt-disabled sections. */ #define for_each_domain(cpu, __sd) \ - for (__sd = rcu_dereference(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) + for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); __sd; __sd = __sd->parent) #define cpu_rq(cpu) (&per_cpu(runqueues, (cpu))) #define this_rq() (&__get_cpu_var(runqueues)) @@ -1531,7 +1536,7 @@ static unsigned long target_load(int cpu, int type) static struct sched_group *group_of(int cpu) { - struct sched_domain *sd = rcu_dereference(cpu_rq(cpu)->sd); + struct sched_domain *sd = rcu_dereference_sched(cpu_rq(cpu)->sd); if (!sd) return NULL; @@ -2320,14 +2325,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } /* - * Called from: - * - * - fork, @p is stable because it isn't on the tasklist yet - * - * - exec, @p is unstable, retry loop + * Gets called from 3 sites (exec, fork, wakeup), since it is called without + * holding rq->lock we need to ensure ->cpus_allowed is stable, this is done + * by: * - * - wake-up, we serialize ->cpus_allowed against TASK_WAKING so - * we should be good. + * exec: is unstable, retry loop + * fork & wake-up: serialize ->cpus_allowed against TASK_WAKING */ static inline int select_task_rq(struct task_struct *p, int sd_flags, int wake_flags) @@ -2620,9 +2623,6 @@ void sched_fork(struct task_struct *p, int clone_flags) if (p->sched_class->task_fork) p->sched_class->task_fork(p); -#ifdef CONFIG_SMP - cpu = select_task_rq(p, SD_BALANCE_FORK, 0); -#endif set_task_cpu(p, cpu); #if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) @@ -2652,6 +2652,21 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) { unsigned long flags; struct rq *rq; + int cpu = get_cpu(); + +#ifdef CONFIG_SMP + /* + * Fork balancing, do it here and not earlier because: + * - cpus_allowed can change in the fork path + * - any previously selected cpu might disappear through hotplug + * + * We still have TASK_WAKING but PF_STARTING is gone now, meaning + * ->cpus_allowed is stable, we have preemption disabled, meaning + * cpu_online_mask is stable. + */ + cpu = select_task_rq(p, SD_BALANCE_FORK, 0); + set_task_cpu(p, cpu); +#endif rq = task_rq_lock(p, &flags); BUG_ON(p->state != TASK_WAKING); @@ -2665,6 +2680,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags) p->sched_class->task_woken(rq, p); #endif task_rq_unlock(rq, &flags); + put_cpu(); } #ifdef CONFIG_PREEMPT_NOTIFIERS @@ -4883,7 +4899,7 @@ static void run_rebalance_domains(struct softirq_action *h) static inline int on_null_domain(int cpu) { - return !rcu_dereference(cpu_rq(cpu)->sd); + return !rcu_dereference_sched(cpu_rq(cpu)->sd); } /* @@ -7145,14 +7161,18 @@ int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) * the ->cpus_allowed mask from under waking tasks, which would be * possible when we change rq->lock in ttwu(), so synchronize against * TASK_WAKING to avoid that. + * + * Make an exception for freshly cloned tasks, since cpuset namespaces + * might move the task about, we have to validate the target in + * wake_up_new_task() anyway since the cpu might have gone away. */ again: - while (p->state == TASK_WAKING) + while (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) cpu_relax(); rq = task_rq_lock(p, &flags); - if (p->state == TASK_WAKING) { + if (p->state == TASK_WAKING && !(p->flags & PF_STARTING)) { task_rq_unlock(rq, &flags); goto again; } diff --git a/kernel/smp.c b/kernel/smp.c index f10408422444..9867b6bfefce 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -12,8 +12,6 @@ #include <linux/smp.h> #include <linux/cpu.h> -static DEFINE_PER_CPU(struct call_single_queue, call_single_queue); - static struct { struct list_head queue; raw_spinlock_t lock; @@ -33,12 +31,14 @@ struct call_function_data { cpumask_var_t cpumask; }; +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_function_data, cfd_data); + struct call_single_queue { struct list_head list; raw_spinlock_t lock; }; -static DEFINE_PER_CPU(struct call_function_data, cfd_data); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_queue, call_single_queue); static int hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu) @@ -256,7 +256,7 @@ void generic_smp_call_function_single_interrupt(void) } } -static DEFINE_PER_CPU(struct call_single_data, csd_data); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct call_single_data, csd_data); /* * smp_call_function_single - Run a function on a specific CPU diff --git a/kernel/softirq.c b/kernel/softirq.c index a09502e2ef75..7c1a67ef0274 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -500,22 +500,17 @@ EXPORT_SYMBOL(tasklet_kill); */ /* - * The trampoline is called when the hrtimer expires. If this is - * called from the hrtimer interrupt then we schedule the tasklet as - * the timer callback function expects to run in softirq context. If - * it's called in softirq context anyway (i.e. high resolution timers - * disabled) then the hrtimer callback is called right away. + * The trampoline is called when the hrtimer expires. It schedules a tasklet + * to run __tasklet_hrtimer_trampoline() which in turn will call the intended + * hrtimer callback, but from softirq context. */ static enum hrtimer_restart __hrtimer_tasklet_trampoline(struct hrtimer *timer) { struct tasklet_hrtimer *ttimer = container_of(timer, struct tasklet_hrtimer, timer); - if (hrtimer_is_hres_active(timer)) { - tasklet_hi_schedule(&ttimer->tasklet); - return HRTIMER_NORESTART; - } - return ttimer->function(timer); + tasklet_hi_schedule(&ttimer->tasklet); + return HRTIMER_NORESTART; } /* diff --git a/kernel/softlockup.c b/kernel/softlockup.c index d22579087e27..0d4c7898ab80 100644 --- a/kernel/softlockup.c +++ b/kernel/softlockup.c @@ -25,6 +25,7 @@ static DEFINE_SPINLOCK(print_lock); static DEFINE_PER_CPU(unsigned long, softlockup_touch_ts); /* touch timestamp */ static DEFINE_PER_CPU(unsigned long, softlockup_print_ts); /* print timestamp */ static DEFINE_PER_CPU(struct task_struct *, softlockup_watchdog); +static DEFINE_PER_CPU(bool, softlock_touch_sync); static int __read_mostly did_panic; int __read_mostly softlockup_thresh = 60; @@ -79,6 +80,12 @@ void touch_softlockup_watchdog(void) } EXPORT_SYMBOL(touch_softlockup_watchdog); +void touch_softlockup_watchdog_sync(void) +{ + __raw_get_cpu_var(softlock_touch_sync) = true; + __raw_get_cpu_var(softlockup_touch_ts) = 0; +} + void touch_all_softlockup_watchdogs(void) { int cpu; @@ -118,6 +125,14 @@ void softlockup_tick(void) } if (touch_ts == 0) { + if (unlikely(per_cpu(softlock_touch_sync, this_cpu))) { + /* + * If the time stamp was touched atomically + * make sure the scheduler tick is up to date. + */ + per_cpu(softlock_touch_sync, this_cpu) = false; + sched_clock_tick(); + } __touch_softlockup_watchdog(); return; } diff --git a/kernel/srcu.c b/kernel/srcu.c index 818d7d9aa03c..bde4295774c8 100644 --- a/kernel/srcu.c +++ b/kernel/srcu.c @@ -34,6 +34,30 @@ #include <linux/smp.h> #include <linux/srcu.h> +static int init_srcu_struct_fields(struct srcu_struct *sp) +{ + sp->completed = 0; + mutex_init(&sp->mutex); + sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); + return sp->per_cpu_ref ? 0 : -ENOMEM; +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + +int __init_srcu_struct(struct srcu_struct *sp, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* Don't re-initialize a lock while it is held. */ + debug_check_no_locks_freed((void *)sp, sizeof(*sp)); + lockdep_init_map(&sp->dep_map, name, key, 0); +#endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + return init_srcu_struct_fields(sp); +} +EXPORT_SYMBOL_GPL(__init_srcu_struct); + +#else /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + /** * init_srcu_struct - initialize a sleep-RCU structure * @sp: structure to initialize. @@ -44,13 +68,12 @@ */ int init_srcu_struct(struct srcu_struct *sp) { - sp->completed = 0; - mutex_init(&sp->mutex); - sp->per_cpu_ref = alloc_percpu(struct srcu_struct_array); - return (sp->per_cpu_ref ? 0 : -ENOMEM); + return init_srcu_struct_fields(sp); } EXPORT_SYMBOL_GPL(init_srcu_struct); +#endif /* #else #ifdef CONFIG_DEBUG_LOCK_ALLOC */ + /* * srcu_readers_active_idx -- returns approximate number of readers * active on the specified rank of per-CPU counters. @@ -100,15 +123,12 @@ void cleanup_srcu_struct(struct srcu_struct *sp) } EXPORT_SYMBOL_GPL(cleanup_srcu_struct); -/** - * srcu_read_lock - register a new reader for an SRCU-protected structure. - * @sp: srcu_struct in which to register the new reader. - * +/* * Counts the new reader in the appropriate per-CPU element of the * srcu_struct. Must be called from process context. * Returns an index that must be passed to the matching srcu_read_unlock(). */ -int srcu_read_lock(struct srcu_struct *sp) +int __srcu_read_lock(struct srcu_struct *sp) { int idx; @@ -120,31 +140,27 @@ int srcu_read_lock(struct srcu_struct *sp) preempt_enable(); return idx; } -EXPORT_SYMBOL_GPL(srcu_read_lock); +EXPORT_SYMBOL_GPL(__srcu_read_lock); -/** - * srcu_read_unlock - unregister a old reader from an SRCU-protected structure. - * @sp: srcu_struct in which to unregister the old reader. - * @idx: return value from corresponding srcu_read_lock(). - * +/* * Removes the count for the old reader from the appropriate per-CPU * element of the srcu_struct. Note that this may well be a different * CPU than that which was incremented by the corresponding srcu_read_lock(). * Must be called from process context. */ -void srcu_read_unlock(struct srcu_struct *sp, int idx) +void __srcu_read_unlock(struct srcu_struct *sp, int idx) { preempt_disable(); srcu_barrier(); /* ensure compiler won't misorder critical section. */ per_cpu_ptr(sp->per_cpu_ref, smp_processor_id())->c[idx]--; preempt_enable(); } -EXPORT_SYMBOL_GPL(srcu_read_unlock); +EXPORT_SYMBOL_GPL(__srcu_read_unlock); /* * Helper function for synchronize_srcu() and synchronize_srcu_expedited(). */ -void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) +static void __synchronize_srcu(struct srcu_struct *sp, void (*sync_func)(void)) { int idx; diff --git a/kernel/sys.c b/kernel/sys.c index 26a6b73a6b85..18bde979f346 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -222,6 +222,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) if (which > PRIO_USER || which < PRIO_PROCESS) return -EINVAL; + rcu_read_lock(); read_lock(&tasklist_lock); switch (which) { case PRIO_PROCESS: @@ -267,6 +268,7 @@ SYSCALL_DEFINE2(getpriority, int, which, int, who) } out_unlock: read_unlock(&tasklist_lock); + rcu_read_unlock(); return retval; } diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index e85c23404d34..13700833c181 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -343,7 +343,19 @@ static void clocksource_resume_watchdog(void) { unsigned long flags; - spin_lock_irqsave(&watchdog_lock, flags); + /* + * We use trylock here to avoid a potential dead lock when + * kgdb calls this code after the kernel has been stopped with + * watchdog_lock held. When watchdog_lock is held we just + * return and accept, that the watchdog might trigger and mark + * the monitored clock source (usually TSC) unstable. + * + * This does not affect the other caller clocksource_resume() + * because at this point the kernel is UP, interrupts are + * disabled and nothing can hold watchdog_lock. + */ + if (!spin_trylock_irqsave(&watchdog_lock, flags)) + return; clocksource_reset_watchdog(); spin_unlock_irqrestore(&watchdog_lock, flags); } @@ -458,8 +470,8 @@ void clocksource_resume(void) * clocksource_touch_watchdog - Update watchdog * * Update the watchdog after exception contexts such as kgdb so as not - * to incorrectly trip the watchdog. - * + * to incorrectly trip the watchdog. This might fail when the kernel + * was stopped in code which holds watchdog_lock. */ void clocksource_touch_watchdog(void) { diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 7faaa32fbf4f..e2ab064c6d41 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -880,6 +880,7 @@ void getboottime(struct timespec *ts) set_normalized_timespec(ts, -boottime.tv_sec, -boottime.tv_nsec); } +EXPORT_SYMBOL_GPL(getboottime); /** * monotonic_to_bootbased - Convert the monotonic time to boot based. @@ -889,6 +890,7 @@ void monotonic_to_bootbased(struct timespec *ts) { *ts = timespec_add_safe(*ts, total_sleep_time); } +EXPORT_SYMBOL_GPL(monotonic_to_bootbased); unsigned long get_seconds(void) { diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 6c22d8a2f289..13e13d428cd3 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -27,9 +27,7 @@ config HAVE_FUNCTION_GRAPH_TRACER config HAVE_FUNCTION_GRAPH_FP_TEST bool help - An arch may pass in a unique value (frame pointer) to both the - entering and exiting of a function. On exit, the value is compared - and if it does not match, then it will panic the kernel. + See Documentation/trace/ftrace-design.txt config HAVE_FUNCTION_TRACE_MCOUNT_TEST bool @@ -330,15 +328,6 @@ config BRANCH_TRACER Say N if unsure. -config POWER_TRACER - bool "Trace power consumption behavior" - depends on X86 - select GENERIC_TRACER - help - This tracer helps developers to analyze and optimize the kernel's - power management decisions, specifically the C-state and P-state - behavior. - config KSYM_TRACER bool "Trace read and write access on kernel memory locations" depends on HAVE_HW_BREAKPOINT @@ -451,7 +440,7 @@ config BLK_DEV_IO_TRACE config KPROBE_EVENT depends on KPROBES - depends on X86 + depends on HAVE_REGS_AND_STACK_ACCESS_API bool "Enable kprobes-based dynamic events" select TRACING default y diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 1904797f4a8a..83783579378f 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2402,6 +2402,7 @@ static const struct file_operations ftrace_notrace_fops = { static DEFINE_MUTEX(graph_lock); int ftrace_graph_count; +int ftrace_graph_filter_enabled; unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS] __read_mostly; static void * @@ -2424,7 +2425,7 @@ static void *g_start(struct seq_file *m, loff_t *pos) mutex_lock(&graph_lock); /* Nothing, tell g_show to print all functions are enabled */ - if (!ftrace_graph_count && !*pos) + if (!ftrace_graph_filter_enabled && !*pos) return (void *)1; return __g_next(m, pos); @@ -2470,6 +2471,7 @@ ftrace_graph_open(struct inode *inode, struct file *file) mutex_lock(&graph_lock); if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { + ftrace_graph_filter_enabled = 0; ftrace_graph_count = 0; memset(ftrace_graph_funcs, 0, sizeof(ftrace_graph_funcs)); } @@ -2495,7 +2497,7 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) struct dyn_ftrace *rec; struct ftrace_page *pg; int search_len; - int found = 0; + int fail = 1; int type, not; char *search; bool exists; @@ -2506,37 +2508,51 @@ ftrace_set_func(unsigned long *array, int *idx, char *buffer) /* decode regex */ type = filter_parse_regex(buffer, strlen(buffer), &search, ¬); - if (not) - return -EINVAL; + if (!not && *idx >= FTRACE_GRAPH_MAX_FUNCS) + return -EBUSY; search_len = strlen(search); mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { - if (*idx >= FTRACE_GRAPH_MAX_FUNCS) - break; - if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE)) continue; if (ftrace_match_record(rec, search, search_len, type)) { - /* ensure it is not already in the array */ + /* if it is in the array */ exists = false; - for (i = 0; i < *idx; i++) + for (i = 0; i < *idx; i++) { if (array[i] == rec->ip) { exists = true; break; } - if (!exists) - array[(*idx)++] = rec->ip; - found = 1; + } + + if (!not) { + fail = 0; + if (!exists) { + array[(*idx)++] = rec->ip; + if (*idx >= FTRACE_GRAPH_MAX_FUNCS) + goto out; + } + } else { + if (exists) { + array[i] = array[--(*idx)]; + array[*idx] = 0; + fail = 0; + } + } } } while_for_each_ftrace_rec(); - +out: mutex_unlock(&ftrace_lock); - return found ? 0 : -EINVAL; + if (fail) + return -EINVAL; + + ftrace_graph_filter_enabled = 1; + return 0; } static ssize_t @@ -2546,16 +2562,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf, struct trace_parser parser; ssize_t read, ret; - if (!cnt || cnt < 0) + if (!cnt) return 0; mutex_lock(&graph_lock); - if (ftrace_graph_count >= FTRACE_GRAPH_MAX_FUNCS) { - ret = -EBUSY; - goto out_unlock; - } - if (trace_parser_get_init(&parser, FTRACE_BUFF_MAX)) { ret = -ENOMEM; goto out_unlock; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index edefe3b2801b..8c1b2d290718 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -464,6 +464,8 @@ struct ring_buffer_iter { struct ring_buffer_per_cpu *cpu_buffer; unsigned long head; struct buffer_page *head_page; + struct buffer_page *cache_reader_page; + unsigned long cache_read; u64 read_stamp; }; @@ -2716,6 +2718,8 @@ static void rb_iter_reset(struct ring_buffer_iter *iter) iter->read_stamp = cpu_buffer->read_stamp; else iter->read_stamp = iter->head_page->page->time_stamp; + iter->cache_reader_page = cpu_buffer->reader_page; + iter->cache_read = cpu_buffer->read; } /** @@ -3060,13 +3064,22 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) struct ring_buffer_event *event; int nr_loops = 0; - if (ring_buffer_iter_empty(iter)) - return NULL; - cpu_buffer = iter->cpu_buffer; buffer = cpu_buffer->buffer; + /* + * Check if someone performed a consuming read to + * the buffer. A consuming read invalidates the iterator + * and we need to reset the iterator in this case. + */ + if (unlikely(iter->cache_read != cpu_buffer->read || + iter->cache_reader_page != cpu_buffer->reader_page)) + rb_iter_reset(iter); + again: + if (ring_buffer_iter_empty(iter)) + return NULL; + /* * We repeat when a timestamp is encountered. * We can get multiple timestamps by nested interrupts or also @@ -3081,6 +3094,11 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) if (rb_per_cpu_empty(cpu_buffer)) return NULL; + if (iter->head >= local_read(&iter->head_page->page->commit)) { + rb_inc_iter(iter); + goto again; + } + event = rb_iter_head_event(iter); switch (event->type_len) { diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0df1b0f2cb9e..032c57ca6502 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -32,6 +32,7 @@ #include <linux/splice.h> #include <linux/kdebug.h> #include <linux/string.h> +#include <linux/rwsem.h> #include <linux/ctype.h> #include <linux/init.h> #include <linux/poll.h> @@ -102,9 +103,6 @@ static inline void ftrace_enable_cpu(void) static cpumask_var_t __read_mostly tracing_buffer_mask; -/* Define which cpu buffers are currently read in trace_pipe */ -static cpumask_var_t tracing_reader_cpumask; - #define for_each_tracing_cpu(cpu) \ for_each_cpu(cpu, tracing_buffer_mask) @@ -243,12 +241,91 @@ static struct tracer *current_trace __read_mostly; /* * trace_types_lock is used to protect the trace_types list. - * This lock is also used to keep user access serialized. - * Accesses from userspace will grab this lock while userspace - * activities happen inside the kernel. */ static DEFINE_MUTEX(trace_types_lock); +/* + * serialize the access of the ring buffer + * + * ring buffer serializes readers, but it is low level protection. + * The validity of the events (which returns by ring_buffer_peek() ..etc) + * are not protected by ring buffer. + * + * The content of events may become garbage if we allow other process consumes + * these events concurrently: + * A) the page of the consumed events may become a normal page + * (not reader page) in ring buffer, and this page will be rewrited + * by events producer. + * B) The page of the consumed events may become a page for splice_read, + * and this page will be returned to system. + * + * These primitives allow multi process access to different cpu ring buffer + * concurrently. + * + * These primitives don't distinguish read-only and read-consume access. + * Multi read-only access are also serialized. + */ + +#ifdef CONFIG_SMP +static DECLARE_RWSEM(all_cpu_access_lock); +static DEFINE_PER_CPU(struct mutex, cpu_access_lock); + +static inline void trace_access_lock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + /* gain it for accessing the whole ring buffer. */ + down_write(&all_cpu_access_lock); + } else { + /* gain it for accessing a cpu ring buffer. */ + + /* Firstly block other trace_access_lock(TRACE_PIPE_ALL_CPU). */ + down_read(&all_cpu_access_lock); + + /* Secondly block other access to this @cpu ring buffer. */ + mutex_lock(&per_cpu(cpu_access_lock, cpu)); + } +} + +static inline void trace_access_unlock(int cpu) +{ + if (cpu == TRACE_PIPE_ALL_CPU) { + up_write(&all_cpu_access_lock); + } else { + mutex_unlock(&per_cpu(cpu_access_lock, cpu)); + up_read(&all_cpu_access_lock); + } +} + +static inline void trace_access_lock_init(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + mutex_init(&per_cpu(cpu_access_lock, cpu)); +} + +#else + +static DEFINE_MUTEX(access_lock); + +static inline void trace_access_lock(int cpu) +{ + (void)cpu; + mutex_lock(&access_lock); +} + +static inline void trace_access_unlock(int cpu) +{ + (void)cpu; + mutex_unlock(&access_lock); +} + +static inline void trace_access_lock_init(void) +{ +} + +#endif + /* trace_wait is a waitqueue for tasks blocked on trace_poll */ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); @@ -951,6 +1028,11 @@ void trace_find_cmdline(int pid, char comm[]) return; } + if (WARN_ON_ONCE(pid < 0)) { + strcpy(comm, "<XXX>"); + return; + } + if (pid > PID_MAX_DEFAULT) { strcpy(comm, "<...>"); return; @@ -1315,8 +1397,10 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - if (!filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, flags, 6, pc); + } out_unlock: arch_spin_unlock(&trace_buf_lock); @@ -1389,8 +1473,10 @@ int trace_array_vprintk(struct trace_array *tr, memcpy(&entry->buf, trace_buf, len); entry->buf[len] = '\0'; - if (!filter_check_discard(call, entry, buffer, event)) + if (!filter_check_discard(call, entry, buffer, event)) { ring_buffer_unlock_commit(buffer, event); + ftrace_trace_stack(buffer, irq_flags, 6, pc); + } out_unlock: arch_spin_unlock(&trace_buf_lock); @@ -1580,12 +1666,6 @@ static void tracing_iter_reset(struct trace_iterator *iter, int cpu) } /* - * No necessary locking here. The worst thing which can - * happen is loosing events consumed at the same time - * by a trace_pipe reader. - * Other than that, we don't risk to crash the ring buffer - * because it serializes the readers. - * * The current tracer is copied to avoid a global locking * all around. */ @@ -1640,12 +1720,16 @@ static void *s_start(struct seq_file *m, loff_t *pos) } trace_event_read_lock(); + trace_access_lock(cpu_file); return p; } static void s_stop(struct seq_file *m, void *p) { + struct trace_iterator *iter = m->private; + atomic_dec(&trace_record_cmdline_disabled); + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); } @@ -2836,22 +2920,6 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp) mutex_lock(&trace_types_lock); - /* We only allow one reader per cpu */ - if (cpu_file == TRACE_PIPE_ALL_CPU) { - if (!cpumask_empty(tracing_reader_cpumask)) { - ret = -EBUSY; - goto out; - } - cpumask_setall(tracing_reader_cpumask); - } else { - if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask)) - cpumask_set_cpu(cpu_file, tracing_reader_cpumask); - else { - ret = -EBUSY; - goto out; - } - } - /* create a buffer to store the information to pass to userspace */ iter = kzalloc(sizeof(*iter), GFP_KERNEL); if (!iter) { @@ -2907,12 +2975,6 @@ static int tracing_release_pipe(struct inode *inode, struct file *file) mutex_lock(&trace_types_lock); - if (iter->cpu_file == TRACE_PIPE_ALL_CPU) - cpumask_clear(tracing_reader_cpumask); - else - cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask); - - if (iter->trace->pipe_close) iter->trace->pipe_close(iter); @@ -3074,6 +3136,7 @@ waitagain: iter->pos = -1; trace_event_read_lock(); + trace_access_lock(iter->cpu_file); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3090,6 +3153,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); /* Now copy what we have to the user */ @@ -3215,6 +3279,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, } trace_event_read_lock(); + trace_access_lock(iter->cpu_file); /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { @@ -3238,6 +3303,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_access_unlock(iter->cpu_file); trace_event_read_unlock(); mutex_unlock(&iter->mutex); @@ -3539,10 +3605,12 @@ tracing_buffers_read(struct file *filp, char __user *ubuf, info->read = 0; + trace_access_lock(info->cpu); ret = ring_buffer_read_page(info->tr->buffer, &info->spare, count, info->cpu, 0); + trace_access_unlock(info->cpu); if (ret < 0) return 0; @@ -3670,6 +3738,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } + trace_access_lock(info->cpu); entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { @@ -3717,6 +3786,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } + trace_access_unlock(info->cpu); spd.nr_pages = i; /* did we read anything? */ @@ -4153,6 +4223,8 @@ static __init int tracer_init_debugfs(void) struct dentry *d_tracer; int cpu; + trace_access_lock_init(); + d_tracer = tracing_init_dentry(); trace_create_file("tracing_enabled", 0644, d_tracer, @@ -4387,9 +4459,6 @@ __init static int tracer_alloc_buffers(void) if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL)) goto out_free_buffer_mask; - if (!zalloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL)) - goto out_free_tracing_cpumask; - /* To save memory, keep the ring buffer size to its minimum */ if (ring_buffer_expanded) ring_buf_size = trace_buf_size; @@ -4447,8 +4516,6 @@ __init static int tracer_alloc_buffers(void) return 0; out_free_cpumask: - free_cpumask_var(tracing_reader_cpumask); -out_free_tracing_cpumask: free_cpumask_var(tracing_cpumask); out_free_buffer_mask: free_cpumask_var(tracing_buffer_mask); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 4df6a77eb196..fd05bcaf91b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -497,6 +497,7 @@ trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ #define FTRACE_GRAPH_MAX_FUNCS 32 +extern int ftrace_graph_filter_enabled; extern int ftrace_graph_count; extern unsigned long ftrace_graph_funcs[FTRACE_GRAPH_MAX_FUNCS]; @@ -504,7 +505,7 @@ static inline int ftrace_graph_addr(unsigned long addr) { int i; - if (!ftrace_graph_count || test_tsk_trace_graph(current)) + if (!ftrace_graph_filter_enabled) return 1; for (i = 0; i < ftrace_graph_count; i++) { @@ -791,7 +792,8 @@ extern const char *__stop___trace_bprintk_fmt[]; #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, id, tstruct, print) \ - extern struct ftrace_event_call event_##call; + extern struct ftrace_event_call \ + __attribute__((__aligned__(4))) event_##call; #undef FTRACE_ENTRY_DUP #define FTRACE_ENTRY_DUP(call, struct_name, id, tstruct, print) \ FTRACE_ENTRY(call, struct_name, id, PARAMS(tstruct), PARAMS(print)) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index 4a194f08f88c..b9bc4d470177 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -307,8 +307,23 @@ static int annotated_branch_stat_cmp(void *p1, void *p2) return -1; if (percent_a > percent_b) return 1; - else - return 0; + + if (a->incorrect < b->incorrect) + return -1; + if (a->incorrect > b->incorrect) + return 1; + + /* + * Since the above shows worse (incorrect) cases + * first, we continue that by showing best (correct) + * cases last. + */ + if (a->correct > b->correct) + return -1; + if (a->correct < b->correct) + return 1; + + return 0; } static struct tracer_stat annotated_branch_stats = { diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 189b09baf4fb..3f972ad98d04 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,10 +60,8 @@ int trace_define_field(struct ftrace_event_call *call, const char *type, return 0; err: - if (field) { + if (field) kfree(field->name); - kfree(field->type); - } kfree(field); return -ENOMEM; @@ -520,41 +518,16 @@ out: return ret; } -extern char *__bad_type_size(void); - -#undef FIELD -#define FIELD(type, name) \ - sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ - #type, "common_" #name, offsetof(typeof(field), name), \ - sizeof(field.name), is_signed_type(type) - -static int trace_write_header(struct trace_seq *s) -{ - struct trace_entry field; - - /* struct trace_entry */ - return trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\tsigned:%u;\n" - "\n", - FIELD(unsigned short, type), - FIELD(unsigned char, flags), - FIELD(unsigned char, preempt_count), - FIELD(int, pid), - FIELD(int, lock_depth)); -} - static ssize_t event_format_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; + struct ftrace_event_field *field; struct trace_seq *s; + int common_field_count = 5; char *buf; - int r; + int r = 0; if (*ppos) return 0; @@ -565,14 +538,48 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - /* If any of the first writes fail, so will the show_format. */ - trace_seq_printf(s, "name: %s\n", call->name); trace_seq_printf(s, "ID: %d\n", call->id); trace_seq_printf(s, "format:\n"); - trace_write_header(s); - r = call->show_format(call, s); + list_for_each_entry_reverse(field, &call->fields, link) { + /* + * Smartly shows the array type(except dynamic array). + * Normal: + * field:TYPE VAR + * If TYPE := TYPE[LEN], it is shown: + * field:TYPE VAR[LEN] + */ + const char *array_descriptor = strchr(field->type, '['); + + if (!strncmp(field->type, "__data_loc", 10)) + array_descriptor = NULL; + + if (!array_descriptor) { + r = trace_seq_printf(s, "\tfield:%s %s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + field->type, field->name, field->offset, + field->size, !!field->is_signed); + } else { + r = trace_seq_printf(s, "\tfield:%.*s %s%s;\toffset:%u;" + "\tsize:%u;\tsigned:%d;\n", + (int)(array_descriptor - field->type), + field->type, field->name, + array_descriptor, field->offset, + field->size, !!field->is_signed); + } + + if (--common_field_count == 0) + r = trace_seq_printf(s, "\n"); + + if (!r) + break; + } + + if (r) + r = trace_seq_printf(s, "\nprint fmt: %s\n", + call->print_fmt); + if (!r) { /* * ug! The format output is bigger than a PAGE!! @@ -948,10 +955,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, filter); } - /* A trace may not want to export its format */ - if (!call->show_format) - return 0; - trace_create_file("format", 0444, call->dir, call, format); diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index d4fa5dc1ee4e..e091f64ba6ce 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -62,78 +62,6 @@ static void __always_unused ____ftrace_check_##name(void) \ #include "trace_entries.h" - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __field_desc -#define __field_desc(type, container, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - sizeof(field.item), is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __array_desc -#define __array_desc(type, container, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%zu;\tsize:%zu;\tsigned:%u;\n", \ - offsetof(typeof(field), container.item), \ - sizeof(field.container.item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef __dynamic_array -#define __dynamic_array(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%zu;\tsize:0;\tsigned:%u;\n", \ - offsetof(typeof(field), item), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; - -#undef F_printk -#define F_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef __entry -#define __entry REC - -#undef FTRACE_ENTRY -#define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ -static int \ -ftrace_format_##name(struct ftrace_event_call *unused, \ - struct trace_seq *s) \ -{ \ - struct struct_name field __attribute__((unused)); \ - int ret = 0; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include "trace_entries.h" - #undef __field #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ @@ -175,7 +103,12 @@ ftrace_format_##name(struct ftrace_event_call *unused, \ return ret; #undef __dynamic_array -#define __dynamic_array(type, item) +#define __dynamic_array(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + 0, is_signed_type(type), FILTER_OTHER);\ + if (ret) \ + return ret; #undef FTRACE_ENTRY #define FTRACE_ENTRY(name, struct_name, id, tstruct, print) \ @@ -198,6 +131,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) return 0; } +#undef __entry +#define __entry REC + #undef __field #define __field(type, item) @@ -213,6 +149,9 @@ static int ftrace_raw_init_event(struct ftrace_event_call *call) #undef __dynamic_array #define __dynamic_array(type, item) +#undef F_printk +#define F_printk(fmt, args...) #fmt ", " __stringify(args) + #undef FTRACE_ENTRY #define FTRACE_ENTRY(call, struct_name, type, tstruct, print) \ \ @@ -223,7 +162,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ .id = type, \ .system = __stringify(TRACE_SYSTEM), \ .raw_init = ftrace_raw_init_event, \ - .show_format = ftrace_format_##call, \ + .print_fmt = print, \ .define_fields = ftrace_define_fields_##call, \ }; \ diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index b1342c5d37cf..e998a824e9db 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -18,6 +18,7 @@ struct fgraph_cpu_data { pid_t last_pid; int depth; int ignore; + unsigned long enter_funcs[FTRACE_RETFUNC_DEPTH]; }; struct fgraph_data { @@ -212,13 +213,11 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) int cpu; int pc; - if (unlikely(!tr)) - return 0; - if (!ftrace_trace_task(current)) return 0; - if (!ftrace_graph_addr(trace->func)) + /* trace it when it is-nested-in or is a function enabled. */ + if (!(trace->depth || ftrace_graph_addr(trace->func))) return 0; local_irq_save(flags); @@ -231,9 +230,6 @@ int trace_graph_entry(struct ftrace_graph_ent *trace) } else { ret = 0; } - /* Only do the atomic if it is not already set */ - if (!test_tsk_trace_graph(current)) - set_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); @@ -281,17 +277,24 @@ void trace_graph_return(struct ftrace_graph_ret *trace) pc = preempt_count(); __trace_graph_return(tr, trace, flags, pc); } - if (!trace->depth) - clear_tsk_trace_graph(current); atomic_dec(&data->disabled); local_irq_restore(flags); } +void set_graph_array(struct trace_array *tr) +{ + graph_array = tr; + + /* Make graph_array visible before we start tracing */ + + smp_mb(); +} + static int graph_trace_init(struct trace_array *tr) { int ret; - graph_array = tr; + set_graph_array(tr); ret = register_ftrace_graph(&trace_graph_return, &trace_graph_entry); if (ret) @@ -301,11 +304,6 @@ static int graph_trace_init(struct trace_array *tr) return 0; } -void set_graph_array(struct trace_array *tr) -{ - graph_array = tr; -} - static void graph_trace_reset(struct trace_array *tr) { tracing_stop_cmdline_record(); @@ -673,15 +671,21 @@ print_graph_entry_leaf(struct trace_iterator *iter, duration = graph_ret->rettime - graph_ret->calltime; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. Since * this is a leaf function, keep the comments * equal to this depth. */ - *depth = call->depth - 1; + cpu_data->depth = call->depth - 1; + + /* No need to keep this function around for this depth */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = 0; } /* Overhead */ @@ -721,10 +725,15 @@ print_graph_entry_nested(struct trace_iterator *iter, int i; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); - *depth = call->depth; + cpu_data = per_cpu_ptr(data->cpu_data, cpu); + cpu_data->depth = call->depth; + + /* Save this function pointer to see if the exit matches */ + if (call->depth < FTRACE_RETFUNC_DEPTH) + cpu_data->enter_funcs[call->depth] = call->func; } /* No overhead */ @@ -854,19 +863,28 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, struct fgraph_data *data = iter->private; pid_t pid = ent->pid; int cpu = iter->cpu; + int func_match = 1; int ret; int i; if (data) { + struct fgraph_cpu_data *cpu_data; int cpu = iter->cpu; - int *depth = &(per_cpu_ptr(data->cpu_data, cpu)->depth); + + cpu_data = per_cpu_ptr(data->cpu_data, cpu); /* * Comments display at + 1 to depth. This is the * return from a function, we now want the comments * to display at the same level of the bracket. */ - *depth = trace->depth - 1; + cpu_data->depth = trace->depth - 1; + + if (trace->depth < FTRACE_RETFUNC_DEPTH) { + if (cpu_data->enter_funcs[trace->depth] != trace->func) + func_match = 0; + cpu_data->enter_funcs[trace->depth] = 0; + } } if (print_graph_prologue(iter, s, 0, 0)) @@ -891,9 +909,21 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s, return TRACE_TYPE_PARTIAL_LINE; } - ret = trace_seq_printf(s, "}\n"); - if (!ret) - return TRACE_TYPE_PARTIAL_LINE; + /* + * If the return function does not have a matching entry, + * then the entry was lost. Instead of just printing + * the '}' and letting the user guess what function this + * belongs to, write out the function name. + */ + if (func_match) { + ret = trace_seq_printf(s, "}\n"); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } else { + ret = trace_seq_printf(s, "} (%ps)\n", (void *)trace->func); + if (!ret) + return TRACE_TYPE_PARTIAL_LINE; + } /* Overrun */ if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERRUN) { diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 6178abf3637e..505c92273b1a 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -635,12 +635,12 @@ static int create_trace_probe(int argc, char **argv) event = strchr(group, '/') + 1; event[-1] = '\0'; if (strlen(group) == 0) { - pr_info("Group name is not specifiled\n"); + pr_info("Group name is not specified\n"); return -EINVAL; } } if (strlen(event) == 0) { - pr_info("Event name is not specifiled\n"); + pr_info("Event name is not specified\n"); return -EINVAL; } } @@ -673,7 +673,7 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } /* an address specified */ - ret = strict_strtoul(&argv[0][2], 0, (unsigned long *)&addr); + ret = strict_strtoul(&argv[1][0], 0, (unsigned long *)&addr); if (ret) { pr_info("Failed to parse address.\n"); return ret; @@ -1155,80 +1155,60 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) return 0; } -static int __probe_event_show_format(struct trace_seq *s, - struct trace_probe *tp, const char *fmt, - const char *arg) +static int __set_print_fmt(struct trace_probe *tp, char *buf, int len) { int i; + int pos = 0; - /* Show format */ - if (!trace_seq_printf(s, "\nprint fmt: \"%s", fmt)) - return 0; + const char *fmt, *arg; - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, " %s=%%lx", tp->args[i].name)) - return 0; + if (!probe_is_return(tp)) { + fmt = "(%lx)"; + arg = "REC->" FIELD_STRING_IP; + } else { + fmt = "(%lx <- %lx)"; + arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP; + } - if (!trace_seq_printf(s, "\", %s", arg)) - return 0; + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) - for (i = 0; i < tp->nr_args; i++) - if (!trace_seq_printf(s, ", REC->%s", tp->args[i].name)) - return 0; + pos += snprintf(buf + pos, LEN_OR_ZERO, "\"%s", fmt); - return trace_seq_puts(s, "\n"); -} + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, " %s=%%lx", + tp->args[i].name); + } -#undef SHOW_FIELD -#define SHOW_FIELD(type, item, name) \ - do { \ - ret = trace_seq_printf(s, "\tfield:" #type " %s;\t" \ - "offset:%u;\tsize:%u;\tsigned:%d;\n", name,\ - (unsigned int)offsetof(typeof(field), item),\ - (unsigned int)sizeof(type), \ - is_signed_type(type)); \ - if (!ret) \ - return 0; \ - } while (0) + pos += snprintf(buf + pos, LEN_OR_ZERO, "\", %s", arg); -static int kprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) -{ - struct kprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; - - SHOW_FIELD(unsigned long, ip, FIELD_STRING_IP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + for (i = 0; i < tp->nr_args; i++) { + pos += snprintf(buf + pos, LEN_OR_ZERO, ", REC->%s", + tp->args[i].name); + } - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); +#undef LEN_OR_ZERO - return __probe_event_show_format(s, tp, "(%lx)", - "REC->" FIELD_STRING_IP); + /* return the length of print_fmt */ + return pos; } -static int kretprobe_event_show_format(struct ftrace_event_call *call, - struct trace_seq *s) +static int set_print_fmt(struct trace_probe *tp) { - struct kretprobe_trace_entry field __attribute__((unused)); - int ret, i; - struct trace_probe *tp = (struct trace_probe *)call->data; + int len; + char *print_fmt; - SHOW_FIELD(unsigned long, func, FIELD_STRING_FUNC); - SHOW_FIELD(unsigned long, ret_ip, FIELD_STRING_RETIP); - SHOW_FIELD(int, nargs, FIELD_STRING_NARGS); + /* First: called with 0 length to calculate the needed length */ + len = __set_print_fmt(tp, NULL, 0); + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; - /* Show fields */ - for (i = 0; i < tp->nr_args; i++) - SHOW_FIELD(unsigned long, args[i], tp->args[i].name); - trace_seq_puts(s, "\n"); + /* Second: actually write the @print_fmt */ + __set_print_fmt(tp, print_fmt, len + 1); + tp->call.print_fmt = print_fmt; - return __probe_event_show_format(s, tp, "(%lx <- %lx)", - "REC->" FIELD_STRING_FUNC - ", REC->" FIELD_STRING_RETIP); + return 0; } #ifdef CONFIG_PERF_EVENTS @@ -1359,18 +1339,20 @@ static int register_probe_event(struct trace_probe *tp) if (probe_is_return(tp)) { tp->event.trace = print_kretprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kretprobe_event_show_format; call->define_fields = kretprobe_event_define_fields; } else { tp->event.trace = print_kprobe_event; call->raw_init = probe_event_raw_init; - call->show_format = kprobe_event_show_format; call->define_fields = kprobe_event_define_fields; } + if (set_print_fmt(tp) < 0) + return -ENOMEM; call->event = &tp->event; call->id = register_ftrace_event(&tp->event); - if (!call->id) + if (!call->id) { + kfree(call->print_fmt); return -ENODEV; + } call->enabled = 0; call->regfunc = probe_event_enable; call->unregfunc = probe_event_disable; @@ -1383,6 +1365,7 @@ static int register_probe_event(struct trace_probe *tp) ret = trace_add_event_call(call); if (ret) { pr_info("Failed to register kprobe event: %s\n", call->name); + kfree(call->print_fmt); unregister_ftrace_event(&tp->event); } return ret; @@ -1392,6 +1375,7 @@ static void unregister_probe_event(struct trace_probe *tp) { /* tp->event is unregistered in trace_remove_event_call() */ trace_remove_event_call(&tp->call); + kfree(tp->call.print_fmt); } /* Make a debugfs interface for controling probe points */ diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index 678a5120ee30..f4bc9b27de5f 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -157,6 +157,7 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, unsigned long val, flags; char buf[64]; int ret; + int cpu; if (count >= sizeof(buf)) return -EINVAL; @@ -171,9 +172,20 @@ stack_max_size_write(struct file *filp, const char __user *ubuf, return ret; local_irq_save(flags); + + /* + * In case we trace inside arch_spin_lock() or after (NMI), + * we will cause circular lock, so we also need to increase + * the percpu trace_active here. + */ + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); *ptr = val; arch_spin_unlock(&max_stack_lock); + + per_cpu(trace_active, cpu)--; local_irq_restore(flags); return count; @@ -206,7 +218,13 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + int cpu; + local_irq_disable(); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)++; + arch_spin_lock(&max_stack_lock); if (*pos == 0) @@ -217,7 +235,13 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void t_stop(struct seq_file *m, void *p) { + int cpu; + arch_spin_unlock(&max_stack_lock); + + cpu = smp_processor_id(); + per_cpu(trace_active, cpu)--; + local_irq_enable(); } diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c index 4e332b9e449c..cba47d7935cc 100644 --- a/kernel/trace/trace_syscalls.c +++ b/kernel/trace/trace_syscalls.c @@ -143,70 +143,65 @@ extern char *__bad_type_size(void); #type, #name, offsetof(typeof(trace), name), \ sizeof(trace.name), is_signed_type(type) -int syscall_enter_format(struct ftrace_event_call *call, struct trace_seq *s) +static +int __set_enter_print_fmt(struct syscall_metadata *entry, char *buf, int len) { int i; - int ret; - struct syscall_metadata *entry = call->data; - struct syscall_trace_enter trace; - int offset = offsetof(struct syscall_trace_enter, args); + int pos = 0; - ret = trace_seq_printf(s, "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr)); - if (!ret) - return 0; + /* When len=0, we just calculate the needed length */ +#define LEN_OR_ZERO (len ? len - pos : 0) + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "\tfield:%s %s;", entry->types[i], - entry->args[i]); - if (!ret) - return 0; - ret = trace_seq_printf(s, "\toffset:%d;\tsize:%zu;" - "\tsigned:%u;\n", offset, - sizeof(unsigned long), - is_signed_type(unsigned long)); - if (!ret) - return 0; - offset += sizeof(unsigned long); + pos += snprintf(buf + pos, LEN_OR_ZERO, "%s: 0x%%0%zulx%s", + entry->args[i], sizeof(unsigned long), + i == entry->nb_args - 1 ? "" : ", "); } + pos += snprintf(buf + pos, LEN_OR_ZERO, "\""); - trace_seq_puts(s, "\nprint fmt: \""); for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, "%s: 0x%%0%zulx%s", entry->args[i], - sizeof(unsigned long), - i == entry->nb_args - 1 ? "" : ", "); - if (!ret) - return 0; + pos += snprintf(buf + pos, LEN_OR_ZERO, + ", ((unsigned long)(REC->%s))", entry->args[i]); } - trace_seq_putc(s, '"'); - for (i = 0; i < entry->nb_args; i++) { - ret = trace_seq_printf(s, ", ((unsigned long)(REC->%s))", - entry->args[i]); - if (!ret) - return 0; - } +#undef LEN_OR_ZERO - return trace_seq_putc(s, '\n'); + /* return the length of print_fmt */ + return pos; } -int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s) +static int set_syscall_print_fmt(struct ftrace_event_call *call) { - int ret; - struct syscall_trace_exit trace; + char *print_fmt; + int len; + struct syscall_metadata *entry = call->data; - ret = trace_seq_printf(s, - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n" - "\tfield:%s %s;\toffset:%zu;\tsize:%zu;" - "\tsigned:%u;\n", - SYSCALL_FIELD(int, nr), - SYSCALL_FIELD(long, ret)); - if (!ret) + if (entry->enter_event != call) { + call->print_fmt = "\"0x%lx\", REC->ret"; return 0; + } - return trace_seq_printf(s, "\nprint fmt: \"0x%%lx\", REC->ret\n"); + /* First: called with 0 length to calculate the needed length */ + len = __set_enter_print_fmt(entry, NULL, 0); + + print_fmt = kmalloc(len + 1, GFP_KERNEL); + if (!print_fmt) + return -ENOMEM; + + /* Second: actually write the @print_fmt */ + __set_enter_print_fmt(entry, print_fmt, len + 1); + call->print_fmt = print_fmt; + + return 0; +} + +static void free_syscall_print_fmt(struct ftrace_event_call *call) +{ + struct syscall_metadata *entry = call->data; + + if (entry->enter_event == call) + kfree(call->print_fmt); } int syscall_enter_define_fields(struct ftrace_event_call *call) @@ -386,12 +381,22 @@ int init_syscall_trace(struct ftrace_event_call *call) { int id; - id = register_ftrace_event(call->event); - if (!id) - return -ENODEV; - call->id = id; - INIT_LIST_HEAD(&call->fields); - return 0; + if (set_syscall_print_fmt(call) < 0) + return -ENOMEM; + + id = trace_event_raw_init(call); + + if (id < 0) { + free_syscall_print_fmt(call); + return id; + } + + return id; +} + +unsigned long __init arch_syscall_addr(int nr) +{ + return (unsigned long)sys_call_table[nr]; } int __init init_ftrace_syscalls(void) @@ -552,7 +557,7 @@ int prof_sysexit_enable(struct ftrace_event_call *call) ret = register_trace_sys_exit(prof_syscall_exit); if (ret) { pr_info("event trace: Could not activate" - "syscall entry trace point"); + "syscall exit trace point"); } else { set_bit(num, enabled_prof_exit_syscalls); sys_prof_refcount_exit++; |