summaryrefslogtreecommitdiffstats
path: root/kernel
diff options
context:
space:
mode:
authorIngo Molnar <mingo@elte.hu>2009-02-11 09:22:04 +0100
committerIngo Molnar <mingo@elte.hu>2009-02-11 09:22:04 +0100
commit95fd4845ed0ffcab305b4f30ce1c12dc34f1b56c (patch)
treeaa2aac22a5b329b778a6771a87bbf1945ad49bbd /kernel
parentd278c48435625cb6b7edcf6a547620768b175709 (diff)
parent8e4921515c1a379539607eb443d51c30f4f7f338 (diff)
downloadlinux-95fd4845ed0ffcab305b4f30ce1c12dc34f1b56c.tar.bz2
Merge commit 'v2.6.29-rc4' into perfcounters/core
Conflicts: arch/x86/kernel/setup_percpu.c arch/x86/mm/fault.c drivers/acpi/processor_idle.c kernel/irq/handle.c
Diffstat (limited to 'kernel')
-rw-r--r--kernel/async.c94
-rw-r--r--kernel/cgroup.c28
-rw-r--r--kernel/cpuset.c13
-rw-r--r--kernel/dma-coherent.c47
-rw-r--r--kernel/fork.c17
-rw-r--r--kernel/hrtimer.c45
-rw-r--r--kernel/irq/chip.c2
-rw-r--r--kernel/irq/handle.c16
-rw-r--r--kernel/irq/manage.c10
-rw-r--r--kernel/irq/numa_migrate.c7
-rw-r--r--kernel/kallsyms.c16
-rw-r--r--kernel/module.c35
-rw-r--r--kernel/posix-cpu-timers.c70
-rw-r--r--kernel/power/disk.c10
-rw-r--r--kernel/power/main.c26
-rw-r--r--kernel/rcuclassic.c2
-rw-r--r--kernel/rcutree.c2
-rw-r--r--kernel/relay.c4
-rw-r--r--kernel/sched.c14
-rw-r--r--kernel/sched_fair.c32
-rw-r--r--kernel/sched_stats.h33
-rw-r--r--kernel/signal.c2
-rw-r--r--kernel/smp.c36
-rw-r--r--kernel/softlockup.c9
-rw-r--r--kernel/sys.c16
-rw-r--r--kernel/sysctl.c2
-rw-r--r--kernel/time/tick-common.c26
-rw-r--r--kernel/time/tick-sched.c2
-rw-r--r--kernel/trace/ftrace.c32
-rw-r--r--kernel/trace/ring_buffer.c15
-rw-r--r--kernel/trace/trace.c5
-rw-r--r--kernel/trace/trace_irqsoff.c1
-rw-r--r--kernel/trace/trace_sched_wakeup.c1
-rw-r--r--kernel/wait.c59
34 files changed, 485 insertions, 244 deletions
diff --git a/kernel/async.c b/kernel/async.c
index 608b32b42812..f565891f2c9b 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -54,6 +54,7 @@ asynchronous and synchronous parts of the kernel.
#include <linux/sched.h>
#include <linux/init.h>
#include <linux/kthread.h>
+#include <linux/delay.h>
#include <asm/atomic.h>
static async_cookie_t next_cookie = 1;
@@ -132,21 +133,23 @@ static void run_one_entry(void)
entry = list_first_entry(&async_pending, struct async_entry, list);
/* 2) move it to the running queue */
- list_del(&entry->list);
- list_add_tail(&entry->list, &async_running);
+ list_move_tail(&entry->list, entry->running);
spin_unlock_irqrestore(&async_lock, flags);
/* 3) run it (and print duration)*/
if (initcall_debug && system_state == SYSTEM_BOOTING) {
- printk("calling %lli_%pF @ %i\n", entry->cookie, entry->func, task_pid_nr(current));
+ printk("calling %lli_%pF @ %i\n", (long long)entry->cookie,
+ entry->func, task_pid_nr(current));
calltime = ktime_get();
}
entry->func(entry->data, entry->cookie);
if (initcall_debug && system_state == SYSTEM_BOOTING) {
rettime = ktime_get();
delta = ktime_sub(rettime, calltime);
- printk("initcall %lli_%pF returned 0 after %lld usecs\n", entry->cookie,
- entry->func, ktime_to_ns(delta) >> 10);
+ printk("initcall %lli_%pF returned 0 after %lld usecs\n",
+ (long long)entry->cookie,
+ entry->func,
+ (long long)ktime_to_ns(delta) >> 10);
}
/* 4) remove it from the running queue */
@@ -205,18 +208,44 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
return newcookie;
}
+/**
+ * async_schedule - schedule a function for asynchronous execution
+ * @ptr: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
{
- return __async_schedule(ptr, data, &async_pending);
+ return __async_schedule(ptr, data, &async_running);
}
EXPORT_SYMBOL_GPL(async_schedule);
-async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+/**
+ * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
+ * @ptr: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @running: running list for the domain
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @running may be used in the async_synchronize_*_domain() functions
+ * to wait within a certain synchronization domain rather than globally.
+ * A synchronization domain is specified via the running queue @running to use.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
+async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+ struct list_head *running)
{
return __async_schedule(ptr, data, running);
}
-EXPORT_SYMBOL_GPL(async_schedule_special);
+EXPORT_SYMBOL_GPL(async_schedule_domain);
+/**
+ * async_synchronize_full - synchronize all asynchronous function calls
+ *
+ * This function waits until all asynchronous function calls have been done.
+ */
void async_synchronize_full(void)
{
do {
@@ -225,13 +254,30 @@ void async_synchronize_full(void)
}
EXPORT_SYMBOL_GPL(async_synchronize_full);
-void async_synchronize_full_special(struct list_head *list)
+/**
+ * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
+ * @list: running list to synchronize on
+ *
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list have been done.
+ */
+void async_synchronize_full_domain(struct list_head *list)
{
- async_synchronize_cookie_special(next_cookie, list);
+ async_synchronize_cookie_domain(next_cookie, list);
}
-EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
-void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+/**
+ * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ * @running: running list to synchronize on
+ *
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list submitted
+ * prior to @cookie have been done.
+ */
+void async_synchronize_cookie_domain(async_cookie_t cookie,
+ struct list_head *running)
{
ktime_t starttime, delta, endtime;
@@ -247,14 +293,22 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
delta = ktime_sub(endtime, starttime);
printk("async_continuing @ %i after %lli usec\n",
- task_pid_nr(current), ktime_to_ns(delta) >> 10);
+ task_pid_nr(current),
+ (long long)ktime_to_ns(delta) >> 10);
}
}
-EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
+/**
+ * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ *
+ * This function waits until all asynchronous function calls prior to @cookie
+ * have been done.
+ */
void async_synchronize_cookie(async_cookie_t cookie)
{
- async_synchronize_cookie_special(cookie, &async_running);
+ async_synchronize_cookie_domain(cookie, &async_running);
}
EXPORT_SYMBOL_GPL(async_synchronize_cookie);
@@ -315,7 +369,11 @@ static int async_manager_thread(void *unused)
ec = atomic_read(&entry_count);
while (tc < ec && tc < MAX_THREADS) {
- kthread_run(async_thread, NULL, "async/%i", tc);
+ if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
+ tc))) {
+ msleep(100);
+ continue;
+ }
atomic_inc(&thread_count);
tc++;
}
@@ -330,7 +388,9 @@ static int async_manager_thread(void *unused)
static int __init async_init(void)
{
if (async_enabled)
- kthread_run(async_manager_thread, NULL, "async/mgr");
+ if (IS_ERR(kthread_run(async_manager_thread, NULL,
+ "async/mgr")))
+ async_enabled = 0;
return 0;
}
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c29831076e7a..5a54ff42874e 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1115,8 +1115,10 @@ static void cgroup_kill_sb(struct super_block *sb) {
}
write_unlock(&css_set_lock);
- list_del(&root->root_list);
- root_count--;
+ if (!list_empty(&root->root_list)) {
+ list_del(&root->root_list);
+ root_count--;
+ }
mutex_unlock(&cgroup_mutex);
@@ -2434,7 +2436,9 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
err_remove:
+ cgroup_lock_hierarchy(root);
list_del(&cgrp->sibling);
+ cgroup_unlock_hierarchy(root);
root->number_of_cgroups--;
err_destroy:
@@ -2507,7 +2511,7 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
for_each_subsys(cgrp->root, ss) {
struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
int refcnt;
- do {
+ while (1) {
/* We can only remove a CSS with a refcnt==1 */
refcnt = atomic_read(&css->refcnt);
if (refcnt > 1) {
@@ -2521,7 +2525,10 @@ static int cgroup_clear_css_refs(struct cgroup *cgrp)
* css_tryget() to spin until we set the
* CSS_REMOVED bits or abort
*/
- } while (atomic_cmpxchg(&css->refcnt, refcnt, 0) != refcnt);
+ if (atomic_cmpxchg(&css->refcnt, refcnt, 0) == refcnt)
+ break;
+ cpu_relax();
+ }
}
done:
for_each_subsys(cgrp->root, ss) {
@@ -2991,20 +2998,21 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&cgroup_mutex);
return 0;
}
- task_lock(tsk);
- cg = tsk->cgroups;
- parent = task_cgroup(tsk, subsys->subsys_id);
/* Pin the hierarchy */
- if (!atomic_inc_not_zero(&parent->root->sb->s_active)) {
+ if (!atomic_inc_not_zero(&root->sb->s_active)) {
/* We race with the final deactivate_super() */
mutex_unlock(&cgroup_mutex);
return 0;
}
/* Keep the cgroup alive */
+ task_lock(tsk);
+ parent = task_cgroup(tsk, subsys->subsys_id);
+ cg = tsk->cgroups;
get_css_set(cg);
task_unlock(tsk);
+
mutex_unlock(&cgroup_mutex);
/* Now do the VFS work to create a cgroup */
@@ -3043,7 +3051,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_unlock(&inode->i_mutex);
put_css_set(cg);
- deactivate_super(parent->root->sb);
+ deactivate_super(root->sb);
/* The cgroup is still accessible in the VFS, but
* we're not going to try to rmdir() it at this
* point. */
@@ -3069,7 +3077,7 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
mutex_lock(&cgroup_mutex);
put_css_set(cg);
mutex_unlock(&cgroup_mutex);
- deactivate_super(parent->root->sb);
+ deactivate_super(root->sb);
return ret;
}
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index a85678865c5e..f76db9dcaa05 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -61,6 +61,14 @@
#include <linux/cgroup.h>
/*
+ * Workqueue for cpuset related tasks.
+ *
+ * Using kevent workqueue may cause deadlock when memory_migrate
+ * is set. So we create a separate workqueue thread for cpuset.
+ */
+static struct workqueue_struct *cpuset_wq;
+
+/*
* Tracks how many cpusets are currently defined in system.
* When there is only one cpuset (the root cpuset) we can
* short circuit some hooks.
@@ -831,7 +839,7 @@ static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
*/
static void async_rebuild_sched_domains(void)
{
- schedule_work(&rebuild_sched_domains_work);
+ queue_work(cpuset_wq, &rebuild_sched_domains_work);
}
/*
@@ -2111,6 +2119,9 @@ void __init cpuset_init_smp(void)
hotcpu_notifier(cpuset_track_online_cpus, 0);
hotplug_memory_notifier(cpuset_track_online_nodes, 10);
+
+ cpuset_wq = create_singlethread_workqueue("cpuset");
+ BUG_ON(!cpuset_wq);
}
/**
diff --git a/kernel/dma-coherent.c b/kernel/dma-coherent.c
index 038707404b76..962a3b574f21 100644
--- a/kernel/dma-coherent.c
+++ b/kernel/dma-coherent.c
@@ -98,7 +98,7 @@ EXPORT_SYMBOL(dma_mark_declared_memory_occupied);
* @size: size of requested memory area
* @dma_handle: This will be filled with the correct dma handle
* @ret: This pointer will be filled with the virtual address
- * to allocated area.
+ * to allocated area.
*
* This function should be only called from per-arch dma_alloc_coherent()
* to support allocation from per-device coherent memory pools.
@@ -118,31 +118,32 @@ int dma_alloc_from_coherent(struct device *dev, ssize_t size,
mem = dev->dma_mem;
if (!mem)
return 0;
- if (unlikely(size > mem->size))
- return 0;
+
+ *ret = NULL;
+
+ if (unlikely(size > (mem->size << PAGE_SHIFT)))
+ goto err;
pageno = bitmap_find_free_region(mem->bitmap, mem->size, order);
- if (pageno >= 0) {
- /*
- * Memory was found in the per-device arena.
- */
- *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
- *ret = mem->virt_base + (pageno << PAGE_SHIFT);
- memset(*ret, 0, size);
- } else if (mem->flags & DMA_MEMORY_EXCLUSIVE) {
- /*
- * The per-device arena is exhausted and we are not
- * permitted to fall back to generic memory.
- */
- *ret = NULL;
- } else {
- /*
- * The per-device arena is exhausted and we are
- * permitted to fall back to generic memory.
- */
- return 0;
- }
+ if (unlikely(pageno < 0))
+ goto err;
+
+ /*
+ * Memory was found in the per-device area.
+ */
+ *dma_handle = mem->device_base + (pageno << PAGE_SHIFT);
+ *ret = mem->virt_base + (pageno << PAGE_SHIFT);
+ memset(*ret, 0, size);
+
return 1;
+
+err:
+ /*
+ * In the case where the allocation can not be satisfied from the
+ * per-device area, try to fall back to generic memory if the
+ * constraints allow it.
+ */
+ return mem->flags & DMA_MEMORY_EXCLUSIVE;
}
EXPORT_SYMBOL(dma_alloc_from_coherent);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0181b7b2281a..b01c5b04bcff 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -822,17 +822,17 @@ static void posix_cpu_timers_init_group(struct signal_struct *sig)
static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
{
struct signal_struct *sig;
- int ret;
if (clone_flags & CLONE_THREAD) {
- ret = thread_group_cputime_clone_thread(current);
- if (likely(!ret)) {
- atomic_inc(&current->signal->count);
- atomic_inc(&current->signal->live);
- }
- return ret;
+ atomic_inc(&current->signal->count);
+ atomic_inc(&current->signal->live);
+ return 0;
}
sig = kmem_cache_alloc(signal_cachep, GFP_KERNEL);
+
+ if (sig)
+ posix_cpu_timers_init_group(sig);
+
tsk->signal = sig;
if (!sig)
return -ENOMEM;
@@ -869,8 +869,6 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
memcpy(sig->rlim, current->signal->rlim, sizeof sig->rlim);
task_unlock(current->group_leader);
- posix_cpu_timers_init_group(sig);
-
acct_init_pacct(&sig->pacct);
tty_audit_fork(sig);
@@ -1013,6 +1011,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
* triggers too late. This doesn't hurt, the check is only there
* to stop root fork bombs.
*/
+ retval = -EAGAIN;
if (nr_threads >= max_threads)
goto bad_fork_cleanup_count;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2dc30c59c5fd..f394d2a42ca3 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
continue;
timer = rb_entry(base->first, struct hrtimer, node);
expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+ /*
+ * clock_was_set() has changed base->offset so the
+ * result might be negative. Fix it up to prevent a
+ * false positive in clockevents_program_event()
+ */
+ if (expires.tv64 < 0)
+ expires.tv64 = 0;
if (expires.tv64 < cpu_base->expires_next.tv64)
cpu_base->expires_next = expires;
}
@@ -614,7 +621,9 @@ void clock_was_set(void)
*/
void hres_timers_resume(void)
{
- /* Retrigger the CPU local events: */
+ WARN_ONCE(!irqs_disabled(),
+ KERN_INFO "hres_timers_resume() called with IRQs enabled!");
+
retrigger_next_event(NULL);
}
@@ -1156,6 +1165,29 @@ static void __run_hrtimer(struct hrtimer *timer)
#ifdef CONFIG_HIGH_RES_TIMERS
+static int force_clock_reprogram;
+
+/*
+ * After 5 iteration's attempts, we consider that hrtimer_interrupt()
+ * is hanging, which could happen with something that slows the interrupt
+ * such as the tracing. Then we force the clock reprogramming for each future
+ * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
+ * threshold that we will overwrite.
+ * The next tick event will be scheduled to 3 times we currently spend on
+ * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
+ * 1/4 of their time to process the hrtimer interrupts. This is enough to
+ * let it running without serious starvation.
+ */
+
+static inline void
+hrtimer_interrupt_hanging(struct clock_event_device *dev,
+ ktime_t try_time)
+{
+ force_clock_reprogram = 1;
+ dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
+ printk(KERN_WARNING "hrtimer: interrupt too slow, "
+ "forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+}
/*
* High resolution timer interrupt
* Called with interrupts disabled
@@ -1165,6 +1197,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
struct hrtimer_clock_base *base;
ktime_t expires_next, now;
+ int nr_retries = 0;
int i;
BUG_ON(!cpu_base->hres_active);
@@ -1172,6 +1205,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
dev->next_event.tv64 = KTIME_MAX;
retry:
+ /* 5 retries is enough to notice a hang */
+ if (!(++nr_retries % 5))
+ hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
+
now = ktime_get();
expires_next.tv64 = KTIME_MAX;
@@ -1224,7 +1261,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
/* Reprogramming necessary ? */
if (expires_next.tv64 != KTIME_MAX) {
- if (tick_program_event(expires_next, 0))
+ if (tick_program_event(expires_next, force_clock_reprogram))
goto retry;
}
}
@@ -1578,6 +1615,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
break;
#ifdef CONFIG_HOTPLUG_CPU
+ case CPU_DYING:
+ case CPU_DYING_FROZEN:
+ clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
+ break;
case CPU_DEAD:
case CPU_DEAD_FROZEN:
{
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index c248eba98b43..122fef4b0bd3 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -386,6 +386,7 @@ handle_level_irq(unsigned int irq, struct irq_desc *desc)
out_unlock:
spin_unlock(&desc->lock);
}
+EXPORT_SYMBOL_GPL(handle_level_irq);
/**
* handle_fasteoi_irq - irq handler for transparent controllers
@@ -596,6 +597,7 @@ __set_irq_handler(unsigned int irq, irq_flow_handler_t handle, int is_chained,
}
spin_unlock_irqrestore(&desc->lock, flags);
}
+EXPORT_SYMBOL_GPL(__set_irq_handler);
void
set_irq_chip_and_handler(unsigned int irq, struct irq_chip *chip,
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 375d68cd5bf0..f51eaee921b6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -40,6 +40,18 @@ void handle_bad_irq(unsigned int irq, struct irq_desc *desc)
ack_bad_irq(irq);
}
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+static void __init init_irq_default_affinity(void)
+{
+ alloc_bootmem_cpumask_var(&irq_default_affinity);
+ cpumask_setall(irq_default_affinity);
+}
+#else
+static void __init init_irq_default_affinity(void)
+{
+}
+#endif
+
/*
* Linux has a controller-independent interrupt architecture.
* Every controller has a 'controller-template', that is used
@@ -133,6 +145,8 @@ int __init early_irq_init(void)
int legacy_count;
int i;
+ init_irq_default_affinity();
+
/* initialize nr_irqs based on nr_cpu_ids */
arch_probe_nr_irqs();
printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
@@ -229,6 +243,8 @@ int __init early_irq_init(void)
int count;
int i;
+ init_irq_default_affinity();
+
printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
desc = irq_desc;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index b98739af4558..a3a5dc9ef346 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,17 +15,9 @@
#include "internals.h"
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
cpumask_var_t irq_default_affinity;
-static int init_irq_default_affinity(void)
-{
- alloc_cpumask_var(&irq_default_affinity, GFP_KERNEL);
- cpumask_setall(irq_default_affinity);
- return 0;
-}
-core_initcall(init_irq_default_affinity);
-
/**
* synchronize_irq - wait for pending IRQ handlers (on other CPUs)
* @irq: interrupt number to wait for
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index 666260e4c065..7f9b80434e32 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -78,7 +78,7 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
desc = irq_desc_ptrs[irq];
if (desc && old_desc != desc)
- goto out_unlock;
+ goto out_unlock;
node = cpu_to_node(cpu);
desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
@@ -97,10 +97,15 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
}
irq_desc_ptrs[irq] = desc;
+ spin_unlock_irqrestore(&sparse_irq_lock, flags);
/* free the old one */
free_one_irq_desc(old_desc, desc);
+ spin_unlock(&old_desc->lock);
kfree(old_desc);
+ spin_lock(&desc->lock);
+
+ return desc;
out_unlock:
spin_unlock_irqrestore(&sparse_irq_lock, flags);
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index e694afa0eb8c..7b8b0f21a5b1 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -30,19 +30,20 @@
#define all_var 0
#endif
-extern const unsigned long kallsyms_addresses[];
-extern const u8 kallsyms_names[];
+/* These will be re-linked against their real values during the second link stage */
+extern const unsigned long kallsyms_addresses[] __attribute__((weak));
+extern const u8 kallsyms_names[] __attribute__((weak));
/* tell the compiler that the count isn't in the small data section if the arch
* has one (eg: FRV)
*/
extern const unsigned long kallsyms_num_syms
- __attribute__((__section__(".rodata")));
+__attribute__((weak, section(".rodata")));
-extern const u8 kallsyms_token_table[];
-extern const u16 kallsyms_token_index[];
+extern const u8 kallsyms_token_table[] __attribute__((weak));
+extern const u16 kallsyms_token_index[] __attribute__((weak));
-extern const unsigned long kallsyms_markers[];
+extern const unsigned long kallsyms_markers[] __attribute__((weak));
static inline int is_kernel_inittext(unsigned long addr)
{
@@ -167,6 +168,9 @@ static unsigned long get_symbol_pos(unsigned long addr,
unsigned long symbol_start = 0, symbol_end = 0;
unsigned long i, low, high, mid;
+ /* This kernel should never had been booted. */
+ BUG_ON(!kallsyms_addresses);
+
/* do a binary search on the sorted kallsyms_addresses array */
low = 0;
high = kallsyms_num_syms;
diff --git a/kernel/module.c b/kernel/module.c
index e8b51d41dd72..ba22484a987e 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -573,13 +573,13 @@ static char last_unloaded_module[MODULE_NAME_LEN+1];
/* Init the unload section of the module. */
static void module_unload_init(struct module *mod)
{
- unsigned int i;
+ int cpu;
INIT_LIST_HEAD(&mod->modules_which_use_me);
- for (i = 0; i < NR_CPUS; i++)
- local_set(&mod->ref[i].count, 0);
+ for_each_possible_cpu(cpu)
+ local_set(__module_ref_addr(mod, cpu), 0);
/* Hold reference count during initialization. */
- local_set(&mod->ref[raw_smp_processor_id()].count, 1);
+ local_set(__module_ref_addr(mod, raw_smp_processor_id()), 1);
/* Backwards compatibility macros put refcount during init. */
mod->waiter = current;
}
@@ -717,10 +717,11 @@ static int try_stop_module(struct module *mod, int flags, int *forced)
unsigned int module_refcount(struct module *mod)
{
- unsigned int i, total = 0;
+ unsigned int total = 0;
+ int cpu;
- for (i = 0; i < NR_CPUS; i++)
- total += local_read(&mod->ref[i].count);
+ for_each_possible_cpu(cpu)
+ total += local_read(__module_ref_addr(mod, cpu));
return total;
}
EXPORT_SYMBOL(module_refcount);
@@ -894,7 +895,7 @@ void module_put(struct module *module)
{
if (module) {
unsigned int cpu = get_cpu();
- local_dec(&module->ref[cpu].count);
+ local_dec(__module_ref_addr(module, cpu));
/* Maybe they're waiting for us to drop reference? */
if (unlikely(!module_is_live(module)))
wake_up_process(module->waiter);
@@ -1464,7 +1465,10 @@ static void free_module(struct module *mod)
kfree(mod->args);
if (mod->percpu)
percpu_modfree(mod->percpu);
-
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ if (mod->refptr)
+ percpu_modfree(mod->refptr);
+#endif
/* Free lock-classes: */
lockdep_free_key_range(mod->module_core, mod->core_size);
@@ -2011,6 +2015,14 @@ static noinline struct module *load_module(void __user *umod,
if (err < 0)
goto free_mod;
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+ mod->name);
+ if (!mod->refptr) {
+ err = -ENOMEM;
+ goto free_mod;
+ }
+#endif
if (pcpuindex) {
/* We have a special allocation for this section. */
percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2018,7 +2030,7 @@ static noinline struct module *load_module(void __user *umod,
mod->name);
if (!percpu) {
err = -ENOMEM;
- goto free_mod;
+ goto free_percpu;
}
sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
mod->percpu = percpu;
@@ -2282,6 +2294,9 @@ static noinline struct module *load_module(void __user *umod,
free_percpu:
if (percpu)
percpu_modfree(percpu);
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ percpu_modfree(mod->refptr);
+#endif
free_mod:
kfree(args);
free_hdr:
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 157de3a47832..fa07da94d7be 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -10,76 +10,6 @@
#include <linux/kernel_stat.h>
/*
- * Allocate the thread_group_cputime structure appropriately and fill in the
- * current values of the fields. Called from copy_signal() via
- * thread_group_cputime_clone_thread() when adding a second or subsequent
- * thread to a thread group. Assumes interrupts are enabled when called.
- */
-int thread_group_cputime_alloc(struct task_struct *tsk)
-{
- struct signal_struct *sig = tsk->signal;
- struct task_cputime *cputime;
-
- /*
- * If we have multiple threads and we don't already have a
- * per-CPU task_cputime struct (checked in the caller), allocate
- * one and fill it in with the times accumulated so far. We may
- * race with another thread so recheck after we pick up the sighand
- * lock.
- */
- cputime = alloc_percpu(struct task_cputime);
- if (cputime == NULL)
- return -ENOMEM;
- spin_lock_irq(&tsk->sighand->siglock);
- if (sig->cputime.totals) {
- spin_unlock_irq(&tsk->sighand->siglock);
- free_percpu(cputime);
- return 0;
- }
- sig->cputime.totals = cputime;
- cputime = per_cpu_ptr(sig->cputime.totals, smp_processor_id());
- cputime->utime = tsk->utime;
- cputime->stime = tsk->stime;
- cputime->sum_exec_runtime = tsk->se.sum_exec_runtime;
- spin_unlock_irq(&tsk->sighand->siglock);
- return 0;
-}
-
-/**
- * thread_group_cputime - Sum the thread group time fields across all CPUs.
- *
- * @tsk: The task we use to identify the thread group.
- * @times: task_cputime structure in which we return the summed fields.
- *
- * Walk the list of CPUs to sum the per-CPU time fields in the thread group
- * time structure.
- */
-void thread_group_cputime(
- struct task_struct *tsk,
- struct task_cputime *times)
-{
- struct task_cputime *totals, *tot;
- int i;
-
- totals = tsk->signal->cputime.totals;
- if (!totals) {
- times->utime = tsk->utime;
- times->stime = tsk->stime;
- times->sum_exec_runtime = tsk->se.sum_exec_runtime;
- return;
- }
-
- times->stime = times->utime = cputime_zero;
- times->sum_exec_runtime = 0;
- for_each_possible_cpu(i) {
- tot = per_cpu_ptr(totals, i);
- times->utime = cputime_add(times->utime, tot->utime);
- times->stime = cputime_add(times->stime, tot->stime);
- times->sum_exec_runtime += tot->sum_exec_runtime;
- }
-}
-
-/*
* Called after updating RLIMIT_CPU to set timer expiration if necessary.
*/
void update_rlimit_cpu(unsigned long rlim_new)
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 45e8541ab7e3..432ee575c9ee 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -71,6 +71,14 @@ void hibernation_set_ops(struct platform_hibernation_ops *ops)
mutex_unlock(&pm_mutex);
}
+static bool entering_platform_hibernation;
+
+bool system_entering_hibernation(void)
+{
+ return entering_platform_hibernation;
+}
+EXPORT_SYMBOL(system_entering_hibernation);
+
#ifdef CONFIG_PM_DEBUG
static void hibernation_debug_sleep(void)
{
@@ -411,6 +419,7 @@ int hibernation_platform_enter(void)
if (error)
goto Close;
+ entering_platform_hibernation = true;
suspend_console();
error = device_suspend(PMSG_HIBERNATE);
if (error) {
@@ -445,6 +454,7 @@ int hibernation_platform_enter(void)
Finish:
hibernation_ops->finish();
Resume_devices:
+ entering_platform_hibernation = false;
device_resume(PMSG_RESTORE);
resume_console();
Close:
diff --git a/kernel/power/main.c b/kernel/power/main.c
index 239988873971..b4d219016b6c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,16 +57,6 @@ int pm_notifier_call_chain(unsigned long val)
#ifdef CONFIG_PM_DEBUG
int pm_test_level = TEST_NONE;
-static int suspend_test(int level)
-{
- if (pm_test_level == level) {
- printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
- mdelay(5000);
- return 1;
- }
- return 0;
-}
-
static const char * const pm_tests[__TEST_AFTER_LAST] = {
[TEST_NONE] = "none",
[TEST_CORE] = "core",
@@ -125,14 +115,24 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
}
power_attr(pm_test);
-#else /* !CONFIG_PM_DEBUG */
-static inline int suspend_test(int level) { return 0; }
-#endif /* !CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_DEBUG */
#endif /* CONFIG_PM_SLEEP */
#ifdef CONFIG_SUSPEND
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+ if (pm_test_level == level) {
+ printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+ mdelay(5000);
+ return 1;
+ }
+#endif /* !CONFIG_PM_DEBUG */
+ return 0;
+}
+
#ifdef CONFIG_PM_TEST_SUSPEND
/*
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 490934fc7ac3..bd5a9003497c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -716,7 +716,7 @@ void rcu_check_callbacks(int cpu, int user)
raise_rcu_softirq();
}
-static void rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
+static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_ctrlblk *rcp,
struct rcu_data *rdp)
{
unsigned long flags;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f2d8638e6c60..b2fd602a6f6f 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1314,7 +1314,7 @@ int rcu_needs_cpu(int cpu)
* access due to the fact that this CPU cannot possibly have any RCU
* callbacks in flight yet.
*/
-static void
+static void __cpuinit
rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
{
unsigned long flags;
diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..9d79b7854fa6 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -663,8 +663,10 @@ int relay_late_setup_files(struct rchan *chan,
mutex_lock(&relay_channels_mutex);
/* Is chan already set up? */
- if (unlikely(chan->has_base_filename))
+ if (unlikely(chan->has_base_filename)) {
+ mutex_unlock(&relay_channels_mutex);
return -EEXIST;
+ }
chan->has_base_filename = 1;
chan->parent = parent;
curr_cpu = get_cpu();
diff --git a/kernel/sched.c b/kernel/sched.c
index 173768f142ad..0952931ca7f6 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2311,6 +2311,16 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
if (!sched_feat(SYNC_WAKEUPS))
sync = 0;
+ if (!sync) {
+ if (current->se.avg_overlap < sysctl_sched_migration_cost &&
+ p->se.avg_overlap < sysctl_sched_migration_cost)
+ sync = 1;
+ } else {
+ if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
+ p->se.avg_overlap >= sysctl_sched_migration_cost)
+ sync = 0;
+ }
+
#ifdef CONFIG_SMP
if (sched_feat(LB_WAKEUP_UPDATE)) {
struct sched_domain *sd;
@@ -4774,8 +4784,8 @@ EXPORT_SYMBOL(default_wake_function);
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns
* zero in this (rare) case, and we handle it by continuing to scan the queue.
*/
-static void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
- int nr_exclusive, int sync, void *key)
+void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
+ int nr_exclusive, int sync, void *key)
{
wait_queue_t *curr, *next;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 5cc1c162044f..a7e50ba185ac 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -719,7 +719,7 @@ enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int wakeup)
__enqueue_entity(cfs_rq, se);
}
-static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+static void __clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
if (cfs_rq->last == se)
cfs_rq->last = NULL;
@@ -728,6 +728,12 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
cfs_rq->next = NULL;
}
+static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se)
+{
+ for_each_sched_entity(se)
+ __clear_buddies(cfs_rq_of(se), se);
+}
+
static void
dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int sleep)
{
@@ -768,8 +774,14 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
ideal_runtime = sched_slice(cfs_rq, curr);
delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
- if (delta_exec > ideal_runtime)
+ if (delta_exec > ideal_runtime) {
resched_task(rq_of(cfs_rq)->curr);
+ /*
+ * The current task ran long enough, ensure it doesn't get
+ * re-elected due to buddy favours.
+ */
+ clear_buddies(cfs_rq, curr);
+ }
}
static void
@@ -1179,20 +1191,15 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
int idx, unsigned long load, unsigned long this_load,
unsigned int imbalance)
{
- struct task_struct *curr = this_rq->curr;
- struct task_group *tg;
unsigned long tl = this_load;
unsigned long tl_per_task;
+ struct task_group *tg;
unsigned long weight;
int balanced;
if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
return 0;
- if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
- p->se.avg_overlap > sysctl_sched_migration_cost))
- sync = 0;
-
/*
* If sync wakeup then subtract the (maximum possible)
* effect of the currently running task from the load
@@ -1419,9 +1426,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
if (!sched_feat(WAKEUP_PREEMPT))
return;
- if (sched_feat(WAKEUP_OVERLAP) && (sync ||
- (se->avg_overlap < sysctl_sched_migration_cost &&
- pse->avg_overlap < sysctl_sched_migration_cost))) {
+ if (sched_feat(WAKEUP_OVERLAP) && sync) {
resched_task(curr);
return;
}
@@ -1452,6 +1457,11 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
do {
se = pick_next_entity(cfs_rq);
+ /*
+ * If se was a buddy, clear it so that it will have to earn
+ * the favour again.
+ */
+ __clear_buddies(cfs_rq, se);
set_next_entity(cfs_rq, se);
cfs_rq = group_cfs_rq(se);
} while (cfs_rq);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index f2773b5d1226..8ab0cef8ecab 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,6 +296,7 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
static inline void account_group_user_time(struct task_struct *tsk,
cputime_t cputime)
{
+ struct task_cputime *times;
struct signal_struct *sig;
/* tsk == current, ensure it is safe to use ->signal */
@@ -303,13 +304,11 @@ static inline void account_group_user_time(struct task_struct *tsk,
return;
sig = tsk->signal;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ times = &sig->cputime.totals;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->utime = cputime_add(times->utime, cputime);
- put_cpu_no_resched();
- }
+ spin_lock(&times->lock);
+ times->utime = cputime_add(times->utime, cputime);
+ spin_unlock(&times->lock);
}
/**
@@ -325,6 +324,7 @@ static inline void account_group_user_time(struct task_struct *tsk,
static inline void account_group_system_time(struct task_struct *tsk,
cputime_t cputime)
{
+ struct task_cputime *times;
struct signal_struct *sig;
/* tsk == current, ensure it is safe to use ->signal */
@@ -332,13 +332,11 @@ static inline void account_group_system_time(struct task_struct *tsk,
return;
sig = tsk->signal;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ times = &sig->cputime.totals;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->stime = cputime_add(times->stime, cputime);
- put_cpu_no_resched();
- }
+ spin_lock(&times->lock);
+ times->stime = cputime_add(times->stime, cputime);
+ spin_unlock(&times->lock);
}
/**
@@ -354,6 +352,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
static inline void account_group_exec_runtime(struct task_struct *tsk,
unsigned long long ns)
{
+ struct task_cputime *times;
struct signal_struct *sig;
sig = tsk->signal;
@@ -362,11 +361,9 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
if (unlikely(!sig))
return;
- if (sig->cputime.totals) {
- struct task_cputime *times;
+ times = &sig->cputime.totals;
- times = per_cpu_ptr(sig->cputime.totals, get_cpu());
- times->sum_exec_runtime += ns;
- put_cpu_no_resched();
- }
+ spin_lock(&times->lock);
+ times->sum_exec_runtime += ns;
+ spin_unlock(&times->lock);
}
diff --git a/kernel/signal.c b/kernel/signal.c
index e73759783dc8..b6b36768b758 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -909,7 +909,9 @@ static void print_fatal_signal(struct pt_regs *regs, int signr)
}
#endif
printk("\n");
+ preempt_disable();
show_regs(regs);
+ preempt_enable();
}
static int __init setup_print_fatal_signals(char *str)
diff --git a/kernel/smp.c b/kernel/smp.c
index 5cfa0e5e3e88..bbedbb7efe32 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -18,6 +18,7 @@ __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
enum {
CSD_FLAG_WAIT = 0x01,
CSD_FLAG_ALLOC = 0x02,
+ CSD_FLAG_LOCK = 0x04,
};
struct call_function_data {
@@ -186,6 +187,9 @@ void generic_smp_call_function_single_interrupt(void)
if (data_flags & CSD_FLAG_WAIT) {
smp_wmb();
data->flags &= ~CSD_FLAG_WAIT;
+ } else if (data_flags & CSD_FLAG_LOCK) {
+ smp_wmb();
+ data->flags &= ~CSD_FLAG_LOCK;
} else if (data_flags & CSD_FLAG_ALLOC)
kfree(data);
}
@@ -196,6 +200,8 @@ void generic_smp_call_function_single_interrupt(void)
}
}
+static DEFINE_PER_CPU(struct call_single_data, csd_data);
+
/*
* smp_call_function_single - Run a function on a specific CPU
* @func: The function to run. This must be fast and non-blocking.
@@ -224,14 +230,38 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
func(info);
local_irq_restore(flags);
} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
- struct call_single_data *data = NULL;
+ struct call_single_data *data;
if (!wait) {
+ /*
+ * We are calling a function on a single CPU
+ * and we are not going to wait for it to finish.
+ * We first try to allocate the data, but if we
+ * fail, we fall back to use a per cpu data to pass
+ * the information to that CPU. Since all callers
+ * of this code will use the same data, we must
+ * synchronize the callers to prevent a new caller
+ * from corrupting the data before the callee
+ * can access it.
+ *
+ * The CSD_FLAG_LOCK is used to let us know when
+ * the IPI handler is done with the data.
+ * The first caller will set it, and the callee
+ * will clear it. The next caller must wait for
+ * it to clear before we set it again. This
+ * will make sure the callee is done with the
+ * data before a new caller will use it.
+ */
data = kmalloc(sizeof(*data), GFP_ATOMIC);
if (data)
data->flags = CSD_FLAG_ALLOC;
- }
- if (!data) {
+ else {
+ data = &per_cpu(csd_data, me);
+ while (data->flags & CSD_FLAG_LOCK)
+ cpu_relax();
+ data->flags = CSD_FLAG_LOCK;
+ }
+ } else {
data = &d;
data->flags = CSD_FLAG_WAIT;
}
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index d9188c66278a..85d5a2455103 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -16,6 +16,7 @@
#include <linux/lockdep.h>
#include <linux/notifier.h>
#include <linux/module.h>
+#include <linux/sysctl.h>
#include <asm/irq_regs.h>
@@ -88,6 +89,14 @@ void touch_all_softlockup_watchdogs(void)
}
EXPORT_SYMBOL(touch_all_softlockup_watchdogs);
+int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
+ struct file *filp, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ touch_all_softlockup_watchdogs();
+ return proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+}
+
/*
* This callback runs from the timer interrupt, and checks
* whether the watchdog thread has hung or not:
diff --git a/kernel/sys.c b/kernel/sys.c
index 87ca037dc03a..c5e7dec4966e 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1526,22 +1526,14 @@ SYSCALL_DEFINE2(setrlimit, unsigned int, resource, struct rlimit __user *, rlim)
return -EINVAL;
if (copy_from_user(&new_rlim, rlim, sizeof(*rlim)))
return -EFAULT;
+ if (new_rlim.rlim_cur > new_rlim.rlim_max)
+ return -EINVAL;
old_rlim = current->signal->rlim + resource;
if ((new_rlim.rlim_max > old_rlim->rlim_max) &&
!capable(CAP_SYS_RESOURCE))
return -EPERM;
-
- if (resource == RLIMIT_NOFILE) {
- if (new_rlim.rlim_max == RLIM_INFINITY)
- new_rlim.rlim_max = sysctl_nr_open;
- if (new_rlim.rlim_cur == RLIM_INFINITY)
- new_rlim.rlim_cur = sysctl_nr_open;
- if (new_rlim.rlim_max > sysctl_nr_open)
- return -EPERM;
- }
-
- if (new_rlim.rlim_cur > new_rlim.rlim_max)
- return -EINVAL;
+ if (resource == RLIMIT_NOFILE && new_rlim.rlim_max > sysctl_nr_open)
+ return -EPERM;
retval = security_task_setrlimit(resource, &new_rlim);
if (retval)
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 368d1638ee78..790f9d785663 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -809,7 +809,7 @@ static struct ctl_table kern_table[] = {
.data = &softlockup_thresh,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = &proc_dointvec_minmax,
+ .proc_handler = &proc_dosoftlockup_thresh,
.strategy = &sysctl_intvec,
.extra1 = &neg_one,
.extra2 = &sixty,
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 63e05d423a09..21a5ca849514 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -274,6 +274,21 @@ out_bc:
}
/*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled.
+ */
+static void tick_handover_do_timer(int *cpup)
+{
+ if (*cpup == tick_do_timer_cpu) {
+ int cpu = cpumask_first(cpu_online_mask);
+
+ tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
+ TICK_DO_TIMER_NONE;
+ }
+}
+
+/*
* Shutdown an event device on a given cpu:
*
* This is called on a life CPU, when a CPU is dead. So we cannot
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
clockevents_exchange_device(dev, NULL);
td->evtdev = NULL;
}
- /* Transfer the do_timer job away from this cpu */
- if (*cpup == tick_do_timer_cpu) {
- int cpu = cpumask_first(cpu_online_mask);
-
- tick_do_timer_cpu = (cpu < nr_cpu_ids) ? cpu :
- TICK_DO_TIMER_NONE;
- }
spin_unlock_irqrestore(&tick_device_lock, flags);
}
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
tick_broadcast_oneshot_control(reason);
break;
+ case CLOCK_EVT_NOTIFY_CPU_DYING:
+ tick_handover_do_timer(dev);
+ break;
+
case CLOCK_EVT_NOTIFY_CPU_DEAD:
tick_shutdown_broadcast_oneshot(dev);
tick_shutdown_broadcast(dev);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 1b6c05bd0d0a..d3f1ef4d5cbe 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -134,7 +134,7 @@ __setup("nohz=", setup_tick_nohz);
* value. We do this unconditionally on any cpu, as we don't know whether the
* cpu, which has the update task assigned is in a long sleep.
*/
-void tick_nohz_update_jiffies(void)
+static void tick_nohz_update_jiffies(void)
{
int cpu = smp_processor_id();
struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..9a236ffe2aa4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
#include <linux/clocksource.h>
#include <linux/kallsyms.h>
#include <linux/seq_file.h>
+#include <linux/suspend.h>
#include <linux/debugfs.h>
#include <linux/hardirq.h>
#include <linux/kthread.h>
@@ -1736,9 +1737,12 @@ static void clear_ftrace_pid(struct pid *pid)
{
struct task_struct *p;
+ rcu_read_lock();
do_each_pid_task(pid, PIDTYPE_PID, p) {
clear_tsk_trace_trace(p);
} while_each_pid_task(pid, PIDTYPE_PID, p);
+ rcu_read_unlock();
+
put_pid(pid);
}
@@ -1746,9 +1750,11 @@ static void set_ftrace_pid(struct pid *pid)
{
struct task_struct *p;
+ rcu_read_lock();
do_each_pid_task(pid, PIDTYPE_PID, p) {
set_tsk_trace_trace(p);
} while_each_pid_task(pid, PIDTYPE_PID, p);
+ rcu_read_unlock();
}
static void clear_ftrace_pid_task(struct pid **pid)
@@ -1965,6 +1971,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
static atomic_t ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
{
@@ -2043,6 +2050,27 @@ static int start_graph_tracing(void)
return ret;
}
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+ void *unused)
+{
+ switch (state) {
+ case PM_HIBERNATION_PREPARE:
+ pause_graph_tracing();
+ break;
+
+ case PM_POST_HIBERNATION:
+ unpause_graph_tracing();
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
int register_ftrace_graph(trace_func_graph_ret_t retfunc,
trace_func_graph_ent_t entryfunc)
{
@@ -2050,6 +2078,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
mutex_lock(&ftrace_sysctl_lock);
+ ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+ register_pm_notifier(&ftrace_suspend_notifier);
+
atomic_inc(&ftrace_graph_active);
ret = start_graph_tracing();
if (ret) {
@@ -2075,6 +2106,7 @@ void unregister_ftrace_graph(void)
ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
ftrace_graph_entry = ftrace_graph_entry_stub;
ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+ unregister_pm_notifier(&ftrace_suspend_notifier);
mutex_unlock(&ftrace_sysctl_lock);
}
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..bd38c5cfd8ad 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -246,7 +246,7 @@ static inline int test_time_stamp(u64 delta)
return 0;
}
-#define BUF_PAGE_SIZE (PAGE_SIZE - sizeof(struct buffer_data_page))
+#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
/*
* head_page == tail_page && head == tail then buffer is empty.
@@ -1025,12 +1025,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
}
if (next_page == head_page) {
- if (!(buffer->flags & RB_FL_OVERWRITE)) {
- /* reset write */
- if (tail <= BUF_PAGE_SIZE)
- local_set(&tail_page->write, tail);
+ if (!(buffer->flags & RB_FL_OVERWRITE))
goto out_unlock;
- }
/* tail_page has not moved yet? */
if (tail_page == cpu_buffer->tail_page) {
@@ -1105,6 +1101,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
return event;
out_unlock:
+ /* reset write */
+ if (tail <= BUF_PAGE_SIZE)
+ local_set(&tail_page->write, tail);
+
__raw_spin_unlock(&cpu_buffer->lock);
local_irq_restore(flags);
return NULL;
@@ -2174,6 +2174,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
cpu_buffer->overrun = 0;
cpu_buffer->entries = 0;
+
+ cpu_buffer->write_stamp = 0;
+ cpu_buffer->read_stamp = 0;
}
/**
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c580233add95..17bb88d86ac2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -40,7 +40,7 @@
#define TRACE_BUFFER_FLAGS (RB_FL_OVERWRITE)
-unsigned long __read_mostly tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly tracing_max_latency;
unsigned long __read_mostly tracing_thresh;
/*
@@ -3736,7 +3736,7 @@ static struct notifier_block trace_die_notifier = {
* it if we decide to change what log level the ftrace dump
* should be at.
*/
-#define KERN_TRACE KERN_INFO
+#define KERN_TRACE KERN_EMERG
static void
trace_printk_seq(struct trace_seq *s)
@@ -3770,6 +3770,7 @@ void ftrace_dump(void)
dump_ran = 1;
/* No turning back! */
+ tracing_off();
ftrace_kill();
for_each_tracing_cpu(cpu) {
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
static void __irqsoff_tracer_init(struct trace_array *tr)
{
+ tracing_max_latency = 0;
irqsoff_trace = tr;
/* make sure that the tracer is visible */
smp_wmb();
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..42ae1e77b6b3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
static int wakeup_tracer_init(struct trace_array *tr)
{
+ tracing_max_latency = 0;
wakeup_trace = tr;
start_wakeup_tracer(tr);
return 0;
diff --git a/kernel/wait.c b/kernel/wait.c
index cd87131f2fc2..42a2dbc181c8 100644
--- a/kernel/wait.c
+++ b/kernel/wait.c
@@ -91,6 +91,15 @@ prepare_to_wait_exclusive(wait_queue_head_t *q, wait_queue_t *wait, int state)
}
EXPORT_SYMBOL(prepare_to_wait_exclusive);
+/*
+ * finish_wait - clean up after waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ */
void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
{
unsigned long flags;
@@ -117,6 +126,39 @@ void finish_wait(wait_queue_head_t *q, wait_queue_t *wait)
}
EXPORT_SYMBOL(finish_wait);
+/*
+ * abort_exclusive_wait - abort exclusive waiting in a queue
+ * @q: waitqueue waited on
+ * @wait: wait descriptor
+ * @state: runstate of the waiter to be woken
+ * @key: key to identify a wait bit queue or %NULL
+ *
+ * Sets current thread back to running state and removes
+ * the wait descriptor from the given waitqueue if still
+ * queued.
+ *
+ * Wakes up the next waiter if the caller is concurrently
+ * woken up through the queue.
+ *
+ * This prevents waiter starvation where an exclusive waiter
+ * aborts and is woken up concurrently and noone wakes up
+ * the next waiter.
+ */
+void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait,
+ unsigned int mode, void *key)
+{
+ unsigned long flags;
+
+ __set_current_state(TASK_RUNNING);
+ spin_lock_irqsave(&q->lock, flags);
+ if (!list_empty(&wait->task_list))
+ list_del_init(&wait->task_list);
+ else if (waitqueue_active(q))
+ __wake_up_common(q, mode, 1, 0, key);
+ spin_unlock_irqrestore(&q->lock, flags);
+}
+EXPORT_SYMBOL(abort_exclusive_wait);
+
int autoremove_wake_function(wait_queue_t *wait, unsigned mode, int sync, void *key)
{
int ret = default_wake_function(wait, mode, sync, key);
@@ -177,17 +219,20 @@ int __sched
__wait_on_bit_lock(wait_queue_head_t *wq, struct wait_bit_queue *q,
int (*action)(void *), unsigned mode)
{
- int ret = 0;
-
do {
+ int ret;
+
prepare_to_wait_exclusive(wq, &q->wait, mode);
- if (test_bit(q->key.bit_nr, q->key.flags)) {
- if ((ret = (*action)(q->key.flags)))
- break;
- }
+ if (!test_bit(q->key.bit_nr, q->key.flags))
+ continue;
+ ret = action(q->key.flags);
+ if (!ret)
+ continue;
+ abort_exclusive_wait(wq, &q->wait, mode, &q->key);
+ return ret;
} while (test_and_set_bit(q->key.bit_nr, q->key.flags));
finish_wait(wq, &q->wait);
- return ret;
+ return 0;
}
EXPORT_SYMBOL(__wait_on_bit_lock);