From fc2219d49ef1606e7fd2c88af2b423b01ff3d319 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 09:50:41 -0700
Subject: rcu: Clean up code based on review feedback from Josh Triplett

These issues identified during an old-fashioned face-to-face code
review extended over many hours.

o	Bury various forms of the "rsp->completed == rsp->gpnum"
	comparison into an rcu_gp_in_progress() function, which has
	the beneficial side-effect of forcing consistent use of
	ACCESS_ONCE().

o	Replace hand-coded arithmetic with DIV_ROUND_UP().

o	Bury several "!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x01])"
	instances into an rcu_preempted_readers() function, as this
	expression indicates that there are no readers blocked
	within RCU read-side critical sections blocking the current
	grace period.  (Though there might well be similar readers
	blocking the next grace period.)

o	Remove a dangling rcu_restart_cpu() declaration that has
	been dangling for almost 20 minor releases of the kernel.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12537246442687-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 29 +++++++++++++++++-----------
 kernel/rcutree.h        |  6 +++---
 kernel/rcutree_plugin.h | 50 ++++++++++++++++++++++++-------------------------
 3 files changed, 46 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 52b06f6e158c..f85b6842d1e1 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -100,6 +100,16 @@ static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
 
 #include "rcutree_plugin.h"
 
+/*
+ * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
+ * permit this function to be invoked without holding the root rcu_node
+ * structure's ->lock, but of course results can be subject to change.
+ */
+static int rcu_gp_in_progress(struct rcu_state *rsp)
+{
+	return ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum);
+}
+
 /*
  * Note a quiescent state.  Because we do not need to know
  * how many quiescent states passed, just if there was at least
@@ -173,9 +183,7 @@ cpu_has_callbacks_ready_to_invoke(struct rcu_data *rdp)
 static int
 cpu_needs_another_gp(struct rcu_state *rsp, struct rcu_data *rdp)
 {
-	/* ACCESS_ONCE() because we are accessing outside of lock. */
-	return *rdp->nxttail[RCU_DONE_TAIL] &&
-	       ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum);
+	return *rdp->nxttail[RCU_DONE_TAIL] && !rcu_gp_in_progress(rsp);
 }
 
 /*
@@ -482,7 +490,7 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 
 	spin_lock_irqsave(&rnp->lock, flags);
 	delta = jiffies - rsp->jiffies_stall;
-	if (delta < RCU_STALL_RAT_DELAY || rsp->gpnum == rsp->completed) {
+	if (delta < RCU_STALL_RAT_DELAY || !rcu_gp_in_progress(rsp)) {
 		spin_unlock_irqrestore(&rnp->lock, flags);
 		return;
 	}
@@ -537,8 +545,7 @@ static void check_cpu_stall(struct rcu_state *rsp, struct rcu_data *rdp)
 		/* We haven't checked in, so go dump stack. */
 		print_cpu_stall(rsp);
 
-	} else if (rsp->gpnum != rsp->completed &&
-		   delta >= RCU_STALL_RAT_DELAY) {
+	} else if (rcu_gp_in_progress(rsp) && delta >= RCU_STALL_RAT_DELAY) {
 
 		/* They had two time units to dump stack, so complain. */
 		print_other_cpu_stall(rsp);
@@ -703,9 +710,9 @@ rcu_process_gp_end(struct rcu_state *rsp, struct rcu_data *rdp)
  * hold rnp->lock, as required by rcu_start_gp(), which will release it.
  */
 static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
-	__releases(rnp->lock)
+	__releases(rcu_get_root(rsp)->lock)
 {
-	WARN_ON_ONCE(rsp->completed == rsp->gpnum);
+	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	rsp->completed = rsp->gpnum;
 	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
@@ -1092,7 +1099,7 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 	u8 signaled;
 
-	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum))
+	if (!rcu_gp_in_progress(rsp))
 		return;  /* No grace period in progress, nothing to force. */
 	if (!spin_trylock_irqsave(&rsp->fqslock, flags)) {
 		rsp->n_force_qs_lh++; /* Inexact, can lose counts.  Tough! */
@@ -1251,7 +1258,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 	rdp->nxttail[RCU_NEXT_TAIL] = &head->next;
 
 	/* Start a new grace period if one not already started. */
-	if (ACCESS_ONCE(rsp->completed) == ACCESS_ONCE(rsp->gpnum)) {
+	if (!rcu_gp_in_progress(rsp)) {
 		unsigned long nestflag;
 		struct rcu_node *rnp_root = rcu_get_root(rsp);
 
@@ -1331,7 +1338,7 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* Has an RCU GP gone long enough to send resched IPIs &c? */
-	if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) &&
+	if (rcu_gp_in_progress(rsp) &&
 	    ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) {
 		rdp->n_rp_need_fqs++;
 		return 1;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 8e8287a983c2..9aa8c8a160d8 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -48,14 +48,14 @@
 #elif NR_CPUS <= RCU_FANOUT_SQ
 #  define NUM_RCU_LVLS	      2
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT)
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
 #  define NUM_RCU_LVL_2	      (NR_CPUS)
 #  define NUM_RCU_LVL_3	      0
 #elif NR_CPUS <= RCU_FANOUT_CUBE
 #  define NUM_RCU_LVLS	      3
 #  define NUM_RCU_LVL_0	      1
-#  define NUM_RCU_LVL_1	      (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ)
-#  define NUM_RCU_LVL_2	      (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT))
+#  define NUM_RCU_LVL_1	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT_SQ)
+#  define NUM_RCU_LVL_2	      DIV_ROUND_UP(NR_CPUS, RCU_FANOUT)
 #  define NUM_RCU_LVL_3	      NR_CPUS
 #else
 # error "CONFIG_RCU_FANOUT insufficient for NR_CPUS"
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 1cee04f627eb..8ff1ba7b3c43 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -150,6 +150,16 @@ void __rcu_read_lock(void)
 }
 EXPORT_SYMBOL_GPL(__rcu_read_lock);
 
+/*
+ * Check for preempted RCU readers blocking the current grace period
+ * for the specified rcu_node structure.  If the caller needs a reliable
+ * answer, it must hold the rcu_node's ->lock.
+ */
+static int rcu_preempted_readers(struct rcu_node *rnp)
+{
+	return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+}
+
 static void rcu_read_unlock_special(struct task_struct *t)
 {
 	int empty;
@@ -196,7 +206,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			spin_unlock(&rnp->lock);  /* irqs remain disabled. */
 		}
-		empty = list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
+		empty = !rcu_preempted_readers(rnp);
 		list_del_init(&t->rcu_node_entry);
 		t->rcu_blocked_node = NULL;
 
@@ -207,7 +217,7 @@ static void rcu_read_unlock_special(struct task_struct *t)
 		 * drop rnp->lock and restore irq.
 		 */
 		if (!empty && rnp->qsmask == 0 &&
-		    list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1])) {
+		    !rcu_preempted_readers(rnp)) {
 			struct rcu_node *rnp_p;
 
 			if (rnp->parent == NULL) {
@@ -257,12 +267,12 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
 {
 	unsigned long flags;
 	struct list_head *lp;
-	int phase = rnp->gpnum & 0x1;
+	int phase;
 	struct task_struct *t;
 
-	if (!list_empty(&rnp->blocked_tasks[phase])) {
+	if (rcu_preempted_readers(rnp)) {
 		spin_lock_irqsave(&rnp->lock, flags);
-		phase = rnp->gpnum & 0x1; /* re-read under lock. */
+		phase = rnp->gpnum & 0x1;
 		lp = &rnp->blocked_tasks[phase];
 		list_for_each_entry(t, lp, rcu_node_entry)
 			printk(" P%d", t->pid);
@@ -281,20 +291,10 @@ static void rcu_print_task_stall(struct rcu_node *rnp)
  */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 {
-	WARN_ON_ONCE(!list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]));
+	WARN_ON_ONCE(rcu_preempted_readers(rnp));
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
-/*
- * Check for preempted RCU readers for the specified rcu_node structure.
- * If the caller needs a reliable answer, it must hold the rcu_node's
- * >lock.
- */
-static int rcu_preempted_readers(struct rcu_node *rnp)
-{
-	return !list_empty(&rnp->blocked_tasks[rnp->gpnum & 0x1]);
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
@@ -461,6 +461,15 @@ static void rcu_preempt_note_context_switch(int cpu)
 {
 }
 
+/*
+ * Because preemptable RCU does not exist, there are never any preempted
+ * RCU readers.
+ */
+static int rcu_preempted_readers(struct rcu_node *rnp)
+{
+	return 0;
+}
+
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 
 /*
@@ -483,15 +492,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
-/*
- * Because preemptable RCU does not exist, there are never any preempted
- * RCU readers.
- */
-static int rcu_preempted_readers(struct rcu_node *rnp)
-{
-	return 0;
-}
-
 #ifdef CONFIG_HOTPLUG_CPU
 
 /*
-- 
cgit v1.2.3


From 1eba8f84380bede3c602bd7758dea96925cead01 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 09:50:42 -0700
Subject: rcu: Clean up code based on review feedback from Josh Triplett, part
 2

These issues identified during an old-fashioned face-to-face code
review extending over many hours.

o	Add comments for tricky parts of code, and correct comments
	that have passed their sell-by date.

o	Get rid of the vestiges of rcu_init_sched(), which is no
	longer needed now that PREEMPT_RCU is gone.

o	Move the #include of rcutree_plugin.h to the end of
	rcutree.c, which means that, rather than having a random
	collection of forward declarations, the new set of forward
	declarations document the set of plugins.  The new home for
	this #include also allows __rcu_init_preempt() to move into
	rcutree_plugin.h.

o	Fix rcu_preempt_check_callbacks() to be static.

Suggested-by: Josh Triplett <josh@joshtriplett.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12537246443924-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Peter Zijlstra <peterz@infradead.org>
---
 include/linux/rcupdate.h |  4 +++
 include/linux/rcutree.h  |  6 +---
 init/main.c              |  1 -
 kernel/rcutree.c         | 72 ++++++++++++++++++++++--------------------------
 kernel/rcutree.h         | 31 +++++++++++++++------
 kernel/rcutree_plugin.h  | 23 ++++++++++++++--
 6 files changed, 82 insertions(+), 55 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 6fe0363724e9..70331218e4b4 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -196,6 +196,8 @@ static inline void rcu_read_lock_sched(void)
 	__acquire(RCU_SCHED);
 	rcu_read_acquire();
 }
+
+/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
 static inline notrace void rcu_read_lock_sched_notrace(void)
 {
 	preempt_disable_notrace();
@@ -213,6 +215,8 @@ static inline void rcu_read_unlock_sched(void)
 	__release(RCU_SCHED);
 	preempt_enable();
 }
+
+/* Used by lockdep and tracing: cannot be traced, cannot call lockdep. */
 static inline notrace void rcu_read_unlock_sched_notrace(void)
 {
 	__release(RCU_SCHED);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 88109c87f29c..19a3b06943e0 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -90,10 +90,6 @@ extern long rcu_batches_completed(void);
 extern long rcu_batches_completed_bh(void);
 extern long rcu_batches_completed_sched(void);
 
-static inline void rcu_init_sched(void)
-{
-}
-
 #ifdef CONFIG_NO_HZ
 void rcu_enter_nohz(void);
 void rcu_exit_nohz(void);
@@ -106,7 +102,7 @@ static inline void rcu_exit_nohz(void)
 }
 #endif /* CONFIG_NO_HZ */
 
-/* A context switch is a grace period for rcutree. */
+/* A context switch is a grace period for RCU-sched and RCU-bh. */
 static inline int rcu_blocking_is_gp(void)
 {
 	return num_online_cpus() == 1;
diff --git a/init/main.c b/init/main.c
index 34971becbd3c..833d675677d1 100644
--- a/init/main.c
+++ b/init/main.c
@@ -782,7 +782,6 @@ static void __init do_initcalls(void)
  */
 static void __init do_basic_setup(void)
 {
-	rcu_init_sched(); /* needed by module_init stage. */
 	init_workqueues();
 	cpuset_init_smp();
 	usermodehelper_init();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index f85b6842d1e1..53a5ef0ca911 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -81,24 +81,29 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
-extern long rcu_batches_completed_sched(void);
-static struct rcu_node *rcu_get_root(struct rcu_state *rsp);
-static void cpu_quiet_msk(unsigned long mask, struct rcu_state *rsp,
-			  struct rcu_node *rnp, unsigned long flags);
-static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags);
+/* Forward declarations for rcutree_plugin.h */
+static inline void rcu_bootup_announce(void);
+long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
+static int rcu_preempted_readers(struct rcu_node *rnp);
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void rcu_print_task_stall(struct rcu_node *rnp);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp);
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				      struct rcu_node *rnp,
+				      struct rcu_data *rdp);
+static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void __rcu_process_callbacks(struct rcu_state *rsp,
-				    struct rcu_data *rdp);
-static void __call_rcu(struct rcu_head *head,
-		       void (*func)(struct rcu_head *rcu),
-		       struct rcu_state *rsp);
-static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp);
-static void __cpuinit rcu_init_percpu_data(int cpu, struct rcu_state *rsp,
-					   int preemptable);
+static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_process_callbacks(void);
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+static int rcu_preempt_pending(int cpu);
+static int rcu_preempt_needs_cpu(int cpu);
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
+static void __init __rcu_init_preempt(void);
 
-#include "rcutree_plugin.h"
 
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
@@ -377,7 +382,7 @@ static long dyntick_recall_completed(struct rcu_state *rsp)
 /*
  * Snapshot the specified CPU's dynticks counter so that we can later
  * credit them with an implicit quiescent state.  Return 1 if this CPU
- * is already in a quiescent state courtesy of dynticks idle mode.
+ * is in dynticks idle mode, which is an extended quiescent state.
  */
 static int dyntick_save_progress_counter(struct rcu_data *rdp)
 {
@@ -624,9 +629,15 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	note_new_gpnum(rsp, rdp);
 
 	/*
-	 * Because we are first, we know that all our callbacks will
-	 * be covered by this upcoming grace period, even the ones
-	 * that were registered arbitrarily recently.
+	 * Because this CPU just now started the new grace period, we know
+	 * that all of its callbacks will be covered by this upcoming grace
+	 * period, even the ones that were registered arbitrarily recently.
+	 * Therefore, advance all outstanding callbacks to RCU_WAIT_TAIL.
+	 *
+	 * Other CPUs cannot be sure exactly when the grace period started.
+	 * Therefore, their recently registered callbacks must pass through
+	 * an additional RCU_NEXT_READY stage, so that they will be handled
+	 * by the next RCU grace period.
 	 */
 	rdp->nxttail[RCU_NEXT_READY_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 	rdp->nxttail[RCU_WAIT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
@@ -886,7 +897,7 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 
 	/*
 	 * Move callbacks from the outgoing CPU to the running CPU.
-	 * Note that the outgoing CPU is now quiscent, so it is now
+	 * Note that the outgoing CPU is now quiescent, so it is now
 	 * (uncharacteristically) safe to access its rcu_data structure.
 	 * Note also that we must carefully retain the order of the
 	 * outgoing CPU's callbacks in order for rcu_barrier() to work
@@ -1577,25 +1588,6 @@ do { \
 	} \
 } while (0)
 
-#ifdef CONFIG_TREE_PREEMPT_RCU
-
-void __init __rcu_init_preempt(void)
-{
-	int i;			/* All used by RCU_INIT_FLAVOR(). */
-	int j;
-	struct rcu_node *rnp;
-
-	RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
-}
-
-#else /* #ifdef CONFIG_TREE_PREEMPT_RCU */
-
-void __init __rcu_init_preempt(void)
-{
-}
-
-#endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
-
 void __init __rcu_init(void)
 {
 	int i;			/* All used by RCU_INIT_FLAVOR(). */
@@ -1612,6 +1604,8 @@ void __init __rcu_init(void)
 	open_softirq(RCU_SOFTIRQ, rcu_process_callbacks);
 }
 
+#include "rcutree_plugin.h"
+
 module_param(blimit, int, 0);
 module_param(qhimark, int, 0);
 module_param(qlowmark, int, 0);
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 9aa8c8a160d8..a48d11f37b4c 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -79,15 +79,21 @@ struct rcu_dynticks {
  * Definition for node within the RCU grace-period-detection hierarchy.
  */
 struct rcu_node {
-	spinlock_t lock;
+	spinlock_t lock;	/* Root rcu_node's lock protects some */
+				/*  rcu_state fields as well as following. */
 	long	gpnum;		/* Current grace period for this node. */
 				/*  This will either be equal to or one */
 				/*  behind the root rcu_node's gpnum. */
 	unsigned long qsmask;	/* CPUs or groups that need to switch in */
 				/*  order for current grace period to proceed.*/
+				/*  In leaf rcu_node, each bit corresponds to */
+				/*  an rcu_data structure, otherwise, each */
+				/*  bit corresponds to a child rcu_node */
+				/*  structure. */
 	unsigned long qsmaskinit;
 				/* Per-GP initialization for qsmask. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
+				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
 	int	grphi;		/* highest-numbered CPU or group here. */
 	u8	grpnum;		/* CPU/group number for next level up. */
@@ -95,6 +101,9 @@ struct rcu_node {
 	struct rcu_node *parent;
 	struct list_head blocked_tasks[2];
 				/* Tasks blocked in RCU read-side critsect. */
+				/*  Grace period number (->gpnum) x blocked */
+				/*  by tasks on the (x & 0x1) element of the */
+				/*  blocked_tasks[] array. */
 } ____cacheline_internodealigned_in_smp;
 
 /* Index values for nxttail array in struct rcu_data. */
@@ -126,19 +135,22 @@ struct rcu_data {
 	 * Any of the partitions might be empty, in which case the
 	 * pointer to that partition will be equal to the pointer for
 	 * the following partition.  When the list is empty, all of
-	 * the nxttail elements point to nxtlist, which is NULL.
+	 * the nxttail elements point to the ->nxtlist pointer itself,
+	 * which in that case is NULL.
 	 *
-	 * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]):
-	 *	Entries that might have arrived after current GP ended
-	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
-	 *	Entries known to have arrived before current GP ended
-	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
-	 *	Entries that batch # <= ->completed - 1: waiting for current GP
 	 * [nxtlist, *nxttail[RCU_DONE_TAIL]):
 	 *	Entries that batch # <= ->completed
 	 *	The grace period for these entries has completed, and
 	 *	the other grace-period-completed entries may be moved
 	 *	here temporarily in rcu_process_callbacks().
+	 * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]):
+	 *	Entries that batch # <= ->completed - 1: waiting for current GP
+	 * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]):
+	 *	Entries known to have arrived before current GP ended
+	 * [*nxttail[RCU_NEXT_READY_TAIL], *nxttail[RCU_NEXT_TAIL]):
+	 *	Entries that might have arrived after current GP ended
+	 *	Note that the value of *nxttail[RCU_NEXT_TAIL] will
+	 *	always be NULL, as this is the end of the list.
 	 */
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail[RCU_NEXT_SIZE];
@@ -216,6 +228,9 @@ struct rcu_state {
 						/* Force QS state. */
 	long	gpnum;				/* Current gp number. */
 	long	completed;			/* # of last completed gp. */
+
+	/* End  of fields guarded by root rcu_node's lock. */
+
 	spinlock_t onofflock;			/* exclude on/offline and */
 						/*  starting new GP. */
 	spinlock_t fqslock;			/* Only one task forcing */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 8ff1ba7b3c43..65250219ab6d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -418,6 +418,18 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 	rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
 }
 
+/*
+ * Initialize preemptable RCU's state structures.
+ */
+static void __init __rcu_init_preempt(void)
+{
+	int i;			/* All used by RCU_INIT_FLAVOR(). */
+	int j;
+	struct rcu_node *rnp;
+
+	RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
+}
+
 /*
  * Check for a task exiting while in a preemptable-RCU read-side
  * critical section, clean up if so.  No need to issue warnings,
@@ -518,7 +530,7 @@ static void rcu_preempt_offline_cpu(int cpu)
  * Because preemptable RCU does not exist, it never has any callbacks
  * to check.
  */
-void rcu_preempt_check_callbacks(int cpu)
+static void rcu_preempt_check_callbacks(int cpu)
 {
 }
 
@@ -526,7 +538,7 @@ void rcu_preempt_check_callbacks(int cpu)
  * Because preemptable RCU does not exist, it never has any callbacks
  * to process.
  */
-void rcu_preempt_process_callbacks(void)
+static void rcu_preempt_process_callbacks(void)
 {
 }
 
@@ -563,4 +575,11 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 {
 }
 
+/*
+ * Because preemptable RCU does not exist, it need not be initialized.
+ */
+static void __init __rcu_init_preempt(void)
+{
+}
+
 #endif /* #else #ifdef CONFIG_TREE_PREEMPT_RCU */
-- 
cgit v1.2.3


From 9b2619aff0332e95ea5eb7a0d75b0208818d871c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 09:50:43 -0700
Subject: rcu: Clean up code to address Ingo's checkpatch feedback

Move declarations and update storage classes to make checkpatch happy.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12537246441701-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutree.h |  6 +++++-
 kernel/rcupdate.c       |  3 ---
 kernel/rcutorture.c     |  4 +---
 kernel/rcutree.c        | 23 -----------------------
 kernel/rcutree.h        | 26 +++++++++++++++++++++++++-
 kernel/rcutree_trace.c  | 12 ++++++------
 6 files changed, 37 insertions(+), 37 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 19a3b06943e0..46e9ab3ee6e1 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -30,10 +30,14 @@
 #ifndef __LINUX_RCUTREE_H
 #define __LINUX_RCUTREE_H
 
+struct notifier_block;
+
 extern void rcu_sched_qs(int cpu);
 extern void rcu_bh_qs(int cpu);
-
+extern int rcu_cpu_notify(struct notifier_block *self,
+			  unsigned long action, void *hcpu);
 extern int rcu_needs_cpu(int cpu);
+extern int rcu_expedited_torture_stats(char *page);
 
 #ifdef CONFIG_TREE_PREEMPT_RCU
 
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 37ac45483082..8e795133b33d 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -259,9 +259,6 @@ static void rcu_migrate_callback(struct rcu_head *notused)
 		wake_up(&rcu_migrate_wq);
 }
 
-extern int rcu_cpu_notify(struct notifier_block *self,
-			  unsigned long action, void *hcpu);
-
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		unsigned long action, void *hcpu)
 {
diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 233768f21f97..697c0a0229d4 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -606,8 +606,6 @@ static struct rcu_torture_ops sched_ops_sync = {
 	.name		= "sched_sync"
 };
 
-extern int rcu_expedited_torture_stats(char *page);
-
 static struct rcu_torture_ops sched_expedited_ops = {
 	.init		= rcu_sync_torture_init,
 	.cleanup	= NULL,
@@ -650,7 +648,7 @@ rcu_torture_writer(void *arg)
 		old_rp = rcu_torture_current;
 		rp->rtort_mbtest = 1;
 		rcu_assign_pointer(rcu_torture_current, rp);
-		smp_wmb();
+		smp_wmb(); /* Mods to old_rp must follow rcu_assign_pointer() */
 		if (old_rp) {
 			i = old_rp->rtort_pipe_count;
 			if (i > RCU_TORTURE_PIPE_LEN)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 53a5ef0ca911..8e52cde7b8f7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -81,29 +81,6 @@ DEFINE_PER_CPU(struct rcu_data, rcu_sched_data);
 struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
 DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
-/* Forward declarations for rcutree_plugin.h */
-static inline void rcu_bootup_announce(void);
-long rcu_batches_completed(void);
-static void rcu_preempt_note_context_switch(int cpu);
-static int rcu_preempted_readers(struct rcu_node *rnp);
-#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
-static void rcu_print_task_stall(struct rcu_node *rnp);
-#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
-static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
-#ifdef CONFIG_HOTPLUG_CPU
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				      struct rcu_node *rnp,
-				      struct rcu_data *rdp);
-static void rcu_preempt_offline_cpu(int cpu);
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-static void rcu_preempt_check_callbacks(int cpu);
-static void rcu_preempt_process_callbacks(void);
-void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
-static int rcu_preempt_pending(int cpu);
-static int rcu_preempt_needs_cpu(int cpu);
-static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
-static void __init __rcu_init_preempt(void);
-
 
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index a48d11f37b4c..e6ab31cc28ba 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -270,5 +270,29 @@ extern struct rcu_state rcu_preempt_state;
 DECLARE_PER_CPU(struct rcu_data, rcu_preempt_data);
 #endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
 
-#endif /* #ifdef RCU_TREE_NONCORE */
+#else /* #ifdef RCU_TREE_NONCORE */
 
+/* Forward declarations for rcutree_plugin.h */
+static inline void rcu_bootup_announce(void);
+long rcu_batches_completed(void);
+static void rcu_preempt_note_context_switch(int cpu);
+static int rcu_preempted_readers(struct rcu_node *rnp);
+#ifdef CONFIG_RCU_CPU_STALL_DETECTOR
+static void rcu_print_task_stall(struct rcu_node *rnp);
+#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
+static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
+#ifdef CONFIG_HOTPLUG_CPU
+static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				      struct rcu_node *rnp,
+				      struct rcu_data *rdp);
+static void rcu_preempt_offline_cpu(int cpu);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+static void rcu_preempt_check_callbacks(int cpu);
+static void rcu_preempt_process_callbacks(void);
+void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
+static int rcu_preempt_pending(int cpu);
+static int rcu_preempt_needs_cpu(int cpu);
+static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
+static void __init __rcu_init_preempt(void);
+
+#endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c89f5e9fd173..f09af28b8262 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcudata, NULL);
 }
 
-static struct file_operations rcudata_fops = {
+static const struct file_operations rcudata_fops = {
 	.owner = THIS_MODULE,
 	.open = rcudata_open,
 	.read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcudata_csv, NULL);
 }
 
-static struct file_operations rcudata_csv_fops = {
+static const struct file_operations rcudata_csv_fops = {
 	.owner = THIS_MODULE,
 	.open = rcudata_csv_open,
 	.read = seq_read,
@@ -159,7 +159,7 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
-	              "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
 		   rsp->completed, rsp->gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
@@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcuhier, NULL);
 }
 
-static struct file_operations rcuhier_fops = {
+static const struct file_operations rcuhier_fops = {
 	.owner = THIS_MODULE,
 	.open = rcuhier_open,
 	.read = seq_read,
@@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcugp, NULL);
 }
 
-static struct file_operations rcugp_fops = {
+static const struct file_operations rcugp_fops = {
 	.owner = THIS_MODULE,
 	.open = rcugp_open,
 	.read = seq_read,
@@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcu_pending, NULL);
 }
 
-static struct file_operations rcu_pending_fops = {
+static const struct file_operations rcu_pending_fops = {
 	.owner = THIS_MODULE,
 	.open = rcu_pending_open,
 	.read = seq_read,
-- 
cgit v1.2.3


From d3f6302e7e51b41af86c6496ffb2f95e8f2179df Mon Sep 17 00:00:00 2001
From: Roland Dreier <rdreier@cisco.com>
Date: Thu, 24 Sep 2009 14:07:55 -0700
Subject: hrtimer: Remove overly verbose "switch to high res mode" message

On big systems, printing <number of CPUs> copies of

    Switched to high resolution mode on CPU nnn

clutters up the kernel log for minimal gain.  Just get rid of them.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
LKML-Reference: <ada1vlw126s.fsf_-_@cisco.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index e5d98ce50f89..42672790ee13 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -720,8 +720,6 @@ static int hrtimer_switch_to_hres(void)
 	/* "Retrigger" the interrupt to get things going */
 	retrigger_next_event(NULL);
 	local_irq_restore(flags);
-	printk(KERN_DEBUG "Switched to high resolution mode on CPU %d\n",
-	       smp_processor_id());
 	return 1;
 }
 
-- 
cgit v1.2.3


From 152f9d0710a62708710161bce1b29fa8292c8c11 Mon Sep 17 00:00:00 2001
From: Eric Dumazet <eric.dumazet@gmail.com>
Date: Wed, 30 Sep 2009 20:36:19 +0200
Subject: sched_clock: Fix atomicity/continuity bug by using cmpxchg64()

Commit def0a9b2573 (sched_clock: Make it NMI safe) assumed
cmpxchg() of 64bit values was available on X86_32.

That is not so - and causes some subtle scheduler misbehavior due
to incorrect timestamps off to up by ~4 seconds.

Two symptoms are known right now:

 - interactivity problems seen by Arjan: up to 600 msecs
   latencies instead of the expected 20-40 msecs. These
   latencies are very visible on the desktop.

 - incorrect CPU stats: occasionally too high percentages in 'top',
   and crazy CPU usage stats.

Reported-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090930170754.0886ff2e@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index ac2e1dc708bd..479ce5682d7c 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -127,7 +127,7 @@ again:
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
 
-	if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
+	if (cmpxchg64(&scd->clock, old_clock, clock) != old_clock)
 		goto again;
 
 	return clock;
@@ -163,7 +163,7 @@ again:
 		val = remote_clock;
 	}
 
-	if (cmpxchg(ptr, old_val, val) != old_val)
+	if (cmpxchg64(ptr, old_val, val) != old_val)
 		goto again;
 
 	return val;
-- 
cgit v1.2.3


From 33974093c024f08caadd2fc71a83bd811ed1831d Mon Sep 17 00:00:00 2001
From: Matt Fleming <matthew.fleming@imgtec.com>
Date: Mon, 28 Sep 2009 16:43:01 +0100
Subject: tracing: Fix infinite recursion in ftrace_update_pid_func()

When CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST is enabled
__ftrace_trace_function contains the current trace function, not
ftrace_trace_function.

In ftrace_update_pid_func() we currently incorrectly assign the
value of ftrace_trace_function to __ftrace_trace_funcion before
returning.

Without this patch it is possible to execute an infinite recursion
whereby ftrace_test_stop_func() calls __ftrace_trace_function,
which was assigned ftrace_test_stop_func() in
ftrace_update_pid_func().

Signed-off-by: Matt Fleming <matthew.fleming@imgtec.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1254152581-18347-1-git-send-email-matt@console-pimps.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 46592feab5a6..3724756e41ca 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -225,7 +225,11 @@ static void ftrace_update_pid_func(void)
 	if (ftrace_trace_function == ftrace_stub)
 		return;
 
+#ifdef CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST
 	func = ftrace_trace_function;
+#else
+	func = __ftrace_trace_function;
+#endif
 
 	if (ftrace_pid_trace) {
 		set_ftrace_pid_function(func);
-- 
cgit v1.2.3


From 8c9ed8e14c342ec5e7f27e7e498f62409a10eb29 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 25 Sep 2009 13:51:17 +0800
Subject: perf_event: Fix event group handling in __perf_event_sched_*()

Paul Mackerras says:

 "Actually, looking at this more closely, it has to be a group
 leader anyway since it's at the top level of ctx->group_list.  In
 fact I see four places where we do:

  list_for_each_entry(event, &ctx->group_list, group_entry) {
	if (event == event->group_leader)
		...

 or the equivalent, three of which appear to have been introduced
 by afedadf2 ("perf_counter: Optimize sched in/out of counters")
 back in May by Peter Z.

 As far as I can see the if () is superfluous in each case (a
 singleton event will be a group of 1 and will have its
 group_leader pointing to itself)."

 [ See: http://marc.info/?l=linux-kernel&m=125361238901442&w=2 ]

And Peter Zijlstra points out this is a bugfix:

 "The intent was to call event_sched_{in,out}() for single event
  groups because that's cheaper than group_sched_{in,out}(),
  however..

  - as you noticed, I got the condition wrong, it should have read:

      list_empty(&event->sibling_list)

  - it failed to call group_can_go_on() which deals with ->exclusive.

  - it also doesn't call hw_perf_group_sched_in() which might break
    power."

 [ See: http://marc.info/?l=linux-kernel&m=125369523318583&w=2 ]

Changelog v1->v2:

 - Fix the title name according to Peter Zijlstra's suggestion

 - Remove the comments and WARN_ON_ONCE() as Peter Zijlstra's
   suggestion

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <4ABC5A55.7000208@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 30 ++++++++----------------------
 1 file changed, 8 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 0f86feb6db0c..e50543db642a 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1030,14 +1030,10 @@ void __perf_event_sched_out(struct perf_event_context *ctx,
 	update_context_time(ctx);
 
 	perf_disable();
-	if (ctx->nr_active) {
-		list_for_each_entry(event, &ctx->group_list, group_entry) {
-			if (event != event->group_leader)
-				event_sched_out(event, cpuctx, ctx);
-			else
-				group_sched_out(event, cpuctx, ctx);
-		}
-	}
+	if (ctx->nr_active)
+		list_for_each_entry(event, &ctx->group_list, group_entry)
+			group_sched_out(event, cpuctx, ctx);
+
 	perf_enable();
  out:
 	spin_unlock(&ctx->lock);
@@ -1258,12 +1254,8 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 		if (event->cpu != -1 && event->cpu != cpu)
 			continue;
 
-		if (event != event->group_leader)
-			event_sched_in(event, cpuctx, ctx, cpu);
-		else {
-			if (group_can_go_on(event, cpuctx, 1))
-				group_sched_in(event, cpuctx, ctx, cpu);
-		}
+		if (group_can_go_on(event, cpuctx, 1))
+			group_sched_in(event, cpuctx, ctx, cpu);
 
 		/*
 		 * If this pinned group hasn't been scheduled,
@@ -1291,15 +1283,9 @@ __perf_event_sched_in(struct perf_event_context *ctx,
 		if (event->cpu != -1 && event->cpu != cpu)
 			continue;
 
-		if (event != event->group_leader) {
-			if (event_sched_in(event, cpuctx, ctx, cpu))
+		if (group_can_go_on(event, cpuctx, can_add_hw))
+			if (group_sched_in(event, cpuctx, ctx, cpu))
 				can_add_hw = 0;
-		} else {
-			if (group_can_go_on(event, cpuctx, can_add_hw)) {
-				if (group_sched_in(event, cpuctx, ctx, cpu))
-					can_add_hw = 0;
-			}
-		}
 	}
 	perf_enable();
  out:
-- 
cgit v1.2.3


From 27f9994c50e95f3a5a81fe4c7491a9f9cffe6ec0 Mon Sep 17 00:00:00 2001
From: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Date: Fri, 25 Sep 2009 13:54:01 +0800
Subject: perf_event: Clean up perf_event_init_task()

While at it: we can traverse ctx->group_list to get all
group leader, it should be safe since we hold ctx->mutex.

Changlog v1->v2:

  - remove WARN_ON_ONCE() according to Peter Zijlstra's suggestion

Signed-off-by: Xiao Guangrong <xiaoguangrong@cn.fujitsu.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <4ABC5AF9.6060808@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e50543db642a..e491fb087939 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -4767,9 +4767,7 @@ int perf_event_init_task(struct task_struct *child)
 	 * We dont have to disable NMIs - we are only looking at
 	 * the list, not manipulating it:
 	 */
-	list_for_each_entry_rcu(event, &parent_ctx->event_list, event_entry) {
-		if (event != event->group_leader)
-			continue;
+	list_for_each_entry(event, &parent_ctx->group_list, group_entry) {
 
 		if (!event->attr.inherit) {
 			inherited_all = 0;
-- 
cgit v1.2.3


From f9ac5a69edee0ee7e06a05727226e3f275306c8d Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Mon, 28 Sep 2009 16:55:40 +0900
Subject: kmemtrace: Fix up tracer registration

Commit ddc1637af217dbd8bc51f30e6d24e84476a869a6 ("kmemtrace: Print
binary output only if 'bin' option is set") ended up inverting the
error detection logic. register_tracer() returns 0 on success,
which this change caused to treat as an error, resulting in:

[    0.132000] Warning: could not register the kmem tracer

as well as bailing out of the initcall with an error value. This
restores the old logic.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <20090928075540.GD6668@linux-sh.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 81b1645c8549..a91da69f153a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -501,7 +501,7 @@ static int __init init_kmem_tracer(void)
 		return 1;
 	}
 
-	if (!register_tracer(&kmem_tracer)) {
+	if (register_tracer(&kmem_tracer) != 0) {
 		pr_warning("Warning: could not register the kmem tracer\n");
 		return 1;
 	}
-- 
cgit v1.2.3


From 48c0d4d4c04dd520c55e0fd756fa4e7c83de3d13 Mon Sep 17 00:00:00 2001
From: Zdenek Kabelac <zdenek.kabelac@gmail.com>
Date: Fri, 25 Sep 2009 06:19:26 +0200
Subject: Add missing blk_trace_remove_sysfs to be in pair with
 blk_trace_init_sysfs

Add missing blk_trace_remove_sysfs to be in pair with blk_trace_init_sysfs
introduced in commit 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82.
Release kobject also in case the request_fn is NULL.

Problem was noticed via kmemleak backtrace when some sysfs entries were
note properly destroyed during  device removal:

unreferenced object 0xffff88001aa76640 (size 80):
  comm "lvcreate", pid 2120, jiffies 4294885144
  hex dump (first 32 bytes):
    01 00 00 00 00 00 00 00 f0 65 a7 1a 00 88 ff ff  .........e......
    90 66 a7 1a 00 88 ff ff 86 1d 53 81 ff ff ff ff  .f........S.....
  backtrace:
    [<ffffffff813f9cc6>] kmemleak_alloc+0x26/0x60
    [<ffffffff8111d693>] kmem_cache_alloc+0x133/0x1c0
    [<ffffffff81195891>] sysfs_new_dirent+0x41/0x120
    [<ffffffff81194b0c>] sysfs_add_file_mode+0x3c/0xb0
    [<ffffffff81197c81>] internal_create_group+0xc1/0x1a0
    [<ffffffff81197d93>] sysfs_create_group+0x13/0x20
    [<ffffffff810d8004>] blk_trace_init_sysfs+0x14/0x20
    [<ffffffff8123f45c>] blk_register_queue+0x3c/0xf0
    [<ffffffff812447e4>] add_disk+0x94/0x160
    [<ffffffffa00d8b08>] dm_create+0x598/0x6e0 [dm_mod]
    [<ffffffffa00de951>] dev_create+0x51/0x350 [dm_mod]
    [<ffffffffa00de823>] ctl_ioctl+0x1a3/0x240 [dm_mod]
    [<ffffffffa00de8f2>] dm_compat_ctl_ioctl+0x12/0x20 [dm_mod]
    [<ffffffff81177bfd>] compat_sys_ioctl+0xcd/0x4f0
    [<ffffffff81036ed8>] sysenter_dispatch+0x7/0x2c
    [<ffffffffffffffff>] 0xffffffffffffffff

Signed-off-by: Zdenek Kabelac <zkabelac@redhat.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-sysfs.c            | 11 ++++++-----
 include/linux/blktrace_api.h |  2 ++
 kernel/trace/blktrace.c      |  5 +++++
 3 files changed, 13 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index b78c9c3e2670..8a6d81afb284 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -452,6 +452,7 @@ int blk_register_queue(struct gendisk *disk)
 	if (ret) {
 		kobject_uevent(&q->kobj, KOBJ_REMOVE);
 		kobject_del(&q->kobj);
+		blk_trace_remove_sysfs(disk_to_dev(disk));
 		return ret;
 	}
 
@@ -465,11 +466,11 @@ void blk_unregister_queue(struct gendisk *disk)
 	if (WARN_ON(!q))
 		return;
 
-	if (q->request_fn) {
+	if (q->request_fn)
 		elv_unregister_queue(q);
 
-		kobject_uevent(&q->kobj, KOBJ_REMOVE);
-		kobject_del(&q->kobj);
-		kobject_put(&disk_to_dev(disk)->kobj);
-	}
+	kobject_uevent(&q->kobj, KOBJ_REMOVE);
+	kobject_del(&q->kobj);
+	blk_trace_remove_sysfs(disk_to_dev(disk));
+	kobject_put(&disk_to_dev(disk)->kobj);
 }
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 7e4350ece0f8..622939a23299 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 			   char __user *arg);
 extern int blk_trace_startstop(struct request_queue *q, int start);
 extern int blk_trace_remove(struct request_queue *q);
+extern void blk_trace_remove_sysfs(struct device *dev);
 extern int blk_trace_init_sysfs(struct device *dev);
 
 extern struct attribute_group blk_trace_attr_group;
@@ -211,6 +212,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
+# define blk_trace_remove_sysfs(struct device *dev)	do { } while (0)
 static inline int blk_trace_init_sysfs(struct device *dev)
 {
 	return 0;
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 3eb159c277c8..60b5c5a3d4b4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1657,6 +1657,11 @@ int blk_trace_init_sysfs(struct device *dev)
 	return sysfs_create_group(&dev->kobj, &blk_trace_attr_group);
 }
 
+void blk_trace_remove_sysfs(struct device *dev)
+{
+	sysfs_remove_group(&dev->kobj, &blk_trace_attr_group);
+}
+
 #endif /* CONFIG_BLK_DEV_IO_TRACE */
 
 #ifdef CONFIG_EVENT_TRACING
-- 
cgit v1.2.3


From b0da3f0dada78832c9da03ad2152ae76bd9a2496 Mon Sep 17 00:00:00 2001
From: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Date: Thu, 1 Oct 2009 21:16:13 +0200
Subject: Add a tracepoint for block request remapping

Since 2.6.31 now has request-based device-mapper, it's useful to have
a tracepoint for request-remapping as well as bio-remapping.
This patch adds a tracepoint for request-remapping, trace_block_rq_remap().

Signed-off-by: Kiyoshi Ueda <k-ueda@ct.jp.nec.com>
Signed-off-by: Jun'ichi Nomura <j-nomura@ce.jp.nec.com>
Cc: Alasdair G Kergon <agk@redhat.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-core.c             |  1 +
 include/linux/blktrace_api.h |  2 +-
 include/trace/events/block.h | 33 +++++++++++++++++++++++++++++++++
 kernel/trace/blktrace.c      | 34 ++++++++++++++++++++++++++++++++++
 4 files changed, 69 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 34504f309728..ddaaea4fdffc 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,6 +34,7 @@
 #include "blk.h"
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_remap);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
 EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
 
 static int __make_request(struct request_queue *q, struct bio *bio);
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 622939a23299..3b73b9992b26 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -212,7 +212,7 @@ extern struct attribute_group blk_trace_attr_group;
 # define blk_trace_startstop(q, start)			(-ENOTTY)
 # define blk_trace_remove(q)				(-ENOTTY)
 # define blk_add_trace_msg(q, fmt, ...)			do { } while (0)
-# define blk_trace_remove_sysfs(struct device *dev)	do { } while (0)
+# define blk_trace_remove_sysfs(dev)			do { } while (0)
 static inline int blk_trace_init_sysfs(struct device *dev)
 {
 	return 0;
diff --git a/include/trace/events/block.h b/include/trace/events/block.h
index d86af94691c2..00405b5f624a 100644
--- a/include/trace/events/block.h
+++ b/include/trace/events/block.h
@@ -488,6 +488,39 @@ TRACE_EVENT(block_remap,
 		  (unsigned long long)__entry->old_sector)
 );
 
+TRACE_EVENT(block_rq_remap,
+
+	TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev,
+		 sector_t from),
+
+	TP_ARGS(q, rq, dev, from),
+
+	TP_STRUCT__entry(
+		__field( dev_t,		dev		)
+		__field( sector_t,	sector		)
+		__field( unsigned int,	nr_sector	)
+		__field( dev_t,		old_dev		)
+		__field( sector_t,	old_sector	)
+		__array( char,		rwbs,	6	)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= disk_devt(rq->rq_disk);
+		__entry->sector		= blk_rq_pos(rq);
+		__entry->nr_sector	= blk_rq_sectors(rq);
+		__entry->old_dev	= dev;
+		__entry->old_sector	= from;
+		blk_fill_rwbs_rq(__entry->rwbs, rq);
+	),
+
+	TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs,
+		  (unsigned long long)__entry->sector,
+		  __entry->nr_sector,
+		  MAJOR(__entry->old_dev), MINOR(__entry->old_dev),
+		  (unsigned long long)__entry->old_sector)
+);
+
 #endif /* _TRACE_BLOCK_H */
 
 /* This part must be outside protection */
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 60b5c5a3d4b4..d9d6206e0b14 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -855,6 +855,37 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
 			sizeof(r), &r);
 }
 
+/**
+ * blk_add_trace_rq_remap - Add a trace for a request-remap operation
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @dev:	target device
+ * @from:	source sector
+ *
+ * Description:
+ *     Device mapper remaps request to other devices.
+ *     Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_rq_remap(struct request_queue *q,
+				   struct request *rq, dev_t dev,
+				   sector_t from)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device_from = cpu_to_be32(dev);
+	r.device_to   = cpu_to_be32(disk_devt(rq->rq_disk));
+	r.sector_from = cpu_to_be64(from);
+
+	__blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq),
+			rq_data_dir(rq), BLK_TA_REMAP, !!rq->errors,
+			sizeof(r), &r);
+}
+
 /**
  * blk_add_driver_data - Add binary message with driver-specific data
  * @q:		queue the io is for
@@ -922,10 +953,13 @@ static void blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_remap(blk_add_trace_remap);
 	WARN_ON(ret);
+	ret = register_trace_block_rq_remap(blk_add_trace_rq_remap);
+	WARN_ON(ret);
 }
 
 static void blk_unregister_tracepoints(void)
 {
+	unregister_trace_block_rq_remap(blk_add_trace_rq_remap);
 	unregister_trace_block_remap(blk_add_trace_remap);
 	unregister_trace_block_split(blk_add_trace_split);
 	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
-- 
cgit v1.2.3


From 3ae91c21dd29e413f4111978152c14061f0984b0 Mon Sep 17 00:00:00 2001
From: Paul Mundt <lethal@linux-sh.org>
Date: Thu, 1 Oct 2009 15:43:54 -0700
Subject: module: fix up CONFIG_KALLSYMS=n build.

Starting from commit 4a4962263f07d14660849ec134ee42b63e95ea9a "reduce
symbol table for loaded modules (v2)", the kernel/module.c build is broken
with CONFIG_KALLSYMS disabled.

  CC      kernel/module.o
kernel/module.c:1995: warning: type defaults to 'int' in declaration of 'Elf_Hdr'
kernel/module.c:1995: error: expected ';', ',' or ')' before '*' token
kernel/module.c: In function 'load_module':
kernel/module.c:2203: error: 'strmap' undeclared (first use in this function)
kernel/module.c:2203: error: (Each undeclared identifier is reported only once
kernel/module.c:2203: error: for each function it appears in.)
kernel/module.c:2239: error: 'symoffs' undeclared (first use in this function)
kernel/module.c:2239: error: implicit declaration of function 'layout_symtab'
kernel/module.c:2240: error: 'stroffs' undeclared (first use in this function)
make[1]: *** [kernel/module.o] Error 1
make: *** [kernel/module.o] Error 2

There are three different issues:

    - layout_symtab() takes a const Elf_Ehdr

    - layout_symtab() needs to return a value

    - symoffs/stroffs/strmap are referenced by the load_module() code
      despite being ifdefed out, which seems unnecessary given the noop
      behaviour of layout_symtab()/add_kallsyms() in the case of
      CONFIG_KALLSYMS=n.

Signed-off-by: Paul Mundt <lethal@linux-sh.org>
Acked-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index fe748a86d452..8b7d8805819d 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1992,12 +1992,14 @@ static inline unsigned long layout_symtab(struct module *mod,
 					  Elf_Shdr *sechdrs,
 					  unsigned int symindex,
 					  unsigned int strindex,
-					  const Elf_Hdr *hdr,
+					  const Elf_Ehdr *hdr,
 					  const char *secstrings,
 					  unsigned long *pstroffs,
 					  unsigned long *strmap)
 {
+	return 0;
 }
+
 static inline void add_kallsyms(struct module *mod,
 				Elf_Shdr *sechdrs,
 				unsigned int shnum,
@@ -2081,9 +2083,8 @@ static noinline struct module *load_module(void __user *umod,
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
-#ifdef CONFIG_KALLSYMS
 	unsigned long symoffs, stroffs, *strmap;
-#endif
+
 	mm_segment_t old_fs;
 
 	DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n",
-- 
cgit v1.2.3


From 828c09509b9695271bcbdc53e9fc9a6a737148d2 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 1 Oct 2009 15:43:56 -0700
Subject: const: constify remaining file_operations

[akpm@linux-foundation.org: fix KVM]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Acked-by: Mike Frysinger <vapier@gentoo.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/arm/mach-ns9xxx/clock.c               |  2 +-
 arch/blackfin/mach-bf561/coreb.c           |  2 +-
 arch/cris/arch-v10/drivers/sync_serial.c   |  2 +-
 arch/cris/arch-v32/drivers/mach-fs/gpio.c  |  2 +-
 arch/powerpc/kvm/timing.c                  |  2 +-
 arch/powerpc/platforms/cell/spufs/file.c   |  2 +-
 arch/powerpc/platforms/pseries/dtl.c       |  2 +-
 arch/x86/xen/debugfs.c                     |  2 +-
 drivers/acpi/video.c                       |  2 +-
 drivers/block/cciss.c                      |  2 +-
 drivers/char/apm-emulation.c               |  2 +-
 drivers/char/bfin-otp.c                    |  2 +-
 drivers/char/xilinx_hwicap/xilinx_hwicap.c |  2 +-
 drivers/gpio/gpiolib.c                     |  2 +-
 drivers/hwmon/fschmd.c                     |  2 +-
 drivers/lguest/lguest_user.c               |  2 +-
 drivers/media/dvb/dvb-core/dmxdev.c        |  2 +-
 drivers/media/dvb/firewire/firedtv-ci.c    |  2 +-
 drivers/misc/phantom.c                     |  2 +-
 drivers/misc/sgi-gru/grufile.c             |  3 +--
 drivers/mmc/core/debugfs.c                 |  2 +-
 drivers/s390/cio/qdio_debug.c              |  2 +-
 drivers/s390/cio/qdio_perf.c               |  2 +-
 drivers/scsi/sg.c                          | 43 +++++++++++++++++++++---------
 drivers/spi/spidev.c                       |  2 +-
 drivers/usb/class/usbtmc.c                 |  2 +-
 drivers/usb/gadget/printer.c               |  2 +-
 drivers/usb/host/whci/debug.c              |  6 ++---
 drivers/usb/misc/rio500.c                  |  3 +--
 drivers/uwb/uwb-debug.c                    |  6 ++---
 fs/btrfs/ctree.h                           |  2 +-
 fs/btrfs/file.c                            |  2 +-
 fs/btrfs/inode.c                           |  4 +--
 fs/jbd2/journal.c                          |  2 +-
 fs/nfsd/nfsctl.c                           |  2 +-
 fs/nilfs2/dir.c                            |  2 +-
 fs/nilfs2/file.c                           |  2 +-
 fs/nilfs2/mdt.c                            |  2 +-
 fs/nilfs2/nilfs.h                          |  4 +--
 fs/ocfs2/cluster/heartbeat.c               |  2 +-
 fs/ocfs2/cluster/netdebug.c                |  4 +--
 fs/ocfs2/dlm/dlmdebug.c                    |  8 +++---
 fs/ocfs2/super.c                           |  2 +-
 fs/omfs/dir.c                              |  2 +-
 fs/omfs/file.c                             |  2 +-
 fs/omfs/omfs.h                             |  4 +--
 include/linux/cgroup.h                     |  2 +-
 include/linux/fs.h                         |  2 +-
 kernel/cgroup.c                            | 10 +++----
 kernel/kprobes.c                           |  4 +--
 kernel/rcutree_trace.c                     | 10 +++----
 kernel/sched.c                             |  2 +-
 kernel/time/timer_list.c                   |  2 +-
 kernel/time/timer_stats.c                  |  2 +-
 samples/tracepoints/tracepoint-sample.c    |  2 +-
 security/integrity/ima/ima_fs.c            | 10 +++----
 virt/kvm/kvm_main.c                        |  2 +-
 57 files changed, 110 insertions(+), 95 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-ns9xxx/clock.c b/arch/arm/mach-ns9xxx/clock.c
index 44ed20d4a388..cf81cbc57544 100644
--- a/arch/arm/mach-ns9xxx/clock.c
+++ b/arch/arm/mach-ns9xxx/clock.c
@@ -195,7 +195,7 @@ static int clk_debugfs_open(struct inode *inode, struct file *file)
 	return single_open(file, clk_debugfs_show, NULL);
 }
 
-static struct file_operations clk_debugfs_operations = {
+static const struct file_operations clk_debugfs_operations = {
 	.open = clk_debugfs_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/arch/blackfin/mach-bf561/coreb.c b/arch/blackfin/mach-bf561/coreb.c
index 93635a766f9c..1e60a92dd602 100644
--- a/arch/blackfin/mach-bf561/coreb.c
+++ b/arch/blackfin/mach-bf561/coreb.c
@@ -48,7 +48,7 @@ coreb_ioctl(struct inode *inode, struct file *file, unsigned int cmd, unsigned l
 	return ret;
 }
 
-static struct file_operations coreb_fops = {
+static const struct file_operations coreb_fops = {
 	.owner   = THIS_MODULE,
 	.ioctl   = coreb_ioctl,
 };
diff --git a/arch/cris/arch-v10/drivers/sync_serial.c b/arch/cris/arch-v10/drivers/sync_serial.c
index 6cc1a0319a5d..562b9a7feae7 100644
--- a/arch/cris/arch-v10/drivers/sync_serial.c
+++ b/arch/cris/arch-v10/drivers/sync_serial.c
@@ -244,7 +244,7 @@ static unsigned sync_serial_prescale_shadow;
 
 #define NUMBER_OF_PORTS 2
 
-static struct file_operations sync_serial_fops = {
+static const struct file_operations sync_serial_fops = {
 	.owner   = THIS_MODULE,
 	.write   = sync_serial_write,
 	.read    = sync_serial_read,
diff --git a/arch/cris/arch-v32/drivers/mach-fs/gpio.c b/arch/cris/arch-v32/drivers/mach-fs/gpio.c
index fe1fde893887..d89ab80498ed 100644
--- a/arch/cris/arch-v32/drivers/mach-fs/gpio.c
+++ b/arch/cris/arch-v32/drivers/mach-fs/gpio.c
@@ -855,7 +855,7 @@ gpio_leds_ioctl(unsigned int cmd, unsigned long arg)
 	return 0;
 }
 
-struct file_operations gpio_fops = {
+static const struct file_operations gpio_fops = {
 	.owner       = THIS_MODULE,
 	.poll        = gpio_poll,
 	.ioctl       = gpio_ioctl,
diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
index 47ee603f558e..2aa371e30079 100644
--- a/arch/powerpc/kvm/timing.c
+++ b/arch/powerpc/kvm/timing.c
@@ -201,7 +201,7 @@ static int kvmppc_exit_timing_open(struct inode *inode, struct file *file)
 	return single_open(file, kvmppc_exit_timing_show, inode->i_private);
 }
 
-static struct file_operations kvmppc_exit_timing_fops = {
+static const struct file_operations kvmppc_exit_timing_fops = {
 	.owner   = THIS_MODULE,
 	.open    = kvmppc_exit_timing_open,
 	.read    = seq_read,
diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c
index 961309446170..884e8bcec499 100644
--- a/arch/powerpc/platforms/cell/spufs/file.c
+++ b/arch/powerpc/platforms/cell/spufs/file.c
@@ -147,7 +147,7 @@ static int __fops ## _open(struct inode *inode, struct file *file)	\
 	__simple_attr_check_format(__fmt, 0ull);			\
 	return spufs_attr_open(inode, file, __get, __set, __fmt);	\
 }									\
-static struct file_operations __fops = {				\
+static const struct file_operations __fops = {				\
 	.owner	 = THIS_MODULE,						\
 	.open	 = __fops ## _open,					\
 	.release = spufs_attr_release,					\
diff --git a/arch/powerpc/platforms/pseries/dtl.c b/arch/powerpc/platforms/pseries/dtl.c
index ab69925d579b..937a544a236d 100644
--- a/arch/powerpc/platforms/pseries/dtl.c
+++ b/arch/powerpc/platforms/pseries/dtl.c
@@ -209,7 +209,7 @@ static ssize_t dtl_file_read(struct file *filp, char __user *buf, size_t len,
 	return n_read * sizeof(struct dtl_entry);
 }
 
-static struct file_operations dtl_fops = {
+static const struct file_operations dtl_fops = {
 	.open		= dtl_file_open,
 	.release	= dtl_file_release,
 	.read		= dtl_file_read,
diff --git a/arch/x86/xen/debugfs.c b/arch/x86/xen/debugfs.c
index b53225d2cac3..e133ce25e290 100644
--- a/arch/x86/xen/debugfs.c
+++ b/arch/x86/xen/debugfs.c
@@ -100,7 +100,7 @@ static int xen_array_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct file_operations u32_array_fops = {
+static const struct file_operations u32_array_fops = {
 	.owner	= THIS_MODULE,
 	.open	= u32_array_open,
 	.release= xen_array_release,
diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c
index a4fddb24476f..f6e54bf8dd96 100644
--- a/drivers/acpi/video.c
+++ b/drivers/acpi/video.c
@@ -285,7 +285,7 @@ static int acpi_video_device_brightness_open_fs(struct inode *inode,
 						struct file *file);
 static ssize_t acpi_video_device_write_brightness(struct file *file,
 	const char __user *buffer, size_t count, loff_t *data);
-static struct file_operations acpi_video_device_brightness_fops = {
+static const struct file_operations acpi_video_device_brightness_fops = {
 	.owner = THIS_MODULE,
 	.open = acpi_video_device_brightness_open_fs,
 	.read = seq_read,
diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c
index 24c3e21ab263..1ece0b47b581 100644
--- a/drivers/block/cciss.c
+++ b/drivers/block/cciss.c
@@ -426,7 +426,7 @@ out:
 	return err;
 }
 
-static struct file_operations cciss_proc_fops = {
+static const struct file_operations cciss_proc_fops = {
 	.owner	 = THIS_MODULE,
 	.open    = cciss_seq_open,
 	.read    = seq_read,
diff --git a/drivers/char/apm-emulation.c b/drivers/char/apm-emulation.c
index aaca40283be9..4f568cb9af3f 100644
--- a/drivers/char/apm-emulation.c
+++ b/drivers/char/apm-emulation.c
@@ -393,7 +393,7 @@ static int apm_open(struct inode * inode, struct file * filp)
 	return as ? 0 : -ENOMEM;
 }
 
-static struct file_operations apm_bios_fops = {
+static const struct file_operations apm_bios_fops = {
 	.owner		= THIS_MODULE,
 	.read		= apm_read,
 	.poll		= apm_poll,
diff --git a/drivers/char/bfin-otp.c b/drivers/char/bfin-otp.c
index e3dd24bff514..836d4f0a876f 100644
--- a/drivers/char/bfin-otp.c
+++ b/drivers/char/bfin-otp.c
@@ -217,7 +217,7 @@ static long bfin_otp_ioctl(struct file *filp, unsigned cmd, unsigned long arg)
 # define bfin_otp_ioctl NULL
 #endif
 
-static struct file_operations bfin_otp_fops = {
+static const struct file_operations bfin_otp_fops = {
 	.owner          = THIS_MODULE,
 	.unlocked_ioctl = bfin_otp_ioctl,
 	.read           = bfin_otp_read,
diff --git a/drivers/char/xilinx_hwicap/xilinx_hwicap.c b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
index f40ab699860f..4846d50199f3 100644
--- a/drivers/char/xilinx_hwicap/xilinx_hwicap.c
+++ b/drivers/char/xilinx_hwicap/xilinx_hwicap.c
@@ -559,7 +559,7 @@ static int hwicap_release(struct inode *inode, struct file *file)
 	return status;
 }
 
-static struct file_operations hwicap_fops = {
+static const struct file_operations hwicap_fops = {
 	.owner = THIS_MODULE,
 	.write = hwicap_write,
 	.read = hwicap_read,
diff --git a/drivers/gpio/gpiolib.c b/drivers/gpio/gpiolib.c
index bb11a429394a..662ed923d9eb 100644
--- a/drivers/gpio/gpiolib.c
+++ b/drivers/gpio/gpiolib.c
@@ -1487,7 +1487,7 @@ static int gpiolib_open(struct inode *inode, struct file *file)
 	return single_open(file, gpiolib_show, NULL);
 }
 
-static struct file_operations gpiolib_operations = {
+static const struct file_operations gpiolib_operations = {
 	.open		= gpiolib_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/drivers/hwmon/fschmd.c b/drivers/hwmon/fschmd.c
index ea955edde87e..2a7a85a6dc36 100644
--- a/drivers/hwmon/fschmd.c
+++ b/drivers/hwmon/fschmd.c
@@ -915,7 +915,7 @@ static int watchdog_ioctl(struct inode *inode, struct file *filp,
 	return ret;
 }
 
-static struct file_operations watchdog_fops = {
+static const struct file_operations watchdog_fops = {
 	.owner = THIS_MODULE,
 	.llseek = no_llseek,
 	.open = watchdog_open,
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index b4d3f7ca554f..bd1632388e4a 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -508,7 +508,7 @@ static int close(struct inode *inode, struct file *file)
  * uses: reading and writing a character device called /dev/lguest.  All the
  * work happens in the read(), write() and close() routines:
  */
-static struct file_operations lguest_fops = {
+static const struct file_operations lguest_fops = {
 	.owner	 = THIS_MODULE,
 	.release = close,
 	.write	 = write,
diff --git a/drivers/media/dvb/dvb-core/dmxdev.c b/drivers/media/dvb/dvb-core/dmxdev.c
index 3750ff48cba1..516414983593 100644
--- a/drivers/media/dvb/dvb-core/dmxdev.c
+++ b/drivers/media/dvb/dvb-core/dmxdev.c
@@ -1203,7 +1203,7 @@ static unsigned int dvb_dvr_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
-static struct file_operations dvb_dvr_fops = {
+static const struct file_operations dvb_dvr_fops = {
 	.owner = THIS_MODULE,
 	.read = dvb_dvr_read,
 	.write = dvb_dvr_write,
diff --git a/drivers/media/dvb/firewire/firedtv-ci.c b/drivers/media/dvb/firewire/firedtv-ci.c
index eeb80d0ea3ff..853e04b7cb36 100644
--- a/drivers/media/dvb/firewire/firedtv-ci.c
+++ b/drivers/media/dvb/firewire/firedtv-ci.c
@@ -215,7 +215,7 @@ static unsigned int fdtv_ca_io_poll(struct file *file, poll_table *wait)
 	return POLLIN;
 }
 
-static struct file_operations fdtv_ca_fops = {
+static const struct file_operations fdtv_ca_fops = {
 	.owner		= THIS_MODULE,
 	.ioctl		= dvb_generic_ioctl,
 	.open		= dvb_generic_open,
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index fa57b67593ae..90a95ce8dc34 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -271,7 +271,7 @@ static unsigned int phantom_poll(struct file *file, poll_table *wait)
 	return mask;
 }
 
-static struct file_operations phantom_file_ops = {
+static const struct file_operations phantom_file_ops = {
 	.open = phantom_open,
 	.release = phantom_release,
 	.unlocked_ioctl = phantom_ioctl,
diff --git a/drivers/misc/sgi-gru/grufile.c b/drivers/misc/sgi-gru/grufile.c
index 300e7ba391a0..41c8fe2a928c 100644
--- a/drivers/misc/sgi-gru/grufile.c
+++ b/drivers/misc/sgi-gru/grufile.c
@@ -53,7 +53,6 @@ struct gru_stats_s gru_stats;
 /* Guaranteed user available resources on each node */
 static int max_user_cbrs, max_user_dsr_bytes;
 
-static struct file_operations gru_fops;
 static struct miscdevice gru_miscdev;
 
 
@@ -426,7 +425,7 @@ static void __exit gru_exit(void)
 	gru_proc_exit();
 }
 
-static struct file_operations gru_fops = {
+static const struct file_operations gru_fops = {
 	.owner		= THIS_MODULE,
 	.unlocked_ioctl	= gru_file_unlocked_ioctl,
 	.mmap		= gru_file_mmap,
diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c
index 610dbd1fcc82..96d10f40fb23 100644
--- a/drivers/mmc/core/debugfs.c
+++ b/drivers/mmc/core/debugfs.c
@@ -240,7 +240,7 @@ static int mmc_ext_csd_release(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct file_operations mmc_dbg_ext_csd_fops = {
+static const struct file_operations mmc_dbg_ext_csd_fops = {
 	.open		= mmc_ext_csd_open,
 	.read		= mmc_ext_csd_read,
 	.release	= mmc_ext_csd_release,
diff --git a/drivers/s390/cio/qdio_debug.c b/drivers/s390/cio/qdio_debug.c
index 1b78f639ead3..76769978285f 100644
--- a/drivers/s390/cio/qdio_debug.c
+++ b/drivers/s390/cio/qdio_debug.c
@@ -125,7 +125,7 @@ static int qstat_seq_open(struct inode *inode, struct file *filp)
 			   filp->f_path.dentry->d_inode->i_private);
 }
 
-static struct file_operations debugfs_fops = {
+static const struct file_operations debugfs_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = qstat_seq_open,
 	.read	 = seq_read,
diff --git a/drivers/s390/cio/qdio_perf.c b/drivers/s390/cio/qdio_perf.c
index eff943923c6f..968e3c7c2632 100644
--- a/drivers/s390/cio/qdio_perf.c
+++ b/drivers/s390/cio/qdio_perf.c
@@ -84,7 +84,7 @@ static int qdio_perf_seq_open(struct inode *inode, struct file *filp)
 	return single_open(filp, qdio_perf_proc_show, NULL);
 }
 
-static struct file_operations qdio_perf_proc_fops = {
+static const struct file_operations qdio_perf_proc_fops = {
 	.owner	 = THIS_MODULE,
 	.open	 = qdio_perf_seq_open,
 	.read	 = seq_read,
diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c
index 0cb049f5cc56..747a5e5c1276 100644
--- a/drivers/scsi/sg.c
+++ b/drivers/scsi/sg.c
@@ -1317,7 +1317,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate)
 	}
 }
 
-static struct file_operations sg_fops = {
+static const struct file_operations sg_fops = {
 	.owner = THIS_MODULE,
 	.read = sg_read,
 	.write = sg_write,
@@ -2194,9 +2194,11 @@ static int sg_proc_seq_show_int(struct seq_file *s, void *v);
 static int sg_proc_single_open_adio(struct inode *inode, struct file *file);
 static ssize_t sg_proc_write_adio(struct file *filp, const char __user *buffer,
 			          size_t count, loff_t *off);
-static struct file_operations adio_fops = {
-	/* .owner, .read and .llseek added in sg_proc_init() */
+static const struct file_operations adio_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_adio,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.write = sg_proc_write_adio,
 	.release = single_release,
 };
@@ -2204,23 +2206,32 @@ static struct file_operations adio_fops = {
 static int sg_proc_single_open_dressz(struct inode *inode, struct file *file);
 static ssize_t sg_proc_write_dressz(struct file *filp, 
 		const char __user *buffer, size_t count, loff_t *off);
-static struct file_operations dressz_fops = {
+static const struct file_operations dressz_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_dressz,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.write = sg_proc_write_dressz,
 	.release = single_release,
 };
 
 static int sg_proc_seq_show_version(struct seq_file *s, void *v);
 static int sg_proc_single_open_version(struct inode *inode, struct file *file);
-static struct file_operations version_fops = {
+static const struct file_operations version_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_version,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = single_release,
 };
 
 static int sg_proc_seq_show_devhdr(struct seq_file *s, void *v);
 static int sg_proc_single_open_devhdr(struct inode *inode, struct file *file);
-static struct file_operations devhdr_fops = {
+static const struct file_operations devhdr_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_single_open_devhdr,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = single_release,
 };
 
@@ -2229,8 +2240,11 @@ static int sg_proc_open_dev(struct inode *inode, struct file *file);
 static void * dev_seq_start(struct seq_file *s, loff_t *pos);
 static void * dev_seq_next(struct seq_file *s, void *v, loff_t *pos);
 static void dev_seq_stop(struct seq_file *s, void *v);
-static struct file_operations dev_fops = {
+static const struct file_operations dev_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_open_dev,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = seq_release,
 };
 static const struct seq_operations dev_seq_ops = {
@@ -2242,8 +2256,11 @@ static const struct seq_operations dev_seq_ops = {
 
 static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v);
 static int sg_proc_open_devstrs(struct inode *inode, struct file *file);
-static struct file_operations devstrs_fops = {
+static const struct file_operations devstrs_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_open_devstrs,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = seq_release,
 };
 static const struct seq_operations devstrs_seq_ops = {
@@ -2255,8 +2272,11 @@ static const struct seq_operations devstrs_seq_ops = {
 
 static int sg_proc_seq_show_debug(struct seq_file *s, void *v);
 static int sg_proc_open_debug(struct inode *inode, struct file *file);
-static struct file_operations debug_fops = {
+static const struct file_operations debug_fops = {
+	.owner = THIS_MODULE,
 	.open = sg_proc_open_debug,
+	.read = seq_read,
+	.llseek = seq_lseek,
 	.release = seq_release,
 };
 static const struct seq_operations debug_seq_ops = {
@@ -2269,7 +2289,7 @@ static const struct seq_operations debug_seq_ops = {
 
 struct sg_proc_leaf {
 	const char * name;
-	struct file_operations * fops;
+	const struct file_operations * fops;
 };
 
 static struct sg_proc_leaf sg_proc_leaf_arr[] = {
@@ -2295,9 +2315,6 @@ sg_proc_init(void)
 	for (k = 0; k < num_leaves; ++k) {
 		leaf = &sg_proc_leaf_arr[k];
 		mask = leaf->fops->write ? S_IRUGO | S_IWUSR : S_IRUGO;
-		leaf->fops->owner = THIS_MODULE;
-		leaf->fops->read = seq_read;
-		leaf->fops->llseek = seq_lseek;
 		proc_create(leaf->name, mask, sg_proc_sgp, leaf->fops);
 	}
 	return 0;
diff --git a/drivers/spi/spidev.c b/drivers/spi/spidev.c
index f921bd1109e1..5d23983f02fc 100644
--- a/drivers/spi/spidev.c
+++ b/drivers/spi/spidev.c
@@ -537,7 +537,7 @@ static int spidev_release(struct inode *inode, struct file *filp)
 	return status;
 }
 
-static struct file_operations spidev_fops = {
+static const struct file_operations spidev_fops = {
 	.owner =	THIS_MODULE,
 	/* REVISIT switch to aio primitives, so that userspace
 	 * gets more complete API coverage.  It'll simplify things
diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c
index 333ee02e7b2b..864f0ba6a344 100644
--- a/drivers/usb/class/usbtmc.c
+++ b/drivers/usb/class/usbtmc.c
@@ -993,7 +993,7 @@ skip_io_on_zombie:
 	return retval;
 }
 
-static struct file_operations fops = {
+static const struct file_operations fops = {
 	.owner		= THIS_MODULE,
 	.read		= usbtmc_read,
 	.write		= usbtmc_write,
diff --git a/drivers/usb/gadget/printer.c b/drivers/usb/gadget/printer.c
index 29500154d00c..2d867fd22413 100644
--- a/drivers/usb/gadget/printer.c
+++ b/drivers/usb/gadget/printer.c
@@ -875,7 +875,7 @@ printer_ioctl(struct file *fd, unsigned int code, unsigned long arg)
 }
 
 /* used after endpoint configuration */
-static struct file_operations printer_io_operations = {
+static const struct file_operations printer_io_operations = {
 	.owner =	THIS_MODULE,
 	.open =		printer_open,
 	.read =		printer_read,
diff --git a/drivers/usb/host/whci/debug.c b/drivers/usb/host/whci/debug.c
index cf2d45946c57..2273c815941f 100644
--- a/drivers/usb/host/whci/debug.c
+++ b/drivers/usb/host/whci/debug.c
@@ -134,7 +134,7 @@ static int pzl_open(struct inode *inode, struct file *file)
 	return single_open(file, pzl_print, inode->i_private);
 }
 
-static struct file_operations di_fops = {
+static const struct file_operations di_fops = {
 	.open    = di_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -142,7 +142,7 @@ static struct file_operations di_fops = {
 	.owner   = THIS_MODULE,
 };
 
-static struct file_operations asl_fops = {
+static const struct file_operations asl_fops = {
 	.open    = asl_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -150,7 +150,7 @@ static struct file_operations asl_fops = {
 	.owner   = THIS_MODULE,
 };
 
-static struct file_operations pzl_fops = {
+static const struct file_operations pzl_fops = {
 	.open    = pzl_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/drivers/usb/misc/rio500.c b/drivers/usb/misc/rio500.c
index d645f3899fe1..32d0199d0c32 100644
--- a/drivers/usb/misc/rio500.c
+++ b/drivers/usb/misc/rio500.c
@@ -429,8 +429,7 @@ read_rio(struct file *file, char __user *buffer, size_t count, loff_t * ppos)
 	return read_count;
 }
 
-static struct
-file_operations usb_rio_fops = {
+static const struct file_operations usb_rio_fops = {
 	.owner =	THIS_MODULE,
 	.read =		read_rio,
 	.write =	write_rio,
diff --git a/drivers/uwb/uwb-debug.c b/drivers/uwb/uwb-debug.c
index 4a42993700c1..2eecec0c13c9 100644
--- a/drivers/uwb/uwb-debug.c
+++ b/drivers/uwb/uwb-debug.c
@@ -205,7 +205,7 @@ static ssize_t command_write(struct file *file, const char __user *buf,
 	return ret < 0 ? ret : len;
 }
 
-static struct file_operations command_fops = {
+static const struct file_operations command_fops = {
 	.open   = command_open,
 	.write  = command_write,
 	.read   = NULL,
@@ -255,7 +255,7 @@ static int reservations_open(struct inode *inode, struct file *file)
 	return single_open(file, reservations_print, inode->i_private);
 }
 
-static struct file_operations reservations_fops = {
+static const struct file_operations reservations_fops = {
 	.open    = reservations_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
@@ -283,7 +283,7 @@ static int drp_avail_open(struct inode *inode, struct file *file)
 	return single_open(file, drp_avail_print, inode->i_private);
 }
 
-static struct file_operations drp_avail_fops = {
+static const struct file_operations drp_avail_fops = {
 	.open    = drp_avail_open,
 	.read    = seq_read,
 	.llseek  = seq_lseek,
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 80599b4e42bd..4484eb3408af 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -2326,7 +2326,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync);
 int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end,
 			    int skip_pinned);
 int btrfs_check_file(struct btrfs_root *root, struct inode *inode);
-extern struct file_operations btrfs_file_operations;
+extern const struct file_operations btrfs_file_operations;
 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 		       struct btrfs_root *root, struct inode *inode,
 		       u64 start, u64 end, u64 locked_end,
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index a3492a3ad96b..9ed17dbe5c6e 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -1196,7 +1196,7 @@ static int btrfs_file_mmap(struct file	*filp, struct vm_area_struct *vma)
 	return 0;
 }
 
-struct file_operations btrfs_file_operations = {
+const struct file_operations btrfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.aio_read       = generic_file_aio_read,
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index e9b76bcd1c12..b9fe06d751c0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -62,7 +62,7 @@ static const struct inode_operations btrfs_special_inode_operations;
 static const struct inode_operations btrfs_file_inode_operations;
 static const struct address_space_operations btrfs_aops;
 static const struct address_space_operations btrfs_symlink_aops;
-static struct file_operations btrfs_dir_file_operations;
+static const struct file_operations btrfs_dir_file_operations;
 static struct extent_io_ops btrfs_extent_io_ops;
 
 static struct kmem_cache *btrfs_inode_cachep;
@@ -5544,7 +5544,7 @@ static const struct inode_operations btrfs_dir_ro_inode_operations = {
 	.permission	= btrfs_permission,
 };
 
-static struct file_operations btrfs_dir_file_operations = {
+static const struct file_operations btrfs_dir_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= btrfs_real_readdir,
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 761af77491f5..b0ab5219becb 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -770,7 +770,7 @@ static int jbd2_seq_info_release(struct inode *inode, struct file *file)
 	return seq_release(inode, file);
 }
 
-static struct file_operations jbd2_seq_info_fops = {
+static const struct file_operations jbd2_seq_info_fops = {
 	.owner		= THIS_MODULE,
 	.open           = jbd2_seq_info_open,
 	.read           = seq_read,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 00388d2a3c99..5c01fc148ce8 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -176,7 +176,7 @@ static const struct file_operations exports_operations = {
 extern int nfsd_pool_stats_open(struct inode *inode, struct file *file);
 extern int nfsd_pool_stats_release(struct inode *inode, struct file *file);
 
-static struct file_operations pool_stats_operations = {
+static const struct file_operations pool_stats_operations = {
 	.open		= nfsd_pool_stats_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/fs/nilfs2/dir.c b/fs/nilfs2/dir.c
index 1a4fa04cf071..e097099bfc8f 100644
--- a/fs/nilfs2/dir.c
+++ b/fs/nilfs2/dir.c
@@ -697,7 +697,7 @@ not_empty:
 	return 0;
 }
 
-struct file_operations nilfs_dir_operations = {
+const struct file_operations nilfs_dir_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= generic_read_dir,
 	.readdir	= nilfs_readdir,
diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c
index 7d7b4983dee3..30292df443ce 100644
--- a/fs/nilfs2/file.c
+++ b/fs/nilfs2/file.c
@@ -134,7 +134,7 @@ static int nilfs_file_mmap(struct file *file, struct vm_area_struct *vma)
  * We have mostly NULL's here: the current defaults are ok for
  * the nilfs filesystem.
  */
-struct file_operations nilfs_file_operations = {
+const struct file_operations nilfs_file_operations = {
 	.llseek		= generic_file_llseek,
 	.read		= do_sync_read,
 	.write		= do_sync_write,
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index b18c4998f8d0..f6326112d647 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -433,7 +433,7 @@ static const struct address_space_operations def_mdt_aops = {
 };
 
 static const struct inode_operations def_mdt_iops;
-static struct file_operations def_mdt_fops;
+static const struct file_operations def_mdt_fops;
 
 /*
  * NILFS2 uses pseudo inodes for meta data files such as DAT, cpfile, sufile,
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index bad7368782d0..4da6f67e9a91 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -294,9 +294,9 @@ void nilfs_clear_gcdat_inode(struct the_nilfs *);
 /*
  * Inodes and files operations
  */
-extern struct file_operations nilfs_dir_operations;
+extern const struct file_operations nilfs_dir_operations;
 extern const struct inode_operations nilfs_file_inode_operations;
-extern struct file_operations nilfs_file_operations;
+extern const struct file_operations nilfs_file_operations;
 extern const struct address_space_operations nilfs_aops;
 extern const struct inode_operations nilfs_dir_inode_operations;
 extern const struct inode_operations nilfs_special_inode_operations;
diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c
index 09cc25d04611..c452d116b892 100644
--- a/fs/ocfs2/cluster/heartbeat.c
+++ b/fs/ocfs2/cluster/heartbeat.c
@@ -966,7 +966,7 @@ static ssize_t o2hb_debug_read(struct file *file, char __user *buf,
 }
 #endif  /* CONFIG_DEBUG_FS */
 
-static struct file_operations o2hb_debug_fops = {
+static const struct file_operations o2hb_debug_fops = {
 	.open =		o2hb_debug_open,
 	.release =	o2hb_debug_release,
 	.read =		o2hb_debug_read,
diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c
index cfb2be708abe..da794bc07a6c 100644
--- a/fs/ocfs2/cluster/netdebug.c
+++ b/fs/ocfs2/cluster/netdebug.c
@@ -207,7 +207,7 @@ static int nst_fop_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations nst_seq_fops = {
+static const struct file_operations nst_seq_fops = {
 	.open = nst_fop_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -388,7 +388,7 @@ static int sc_fop_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations sc_seq_fops = {
+static const struct file_operations sc_seq_fops = {
 	.open = sc_fop_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c
index ca46002ec10e..42b0bad7a612 100644
--- a/fs/ocfs2/dlm/dlmdebug.c
+++ b/fs/ocfs2/dlm/dlmdebug.c
@@ -478,7 +478,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_purgelist_fops = {
+static const struct file_operations debug_purgelist_fops = {
 	.open =		debug_purgelist_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
@@ -538,7 +538,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_mle_fops = {
+static const struct file_operations debug_mle_fops = {
 	.open =		debug_mle_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
@@ -741,7 +741,7 @@ static int debug_lockres_release(struct inode *inode, struct file *file)
 	return seq_release_private(inode, file);
 }
 
-static struct file_operations debug_lockres_fops = {
+static const struct file_operations debug_lockres_fops = {
 	.open =		debug_lockres_open,
 	.release =	debug_lockres_release,
 	.read =		seq_read,
@@ -925,7 +925,7 @@ bail:
 	return -ENOMEM;
 }
 
-static struct file_operations debug_state_fops = {
+static const struct file_operations debug_state_fops = {
 	.open =		debug_state_open,
 	.release =	debug_buffer_release,
 	.read =		debug_buffer_read,
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 4cc3c890a2cd..c0e48aeebb1c 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -373,7 +373,7 @@ static ssize_t ocfs2_debug_read(struct file *file, char __user *buf,
 }
 #endif	/* CONFIG_DEBUG_FS */
 
-static struct file_operations ocfs2_osb_debug_fops = {
+static const struct file_operations ocfs2_osb_debug_fops = {
 	.open =		ocfs2_osb_debug_open,
 	.release =	ocfs2_debug_release,
 	.read =		ocfs2_debug_read,
diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c
index 3680bae335b5..b42d62419034 100644
--- a/fs/omfs/dir.c
+++ b/fs/omfs/dir.c
@@ -498,7 +498,7 @@ const struct inode_operations omfs_dir_inops = {
 	.rmdir = omfs_rmdir,
 };
 
-struct file_operations omfs_dir_operations = {
+const struct file_operations omfs_dir_operations = {
 	.read = generic_read_dir,
 	.readdir = omfs_readdir,
 	.llseek = generic_file_llseek,
diff --git a/fs/omfs/file.c b/fs/omfs/file.c
index 4845fbb18e6e..399487c09364 100644
--- a/fs/omfs/file.c
+++ b/fs/omfs/file.c
@@ -322,7 +322,7 @@ static sector_t omfs_bmap(struct address_space *mapping, sector_t block)
 	return generic_block_bmap(mapping, block, omfs_get_block);
 }
 
-struct file_operations omfs_file_operations = {
+const struct file_operations omfs_file_operations = {
 	.llseek = generic_file_llseek,
 	.read = do_sync_read,
 	.write = do_sync_write,
diff --git a/fs/omfs/omfs.h b/fs/omfs/omfs.h
index df71039945ac..ebe2fdbe535e 100644
--- a/fs/omfs/omfs.h
+++ b/fs/omfs/omfs.h
@@ -44,14 +44,14 @@ extern int omfs_allocate_range(struct super_block *sb, int min_request,
 extern int omfs_clear_range(struct super_block *sb, u64 block, int count);
 
 /* dir.c */
-extern struct file_operations omfs_dir_operations;
+extern const struct file_operations omfs_dir_operations;
 extern const struct inode_operations omfs_dir_inops;
 extern int omfs_make_empty(struct inode *inode, struct super_block *sb);
 extern int omfs_is_bad(struct omfs_sb_info *sbi, struct omfs_header *header,
 			u64 fsblock);
 
 /* file.c */
-extern struct file_operations omfs_file_operations;
+extern const struct file_operations omfs_file_operations;
 extern const struct inode_operations omfs_file_inops;
 extern const struct address_space_operations omfs_aops;
 extern void omfs_make_empty_table(struct buffer_head *bh, int offset);
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b62bb9294d0c..0008dee66514 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -37,7 +37,7 @@ extern void cgroup_exit(struct task_struct *p, int run_callbacks);
 extern int cgroupstats_build(struct cgroupstats *stats,
 				struct dentry *dentry);
 
-extern struct file_operations proc_cgroup_operations;
+extern const struct file_operations proc_cgroup_operations;
 
 /* Define the enumeration of all cgroup subsystems */
 #define SUBSYS(_x) _x ## _subsys_id,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 2adaa2529f18..a1e6899d4b6c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2446,7 +2446,7 @@ static int __fops ## _open(struct inode *inode, struct file *file)	\
 	__simple_attr_check_format(__fmt, 0ull);			\
 	return simple_attr_open(inode, file, __get, __set, __fmt);	\
 }									\
-static struct file_operations __fops = {				\
+static const struct file_operations __fops = {				\
 	.owner	 = THIS_MODULE,						\
 	.open	 = __fops ## _open,					\
 	.release = simple_attr_release,					\
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 7ccba4bc5e3b..d2b88596efde 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -703,7 +703,7 @@ static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, int mode);
 static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry);
 static int cgroup_populate_dir(struct cgroup *cgrp);
 static const struct inode_operations cgroup_dir_inode_operations;
-static struct file_operations proc_cgroupstats_operations;
+static const struct file_operations proc_cgroupstats_operations;
 
 static struct backing_dev_info cgroup_backing_dev_info = {
 	.name		= "cgroup",
@@ -1863,7 +1863,7 @@ static int cgroup_seqfile_release(struct inode *inode, struct file *file)
 	return single_release(inode, file);
 }
 
-static struct file_operations cgroup_seqfile_operations = {
+static const struct file_operations cgroup_seqfile_operations = {
 	.read = seq_read,
 	.write = cgroup_file_write,
 	.llseek = seq_lseek,
@@ -1922,7 +1922,7 @@ static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry,
 	return simple_rename(old_dir, old_dentry, new_dir, new_dentry);
 }
 
-static struct file_operations cgroup_file_operations = {
+static const struct file_operations cgroup_file_operations = {
 	.read = cgroup_file_read,
 	.write = cgroup_file_write,
 	.llseek = generic_file_llseek,
@@ -3369,7 +3369,7 @@ static int cgroup_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_cgroup_show, pid);
 }
 
-struct file_operations proc_cgroup_operations = {
+const struct file_operations proc_cgroup_operations = {
 	.open		= cgroup_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
@@ -3398,7 +3398,7 @@ static int cgroupstats_open(struct inode *inode, struct file *file)
 	return single_open(file, proc_cgroupstats_show, NULL);
 }
 
-static struct file_operations proc_cgroupstats_operations = {
+static const struct file_operations proc_cgroupstats_operations = {
 	.open = cgroupstats_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index cfadc1291d0b..5240d75f4c60 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -1333,7 +1333,7 @@ static int __kprobes kprobes_open(struct inode *inode, struct file *filp)
 	return seq_open(filp, &kprobes_seq_ops);
 }
 
-static struct file_operations debugfs_kprobes_operations = {
+static const struct file_operations debugfs_kprobes_operations = {
 	.open           = kprobes_open,
 	.read           = seq_read,
 	.llseek         = seq_lseek,
@@ -1515,7 +1515,7 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	return count;
 }
 
-static struct file_operations fops_kp = {
+static const struct file_operations fops_kp = {
 	.read =         read_enabled_file_bool,
 	.write =        write_enabled_file_bool,
 };
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index c89f5e9fd173..179e6ad80dc0 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -93,7 +93,7 @@ static int rcudata_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcudata, NULL);
 }
 
-static struct file_operations rcudata_fops = {
+static const struct file_operations rcudata_fops = {
 	.owner = THIS_MODULE,
 	.open = rcudata_open,
 	.read = seq_read,
@@ -145,7 +145,7 @@ static int rcudata_csv_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcudata_csv, NULL);
 }
 
-static struct file_operations rcudata_csv_fops = {
+static const struct file_operations rcudata_csv_fops = {
 	.owner = THIS_MODULE,
 	.open = rcudata_csv_open,
 	.read = seq_read,
@@ -196,7 +196,7 @@ static int rcuhier_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcuhier, NULL);
 }
 
-static struct file_operations rcuhier_fops = {
+static const struct file_operations rcuhier_fops = {
 	.owner = THIS_MODULE,
 	.open = rcuhier_open,
 	.read = seq_read,
@@ -222,7 +222,7 @@ static int rcugp_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcugp, NULL);
 }
 
-static struct file_operations rcugp_fops = {
+static const struct file_operations rcugp_fops = {
 	.owner = THIS_MODULE,
 	.open = rcugp_open,
 	.read = seq_read,
@@ -276,7 +276,7 @@ static int rcu_pending_open(struct inode *inode, struct file *file)
 	return single_open(file, show_rcu_pending, NULL);
 }
 
-static struct file_operations rcu_pending_fops = {
+static const struct file_operations rcu_pending_fops = {
 	.owner = THIS_MODULE,
 	.open = rcu_pending_open,
 	.read = seq_read,
diff --git a/kernel/sched.c b/kernel/sched.c
index ee61f454a98b..1535f3884b88 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -780,7 +780,7 @@ static int sched_feat_open(struct inode *inode, struct file *filp)
 	return single_open(filp, sched_feat_show, NULL);
 }
 
-static struct file_operations sched_feat_fops = {
+static const struct file_operations sched_feat_fops = {
 	.open		= sched_feat_open,
 	.write		= sched_feat_write,
 	.read		= seq_read,
diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c
index fddd69d16e03..1b5b7aa2fdfd 100644
--- a/kernel/time/timer_list.c
+++ b/kernel/time/timer_list.c
@@ -275,7 +275,7 @@ static int timer_list_open(struct inode *inode, struct file *filp)
 	return single_open(filp, timer_list_show, NULL);
 }
 
-static struct file_operations timer_list_fops = {
+static const struct file_operations timer_list_fops = {
 	.open		= timer_list_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
diff --git a/kernel/time/timer_stats.c b/kernel/time/timer_stats.c
index 4cde8b9c716f..ee5681f8d7ec 100644
--- a/kernel/time/timer_stats.c
+++ b/kernel/time/timer_stats.c
@@ -395,7 +395,7 @@ static int tstats_open(struct inode *inode, struct file *filp)
 	return single_open(filp, tstats_show, NULL);
 }
 
-static struct file_operations tstats_fops = {
+static const struct file_operations tstats_fops = {
 	.open		= tstats_open,
 	.read		= seq_read,
 	.write		= tstats_write,
diff --git a/samples/tracepoints/tracepoint-sample.c b/samples/tracepoints/tracepoint-sample.c
index 9cf80a11e8b6..26fab33ffa8c 100644
--- a/samples/tracepoints/tracepoint-sample.c
+++ b/samples/tracepoints/tracepoint-sample.c
@@ -28,7 +28,7 @@ static int my_open(struct inode *inode, struct file *file)
 	return -EPERM;
 }
 
-static struct file_operations mark_ops = {
+static const struct file_operations mark_ops = {
 	.open = my_open,
 };
 
diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c
index 8e9777b76405..0c72c9c38956 100644
--- a/security/integrity/ima/ima_fs.c
+++ b/security/integrity/ima/ima_fs.c
@@ -43,7 +43,7 @@ static ssize_t ima_show_htable_violations(struct file *filp,
 	return ima_show_htable_value(buf, count, ppos, &ima_htable.violations);
 }
 
-static struct file_operations ima_htable_violations_ops = {
+static const struct file_operations ima_htable_violations_ops = {
 	.read = ima_show_htable_violations
 };
 
@@ -55,7 +55,7 @@ static ssize_t ima_show_measurements_count(struct file *filp,
 
 }
 
-static struct file_operations ima_measurements_count_ops = {
+static const struct file_operations ima_measurements_count_ops = {
 	.read = ima_show_measurements_count
 };
 
@@ -158,7 +158,7 @@ static int ima_measurements_open(struct inode *inode, struct file *file)
 	return seq_open(file, &ima_measurments_seqops);
 }
 
-static struct file_operations ima_measurements_ops = {
+static const struct file_operations ima_measurements_ops = {
 	.open = ima_measurements_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -233,7 +233,7 @@ static int ima_ascii_measurements_open(struct inode *inode, struct file *file)
 	return seq_open(file, &ima_ascii_measurements_seqops);
 }
 
-static struct file_operations ima_ascii_measurements_ops = {
+static const struct file_operations ima_ascii_measurements_ops = {
 	.open = ima_ascii_measurements_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
@@ -313,7 +313,7 @@ static int ima_release_policy(struct inode *inode, struct file *file)
 	return 0;
 }
 
-static struct file_operations ima_measure_policy_ops = {
+static const struct file_operations ima_measure_policy_ops = {
 	.open = ima_open_policy,
 	.write = ima_write_policy,
 	.release = ima_release_policy
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index b5e7e3f1183f..e79c54034bcd 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -2625,7 +2625,7 @@ static int vcpu_stat_get(void *_offset, u64 *val)
 
 DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, NULL, "%llu\n");
 
-static struct file_operations *stat_fops[] = {
+static const struct file_operations *stat_fops[] = {
 	[KVM_STAT_VCPU] = &vcpu_stat_fops,
 	[KVM_STAT_VM]   = &vm_stat_fops,
 };
-- 
cgit v1.2.3


From 3dece8347df6a16239fab10dadb370854f1c969c Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 1 Oct 2009 15:44:09 -0700
Subject: cgroup: catch bad css refcnt at css_put

__css_put() doesn't check a bug as refcnt goes to minus.
I think it should be caught. This patch adds a check for it.

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d2b88596efde..ca83b73fba19 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3708,8 +3708,10 @@ static void check_for_release(struct cgroup *cgrp)
 void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
+	int val;
 	rcu_read_lock();
-	if (atomic_dec_return(&css->refcnt) == 1) {
+	val = atomic_dec_return(&css->refcnt);
+	if (val == 1) {
 		if (notify_on_release(cgrp)) {
 			set_bit(CGRP_RELEASABLE, &cgrp->flags);
 			check_for_release(cgrp);
@@ -3717,6 +3719,7 @@ void __css_put(struct cgroup_subsys_state *css)
 		cgroup_wakeup_rmdir_waiter(cgrp);
 	}
 	rcu_read_unlock();
+	WARN_ON_ONCE(val < 1);
 }
 
 /*
-- 
cgit v1.2.3


From 4e649152cbaa1aedd01821d200ab9d597fe469e4 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 1 Oct 2009 15:44:11 -0700
Subject: memcg: some modification to softlimit under hierarchical memory
 reclaim.

This patch clean up/fixes for memcg's uncharge soft limit path.

Problems:
  Now, res_counter_charge()/uncharge() handles softlimit information at
  charge/uncharge and softlimit-check is done when event counter per memcg
  goes over limit. Now, event counter per memcg is updated only when
  memory usage is over soft limit. Here, considering hierarchical memcg
  management, ancesotors should be taken care of.

  Now, ancerstors(hierarchy) are handled in charge() but not in uncharge().
  This is not good.

  Prolems:
  1. memcg's event counter incremented only when softlimit hits. That's bad.
     It makes event counter hard to be reused for other purpose.

  2. At uncharge, only the lowest level rescounter is handled. This is bug.
     Because ancesotor's event counter is not incremented, children should
     take care of them.

  3. res_counter_uncharge()'s 3rd argument is NULL in most case.
     ops under res_counter->lock should be small. No "if" sentense is better.

Fixes:
  * Removed soft_limit_xx poitner and checks in charge and uncharge.
    Do-check-only-when-necessary scheme works enough well without them.

  * make event-counter of memcg incremented at every charge/uncharge.
    (per-cpu area will be accessed soon anyway)

  * All ancestors are checked at soft-limit-check. This is necessary because
    ancesotor's event counter may never be modified. Then, they should be
    checked at the same time.

Reviewed-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/res_counter.h |   6 +--
 kernel/res_counter.c        |  18 +------
 mm/memcontrol.c             | 113 ++++++++++++++++++++------------------------
 3 files changed, 54 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/res_counter.h b/include/linux/res_counter.h
index 731af71cddc9..fcb9884df618 100644
--- a/include/linux/res_counter.h
+++ b/include/linux/res_counter.h
@@ -114,8 +114,7 @@ void res_counter_init(struct res_counter *counter, struct res_counter *parent);
 int __must_check res_counter_charge_locked(struct res_counter *counter,
 		unsigned long val);
 int __must_check res_counter_charge(struct res_counter *counter,
-		unsigned long val, struct res_counter **limit_fail_at,
-		struct res_counter **soft_limit_at);
+		unsigned long val, struct res_counter **limit_fail_at);
 
 /*
  * uncharge - tell that some portion of the resource is released
@@ -128,8 +127,7 @@ int __must_check res_counter_charge(struct res_counter *counter,
  */
 
 void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val);
-void res_counter_uncharge(struct res_counter *counter, unsigned long val,
-				bool *was_soft_limit_excess);
+void res_counter_uncharge(struct res_counter *counter, unsigned long val);
 
 static inline bool res_counter_limit_check_locked(struct res_counter *cnt)
 {
diff --git a/kernel/res_counter.c b/kernel/res_counter.c
index 88faec23e833..bcdabf37c40b 100644
--- a/kernel/res_counter.c
+++ b/kernel/res_counter.c
@@ -37,27 +37,17 @@ int res_counter_charge_locked(struct res_counter *counter, unsigned long val)
 }
 
 int res_counter_charge(struct res_counter *counter, unsigned long val,
-			struct res_counter **limit_fail_at,
-			struct res_counter **soft_limit_fail_at)
+			struct res_counter **limit_fail_at)
 {
 	int ret;
 	unsigned long flags;
 	struct res_counter *c, *u;
 
 	*limit_fail_at = NULL;
-	if (soft_limit_fail_at)
-		*soft_limit_fail_at = NULL;
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
 		ret = res_counter_charge_locked(c, val);
-		/*
-		 * With soft limits, we return the highest ancestor
-		 * that exceeds its soft limit
-		 */
-		if (soft_limit_fail_at &&
-			!res_counter_soft_limit_check_locked(c))
-			*soft_limit_fail_at = c;
 		spin_unlock(&c->lock);
 		if (ret < 0) {
 			*limit_fail_at = c;
@@ -85,8 +75,7 @@ void res_counter_uncharge_locked(struct res_counter *counter, unsigned long val)
 	counter->usage -= val;
 }
 
-void res_counter_uncharge(struct res_counter *counter, unsigned long val,
-				bool *was_soft_limit_excess)
+void res_counter_uncharge(struct res_counter *counter, unsigned long val)
 {
 	unsigned long flags;
 	struct res_counter *c;
@@ -94,9 +83,6 @@ void res_counter_uncharge(struct res_counter *counter, unsigned long val,
 	local_irq_save(flags);
 	for (c = counter; c != NULL; c = c->parent) {
 		spin_lock(&c->lock);
-		if (was_soft_limit_excess)
-			*was_soft_limit_excess =
-				!res_counter_soft_limit_check_locked(c);
 		res_counter_uncharge_locked(c, val);
 		spin_unlock(&c->lock);
 	}
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 21a30629ca80..1ae8c439584a 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -352,16 +352,6 @@ __mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 	mz->on_tree = false;
 }
 
-static void
-mem_cgroup_insert_exceeded(struct mem_cgroup *mem,
-				struct mem_cgroup_per_zone *mz,
-				struct mem_cgroup_tree_per_zone *mctz)
-{
-	spin_lock(&mctz->lock);
-	__mem_cgroup_insert_exceeded(mem, mz, mctz);
-	spin_unlock(&mctz->lock);
-}
-
 static void
 mem_cgroup_remove_exceeded(struct mem_cgroup *mem,
 				struct mem_cgroup_per_zone *mz,
@@ -392,34 +382,40 @@ static bool mem_cgroup_soft_limit_check(struct mem_cgroup *mem)
 
 static void mem_cgroup_update_tree(struct mem_cgroup *mem, struct page *page)
 {
-	unsigned long long prev_usage_in_excess, new_usage_in_excess;
-	bool updated_tree = false;
+	unsigned long long new_usage_in_excess;
 	struct mem_cgroup_per_zone *mz;
 	struct mem_cgroup_tree_per_zone *mctz;
-
-	mz = mem_cgroup_zoneinfo(mem, page_to_nid(page), page_zonenum(page));
+	int nid = page_to_nid(page);
+	int zid = page_zonenum(page);
 	mctz = soft_limit_tree_from_page(page);
 
 	/*
-	 * We do updates in lazy mode, mem's are removed
-	 * lazily from the per-zone, per-node rb tree
+	 * Necessary to update all ancestors when hierarchy is used.
+	 * because their event counter is not touched.
 	 */
-	prev_usage_in_excess = mz->usage_in_excess;
-
-	new_usage_in_excess = res_counter_soft_limit_excess(&mem->res);
-	if (prev_usage_in_excess) {
-		mem_cgroup_remove_exceeded(mem, mz, mctz);
-		updated_tree = true;
-	}
-	if (!new_usage_in_excess)
-		goto done;
-	mem_cgroup_insert_exceeded(mem, mz, mctz);
-
-done:
-	if (updated_tree) {
-		spin_lock(&mctz->lock);
-		mz->usage_in_excess = new_usage_in_excess;
-		spin_unlock(&mctz->lock);
+	for (; mem; mem = parent_mem_cgroup(mem)) {
+		mz = mem_cgroup_zoneinfo(mem, nid, zid);
+		new_usage_in_excess =
+			res_counter_soft_limit_excess(&mem->res);
+		/*
+		 * We have to update the tree if mz is on RB-tree or
+		 * mem is over its softlimit.
+		 */
+		if (new_usage_in_excess || mz->on_tree) {
+			spin_lock(&mctz->lock);
+			/* if on-tree, remove it */
+			if (mz->on_tree)
+				__mem_cgroup_remove_exceeded(mem, mz, mctz);
+			/*
+			 * if over soft limit, insert again. mz->usage_in_excess
+			 * will be updated properly.
+			 */
+			if (new_usage_in_excess)
+				__mem_cgroup_insert_exceeded(mem, mz, mctz);
+			else
+				mz->usage_in_excess = 0;
+			spin_unlock(&mctz->lock);
+		}
 	}
 }
 
@@ -1271,9 +1267,9 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 			gfp_t gfp_mask, struct mem_cgroup **memcg,
 			bool oom, struct page *page)
 {
-	struct mem_cgroup *mem, *mem_over_limit, *mem_over_soft_limit;
+	struct mem_cgroup *mem, *mem_over_limit;
 	int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
-	struct res_counter *fail_res, *soft_fail_res = NULL;
+	struct res_counter *fail_res;
 
 	if (unlikely(test_thread_flag(TIF_MEMDIE))) {
 		/* Don't account this! */
@@ -1305,17 +1301,16 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 
 		if (mem_cgroup_is_root(mem))
 			goto done;
-		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res,
-						&soft_fail_res);
+		ret = res_counter_charge(&mem->res, PAGE_SIZE, &fail_res);
 		if (likely(!ret)) {
 			if (!do_swap_account)
 				break;
 			ret = res_counter_charge(&mem->memsw, PAGE_SIZE,
-							&fail_res, NULL);
+							&fail_res);
 			if (likely(!ret))
 				break;
 			/* mem+swap counter fails */
-			res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			flags |= MEM_CGROUP_RECLAIM_NOSWAP;
 			mem_over_limit = mem_cgroup_from_res_counter(fail_res,
 									memsw);
@@ -1354,16 +1349,11 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 		}
 	}
 	/*
-	 * Insert just the ancestor, we should trickle down to the correct
-	 * cgroup for reclaim, since the other nodes will be below their
-	 * soft limit
+	 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree.
+	 * if they exceeds softlimit.
 	 */
-	if (soft_fail_res) {
-		mem_over_soft_limit =
-			mem_cgroup_from_res_counter(soft_fail_res, res);
-		if (mem_cgroup_soft_limit_check(mem_over_soft_limit))
-			mem_cgroup_update_tree(mem_over_soft_limit, page);
-	}
+	if (mem_cgroup_soft_limit_check(mem))
+		mem_cgroup_update_tree(mem, page);
 done:
 	return 0;
 nomem:
@@ -1438,10 +1428,9 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *mem,
 	if (unlikely(PageCgroupUsed(pc))) {
 		unlock_page_cgroup(pc);
 		if (!mem_cgroup_is_root(mem)) {
-			res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+			res_counter_uncharge(&mem->res, PAGE_SIZE);
 			if (do_swap_account)
-				res_counter_uncharge(&mem->memsw, PAGE_SIZE,
-							NULL);
+				res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 		}
 		css_put(&mem->css);
 		return;
@@ -1520,7 +1509,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 		goto out;
 
 	if (!mem_cgroup_is_root(from))
-		res_counter_uncharge(&from->res, PAGE_SIZE, NULL);
+		res_counter_uncharge(&from->res, PAGE_SIZE);
 	mem_cgroup_charge_statistics(from, pc, false);
 
 	page = pc->page;
@@ -1540,7 +1529,7 @@ static int mem_cgroup_move_account(struct page_cgroup *pc,
 	}
 
 	if (do_swap_account && !mem_cgroup_is_root(from))
-		res_counter_uncharge(&from->memsw, PAGE_SIZE, NULL);
+		res_counter_uncharge(&from->memsw, PAGE_SIZE);
 	css_put(&from->css);
 
 	css_get(&to->css);
@@ -1611,9 +1600,9 @@ uncharge:
 	css_put(&parent->css);
 	/* uncharge if move fails */
 	if (!mem_cgroup_is_root(parent)) {
-		res_counter_uncharge(&parent->res, PAGE_SIZE, NULL);
+		res_counter_uncharge(&parent->res, PAGE_SIZE);
 		if (do_swap_account)
-			res_counter_uncharge(&parent->memsw, PAGE_SIZE, NULL);
+			res_counter_uncharge(&parent->memsw, PAGE_SIZE);
 	}
 	return ret;
 }
@@ -1804,8 +1793,7 @@ __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *ptr,
 			 * calling css_tryget
 			 */
 			if (!mem_cgroup_is_root(memcg))
-				res_counter_uncharge(&memcg->memsw, PAGE_SIZE,
-							NULL);
+				res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 			mem_cgroup_swap_statistics(memcg, false);
 			mem_cgroup_put(memcg);
 		}
@@ -1832,9 +1820,9 @@ void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *mem)
 	if (!mem)
 		return;
 	if (!mem_cgroup_is_root(mem)) {
-		res_counter_uncharge(&mem->res, PAGE_SIZE, NULL);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		if (do_swap_account)
-			res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 	}
 	css_put(&mem->css);
 }
@@ -1849,7 +1837,6 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	struct page_cgroup *pc;
 	struct mem_cgroup *mem = NULL;
 	struct mem_cgroup_per_zone *mz;
-	bool soft_limit_excess = false;
 
 	if (mem_cgroup_disabled())
 		return NULL;
@@ -1889,10 +1876,10 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	}
 
 	if (!mem_cgroup_is_root(mem)) {
-		res_counter_uncharge(&mem->res, PAGE_SIZE, &soft_limit_excess);
+		res_counter_uncharge(&mem->res, PAGE_SIZE);
 		if (do_swap_account &&
 				(ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT))
-			res_counter_uncharge(&mem->memsw, PAGE_SIZE, NULL);
+			res_counter_uncharge(&mem->memsw, PAGE_SIZE);
 	}
 	if (ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
 		mem_cgroup_swap_statistics(mem, true);
@@ -1909,7 +1896,7 @@ __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype)
 	mz = page_cgroup_zoneinfo(pc);
 	unlock_page_cgroup(pc);
 
-	if (soft_limit_excess && mem_cgroup_soft_limit_check(mem))
+	if (mem_cgroup_soft_limit_check(mem))
 		mem_cgroup_update_tree(mem, page);
 	/* at swapout, this memcg will be accessed to record to swap */
 	if (ctype != MEM_CGROUP_CHARGE_TYPE_SWAPOUT)
@@ -1987,7 +1974,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t ent)
 		 * This memcg can be obsolete one. We avoid calling css_tryget
 		 */
 		if (!mem_cgroup_is_root(memcg))
-			res_counter_uncharge(&memcg->memsw, PAGE_SIZE, NULL);
+			res_counter_uncharge(&memcg->memsw, PAGE_SIZE);
 		mem_cgroup_swap_statistics(memcg, false);
 		mem_cgroup_put(memcg);
 	}
-- 
cgit v1.2.3


From 1087e9b4ff708976499b4de541d9e1d57d49b60a Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Sun, 4 Oct 2009 02:20:11 +0200
Subject: HWPOISON: Clean up PR_MCE_KILL interface

While writing the manpage I noticed some shortcomings in the
current interface.

- Define symbolic names for all the different values
- Boundary check the kill mode values
- For symmetry add a get interface too. This allows library
code to get/set the current state.
- For consistency define a PR_MCE_KILL_DEFAULT value

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 include/linux/prctl.h | 12 ++++++++++++
 kernel/sys.c          | 23 ++++++++++++++++++-----
 2 files changed, 30 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/prctl.h b/include/linux/prctl.h
index 931150566ade..a3baeb2c2161 100644
--- a/include/linux/prctl.h
+++ b/include/linux/prctl.h
@@ -88,6 +88,18 @@
 #define PR_TASK_PERF_EVENTS_DISABLE		31
 #define PR_TASK_PERF_EVENTS_ENABLE		32
 
+/*
+ * Set early/late kill mode for hwpoison memory corruption.
+ * This influences when the process gets killed on a memory corruption.
+ */
 #define PR_MCE_KILL	33
+# define PR_MCE_KILL_CLEAR   0
+# define PR_MCE_KILL_SET     1
+
+# define PR_MCE_KILL_LATE    0
+# define PR_MCE_KILL_EARLY   1
+# define PR_MCE_KILL_DEFAULT 2
+
+#define PR_MCE_KILL_GET 34
 
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e0..f6afe07d6c0b 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1546,24 +1546,37 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 			if (arg4 | arg5)
 				return -EINVAL;
 			switch (arg2) {
-			case 0:
+			case PR_MCE_KILL_CLEAR:
 				if (arg3 != 0)
 					return -EINVAL;
 				current->flags &= ~PF_MCE_PROCESS;
 				break;
-			case 1:
+			case PR_MCE_KILL_SET:
 				current->flags |= PF_MCE_PROCESS;
-				if (arg3 != 0)
+				if (arg3 == PR_MCE_KILL_EARLY)
 					current->flags |= PF_MCE_EARLY;
-				else
+				else if (arg3 == PR_MCE_KILL_LATE)
 					current->flags &= ~PF_MCE_EARLY;
+				else if (arg3 == PR_MCE_KILL_DEFAULT)
+					current->flags &=
+						~(PF_MCE_EARLY|PF_MCE_PROCESS);
+				else
+					return -EINVAL;
 				break;
 			default:
 				return -EINVAL;
 			}
 			error = 0;
 			break;
-
+		case PR_MCE_KILL_GET:
+			if (arg2 | arg3 | arg4 | arg5)
+				return -EINVAL;
+			if (current->flags & PF_MCE_PROCESS)
+				error = (current->flags & PF_MCE_EARLY) ?
+					PR_MCE_KILL_EARLY : PR_MCE_KILL_LATE;
+			else
+				error = PR_MCE_KILL_DEFAULT;
+			break;
 		default:
 			error = -EINVAL;
 			break;
-- 
cgit v1.2.3


From fe8e5b5a60f8427940d33b205e127aecfb0bca10 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 3 Oct 2009 14:55:18 +0200
Subject: tracing: Check total refcount before releasing bufs in profile_enable
 failure

When we call the profile_enable() callback of an event, we release the
shared perf event tracing buffers unconditionnaly in the failure path.
This is wrong because there may be other users of these. Then check the
total refcount before doing this.

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace_event_profile.c | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index dd44b8768867..e52784b7b844 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -31,7 +31,7 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 	if (atomic_inc_return(&event->profile_count))
 		return 0;
 
-	if (!total_profile_count++) {
+	if (!total_profile_count) {
 		buf = (char *)alloc_percpu(profile_buf_t);
 		if (!buf)
 			goto fail_buf;
@@ -46,14 +46,19 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 	}
 
 	ret = event->profile_enable();
-	if (!ret)
+	if (!ret) {
+		total_profile_count++;
 		return 0;
+	}
 
-	kfree(trace_profile_buf_nmi);
 fail_buf_nmi:
-	kfree(trace_profile_buf);
+	if (!total_profile_count) {
+		kfree(trace_profile_buf_nmi);
+		kfree(trace_profile_buf);
+		trace_profile_buf_nmi = NULL;
+		trace_profile_buf = NULL;
+	}
 fail_buf:
-	total_profile_count--;
 	atomic_dec(&event->profile_count);
 
 	return ret;
-- 
cgit v1.2.3


From 75fb4090b39a3d7bf9ac77a28665c991ec5eaadc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 3 Oct 2009 15:08:54 +0200
Subject: tracing: Use free_percpu instead of kfree

In the event->profile_enable() failure path, we release the per cpu
buffers using kfree which is wrong because they are per cpu pointers.
Although free_percpu only wraps kfree for now, that may change in the
future so lets use the correct way.

Reported-by: Paul Mackerras <paulus@samba.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Li Zefan <lizf@cn.fujitsu.com>
---
 kernel/trace/trace_event_profile.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
index e52784b7b844..8d5c171cc998 100644
--- a/kernel/trace/trace_event_profile.c
+++ b/kernel/trace/trace_event_profile.c
@@ -53,8 +53,8 @@ static int ftrace_profile_enable_event(struct ftrace_event_call *event)
 
 fail_buf_nmi:
 	if (!total_profile_count) {
-		kfree(trace_profile_buf_nmi);
-		kfree(trace_profile_buf);
+		free_percpu(trace_profile_buf_nmi);
+		free_percpu(trace_profile_buf);
 		trace_profile_buf_nmi = NULL;
 		trace_profile_buf = NULL;
 	}
-- 
cgit v1.2.3


From f83f9ac2632732bd1678150b5a03d152f912fe72 Mon Sep 17 00:00:00 2001
From: Peter Williams <pwil3058@bigpond.net.au>
Date: Thu, 24 Sep 2009 06:47:10 +0000
Subject: sched: Set correct normal_prio and prio values in sched_fork()

normal_prio should be updated if policy changes from RT to
SCHED_MORMAL or if static_prio/nice is changed.

Some paths through sched_fork() ignore this requirement and may
result in normal_prio having an invalid value.

Fixing this issue allows the call to effective_prio() in
wake_up_new_task() to be removed.

Signed-off-by: Peter Williams <pwil3058@bigpond.net.au>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
LKML-Reference: <f8f46736fd4e7f090ac0.1253774830@mudlark.pw.nest>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 20 +++++++++-----------
 1 file changed, 9 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1535f3884b88..76c0e9691fc0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2514,23 +2514,18 @@ void sched_fork(struct task_struct *p, int clone_flags)
 
 	__sched_fork(p);
 
-	/*
-	 * Make sure we do not leak PI boosting priority to the child.
-	 */
-	p->prio = current->normal_prio;
-
 	/*
 	 * Revert to default priority/policy on fork if requested.
 	 */
 	if (unlikely(p->sched_reset_on_fork)) {
-		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR)
+		if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
 			p->policy = SCHED_NORMAL;
-
-		if (p->normal_prio < DEFAULT_PRIO)
-			p->prio = DEFAULT_PRIO;
+			p->normal_prio = p->static_prio;
+		}
 
 		if (PRIO_TO_NICE(p->static_prio) < 0) {
 			p->static_prio = NICE_TO_PRIO(0);
+			p->normal_prio = p->static_prio;
 			set_load_weight(p);
 		}
 
@@ -2541,6 +2536,11 @@ void sched_fork(struct task_struct *p, int clone_flags)
 		p->sched_reset_on_fork = 0;
 	}
 
+	/*
+	 * Make sure we do not leak PI boosting priority to the child.
+	 */
+	p->prio = current->normal_prio;
+
 	if (!rt_prio(p->prio))
 		p->sched_class = &fair_sched_class;
 
@@ -2581,8 +2581,6 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 	BUG_ON(p->state != TASK_RUNNING);
 	update_rq_clock(rq);
 
-	p->prio = effective_prio(p);
-
 	if (!p->sched_class->task_new || !current->se.on_rq) {
 		activate_task(rq, p, 0);
 	} else {
-- 
cgit v1.2.3


From 162cc2794df37662beb7f97ddd1dd5bffaf85e9a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 23 Sep 2009 16:18:13 -0700
Subject: rcu: Fix rcu_lock_map build failure on CONFIG_PROVE_LOCKING=y

Move the rcu_lock_map definition from rcutree.c to rcupdate.c so that
TINY_RCU can use lockdep.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 7 +++++++
 kernel/rcutree.c  | 7 -------
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 8e795133b33d..4a189ea18b48 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -46,6 +46,13 @@
 #include <linux/module.h>
 #include <linux/kernel_stat.h>
 
+#ifdef CONFIG_DEBUG_LOCK_ALLOC
+static struct lock_class_key rcu_lock_key;
+struct lockdep_map rcu_lock_map =
+	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
+EXPORT_SYMBOL_GPL(rcu_lock_map);
+#endif
+
 enum rcu_barrier {
 	RCU_BARRIER_STD,
 	RCU_BARRIER_BH,
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 8e52cde7b8f7..81af59b8dd88 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -49,13 +49,6 @@
 
 #include "rcutree.h"
 
-#ifdef CONFIG_DEBUG_LOCK_ALLOC
-static struct lock_class_key rcu_lock_key;
-struct lockdep_map rcu_lock_map =
-	STATIC_LOCKDEP_MAP_INIT("rcu_read_lock", &rcu_lock_key);
-EXPORT_SYMBOL_GPL(rcu_lock_map);
-#endif
-
 /* Data structures. */
 
 #define RCU_STATE_INITIALIZER(name) { \
-- 
cgit v1.2.3


From 3d76c082907e8f83c5d5c4572f38d53ad8f00c4b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Sep 2009 07:46:32 -0700
Subject: rcu: Clean up code based on review feedback from Josh Triplett, part
 3

Whitespace fixes, updated comments, and trivial code movement.

o	Fix whitespace error in RCU_HEAD_INIT()

o	Move "So where is rcu_write_lock()" comment so that it does
	not come between the rcu_read_unlock() header comment and
	the rcu_read_unlock() definition.

o	Move the module_param statements for blimit, qhimark, and
	qlowmark to immediately follow the corresponding
	definitions.

o	In __rcu_offline_cpu(), move the assignment to rdp_me
	inside the "if" statement, given that rdp_me is not used
	outside of that "if" statement.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12541491931164-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupdate.h | 14 +++++++-------
 kernel/rcutree.c         | 10 +++++-----
 2 files changed, 12 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 70331218e4b4..3ebd0b7bcb08 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -77,7 +77,7 @@ extern int rcu_scheduler_active;
 #error "Unknown RCU implementation specified to kernel configuration"
 #endif
 
-#define RCU_HEAD_INIT 	{ .next = NULL, .func = NULL }
+#define RCU_HEAD_INIT	{ .next = NULL, .func = NULL }
 #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT
 #define INIT_RCU_HEAD(ptr) do { \
        (ptr)->next = NULL; (ptr)->func = NULL; \
@@ -129,12 +129,6 @@ static inline void rcu_read_lock(void)
 	rcu_read_acquire();
 }
 
-/**
- * rcu_read_unlock - marks the end of an RCU read-side critical section.
- *
- * See rcu_read_lock() for more information.
- */
-
 /*
  * So where is rcu_write_lock()?  It does not exist, as there is no
  * way for writers to lock out RCU readers.  This is a feature, not
@@ -144,6 +138,12 @@ static inline void rcu_read_lock(void)
  * used as well.  RCU does not care how the writers keep out of each
  * others' way, as long as they do so.
  */
+
+/**
+ * rcu_read_unlock - marks the end of an RCU read-side critical section.
+ *
+ * See rcu_read_lock() for more information.
+ */
 static inline void rcu_read_unlock(void)
 {
 	rcu_read_release();
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 81af59b8dd88..d5597830faf5 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -122,6 +122,10 @@ static int blimit = 10;		/* Maximum callbacks per softirq. */
 static int qhimark = 10000;	/* If this many pending, ignore blimit. */
 static int qlowmark = 100;	/* Once only this many pending, use blimit. */
 
+module_param(blimit, int, 0);
+module_param(qhimark, int, 0);
+module_param(qlowmark, int, 0);
+
 static void force_quiescent_state(struct rcu_state *rsp, int relaxed);
 static int rcu_pending(int cpu);
 
@@ -878,8 +882,8 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	 * indefinitely delay callbacks, you have far worse things to
 	 * be worrying about.
 	 */
-	rdp_me = rsp->rda[smp_processor_id()];
 	if (rdp->nxtlist != NULL) {
+		rdp_me = rsp->rda[smp_processor_id()];
 		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
 		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
 		rdp->nxtlist = NULL;
@@ -1575,7 +1579,3 @@ void __init __rcu_init(void)
 }
 
 #include "rcutree_plugin.h"
-
-module_param(blimit, int, 0);
-module_param(qhimark, int, 0);
-module_param(qlowmark, int, 0);
-- 
cgit v1.2.3


From a0b6c9a78c41dc36732d6e1e90f0f2f57b29816f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Sep 2009 07:46:33 -0700
Subject: rcu: Clean up code based on review feedback from Josh Triplett, part
 4

These issues identified during an old-fashioned face-to-face code
review extending over many hours.  This group improves an existing
abstraction and introduces two new ones.  It also fixes an RCU
stall-warning bug found while making the other changes.

o	Make RCU_INIT_FLAVOR() declare its own variables, removing
	the need to declare them at each call site.

o	Create an rcu_for_each_leaf() macro that scans the leaf
	nodes of the rcu_node tree.

o	Create an rcu_for_each_node_breadth_first() macro that does
	a breadth-first traversal of the rcu_node tree, AKA
	stepping through the array in index-number order.

o	If all CPUs corresponding to a given leaf rcu_node
	structure go offline, then any tasks queued on that leaf
	will be moved to the root rcu_node structure.  Therefore,
	the stall-warning code must dump out tasks queued on the
	root rcu_node structure as well as those queued on the leaf
	rcu_node structures.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12541491934126-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 53 ++++++++++++++++++++++++++-----------------------
 kernel/rcutree.h        | 12 +++++++++++
 kernel/rcutree_plugin.h |  4 ----
 3 files changed, 40 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d5597830faf5..e2e272b5c277 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -462,8 +462,6 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 	long delta;
 	unsigned long flags;
 	struct rcu_node *rnp = rcu_get_root(rsp);
-	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
-	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
 
 	/* Only let one CPU complain about others per time interval. */
 
@@ -474,18 +472,24 @@ static void print_other_cpu_stall(struct rcu_state *rsp)
 		return;
 	}
 	rsp->jiffies_stall = jiffies + RCU_SECONDS_TILL_STALL_RECHECK;
+
+	/*
+	 * Now rat on any tasks that got kicked up to the root rcu_node
+	 * due to CPU offlining.
+	 */
+	rcu_print_task_stall(rnp);
 	spin_unlock_irqrestore(&rnp->lock, flags);
 
 	/* OK, time to rat on our buddy... */
 
 	printk(KERN_ERR "INFO: RCU detected CPU stalls:");
-	for (; rnp_cur < rnp_end; rnp_cur++) {
+	rcu_for_each_leaf_node(rsp, rnp) {
 		rcu_print_task_stall(rnp);
-		if (rnp_cur->qsmask == 0)
+		if (rnp->qsmask == 0)
 			continue;
-		for (cpu = 0; cpu <= rnp_cur->grphi - rnp_cur->grplo; cpu++)
-			if (rnp_cur->qsmask & (1UL << cpu))
-				printk(" %d", rnp_cur->grplo + cpu);
+		for (cpu = 0; cpu <= rnp->grphi - rnp->grplo; cpu++)
+			if (rnp->qsmask & (1UL << cpu))
+				printk(" %d", rnp->grplo + cpu);
 	}
 	printk(" (detected by %d, t=%ld jiffies)\n",
 	       smp_processor_id(), (long)(jiffies - rsp->gp_start));
@@ -649,7 +653,7 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	 * one corresponding to this CPU, due to the fact that we have
 	 * irqs disabled.
 	 */
-	for (rnp = &rsp->node[0]; rnp < &rsp->node[NUM_RCU_NODES]; rnp++) {
+	rcu_for_each_node_breadth_first(rsp, rnp) {
 		spin_lock(&rnp->lock);	/* irqs already disabled. */
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
@@ -1042,33 +1046,32 @@ static int rcu_process_dyntick(struct rcu_state *rsp, long lastcomp,
 	int cpu;
 	unsigned long flags;
 	unsigned long mask;
-	struct rcu_node *rnp_cur = rsp->level[NUM_RCU_LVLS - 1];
-	struct rcu_node *rnp_end = &rsp->node[NUM_RCU_NODES];
+	struct rcu_node *rnp;
 
-	for (; rnp_cur < rnp_end; rnp_cur++) {
+	rcu_for_each_leaf_node(rsp, rnp) {
 		mask = 0;
-		spin_lock_irqsave(&rnp_cur->lock, flags);
+		spin_lock_irqsave(&rnp->lock, flags);
 		if (rsp->completed != lastcomp) {
-			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+			spin_unlock_irqrestore(&rnp->lock, flags);
 			return 1;
 		}
-		if (rnp_cur->qsmask == 0) {
-			spin_unlock_irqrestore(&rnp_cur->lock, flags);
+		if (rnp->qsmask == 0) {
+			spin_unlock_irqrestore(&rnp->lock, flags);
 			continue;
 		}
-		cpu = rnp_cur->grplo;
+		cpu = rnp->grplo;
 		bit = 1;
-		for (; cpu <= rnp_cur->grphi; cpu++, bit <<= 1) {
-			if ((rnp_cur->qsmask & bit) != 0 && f(rsp->rda[cpu]))
+		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
+			if ((rnp->qsmask & bit) != 0 && f(rsp->rda[cpu]))
 				mask |= bit;
 		}
 		if (mask != 0 && rsp->completed == lastcomp) {
 
-			/* cpu_quiet_msk() releases rnp_cur->lock. */
-			cpu_quiet_msk(mask, rsp, rnp_cur, flags);
+			/* cpu_quiet_msk() releases rnp->lock. */
+			cpu_quiet_msk(mask, rsp, rnp, flags);
 			continue;
 		}
-		spin_unlock_irqrestore(&rnp_cur->lock, flags);
+		spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 	return 0;
 }
@@ -1550,6 +1553,10 @@ static void __init rcu_init_one(struct rcu_state *rsp)
  */
 #define RCU_INIT_FLAVOR(rsp, rcu_data) \
 do { \
+	int i; \
+	int j; \
+	struct rcu_node *rnp; \
+	\
 	rcu_init_one(rsp); \
 	rnp = (rsp)->level[NUM_RCU_LVLS - 1]; \
 	j = 0; \
@@ -1564,10 +1571,6 @@ do { \
 
 void __init __rcu_init(void)
 {
-	int i;			/* All used by RCU_INIT_FLAVOR(). */
-	int j;
-	struct rcu_node *rnp;
-
 	rcu_bootup_announce();
 #ifdef CONFIG_RCU_CPU_STALL_DETECTOR
 	printk(KERN_INFO "RCU-based detection of stalled CPUs is enabled.\n");
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index e6ab31cc28ba..676eecd371d9 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -106,6 +106,18 @@ struct rcu_node {
 				/*  blocked_tasks[] array. */
 } ____cacheline_internodealigned_in_smp;
 
+/*
+ * Do a full breadth-first scan of the rcu_node structures for the
+ * specified rcu_state structure.
+ */
+#define rcu_for_each_node_breadth_first(rsp, rnp) \
+	for ((rnp) = &(rsp)->node[0]; \
+	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+
+#define rcu_for_each_leaf_node(rsp, rnp) \
+	for ((rnp) = (rsp)->level[NUM_RCU_LVLS - 1]; \
+	     (rnp) < &(rsp)->node[NUM_RCU_NODES]; (rnp)++)
+
 /* Index values for nxttail array in struct rcu_data. */
 #define RCU_DONE_TAIL		0	/* Also RCU_WAIT head. */
 #define RCU_WAIT_TAIL		1	/* Also RCU_NEXT_READY head. */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 65250219ab6d..57200fe96d0a 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -423,10 +423,6 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
  */
 static void __init __rcu_init_preempt(void)
 {
-	int i;			/* All used by RCU_INIT_FLAVOR(). */
-	int j;
-	struct rcu_node *rnp;
-
 	RCU_INIT_FLAVOR(&rcu_preempt_state, rcu_preempt_data);
 }
 
-- 
cgit v1.2.3


From 135c8aea557cf53abe6c8847e286d01442124193 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 28 Sep 2009 21:50:21 -0700
Subject: rcu: Replace the rcu_barrier enum with pointer to call_rcu*()
 function

The rcu_barrier enum causes several problems:

  (1) you have to define the enum somewhere, and there is no
      convenient place,

  (2) the difference between TREE_RCU and TREE_PREEMPT_RCU causes
      problems when you need to map from rcu_barrier enum to struct
      rcu_state,

  (3) the switch statement are large, and

  (4) TINY_RCU really needs a different rcu_barrier() than do the
      treercu implementations.

So replace it with a functionally equivalent but cleaner function
pointer abstraction.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12541998232366-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 32 ++++++++++----------------------
 1 file changed, 10 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index 4a189ea18b48..e43242274466 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,12 +53,6 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
-enum rcu_barrier {
-	RCU_BARRIER_STD,
-	RCU_BARRIER_BH,
-	RCU_BARRIER_SCHED,
-};
-
 static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
@@ -184,19 +178,12 @@ static void rcu_barrier_func(void *type)
 {
 	int cpu = smp_processor_id();
 	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
+	void (*call_rcu_func)(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head));
 
 	atomic_inc(&rcu_barrier_cpu_count);
-	switch ((enum rcu_barrier)type) {
-	case RCU_BARRIER_STD:
-		call_rcu(head, rcu_barrier_callback);
-		break;
-	case RCU_BARRIER_BH:
-		call_rcu_bh(head, rcu_barrier_callback);
-		break;
-	case RCU_BARRIER_SCHED:
-		call_rcu_sched(head, rcu_barrier_callback);
-		break;
-	}
+	call_rcu_func = type;
+	call_rcu_func(head, rcu_barrier_callback);
 }
 
 static inline void wait_migrated_callbacks(void)
@@ -209,7 +196,8 @@ static inline void wait_migrated_callbacks(void)
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
  */
-static void _rcu_barrier(enum rcu_barrier type)
+static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head,
+					       void (*func)(struct rcu_head *head)))
 {
 	BUG_ON(in_interrupt());
 	/* Take cpucontrol mutex to protect against CPU hotplug */
@@ -225,7 +213,7 @@ static void _rcu_barrier(enum rcu_barrier type)
 	 * early.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
-	on_each_cpu(rcu_barrier_func, (void *)type, 1);
+	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
@@ -238,7 +226,7 @@ static void _rcu_barrier(enum rcu_barrier type)
  */
 void rcu_barrier(void)
 {
-	_rcu_barrier(RCU_BARRIER_STD);
+	_rcu_barrier(call_rcu);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier);
 
@@ -247,7 +235,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier);
  */
 void rcu_barrier_bh(void)
 {
-	_rcu_barrier(RCU_BARRIER_BH);
+	_rcu_barrier(call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 
@@ -256,7 +244,7 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
  */
 void rcu_barrier_sched(void)
 {
-	_rcu_barrier(RCU_BARRIER_SCHED);
+	_rcu_barrier(call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-- 
cgit v1.2.3


From d014e8894dfc523dd9d2f2a17b6dcb94facea810 Mon Sep 17 00:00:00 2001
From: Aaro Koskinen <aaro.koskinen@nokia.com>
Date: Fri, 2 Oct 2009 14:41:20 +0300
Subject: panic: Fix panic message visibility by calling bust_spinlocks(0)
 before dying

Commit ffd71da4e3f ("panic: decrease oops_in_progress only after
having done the panic") moved bust_spinlocks(0) to the end of the
function, which in practice is never reached.

As a result console_unblank() is not called, and on some systems
the user may not see the panic message.

Move it back up to before the unblanking.

Signed-off-by: Aaro Koskinen <aaro.koskinen@nokia.com>
Reviewed-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1254483680-25578-1-git-send-email-aaro.koskinen@nokia.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 512ab73b0ca3..bc4dcb6a389b 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -90,6 +90,8 @@ NORET_TYPE void panic(const char * fmt, ...)
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
+	bust_spinlocks(0);
+
 	if (!panic_blink)
 		panic_blink = no_blink;
 
@@ -136,7 +138,6 @@ NORET_TYPE void panic(const char * fmt, ...)
 		mdelay(1);
 		i++;
 	}
-	bust_spinlocks(0);
 }
 
 EXPORT_SYMBOL(panic);
-- 
cgit v1.2.3


From eaaea8036d0261d87d7072c5bc88c7ea730c18ac Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 4 Oct 2009 09:34:17 +0200
Subject: futex: Fix locking imbalance

Rich reported a lock imbalance in the futex code:

   http://bugzilla.kernel.org/show_bug.cgi?id=14288

It's caused by the displacement of the retry_private label in
futex_wake_op(). The code unlocks the hash bucket locks in the
error handling path and retries without locking them again which
makes the next unlock fail.

Move retry_private so we lock the hash bucket locks when we retry.

Reported-by: Rich Ercolany <rercola@acm.jhu.edu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: stable-2.6.31 <stable@kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 463af2efa512..1e176f3ab26c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -916,8 +916,8 @@ retry:
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
 
-	double_lock_hb(hb1, hb2);
 retry_private:
+	double_lock_hb(hb1, hb2);
 	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
 
-- 
cgit v1.2.3


From ee949a86b3aef15845ea677aa60231008de62672 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 6 Oct 2009 01:00:49 -0500
Subject: tracing/syscalls: Use long for syscall ret format and field
 definitions

The syscall event definitions use long for the syscall exit ret
value, but unsigned long for the same thing in the format and field
definitions.  Change them all to long.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: rostedt@goodmis.org
Cc: lizf@cn.fujitsu.com
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1254808849-7829-4-git-send-email-tzanussi@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 9fbce6c9d2e1..527e17eae575 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -166,7 +166,7 @@ int syscall_exit_format(struct ftrace_event_call *call, struct trace_seq *s)
 			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 			       "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n",
 			       SYSCALL_FIELD(int, nr),
-			       SYSCALL_FIELD(unsigned long, ret));
+			       SYSCALL_FIELD(long, ret));
 	if (!ret)
 		return 0;
 
@@ -212,7 +212,7 @@ int syscall_exit_define_fields(struct ftrace_event_call *call)
 	if (ret)
 		return ret;
 
-	ret = trace_define_field(call, SYSCALL_FIELD(unsigned long, ret), 0,
+	ret = trace_define_field(call, SYSCALL_FIELD(long, ret), 0,
 				 FILTER_OTHER);
 
 	return ret;
-- 
cgit v1.2.3


From 906010b2134e14a2e377decbadd357b3d0ab9c6a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 21 Sep 2009 16:08:49 +0200
Subject: perf_event: Provide vmalloc() based mmap() backing

Some architectures such as Sparc, ARM and MIPS (basically
everything with flush_dcache_page()) need to deal with dcache
aliases by carefully placing pages in both kernel and user maps.

These architectures typically have to use vmalloc_user() for this.

However, on other architectures, vmalloc() is not needed and has
the downsides of being more restricted and slower than regular
allocations.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: David Miller <davem@davemloft.net>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <1254830228.21044.272.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/sparc/Kconfig         |   2 +
 include/linux/perf_event.h |   5 +
 init/Kconfig               |  18 ++++
 kernel/perf_event.c        | 248 +++++++++++++++++++++++++++++++++------------
 tools/perf/design.txt      |   3 +
 5 files changed, 214 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 97fca4695e0b..9b70a2f28dc7 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -26,6 +26,7 @@ config SPARC
 	select RTC_CLASS
 	select RTC_DRV_M48T59
 	select HAVE_PERF_EVENTS
+	select PERF_USE_VMALLOC
 	select HAVE_DMA_ATTRS
 	select HAVE_DMA_API_DEBUG
 
@@ -48,6 +49,7 @@ config SPARC64
 	select RTC_DRV_SUN4V
 	select RTC_DRV_STARFIRE
 	select HAVE_PERF_EVENTS
+	select PERF_USE_VMALLOC
 
 config ARCH_DEFCONFIG
 	string
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 3a9d36d1e92a..2e6d95f97419 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -442,6 +442,7 @@ enum perf_callchain_context {
 #include <linux/hrtimer.h>
 #include <linux/fs.h>
 #include <linux/pid_namespace.h>
+#include <linux/workqueue.h>
 #include <asm/atomic.h>
 
 #define PERF_MAX_STACK_DEPTH		255
@@ -513,6 +514,10 @@ struct file;
 
 struct perf_mmap_data {
 	struct rcu_head			rcu_head;
+#ifdef CONFIG_PERF_USE_VMALLOC
+	struct work_struct		work;
+#endif
+	int				data_order;
 	int				nr_pages;	/* nr of data pages  */
 	int				writable;	/* are we writable   */
 	int				nr_locked;	/* nr pages mlocked  */
diff --git a/init/Kconfig b/init/Kconfig
index c7bac39d6c61..09c5c6431f42 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -921,6 +921,11 @@ config HAVE_PERF_EVENTS
 	help
 	  See tools/perf/design.txt for details.
 
+config PERF_USE_VMALLOC
+	bool
+	help
+	  See tools/perf/design.txt for details
+
 menu "Kernel Performance Events And Counters"
 
 config PERF_EVENTS
@@ -976,6 +981,19 @@ config PERF_COUNTERS
 
 	  Say N if unsure.
 
+config DEBUG_PERF_USE_VMALLOC
+	default n
+	bool "Debug: use vmalloc to back perf mmap() buffers"
+	depends on PERF_EVENTS && DEBUG_KERNEL
+	select PERF_USE_VMALLOC
+	help
+	 Use vmalloc memory to back perf mmap() buffers.
+
+	 Mostly useful for debugging the vmalloc code on platforms
+	 that don't require it.
+
+	 Say N if unsure.
+
 endmenu
 
 config VM_EVENT_COUNTERS
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index e491fb087939..9d0b5c665883 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -20,6 +20,7 @@
 #include <linux/percpu.h>
 #include <linux/ptrace.h>
 #include <linux/vmstat.h>
+#include <linux/vmalloc.h>
 #include <linux/hardirq.h>
 #include <linux/rculist.h>
 #include <linux/uaccess.h>
@@ -2091,49 +2092,31 @@ unlock:
 	rcu_read_unlock();
 }
 
-static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+static unsigned long perf_data_size(struct perf_mmap_data *data)
 {
-	struct perf_event *event = vma->vm_file->private_data;
-	struct perf_mmap_data *data;
-	int ret = VM_FAULT_SIGBUS;
-
-	if (vmf->flags & FAULT_FLAG_MKWRITE) {
-		if (vmf->pgoff == 0)
-			ret = 0;
-		return ret;
-	}
-
-	rcu_read_lock();
-	data = rcu_dereference(event->data);
-	if (!data)
-		goto unlock;
-
-	if (vmf->pgoff == 0) {
-		vmf->page = virt_to_page(data->user_page);
-	} else {
-		int nr = vmf->pgoff - 1;
-
-		if ((unsigned)nr > data->nr_pages)
-			goto unlock;
+	return data->nr_pages << (PAGE_SHIFT + data->data_order);
+}
 
-		if (vmf->flags & FAULT_FLAG_WRITE)
-			goto unlock;
+#ifndef CONFIG_PERF_USE_VMALLOC
 
-		vmf->page = virt_to_page(data->data_pages[nr]);
-	}
+/*
+ * Back perf_mmap() with regular GFP_KERNEL-0 pages.
+ */
 
-	get_page(vmf->page);
-	vmf->page->mapping = vma->vm_file->f_mapping;
-	vmf->page->index   = vmf->pgoff;
+static struct page *
+perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+{
+	if (pgoff > data->nr_pages)
+		return NULL;
 
-	ret = 0;
-unlock:
-	rcu_read_unlock();
+	if (pgoff == 0)
+		return virt_to_page(data->user_page);
 
-	return ret;
+	return virt_to_page(data->data_pages[pgoff - 1]);
 }
 
-static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+static struct perf_mmap_data *
+perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 {
 	struct perf_mmap_data *data;
 	unsigned long size;
@@ -2158,19 +2141,10 @@ static int perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
 			goto fail_data_pages;
 	}
 
+	data->data_order = 0;
 	data->nr_pages = nr_pages;
-	atomic_set(&data->lock, -1);
-
-	if (event->attr.watermark) {
-		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
-				      event->attr.wakeup_watermark);
-	}
-	if (!data->watermark)
-		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
 
-	rcu_assign_pointer(event->data, data);
-
-	return 0;
+	return data;
 
 fail_data_pages:
 	for (i--; i >= 0; i--)
@@ -2182,7 +2156,7 @@ fail_user_page:
 	kfree(data);
 
 fail:
-	return -ENOMEM;
+	return NULL;
 }
 
 static void perf_mmap_free_page(unsigned long addr)
@@ -2193,28 +2167,169 @@ static void perf_mmap_free_page(unsigned long addr)
 	__free_page(page);
 }
 
-static void __perf_mmap_data_free(struct rcu_head *rcu_head)
+static void perf_mmap_data_free(struct perf_mmap_data *data)
 {
-	struct perf_mmap_data *data;
 	int i;
 
-	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
-
 	perf_mmap_free_page((unsigned long)data->user_page);
 	for (i = 0; i < data->nr_pages; i++)
 		perf_mmap_free_page((unsigned long)data->data_pages[i]);
+}
+
+#else
+
+/*
+ * Back perf_mmap() with vmalloc memory.
+ *
+ * Required for architectures that have d-cache aliasing issues.
+ */
+
+static struct page *
+perf_mmap_to_page(struct perf_mmap_data *data, unsigned long pgoff)
+{
+	if (pgoff > (1UL << data->data_order))
+		return NULL;
+
+	return vmalloc_to_page((void *)data->user_page + pgoff * PAGE_SIZE);
+}
+
+static void perf_mmap_unmark_page(void *addr)
+{
+	struct page *page = vmalloc_to_page(addr);
+
+	page->mapping = NULL;
+}
+
+static void perf_mmap_data_free_work(struct work_struct *work)
+{
+	struct perf_mmap_data *data;
+	void *base;
+	int i, nr;
+
+	data = container_of(work, struct perf_mmap_data, work);
+	nr = 1 << data->data_order;
+
+	base = data->user_page;
+	for (i = 0; i < nr + 1; i++)
+		perf_mmap_unmark_page(base + (i * PAGE_SIZE));
+
+	vfree(base);
+}
+
+static void perf_mmap_data_free(struct perf_mmap_data *data)
+{
+	schedule_work(&data->work);
+}
+
+static struct perf_mmap_data *
+perf_mmap_data_alloc(struct perf_event *event, int nr_pages)
+{
+	struct perf_mmap_data *data;
+	unsigned long size;
+	void *all_buf;
 
+	WARN_ON(atomic_read(&event->mmap_count));
+
+	size = sizeof(struct perf_mmap_data);
+	size += sizeof(void *);
+
+	data = kzalloc(size, GFP_KERNEL);
+	if (!data)
+		goto fail;
+
+	INIT_WORK(&data->work, perf_mmap_data_free_work);
+
+	all_buf = vmalloc_user((nr_pages + 1) * PAGE_SIZE);
+	if (!all_buf)
+		goto fail_all_buf;
+
+	data->user_page = all_buf;
+	data->data_pages[0] = all_buf + PAGE_SIZE;
+	data->data_order = ilog2(nr_pages);
+	data->nr_pages = 1;
+
+	return data;
+
+fail_all_buf:
+	kfree(data);
+
+fail:
+	return NULL;
+}
+
+#endif
+
+static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+	struct perf_event *event = vma->vm_file->private_data;
+	struct perf_mmap_data *data;
+	int ret = VM_FAULT_SIGBUS;
+
+	if (vmf->flags & FAULT_FLAG_MKWRITE) {
+		if (vmf->pgoff == 0)
+			ret = 0;
+		return ret;
+	}
+
+	rcu_read_lock();
+	data = rcu_dereference(event->data);
+	if (!data)
+		goto unlock;
+
+	if (vmf->pgoff && (vmf->flags & FAULT_FLAG_WRITE))
+		goto unlock;
+
+	vmf->page = perf_mmap_to_page(data, vmf->pgoff);
+	if (!vmf->page)
+		goto unlock;
+
+	get_page(vmf->page);
+	vmf->page->mapping = vma->vm_file->f_mapping;
+	vmf->page->index   = vmf->pgoff;
+
+	ret = 0;
+unlock:
+	rcu_read_unlock();
+
+	return ret;
+}
+
+static void
+perf_mmap_data_init(struct perf_event *event, struct perf_mmap_data *data)
+{
+	long max_size = perf_data_size(data);
+
+	atomic_set(&data->lock, -1);
+
+	if (event->attr.watermark) {
+		data->watermark = min_t(long, max_size,
+					event->attr.wakeup_watermark);
+	}
+
+	if (!data->watermark)
+		data->watermark = max_t(long, PAGE_SIZE, max_size / 2);
+
+
+	rcu_assign_pointer(event->data, data);
+}
+
+static void perf_mmap_data_free_rcu(struct rcu_head *rcu_head)
+{
+	struct perf_mmap_data *data;
+
+	data = container_of(rcu_head, struct perf_mmap_data, rcu_head);
+	perf_mmap_data_free(data);
 	kfree(data);
 }
 
-static void perf_mmap_data_free(struct perf_event *event)
+static void perf_mmap_data_release(struct perf_event *event)
 {
 	struct perf_mmap_data *data = event->data;
 
 	WARN_ON(atomic_read(&event->mmap_count));
 
 	rcu_assign_pointer(event->data, NULL);
-	call_rcu(&data->rcu_head, __perf_mmap_data_free);
+	call_rcu(&data->rcu_head, perf_mmap_data_free_rcu);
 }
 
 static void perf_mmap_open(struct vm_area_struct *vma)
@@ -2230,11 +2345,12 @@ static void perf_mmap_close(struct vm_area_struct *vma)
 
 	WARN_ON_ONCE(event->ctx->parent_ctx);
 	if (atomic_dec_and_mutex_lock(&event->mmap_count, &event->mmap_mutex)) {
+		unsigned long size = perf_data_size(event->data);
 		struct user_struct *user = current_user();
 
-		atomic_long_sub(event->data->nr_pages + 1, &user->locked_vm);
+		atomic_long_sub((size >> PAGE_SHIFT) + 1, &user->locked_vm);
 		vma->vm_mm->locked_vm -= event->data->nr_locked;
-		perf_mmap_data_free(event);
+		perf_mmap_data_release(event);
 		mutex_unlock(&event->mmap_mutex);
 	}
 }
@@ -2252,6 +2368,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	unsigned long user_locked, user_lock_limit;
 	struct user_struct *user = current_user();
 	unsigned long locked, lock_limit;
+	struct perf_mmap_data *data;
 	unsigned long vma_size;
 	unsigned long nr_pages;
 	long user_extra, extra;
@@ -2314,10 +2431,15 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	}
 
 	WARN_ON(event->data);
-	ret = perf_mmap_data_alloc(event, nr_pages);
-	if (ret)
+
+	data = perf_mmap_data_alloc(event, nr_pages);
+	ret = -ENOMEM;
+	if (!data)
 		goto unlock;
 
+	ret = 0;
+	perf_mmap_data_init(event, data);
+
 	atomic_set(&event->mmap_count, 1);
 	atomic_long_add(user_extra, &user->locked_vm);
 	vma->vm_mm->locked_vm += extra;
@@ -2505,7 +2627,7 @@ static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 	if (!data->writable)
 		return true;
 
-	mask = (data->nr_pages << PAGE_SHIFT) - 1;
+	mask = perf_data_size(data) - 1;
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -2610,7 +2732,7 @@ void perf_output_copy(struct perf_output_handle *handle,
 		      const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
-	unsigned int offset;
+	unsigned long offset;
 	unsigned int size;
 	void **pages;
 
@@ -2619,12 +2741,14 @@ void perf_output_copy(struct perf_output_handle *handle,
 	pages		= handle->data->data_pages;
 
 	do {
-		unsigned int page_offset;
+		unsigned long page_offset;
+		unsigned long page_size;
 		int nr;
 
 		nr	    = (offset >> PAGE_SHIFT) & pages_mask;
-		page_offset = offset & (PAGE_SIZE - 1);
-		size	    = min_t(unsigned int, PAGE_SIZE - page_offset, len);
+		page_size   = 1UL << (handle->data->data_order + PAGE_SHIFT);
+		page_offset = offset & (page_size - 1);
+		size	    = min_t(unsigned int, page_size - page_offset, len);
 
 		memcpy(pages[nr] + page_offset, buf, size);
 
diff --git a/tools/perf/design.txt b/tools/perf/design.txt
index f1946d107b10..fdd42a824c98 100644
--- a/tools/perf/design.txt
+++ b/tools/perf/design.txt
@@ -455,3 +455,6 @@ will need at least this:
 
 If your architecture does have hardware capabilities, you can override the
 weak stub hw_perf_event_init() to register hardware counters.
+
+Architectures that have d-cache aliassing issues, such as Sparc and ARM,
+should select PERF_USE_VMALLOC in order to avoid these for perf mmap().
-- 
cgit v1.2.3


From b0f56f1a63b7b968e6feeeefeace24bc8e0a4a65 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Thu, 1 Oct 2009 13:33:28 +0900
Subject: trace: Fix missing assignment in trace_ctxwake_*

The state char variable S should be reassigned, if S == 0.

We are missing the state of the task that is going to sleep for the
context switch events (in the raw mode).

Fortunately the problem arises with the sched_switch/wake_up
tracers, not the sched trace events.

The formers are legacy now. But still, that was buggy.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4AC43118.6050409@ct.jp.nec.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index f572f44c6e1e..cda766f9f421 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -883,7 +883,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		task_state_char(field->prev_state);
+		S = task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
 	if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			      field->prev_pid,
@@ -918,7 +918,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	trace_assign_type(field, iter->ent);
 
 	if (!S)
-		task_state_char(field->prev_state);
+		S = task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
-- 
cgit v1.2.3


From fc6b177dee33365ccb29fe6d2092223cf8d679f9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 5 Oct 2009 18:17:32 +0200
Subject: futex: Nullify robust lists after cleanup

The robust list pointers of user space held futexes are kept intact
over an exec() call. When the exec'ed task exits exit_robust_list() is
called with the stale pointer. The risk of corruption is minimal, but
still it is incorrect to keep the pointers valid. Actually glibc
should uninstall the robust list before calling exec() but we have to
deal with it anyway.

Nullify the pointers after [compat_]exit_robust_list() has been
called.

Reported-by: Anirban Sinha <ani@anirban.org>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: stable@kernel.org
---
 kernel/fork.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index bfee931ee3fb..88ef51ca84de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -543,11 +543,15 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 
 	/* Get rid of any futexes when releasing the mm */
 #ifdef CONFIG_FUTEX
-	if (unlikely(tsk->robust_list))
+	if (unlikely(tsk->robust_list)) {
 		exit_robust_list(tsk);
+		tsk->robust_list = NULL;
+	}
 #ifdef CONFIG_COMPAT
-	if (unlikely(tsk->compat_robust_list))
+	if (unlikely(tsk->compat_robust_list)) {
 		compat_exit_robust_list(tsk);
+		tsk->compat_robust_list = NULL;
+	}
 #endif
 #endif
 
-- 
cgit v1.2.3


From 322a2c100a8998158445599ea437fb556aa95b11 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 5 Oct 2009 18:18:03 +0200
Subject: futex: Move exit_pi_state() call to release_mm()

exit_pi_state() is called from do_exit() but not from do_execve().
Move it to release_mm() so it gets called from do_execve() as well.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
LKML-Reference: <new-submission>
Cc: stable@kernel.org
Cc: Anirban Sinha <ani@anirban.org>
Cc: Peter Zijlstra <peterz@infradead.org>
---
 kernel/exit.c | 2 --
 kernel/fork.c | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index ae5d8660ddff..bc2b1fdfc354 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -989,8 +989,6 @@ NORET_TYPE void do_exit(long code)
 	tsk->mempolicy = NULL;
 #endif
 #ifdef CONFIG_FUTEX
-	if (unlikely(!list_empty(&tsk->pi_state_list)))
-		exit_pi_state_list(tsk);
 	if (unlikely(current->pi_state_cache))
 		kfree(current->pi_state_cache);
 #endif
diff --git a/kernel/fork.c b/kernel/fork.c
index 88ef51ca84de..341965b0ab1c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -553,6 +553,8 @@ void mm_release(struct task_struct *tsk, struct mm_struct *mm)
 		tsk->compat_robust_list = NULL;
 	}
 #endif
+	if (unlikely(!list_empty(&tsk->pi_state_list)))
+		exit_pi_state_list(tsk);
 #endif
 
 	/* Get rid of any cached register state */
-- 
cgit v1.2.3


From d0ec774cb2599c858be9d923bb873cf6697520d8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 6 Oct 2009 21:48:16 -0700
Subject: rcu: Move rcu_barrier() to rcutree

Move the existing rcu_barrier() implementation to rcutree.c,
consistent with the fact that the rcu_barrier() implementation is
tied quite tightly to the RCU implementation.

This opens the way to simplify and fix rcutree.c's rcu_barrier()
implementation in a later patch.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12548908982563-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 120 +-----------------------------------------------------
 kernel/rcutree.c  | 119 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 120 insertions(+), 119 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index e43242274466..400183346ad2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -53,16 +53,8 @@ struct lockdep_map rcu_lock_map =
 EXPORT_SYMBOL_GPL(rcu_lock_map);
 #endif
 
-static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
-static atomic_t rcu_barrier_cpu_count;
-static DEFINE_MUTEX(rcu_barrier_mutex);
-static struct completion rcu_barrier_completion;
 int rcu_scheduler_active __read_mostly;
 
-static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
-static struct rcu_head rcu_migrate_head[3];
-static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
-
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
  * grace period has elapsed.
@@ -165,120 +157,10 @@ void synchronize_rcu_bh(void)
 }
 EXPORT_SYMBOL_GPL(synchronize_rcu_bh);
 
-static void rcu_barrier_callback(struct rcu_head *notused)
-{
-	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
-		complete(&rcu_barrier_completion);
-}
-
-/*
- * Called with preemption disabled, and from cross-cpu IRQ context.
- */
-static void rcu_barrier_func(void *type)
-{
-	int cpu = smp_processor_id();
-	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
-	void (*call_rcu_func)(struct rcu_head *head,
-			      void (*func)(struct rcu_head *head));
-
-	atomic_inc(&rcu_barrier_cpu_count);
-	call_rcu_func = type;
-	call_rcu_func(head, rcu_barrier_callback);
-}
-
-static inline void wait_migrated_callbacks(void)
-{
-	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
-	smp_mb(); /* In case we didn't sleep. */
-}
-
-/*
- * Orchestrate the specified type of RCU barrier, waiting for all
- * RCU callbacks of the specified type to complete.
- */
-static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head,
-					       void (*func)(struct rcu_head *head)))
-{
-	BUG_ON(in_interrupt());
-	/* Take cpucontrol mutex to protect against CPU hotplug */
-	mutex_lock(&rcu_barrier_mutex);
-	init_completion(&rcu_barrier_completion);
-	/*
-	 * Initialize rcu_barrier_cpu_count to 1, then invoke
-	 * rcu_barrier_func() on each CPU, so that each CPU also has
-	 * incremented rcu_barrier_cpu_count.  Only then is it safe to
-	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
-	 * might complete its grace period before all of the other CPUs
-	 * did their increment, causing this function to return too
-	 * early.
-	 */
-	atomic_set(&rcu_barrier_cpu_count, 1);
-	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
-	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
-		complete(&rcu_barrier_completion);
-	wait_for_completion(&rcu_barrier_completion);
-	mutex_unlock(&rcu_barrier_mutex);
-	wait_migrated_callbacks();
-}
-
-/**
- * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
- */
-void rcu_barrier(void)
-{
-	_rcu_barrier(call_rcu);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier);
-
-/**
- * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
- */
-void rcu_barrier_bh(void)
-{
-	_rcu_barrier(call_rcu_bh);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_bh);
-
-/**
- * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
- */
-void rcu_barrier_sched(void)
-{
-	_rcu_barrier(call_rcu_sched);
-}
-EXPORT_SYMBOL_GPL(rcu_barrier_sched);
-
-static void rcu_migrate_callback(struct rcu_head *notused)
-{
-	if (atomic_dec_and_test(&rcu_migrate_type_count))
-		wake_up(&rcu_migrate_wq);
-}
-
 static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
 		unsigned long action, void *hcpu)
 {
-	rcu_cpu_notify(self, action, hcpu);
-	if (action == CPU_DYING) {
-		/*
-		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
-		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-		 * returns, all online cpus have queued rcu_barrier_func(),
-		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
-		 *
-		 * These callbacks ensure _rcu_barrier() waits for all
-		 * RCU callbacks of the specified type to complete.
-		 */
-		atomic_set(&rcu_migrate_type_count, 3);
-		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
-		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
-		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
-	} else if (action == CPU_DOWN_PREPARE) {
-		/* Don't need to wait until next removal operation. */
-		/* rcu_migrate_head is protected by cpu_add_remove_lock */
-		wait_migrated_callbacks();
-	}
-
-	return NOTIFY_OK;
+	return rcu_cpu_notify(self, action, hcpu);
 }
 
 void __init rcu_init(void)
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index e2e272b5c277..0108570a192c 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1363,6 +1363,103 @@ int rcu_needs_cpu(int cpu)
 	       rcu_preempt_needs_cpu(cpu);
 }
 
+static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
+static atomic_t rcu_barrier_cpu_count;
+static DEFINE_MUTEX(rcu_barrier_mutex);
+static struct completion rcu_barrier_completion;
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
+
+static void rcu_barrier_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+		complete(&rcu_barrier_completion);
+}
+
+/*
+ * Called with preemption disabled, and from cross-cpu IRQ context.
+ */
+static void rcu_barrier_func(void *type)
+{
+	int cpu = smp_processor_id();
+	struct rcu_head *head = &per_cpu(rcu_barrier_head, cpu);
+	void (*call_rcu_func)(struct rcu_head *head,
+			      void (*func)(struct rcu_head *head));
+
+	atomic_inc(&rcu_barrier_cpu_count);
+	call_rcu_func = type;
+	call_rcu_func(head, rcu_barrier_callback);
+}
+
+static inline void wait_migrated_callbacks(void)
+{
+	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+	smp_mb(); /* In case we didn't sleep. */
+}
+
+/*
+ * Orchestrate the specified type of RCU barrier, waiting for all
+ * RCU callbacks of the specified type to complete.
+ */
+static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head,
+					       void (*func)(struct rcu_head *head)))
+{
+	BUG_ON(in_interrupt());
+	/* Take cpucontrol mutex to protect against CPU hotplug */
+	mutex_lock(&rcu_barrier_mutex);
+	init_completion(&rcu_barrier_completion);
+	/*
+	 * Initialize rcu_barrier_cpu_count to 1, then invoke
+	 * rcu_barrier_func() on each CPU, so that each CPU also has
+	 * incremented rcu_barrier_cpu_count.  Only then is it safe to
+	 * decrement rcu_barrier_cpu_count -- otherwise the first CPU
+	 * might complete its grace period before all of the other CPUs
+	 * did their increment, causing this function to return too
+	 * early.
+	 */
+	atomic_set(&rcu_barrier_cpu_count, 1);
+	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
+	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
+		complete(&rcu_barrier_completion);
+	wait_for_completion(&rcu_barrier_completion);
+	mutex_unlock(&rcu_barrier_mutex);
+	wait_migrated_callbacks();
+}
+
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+	_rcu_barrier(call_rcu);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
+/**
+ * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
+ */
+void rcu_barrier_bh(void)
+{
+	_rcu_barrier(call_rcu_bh);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_bh);
+
+/**
+ * rcu_barrier_sched - Wait for in-flight call_rcu_sched() callbacks.
+ */
+void rcu_barrier_sched(void)
+{
+	_rcu_barrier(call_rcu_sched);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier_sched);
+
+static void rcu_migrate_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_migrate_type_count))
+		wake_up(&rcu_migrate_wq);
+}
+
 /*
  * Do boot-time initialization of a CPU's per-CPU RCU data.
  */
@@ -1459,6 +1556,28 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_UP_PREPARE_FROZEN:
 		rcu_online_cpu(cpu);
 		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		/* Don't need to wait until next removal operation. */
+		/* rcu_migrate_head is protected by cpu_add_remove_lock */
+		wait_migrated_callbacks();
+		break;
+	case CPU_DYING:
+	case CPU_DYING_FROZEN:
+		/*
+		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
+		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+		 * returns, all online cpus have queued rcu_barrier_func(),
+		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
+		 *
+		 * These callbacks ensure _rcu_barrier() waits for all
+		 * RCU callbacks of the specified type to complete.
+		 */
+		atomic_set(&rcu_migrate_type_count, 3);
+		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
+		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
+		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
-- 
cgit v1.2.3


From e74f4c4564455c91a3b4075bb1721993c2a95dda Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 6 Oct 2009 21:48:17 -0700
Subject: rcu: Make hot-unplugged CPU relinquish its own RCU callbacks

The current interaction between RCU and CPU hotplug requires that
RCU block in CPU notifiers waiting for callbacks to drain.

This can be greatly simplified by having each CPU relinquish its
own callbacks, and for both _rcu_barrier() and CPU_DEAD notifiers
to adopt all callbacks that were previously relinquished.

This change also eliminates the possibility of certain types of
hangs due to the previous practice of waiting for callbacks to be
invoked from within CPU notifiers.  If you don't every wait, you
cannot hang.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1254890898456-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 151 +++++++++++++++++++++++++-----------------------
 kernel/rcutree.h        |  11 +++-
 kernel/rcutree_plugin.h |  34 +++++++++++
 kernel/rcutree_trace.c  |   4 +-
 4 files changed, 125 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0108570a192c..d8d98655c9e7 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -63,6 +63,9 @@
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
+	.orphan_cbs_list = NULL, \
+	.orphan_cbs_tail = &name.orphan_cbs_list, \
+	.orphan_qlen = 0, \
 	.fqslock = __SPIN_LOCK_UNLOCKED(&name.fqslock), \
 	.n_force_qs = 0, \
 	.n_force_qs_ngp = 0, \
@@ -837,18 +840,64 @@ rcu_check_quiescent_state(struct rcu_state *rsp, struct rcu_data *rdp)
 
 #ifdef CONFIG_HOTPLUG_CPU
 
+/*
+ * Move a dying CPU's RCU callbacks to the ->orphan_cbs_list for the
+ * specified flavor of RCU.  The callbacks will be adopted by the next
+ * _rcu_barrier() invocation or by the CPU_DEAD notifier, whichever
+ * comes first.  Because this is invoked from the CPU_DYING notifier,
+ * irqs are already disabled.
+ */
+static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+{
+	int i;
+	struct rcu_data *rdp = rsp->rda[smp_processor_id()];
+
+	if (rdp->nxtlist == NULL)
+		return;  /* irqs disabled, so comparison is stable. */
+	spin_lock(&rsp->onofflock);  /* irqs already disabled. */
+	*rsp->orphan_cbs_tail = rdp->nxtlist;
+	rsp->orphan_cbs_tail = rdp->nxttail[RCU_NEXT_TAIL];
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+	rsp->orphan_qlen += rdp->qlen;
+	rdp->qlen = 0;
+	spin_unlock(&rsp->onofflock);  /* irqs remain disabled. */
+}
+
+/*
+ * Adopt previously orphaned RCU callbacks.
+ */
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+	unsigned long flags;
+	struct rcu_data *rdp;
+
+	spin_lock_irqsave(&rsp->onofflock, flags);
+	rdp = rsp->rda[smp_processor_id()];
+	if (rsp->orphan_cbs_list == NULL) {
+		spin_unlock_irqrestore(&rsp->onofflock, flags);
+		return;
+	}
+	*rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_list;
+	rdp->nxttail[RCU_NEXT_TAIL] = rsp->orphan_cbs_tail;
+	rdp->qlen += rsp->orphan_qlen;
+	rsp->orphan_cbs_list = NULL;
+	rsp->orphan_cbs_tail = &rsp->orphan_cbs_list;
+	rsp->orphan_qlen = 0;
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
+}
+
 /*
  * Remove the outgoing CPU from the bitmasks in the rcu_node hierarchy
  * and move all callbacks from the outgoing CPU to the current one.
  */
 static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 {
-	int i;
 	unsigned long flags;
 	long lastcomp;
 	unsigned long mask;
 	struct rcu_data *rdp = rsp->rda[cpu];
-	struct rcu_data *rdp_me;
 	struct rcu_node *rnp;
 
 	/* Exclude any attempts to start a new grace period. */
@@ -871,32 +920,9 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 	} while (rnp != NULL);
 	lastcomp = rsp->completed;
 
-	spin_unlock(&rsp->onofflock);		/* irqs remain disabled. */
+	spin_unlock_irqrestore(&rsp->onofflock, flags);
 
-	/*
-	 * Move callbacks from the outgoing CPU to the running CPU.
-	 * Note that the outgoing CPU is now quiescent, so it is now
-	 * (uncharacteristically) safe to access its rcu_data structure.
-	 * Note also that we must carefully retain the order of the
-	 * outgoing CPU's callbacks in order for rcu_barrier() to work
-	 * correctly.  Finally, note that we start all the callbacks
-	 * afresh, even those that have passed through a grace period
-	 * and are therefore ready to invoke.  The theory is that hotplug
-	 * events are rare, and that if they are frequent enough to
-	 * indefinitely delay callbacks, you have far worse things to
-	 * be worrying about.
-	 */
-	if (rdp->nxtlist != NULL) {
-		rdp_me = rsp->rda[smp_processor_id()];
-		*rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxtlist;
-		rdp_me->nxttail[RCU_NEXT_TAIL] = rdp->nxttail[RCU_NEXT_TAIL];
-		rdp->nxtlist = NULL;
-		for (i = 0; i < RCU_NEXT_SIZE; i++)
-			rdp->nxttail[i] = &rdp->nxtlist;
-		rdp_me->qlen += rdp->qlen;
-		rdp->qlen = 0;
-	}
-	local_irq_restore(flags);
+	rcu_adopt_orphan_cbs(rsp);
 }
 
 /*
@@ -914,6 +940,14 @@ static void rcu_offline_cpu(int cpu)
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
 
+static void rcu_send_cbs_to_orphanage(struct rcu_state *rsp)
+{
+}
+
+static void rcu_adopt_orphan_cbs(struct rcu_state *rsp)
+{
+}
+
 static void rcu_offline_cpu(int cpu)
 {
 }
@@ -1367,9 +1401,6 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
-static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
-static struct rcu_head rcu_migrate_head[3];
-static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
 
 static void rcu_barrier_callback(struct rcu_head *notused)
 {
@@ -1392,21 +1423,16 @@ static void rcu_barrier_func(void *type)
 	call_rcu_func(head, rcu_barrier_callback);
 }
 
-static inline void wait_migrated_callbacks(void)
-{
-	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
-	smp_mb(); /* In case we didn't sleep. */
-}
-
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
  */
-static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head,
+static void _rcu_barrier(struct rcu_state *rsp,
+			 void (*call_rcu_func)(struct rcu_head *head,
 					       void (*func)(struct rcu_head *head)))
 {
 	BUG_ON(in_interrupt());
-	/* Take cpucontrol mutex to protect against CPU hotplug */
+	/* Take mutex to serialize concurrent rcu_barrier() requests. */
 	mutex_lock(&rcu_barrier_mutex);
 	init_completion(&rcu_barrier_completion);
 	/*
@@ -1419,29 +1445,22 @@ static void _rcu_barrier(void (*call_rcu_func)(struct rcu_head *head,
 	 * early.
 	 */
 	atomic_set(&rcu_barrier_cpu_count, 1);
+	preempt_disable(); /* stop CPU_DYING from filling orphan_cbs_list */
+	rcu_adopt_orphan_cbs(rsp);
 	on_each_cpu(rcu_barrier_func, (void *)call_rcu_func, 1);
+	preempt_enable(); /* CPU_DYING can again fill orphan_cbs_list */
 	if (atomic_dec_and_test(&rcu_barrier_cpu_count))
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
-	wait_migrated_callbacks();
-}
-
-/**
- * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
- */
-void rcu_barrier(void)
-{
-	_rcu_barrier(call_rcu);
 }
-EXPORT_SYMBOL_GPL(rcu_barrier);
 
 /**
  * rcu_barrier_bh - Wait until all in-flight call_rcu_bh() callbacks complete.
  */
 void rcu_barrier_bh(void)
 {
-	_rcu_barrier(call_rcu_bh);
+	_rcu_barrier(&rcu_bh_state, call_rcu_bh);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_bh);
 
@@ -1450,16 +1469,10 @@ EXPORT_SYMBOL_GPL(rcu_barrier_bh);
  */
 void rcu_barrier_sched(void)
 {
-	_rcu_barrier(call_rcu_sched);
+	_rcu_barrier(&rcu_sched_state, call_rcu_sched);
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
-static void rcu_migrate_callback(struct rcu_head *notused)
-{
-	if (atomic_dec_and_test(&rcu_migrate_type_count))
-		wake_up(&rcu_migrate_wq);
-}
-
 /*
  * Do boot-time initialization of a CPU's per-CPU RCU data.
  */
@@ -1556,27 +1569,21 @@ int __cpuinit rcu_cpu_notify(struct notifier_block *self,
 	case CPU_UP_PREPARE_FROZEN:
 		rcu_online_cpu(cpu);
 		break;
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		/* Don't need to wait until next removal operation. */
-		/* rcu_migrate_head is protected by cpu_add_remove_lock */
-		wait_migrated_callbacks();
-		break;
 	case CPU_DYING:
 	case CPU_DYING_FROZEN:
 		/*
-		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
+		 * preempt_disable() in _rcu_barrier() prevents stop_machine(),
 		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
-		 * returns, all online cpus have queued rcu_barrier_func(),
-		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
-		 *
-		 * These callbacks ensure _rcu_barrier() waits for all
-		 * RCU callbacks of the specified type to complete.
+		 * returns, all online cpus have queued rcu_barrier_func().
+		 * The dying CPU clears its cpu_online_mask bit and
+		 * moves all of its RCU callbacks to ->orphan_cbs_list
+		 * in the context of stop_machine(), so subsequent calls
+		 * to _rcu_barrier() will adopt these callbacks and only
+		 * then queue rcu_barrier_func() on all remaining CPUs.
 		 */
-		atomic_set(&rcu_migrate_type_count, 3);
-		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
-		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
-		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+		rcu_send_cbs_to_orphanage(&rcu_bh_state);
+		rcu_send_cbs_to_orphanage(&rcu_sched_state);
+		rcu_preempt_send_cbs_to_orphanage();
 		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 676eecd371d9..b40ac5706040 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -244,7 +244,15 @@ struct rcu_state {
 	/* End  of fields guarded by root rcu_node's lock. */
 
 	spinlock_t onofflock;			/* exclude on/offline and */
-						/*  starting new GP. */
+						/*  starting new GP.  Also */
+						/*  protects the following */
+						/*  orphan_cbs fields. */
+	struct rcu_head *orphan_cbs_list;	/* list of rcu_head structs */
+						/*  orphaned by all CPUs in */
+						/*  a given leaf rcu_node */
+						/*  going offline. */
+	struct rcu_head **orphan_cbs_tail;	/* And tail pointer. */
+	long orphan_qlen;			/* Number of orphaned cbs. */
 	spinlock_t fqslock;			/* Only one task forcing */
 						/*  quiescent states. */
 	unsigned long jiffies_force_qs;		/* Time at which to invoke */
@@ -305,6 +313,7 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu));
 static int rcu_preempt_pending(int cpu);
 static int rcu_preempt_needs_cpu(int cpu);
 static void __cpuinit rcu_preempt_init_percpu_data(int cpu);
+static void rcu_preempt_send_cbs_to_orphanage(void);
 static void __init __rcu_init_preempt(void);
 
 #endif /* #else #ifdef RCU_TREE_NONCORE */
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index 57200fe96d0a..c0cb783aa16a 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -410,6 +410,15 @@ static int rcu_preempt_needs_cpu(int cpu)
 	return !!per_cpu(rcu_preempt_data, cpu).nxtlist;
 }
 
+/**
+ * rcu_barrier - Wait until all in-flight call_rcu() callbacks complete.
+ */
+void rcu_barrier(void)
+{
+	_rcu_barrier(&rcu_preempt_state, call_rcu);
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
 /*
  * Initialize preemptable RCU's per-CPU data.
  */
@@ -418,6 +427,14 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 	rcu_init_percpu_data(cpu, &rcu_preempt_state, 1);
 }
 
+/*
+ * Move preemptable RCU's callbacks to ->orphan_cbs_list.
+ */
+static void rcu_preempt_send_cbs_to_orphanage(void)
+{
+	rcu_send_cbs_to_orphanage(&rcu_preempt_state);
+}
+
 /*
  * Initialize preemptable RCU's state structures.
  */
@@ -563,6 +580,16 @@ static int rcu_preempt_needs_cpu(int cpu)
 	return 0;
 }
 
+/*
+ * Because preemptable RCU does not exist, rcu_barrier() is just
+ * another name for rcu_barrier_sched().
+ */
+void rcu_barrier(void)
+{
+	rcu_barrier_sched();
+}
+EXPORT_SYMBOL_GPL(rcu_barrier);
+
 /*
  * Because preemptable RCU does not exist, there is no per-CPU
  * data to initialize.
@@ -571,6 +598,13 @@ static void __cpuinit rcu_preempt_init_percpu_data(int cpu)
 {
 }
 
+/*
+ * Because there is no preemptable RCU, there are no callbacks to move.
+ */
+static void rcu_preempt_send_cbs_to_orphanage(void)
+{
+}
+
 /*
  * Because preemptable RCU does not exist, it need not be initialized.
  */
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index f09af28b8262..4b31c779e62e 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -159,13 +159,13 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 	struct rcu_node *rnp;
 
 	seq_printf(m, "c=%ld g=%ld s=%d jfq=%ld j=%x "
-		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu\n",
+		      "nfqs=%lu/nfqsng=%lu(%lu) fqlh=%lu oqlen=%ld\n",
 		   rsp->completed, rsp->gpnum, rsp->signaled,
 		   (long)(rsp->jiffies_force_qs - jiffies),
 		   (int)(jiffies & 0xffff),
 		   rsp->n_force_qs, rsp->n_force_qs_ngp,
 		   rsp->n_force_qs - rsp->n_force_qs_ngp,
-		   rsp->n_force_qs_lh);
+		   rsp->n_force_qs_lh, rsp->orphan_qlen);
 	for (rnp = &rsp->node[0]; rnp - &rsp->node[0] < NUM_RCU_NODES; rnp++) {
 		if (rnp->level != level) {
 			seq_puts(m, "\n");
-- 
cgit v1.2.3


From 978c0b88146a7f9b364b71b5b83c5b12e7b413d7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 6 Oct 2009 21:48:18 -0700
Subject: rcu: Place root rcu_node structure in separate lockdep class

Before this patch, all of the rcu_node structures were in the same lockdep
class, so that lockdep would complain when rcu_preempt_offline_tasks()
acquired the root rcu_node structure's lock while holding one of the leaf
rcu_nodes' locks.

This patch changes rcu_init_one() to use a separate
spin_lock_init() for the root rcu_node structure's lock than is
used for that of all of the rest of the rcu_node structures, which
puts the root rcu_node structure's lock in its own lockdep class.

Suggested-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <12548908983277-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index d8d98655c9e7..705f02ac7433 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -1647,7 +1647,8 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 		cpustride *= rsp->levelspread[i];
 		rnp = rsp->level[i];
 		for (j = 0; j < rsp->levelcnt[i]; j++, rnp++) {
-			spin_lock_init(&rnp->lock);
+			if (rnp != rcu_get_root(rsp))
+				spin_lock_init(&rnp->lock);
 			rnp->gpnum = 0;
 			rnp->qsmask = 0;
 			rnp->qsmaskinit = 0;
@@ -1670,6 +1671,7 @@ static void __init rcu_init_one(struct rcu_state *rsp)
 			INIT_LIST_HEAD(&rnp->blocked_tasks[1]);
 		}
 	}
+	spin_lock_init(&rcu_get_root(rsp)->lock);
 }
 
 /*
-- 
cgit v1.2.3


From fdc6f192e7e1ae80565af23cc33dc88e3dcdf184 Mon Sep 17 00:00:00 2001
From: Eero Nurkkala <ext-eero.nurkkala@nokia.com>
Date: Wed, 7 Oct 2009 11:54:26 +0300
Subject: NOHZ: update idle state also when NOHZ is inactive

Commit f2e21c9610991e95621a81407cdbab881226419b had unfortunate side
effects with cpufreq governors on some systems.

If the system did not switch into NOHZ mode ts->inidle is not set when
tick_nohz_stop_sched_tick() is called from the idle routine. Therefor
all subsequent calls from irq_exit() to tick_nohz_stop_sched_tick()
fail to call tick_nohz_start_idle(). This results in bogus idle
accounting information which is passed to cpufreq governors.

Set the inidle flag unconditionally of the NOHZ active state to keep
the idle time accounting correct in any case.

[ tglx: Added comment and tweaked the changelog ]

Reported-by: Steven Noonan <steven@uplinklabs.net>
Signed-off-by: Eero Nurkkala <ext-eero.nurkkala@nokia.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: Greg KH <greg@kroah.com>
Cc: Steven Noonan <steven@uplinklabs.net>
Cc: stable@kernel.org
LKML-Reference: <1254907901.30157.93.camel@eenurkka-desktop>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/tick-sched.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index e0f59a21c061..89aed5933ed4 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -231,6 +231,13 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if (!inidle && !ts->inidle)
 		goto end;
 
+	/*
+	 * Set ts->inidle unconditionally. Even if the system did not
+	 * switch to NOHZ mode the cpu frequency governers rely on the
+	 * update of the idle time accounting in tick_nohz_start_idle().
+	 */
+	ts->inidle = 1;
+
 	now = tick_nohz_start_idle(ts);
 
 	/*
@@ -248,8 +255,6 @@ void tick_nohz_stop_sched_tick(int inidle)
 	if (unlikely(ts->nohz_mode == NOHZ_MODE_INACTIVE))
 		goto end;
 
-	ts->inidle = 1;
-
 	if (need_resched())
 		goto end;
 
-- 
cgit v1.2.3


From 829b876dfc94ea8be3a47e200d06f1f217bb104f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sun, 27 Sep 2009 07:02:07 -0400
Subject: tracing: fix transposed numbers of lock_depth and preempt_count

The lock_depth and preempt_count numbers in the latency format is
transposed.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_output.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index cda766f9f421..ed17565826b0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -486,16 +486,18 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry)
 				hardirq ? 'h' : softirq ? 's' : '.'))
 		return 0;
 
-	if (entry->lock_depth < 0)
-		ret = trace_seq_putc(s, '.');
+	if (entry->preempt_count)
+		ret = trace_seq_printf(s, "%x", entry->preempt_count);
 	else
-		ret = trace_seq_printf(s, "%d", entry->lock_depth);
+		ret = trace_seq_putc(s, '.');
+
 	if (!ret)
 		return 0;
 
-	if (entry->preempt_count)
-		return trace_seq_printf(s, "%x", entry->preempt_count);
-	return trace_seq_putc(s, '.');
+	if (entry->lock_depth < 0)
+		return trace_seq_putc(s, '.');
+
+	return trace_seq_printf(s, "%d", entry->lock_depth);
 }
 
 static int
-- 
cgit v1.2.3


From da085681014fb43d67d9bf6d14bc068e9254bd49 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Wed, 7 Oct 2009 11:46:54 -0700
Subject: futex: fix requeue_pi key imbalance

If futex_wait_requeue_pi() wakes prior to requeue, we drop the
reference to the source futex_key twice, once in
handle_early_requeue_pi_wakeup() and once on our way out.

Remove the drop from the handle_early_requeue_pi_wakeup() and keep
the get/drops together in futex_wait_requeue_pi().

Reported-by: Helge Bahmann <hcb@chaoticmind.net>
Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Helge Bahmann <hcb@chaoticmind.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@us.ibm.com>
Cc: stable-2.6.31 <stable@kernel.org>
LKML-Reference: <4ACCE21E.5030805@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 1e176f3ab26c..c3bb2fce11ba 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2111,7 +2111,6 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		 * Unqueue the futex_q and determine which it was.
 		 */
 		plist_del(&q->list, &q->list.plist);
-		drop_futex_key_refs(&q->key);
 
 		if (timeout && !timeout->task)
 			ret = -ETIMEDOUT;
-- 
cgit v1.2.3


From e7247a15ff3bbdab0a8b402dffa1171e5c05a8e0 Mon Sep 17 00:00:00 2001
From: "jolsa@redhat.com" <jolsa@redhat.com>
Date: Wed, 7 Oct 2009 19:00:35 +0200
Subject: tracing: correct module boundaries for ftrace_release

When the module is about the unload we release its call records.
The ftrace_release function was given wrong values representing
the module core boundaries, thus not releasing its call records.

Plus making ftrace_release function module specific.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
LKML-Reference: <1254934835-363-3-git-send-email-jolsa@redhat.com>
Cc: stable@kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/ftrace.h |  2 +-
 kernel/trace/ftrace.c  | 12 ++++--------
 2 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index cd3d2abaf30a..0b4f97d24d7f 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -241,7 +241,7 @@ extern void ftrace_enable_daemon(void);
 # define ftrace_set_filter(buf, len, reset)	do { } while (0)
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
-static inline void ftrace_release(void *start, unsigned long size) { }
+static inline void ftrace_release_mod(struct module *mod) {}
 static inline int register_ftrace_command(struct ftrace_func_command *cmd)
 {
 	return -EINVAL;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 46592feab5a6..c701476a648b 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2658,19 +2658,17 @@ static int ftrace_convert_nops(struct module *mod,
 }
 
 #ifdef CONFIG_MODULES
-void ftrace_release(void *start, void *end)
+void ftrace_release_mod(struct module *mod)
 {
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
-	unsigned long s = (unsigned long)start;
-	unsigned long e = (unsigned long)end;
 
-	if (ftrace_disabled || !start || start == end)
+	if (ftrace_disabled)
 		return;
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e)) {
+		if (within_module_core(rec->ip, mod)) {
 			/*
 			 * rec->ip is changed in ftrace_free_rec()
 			 * It should not between s and e if record was freed.
@@ -2702,9 +2700,7 @@ static int ftrace_module_notify(struct notifier_block *self,
 				   mod->num_ftrace_callsites);
 		break;
 	case MODULE_STATE_GOING:
-		ftrace_release(mod->ftrace_callsites,
-			       mod->ftrace_callsites +
-			       mod->num_ftrace_callsites);
+		ftrace_release_mod(mod);
 		break;
 	}
 
-- 
cgit v1.2.3


From 3279ba37db5d65c4ab0dcdee3b211ccb85bb563f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 7 Oct 2009 16:57:56 -0400
Subject: ftrace: check for failure for all conversions

Due to legacy code from back when the dynamic tracer used a daemon,
only core kernel code was checking for failures. This is no longer
the case. We must check for failures any time we perform text modifications.

Cc: stable@kernel.org
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c701476a648b..f136fe5da07a 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1074,14 +1074,9 @@ static void ftrace_replace_code(int enable)
 		failed = __ftrace_replace_code(rec, enable);
 		if (failed) {
 			rec->flags |= FTRACE_FL_FAILED;
-			if ((system_state == SYSTEM_BOOTING) ||
-			    !core_kernel_text(rec->ip)) {
-				ftrace_free_rec(rec);
-				} else {
-				ftrace_bug(failed, rec->ip);
-					/* Stop processing */
-					return;
-				}
+			ftrace_bug(failed, rec->ip);
+			/* Stop processing */
+			return;
 		}
 	} while_for_each_ftrace_rec();
 }
-- 
cgit v1.2.3


From c8647b28726b09b087155417bb698e7b3789f8a0 Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <helight.xu@gmail.com>
Date: Thu, 8 Oct 2009 09:21:46 +0800
Subject: tracing: fix warning on kernel/trace/trace_branch.c
 andtrace_hw_branches.c

fix warnings that caused the API change of trace_buffer_lock_reserve()
change files: kernel/trace/trace_hw_branch.c
              kernel/trace/trace_branch.c

Signed-off-by: Zhenwen Xu <helight.xu@gmail.com>
LKML-Reference: <20091008012146.GA4170@helight>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_branch.c      | 2 +-
 kernel/trace/trace_hw_branches.c | 8 +++++---
 2 files changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7a7a9fd249a9..216e2dd302a0 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -54,7 +54,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 		goto out;
 
 	pc = preempt_count();
-	event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
+	event = trace_buffer_lock_reserve(tr->buffer, TRACE_BRANCH,
 					  sizeof(*entry), flags, pc);
 	if (!event)
 		goto out;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 23b63859130e..69543a905cd5 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -165,6 +165,7 @@ void trace_hw_branch(u64 from, u64 to)
 	struct ftrace_event_call *call = &event_hw_branch;
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
+	struct ring_buffer *buf;
 	struct hw_branch_entry *entry;
 	unsigned long irq1;
 	int cpu;
@@ -180,7 +181,8 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
+	buf = tr->buffer;
+	event = trace_buffer_lock_reserve(buf, TRACE_HW_BRANCHES,
 					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
@@ -189,8 +191,8 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->from = from;
 	entry->to   = to;
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		trace_buffer_unlock_commit(tr, event, 0, 0);
+	if (!filter_check_discard(call, entry, buf, event))
+		trace_buffer_unlock_commit(buf, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
-- 
cgit v1.2.3


From 8f6e8a314ab37cadd72da5ace9027f2d04aba854 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 7 Oct 2009 21:53:41 -0400
Subject: tracing: user local buffer variable for trace branch tracer

Just using the tr->buffer for the API to trace_buffer_lock_reserve
is not good enough. This is because the tr->buffer may change, and we
do not want to commit with a different buffer that we reserved from.

This patch uses a local variable to hold the buffer that was used to
reserve and commit with.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace_branch.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 216e2dd302a0..4a194f08f88c 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -34,6 +34,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
+	struct ring_buffer *buffer;
 	unsigned long flags;
 	int cpu, pc;
 	const char *p;
@@ -54,7 +55,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 		goto out;
 
 	pc = preempt_count();
-	event = trace_buffer_lock_reserve(tr->buffer, TRACE_BRANCH,
+	buffer = tr->buffer;
+	event = trace_buffer_lock_reserve(buffer, TRACE_BRANCH,
 					  sizeof(*entry), flags, pc);
 	if (!event)
 		goto out;
@@ -74,8 +76,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	if (!filter_check_discard(call, entry, tr->buffer, event))
-		ring_buffer_unlock_commit(tr->buffer, event);
+	if (!filter_check_discard(call, entry, buffer, event))
+		ring_buffer_unlock_commit(buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
-- 
cgit v1.2.3


From a813a159766ee9d36aa1fc717c60d63325a6d077 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 9 Oct 2009 01:41:35 -0400
Subject: tracing: fix trace_vprintk call

The addition of trace_array_{v}printk used the wrong function for
trace_vprintk to call. This broke trace_marker and trace_vprintk
itself. Although trace_printk may not have been affected by those
that end up calling trace_vbprintk.

Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 45068269ebb1..c820b0310a12 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1393,7 +1393,7 @@ int trace_array_vprintk(struct trace_array *tr,
 
 int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
-	return trace_array_printk(&global_trace, ip, fmt, args);
+	return trace_array_vprintk(&global_trace, ip, fmt, args);
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-- 
cgit v1.2.3


From d25105e8911bff1dbd68e387f12901c5b1a15fe8 Mon Sep 17 00:00:00 2001
From: Wu Fengguang <fengguang.wu@intel.com>
Date: Fri, 9 Oct 2009 12:40:42 +0200
Subject: writeback: account IO throttling wait as iowait

It makes sense to do IOWAIT when someone is blocked
due to IO throttle, as suggested by Kame and Peter.

There is an old comment for not doing IOWAIT on throttle,
however it has been mismatching the code for a long time.

If we stop accounting IOWAIT for 2.6.32, it could be an
undesirable behavior change. So restore the io_schedule.

CC: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 kernel/sched.c      | 3 ---
 mm/page-writeback.c | 3 ++-
 2 files changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1535f3884b88..074f753f7449 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6720,9 +6720,6 @@ EXPORT_SYMBOL(yield);
 /*
  * This task is about to go to sleep on IO. Increment rq->nr_iowait so
  * that process accounting knows that this is a task in IO wait state.
- *
- * But don't do that if it is a deliberate, throttling IO wait (this task
- * has set its backing_dev_info: the queue against which it should throttle)
  */
 void __sched io_schedule(void)
 {
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index a3b14090b1fb..2c5d79236ead 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -566,7 +566,8 @@ static void balance_dirty_pages(struct address_space *mapping,
 		if (pages_written >= write_chunk)
 			break;		/* We've done our duty */
 
-		schedule_timeout_interruptible(pause);
+		__set_current_state(TASK_INTERRUPTIBLE);
+		io_schedule_timeout(pause);
 
 		/*
 		 * Increase the delay for each loop, up to our previous
-- 
cgit v1.2.3


From 3365e7798760dc6c190a9bbb0945a38f02625438 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Oct 2009 10:12:41 +0200
Subject: lockdep: Use cpu_clock() for lockstat

Some tracepoint magic (TRACE_EVENT(lock_acquired)) relies on
the fact that lock hold times are positive and uses div64 on
that. That triggered a build warning on MIPS, and probably
causes bad output in certain circumstances as well.

Make it truly positive.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1254818502.21044.112.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 20 ++++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 3815ac1d58b2..9af56723c096 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -142,6 +142,11 @@ static inline struct lock_class *hlock_class(struct held_lock *hlock)
 #ifdef CONFIG_LOCK_STAT
 static DEFINE_PER_CPU(struct lock_class_stats[MAX_LOCKDEP_KEYS], lock_stats);
 
+static inline u64 lockstat_clock(void)
+{
+	return cpu_clock(smp_processor_id());
+}
+
 static int lock_point(unsigned long points[], unsigned long ip)
 {
 	int i;
@@ -158,7 +163,7 @@ static int lock_point(unsigned long points[], unsigned long ip)
 	return i;
 }
 
-static void lock_time_inc(struct lock_time *lt, s64 time)
+static void lock_time_inc(struct lock_time *lt, u64 time)
 {
 	if (time > lt->max)
 		lt->max = time;
@@ -234,12 +239,12 @@ static void put_lock_stats(struct lock_class_stats *stats)
 static void lock_release_holdtime(struct held_lock *hlock)
 {
 	struct lock_class_stats *stats;
-	s64 holdtime;
+	u64 holdtime;
 
 	if (!lock_stat)
 		return;
 
-	holdtime = sched_clock() - hlock->holdtime_stamp;
+	holdtime = lockstat_clock() - hlock->holdtime_stamp;
 
 	stats = get_lock_stats(hlock_class(hlock));
 	if (hlock->read)
@@ -2792,7 +2797,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 	hlock->references = references;
 #ifdef CONFIG_LOCK_STAT
 	hlock->waittime_stamp = 0;
-	hlock->holdtime_stamp = sched_clock();
+	hlock->holdtime_stamp = lockstat_clock();
 #endif
 
 	if (check == 2 && !mark_irqflags(curr, hlock))
@@ -3322,7 +3327,7 @@ found_it:
 	if (hlock->instance != lock)
 		return;
 
-	hlock->waittime_stamp = sched_clock();
+	hlock->waittime_stamp = lockstat_clock();
 
 	contention_point = lock_point(hlock_class(hlock)->contention_point, ip);
 	contending_point = lock_point(hlock_class(hlock)->contending_point,
@@ -3345,8 +3350,7 @@ __lock_acquired(struct lockdep_map *lock, unsigned long ip)
 	struct held_lock *hlock, *prev_hlock;
 	struct lock_class_stats *stats;
 	unsigned int depth;
-	u64 now;
-	s64 waittime = 0;
+	u64 now, waittime = 0;
 	int i, cpu;
 
 	depth = curr->lockdep_depth;
@@ -3374,7 +3378,7 @@ found_it:
 
 	cpu = smp_processor_id();
 	if (hlock->waittime_stamp) {
-		now = sched_clock();
+		now = lockstat_clock();
 		waittime = now - hlock->waittime_stamp;
 		hlock->holdtime_stamp = now;
 	}
-- 
cgit v1.2.3


From f5dc37530ba8a35aae0f7f4f13781d1904f71e94 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 9 Oct 2009 08:35:03 +0200
Subject: sched: Update the clock of runqueue select_task_rq() selected

In try_to_wake_up(), we update the runqueue clock, but
select_task_rq() may select a different runqueue than the one we
updated, leaving the new runqueue's clock stale for a bit.

This patch cures occasional huge latencies reported by latencytop
when coming out of idle on a mostly idle NO_HZ box.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1255070103.7639.30.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 76c0e9691fc0..e3f8f4f61c8e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2311,7 +2311,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 {
 	int cpu, orig_cpu, this_cpu, success = 0;
 	unsigned long flags;
-	struct rq *rq;
+	struct rq *rq, *orig_rq;
 
 	if (!sched_feat(SYNC_WAKEUPS))
 		wake_flags &= ~WF_SYNC;
@@ -2319,7 +2319,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 	this_cpu = get_cpu();
 
 	smp_wmb();
-	rq = task_rq_lock(p, &flags);
+	rq = orig_rq = task_rq_lock(p, &flags);
 	update_rq_clock(rq);
 	if (!(p->state & state))
 		goto out;
@@ -2350,6 +2350,10 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state,
 		set_task_cpu(p, cpu);
 
 	rq = task_rq_lock(p, &flags);
+
+	if (rq != orig_rq)
+		update_rq_clock(rq);
+
 	WARN_ON(p->state != TASK_WAKING);
 	cpu = task_cpu(p);
 
-- 
cgit v1.2.3


From d43c36dc6b357fa1806800f18aa30123c747a6d1 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Wed, 7 Oct 2009 17:09:06 +0400
Subject: headers: remove sched.h from interrupt.h

After m68k's task_thread_info() doesn't refer to current,
it's possible to remove sched.h from interrupt.h and not break m68k!
Many thanks to Heiko Carstens for allowing this.

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
---
 arch/arm/kernel/time.c                          | 1 +
 arch/arm/mach-integrator/pci_v3.c               | 1 +
 arch/arm/plat-s3c24xx/adc.c                     | 1 +
 arch/blackfin/kernel/time.c                     | 1 +
 arch/m32r/kernel/smp.c                          | 1 +
 arch/um/drivers/line.c                          | 1 +
 arch/um/drivers/port_kern.c                     | 1 +
 arch/um/kernel/irq.c                            | 1 +
 arch/x86/kernel/cpu/mcheck/mce_intel.c          | 1 +
 arch/x86/kernel/pci-gart_64.c                   | 1 +
 arch/x86/kernel/reboot.c                        | 1 +
 arch/xtensa/kernel/time.c                       | 1 +
 crypto/aead.c                                   | 1 +
 drivers/char/applicom.c                         | 1 +
 drivers/char/epca.c                             | 1 +
 drivers/char/generic_serial.c                   | 1 +
 drivers/char/istallion.c                        | 1 +
 drivers/char/nozomi.c                           | 1 +
 drivers/char/pty.c                              | 1 +
 drivers/char/rio/riocmd.c                       | 1 +
 drivers/char/rio/rioctrl.c                      | 1 +
 drivers/char/rio/riotty.c                       | 1 +
 drivers/char/ser_a2232.c                        | 1 +
 drivers/char/stallion.c                         | 1 +
 drivers/char/tlclk.c                            | 1 +
 drivers/hwmon/sht15.c                           | 1 +
 drivers/ieee1394/raw1394.c                      | 1 +
 drivers/ieee1394/video1394.c                    | 1 +
 drivers/infiniband/core/iwcm.c                  | 1 +
 drivers/infiniband/core/ucma.c                  | 1 +
 drivers/infiniband/hw/cxgb3/iwch_provider.c     | 1 +
 drivers/infiniband/hw/cxgb3/iwch_qp.c           | 1 +
 drivers/infiniband/hw/ipath/ipath_driver.c      | 1 +
 drivers/infiniband/hw/ipath/ipath_iba7220.c     | 1 +
 drivers/infiniband/hw/ipath/ipath_intr.c        | 1 +
 drivers/infiniband/hw/ipath/ipath_qp.c          | 1 +
 drivers/infiniband/hw/ipath/ipath_ruc.c         | 1 +
 drivers/infiniband/hw/ipath/ipath_ud.c          | 1 +
 drivers/infiniband/hw/ipath/ipath_user_pages.c  | 1 +
 drivers/infiniband/hw/ipath/ipath_user_sdma.c   | 1 +
 drivers/infiniband/hw/ipath/ipath_verbs_mcast.c | 1 +
 drivers/input/keyboard/hilkbd.c                 | 1 +
 drivers/input/keyboard/sunkbd.c                 | 1 +
 drivers/input/serio/libps2.c                    | 1 +
 drivers/input/serio/serio_raw.c                 | 1 +
 drivers/input/serio/serport.c                   | 1 +
 drivers/isdn/capi/kcapi.c                       | 1 +
 drivers/isdn/hisax/arcofi.c                     | 1 +
 drivers/isdn/hisax/hfc_2bds0.c                  | 1 +
 drivers/isdn/hisax/hfc_pci.c                    | 1 +
 drivers/isdn/hysdn/hysdn_procconf.c             | 1 +
 drivers/isdn/hysdn/hysdn_proclog.c              | 1 +
 drivers/isdn/pcbit/drv.c                        | 1 +
 drivers/isdn/pcbit/layer2.c                     | 1 +
 drivers/isdn/sc/init.c                          | 1 +
 drivers/lguest/interrupts_and_traps.c           | 1 +
 drivers/media/dvb/dvb-core/dvb_net.c            | 1 +
 drivers/media/video/meye.c                      | 1 +
 drivers/media/video/videobuf-core.c             | 1 +
 drivers/media/video/videobuf-dma-sg.c           | 1 +
 drivers/message/fusion/mptlan.c                 | 1 +
 drivers/mfd/ucb1x00-core.c                      | 1 +
 drivers/misc/hpilo.c                            | 1 +
 drivers/misc/ibmasm/command.c                   | 1 +
 drivers/misc/ibmasm/event.c                     | 1 +
 drivers/misc/ibmasm/r_heartbeat.c               | 1 +
 drivers/misc/phantom.c                          | 1 +
 drivers/mtd/devices/m25p80.c                    | 1 +
 drivers/mtd/devices/sst25l.c                    | 1 +
 drivers/net/bonding/bond_sysfs.c                | 1 +
 drivers/net/depca.c                             | 1 +
 drivers/net/e100.c                              | 1 +
 drivers/net/eql.c                               | 1 +
 drivers/net/ethoc.c                             | 1 +
 drivers/net/ewrk3.c                             | 1 +
 drivers/net/forcedeth.c                         | 1 +
 drivers/net/hamachi.c                           | 1 +
 drivers/net/hamradio/baycom_epp.c               | 1 +
 drivers/net/hamradio/baycom_ser_fdx.c           | 1 +
 drivers/net/hamradio/baycom_ser_hdx.c           | 1 +
 drivers/net/hamradio/hdlcdrv.c                  | 1 +
 drivers/net/hp100.c                             | 1 +
 drivers/net/igb/igb_ethtool.c                   | 1 +
 drivers/net/irda/toim3232-sir.c                 | 1 +
 drivers/net/ns83820.c                           | 1 +
 drivers/net/pcnet32.c                           | 1 +
 drivers/net/sb1000.c                            | 1 +
 drivers/net/sis900.c                            | 1 +
 drivers/net/skfp/skfddi.c                       | 1 +
 drivers/net/skge.c                              | 1 +
 drivers/net/slip.c                              | 1 +
 drivers/net/sungem.c                            | 1 +
 drivers/net/tokenring/ibmtr.c                   | 1 +
 drivers/net/typhoon.c                           | 1 +
 drivers/net/wan/cosa.c                          | 1 +
 drivers/net/wan/cycx_x25.c                      | 1 +
 drivers/net/wan/dscc4.c                         | 1 +
 drivers/net/wan/farsync.c                       | 1 +
 drivers/net/wireless/b43/pio.c                  | 1 +
 drivers/net/wireless/b43legacy/main.c           | 1 +
 drivers/net/wireless/b43legacy/phy.c            | 1 +
 drivers/net/wireless/hostap/hostap_info.c       | 1 +
 drivers/net/wireless/hostap/hostap_ioctl.c      | 1 +
 drivers/net/wireless/ipw2x00/ipw2200.c          | 1 +
 drivers/net/wireless/iwlwifi/iwl-3945.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-4965.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-5000.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-agn.c          | 1 +
 drivers/net/wireless/iwlwifi/iwl-core.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-hcmd.c         | 1 +
 drivers/net/wireless/iwlwifi/iwl-tx.c           | 1 +
 drivers/net/wireless/iwlwifi/iwl3945-base.c     | 1 +
 drivers/net/wireless/iwmc3200wifi/cfg80211.c    | 1 +
 drivers/net/wireless/iwmc3200wifi/commands.c    | 1 +
 drivers/net/wireless/iwmc3200wifi/main.c        | 1 +
 drivers/net/wireless/iwmc3200wifi/rx.c          | 1 +
 drivers/net/wireless/libertas/cmd.c             | 1 +
 drivers/net/wireless/libertas/tx.c              | 1 +
 drivers/net/wireless/prism54/isl_ioctl.c        | 1 +
 drivers/net/wireless/prism54/islpci_dev.c       | 1 +
 drivers/net/wireless/prism54/islpci_mgt.c       | 1 +
 drivers/net/wireless/rt2x00/rt2x00debug.c       | 1 +
 drivers/pci/pcie/aer/aerdrv.c                   | 1 +
 drivers/rtc/interface.c                         | 1 +
 drivers/rtc/rtc-dev.c                           | 1 +
 drivers/uio/uio.c                               | 1 +
 drivers/uwb/whc-rc.c                            | 1 +
 fs/file.c                                       | 1 +
 include/linux/interrupt.h                       | 2 +-
 include/linux/mmc/host.h                        | 1 +
 kernel/irq/handle.c                             | 1 +
 kernel/mutex-debug.c                            | 1 +
 kernel/time/timekeeping.c                       | 1 +
 lib/debugobjects.c                              | 1 +
 lib/fault-inject.c                              | 1 +
 mm/vmalloc.c                                    | 1 +
 net/irda/ircomm/ircomm_tty_attach.c             | 1 +
 net/irda/irlan/irlan_common.c                   | 1 +
 net/irda/irlan/irlan_eth.c                      | 1 +
 net/irda/irnet/irnet_irda.c                     | 1 +
 net/irda/irnet/irnet_ppp.c                      | 1 +
 net/mac80211/rc80211_pid_debugfs.c              | 1 +
 net/netfilter/nf_conntrack_core.c               | 1 +
 net/sunrpc/xprtrdma/svc_rdma_transport.c        | 1 +
 net/wireless/core.c                             | 1 +
 145 files changed, 145 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/arm/kernel/time.c b/arch/arm/kernel/time.c
index 4cdc4a0bd02d..d38cdf2c8276 100644
--- a/arch/arm/kernel/time.c
+++ b/arch/arm/kernel/time.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/time.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/smp.h>
 #include <linux/timex.h>
 #include <linux/errno.h>
diff --git a/arch/arm/mach-integrator/pci_v3.c b/arch/arm/mach-integrator/pci_v3.c
index 901cc205015e..148d25fc636f 100644
--- a/arch/arm/mach-integrator/pci_v3.c
+++ b/arch/arm/mach-integrator/pci_v3.c
@@ -31,6 +31,7 @@
 
 #include <mach/hardware.h>
 #include <asm/irq.h>
+#include <asm/signal.h>
 #include <asm/system.h>
 #include <asm/mach/pci.h>
 #include <asm/irq_regs.h>
diff --git a/arch/arm/plat-s3c24xx/adc.c b/arch/arm/plat-s3c24xx/adc.c
index 11117a7ba911..4d36b784fb8b 100644
--- a/arch/arm/plat-s3c24xx/adc.c
+++ b/arch/arm/plat-s3c24xx/adc.c
@@ -14,6 +14,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 #include <linux/list.h>
 #include <linux/err.h>
 #include <linux/clk.h>
diff --git a/arch/blackfin/kernel/time.c b/arch/blackfin/kernel/time.c
index e5069fe6861e..bd3b53da295e 100644
--- a/arch/blackfin/kernel/time.c
+++ b/arch/blackfin/kernel/time.c
@@ -14,6 +14,7 @@
 #include <linux/time.h>
 #include <linux/irq.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 
 #include <asm/blackfin.h>
 #include <asm/time.h>
diff --git a/arch/m32r/kernel/smp.c b/arch/m32r/kernel/smp.c
index 1b7598e6f6e8..8a88f1f0a3e2 100644
--- a/arch/m32r/kernel/smp.c
+++ b/arch/m32r/kernel/smp.c
@@ -17,6 +17,7 @@
 
 #include <linux/irq.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
diff --git a/arch/um/drivers/line.c b/arch/um/drivers/line.c
index 14a102e877d6..cf8a97f34518 100644
--- a/arch/um/drivers/line.c
+++ b/arch/um/drivers/line.c
@@ -5,6 +5,7 @@
 
 #include "linux/irqreturn.h"
 #include "linux/kd.h"
+#include "linux/sched.h"
 #include "chan_kern.h"
 #include "irq_kern.h"
 #include "irq_user.h"
diff --git a/arch/um/drivers/port_kern.c b/arch/um/drivers/port_kern.c
index 19930081d3d8..4ebc8a34738f 100644
--- a/arch/um/drivers/port_kern.c
+++ b/arch/um/drivers/port_kern.c
@@ -7,6 +7,7 @@
 #include "linux/interrupt.h"
 #include "linux/list.h"
 #include "linux/mutex.h"
+#include "linux/workqueue.h"
 #include "asm/atomic.h"
 #include "init.h"
 #include "irq_kern.h"
diff --git a/arch/um/kernel/irq.c b/arch/um/kernel/irq.c
index 454cdb43e351..039270b9b73b 100644
--- a/arch/um/kernel/irq.c
+++ b/arch/um/kernel/irq.c
@@ -10,6 +10,7 @@
 #include "linux/interrupt.h"
 #include "linux/kernel_stat.h"
 #include "linux/module.h"
+#include "linux/sched.h"
 #include "linux/seq_file.h"
 #include "as-layout.h"
 #include "kern_util.h"
diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel.c b/arch/x86/kernel/cpu/mcheck/mce_intel.c
index 889f665fe93d..7c785634af2b 100644
--- a/arch/x86/kernel/cpu/mcheck/mce_intel.c
+++ b/arch/x86/kernel/cpu/mcheck/mce_intel.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/percpu.h>
+#include <linux/sched.h>
 #include <asm/apic.h>
 #include <asm/processor.h>
 #include <asm/msr.h>
diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c
index 98a827ee9ed7..a7f1b64f86e0 100644
--- a/arch/x86/kernel/pci-gart_64.c
+++ b/arch/x86/kernel/pci-gart_64.c
@@ -16,6 +16,7 @@
 #include <linux/agp_backend.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/spinlock.h>
 #include <linux/pci.h>
diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c
index 27349f92a6d7..a1a3cdda06e1 100644
--- a/arch/x86/kernel/reboot.c
+++ b/arch/x86/kernel/reboot.c
@@ -4,6 +4,7 @@
 #include <linux/pm.h>
 #include <linux/efi.h>
 #include <linux/dmi.h>
+#include <linux/sched.h>
 #include <linux/tboot.h>
 #include <acpi/reboot.h>
 #include <asm/io.h>
diff --git a/arch/xtensa/kernel/time.c b/arch/xtensa/kernel/time.c
index 19085ff0484a..19f7df30937f 100644
--- a/arch/xtensa/kernel/time.c
+++ b/arch/xtensa/kernel/time.c
@@ -13,6 +13,7 @@
  */
 
 #include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/time.h>
 #include <linux/clocksource.h>
 #include <linux/interrupt.h>
diff --git a/crypto/aead.c b/crypto/aead.c
index d9aa733db164..0a55da70845e 100644
--- a/crypto/aead.c
+++ b/crypto/aead.c
@@ -18,6 +18,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/rtnetlink.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/seq_file.h>
 
diff --git a/drivers/char/applicom.c b/drivers/char/applicom.c
index 73a0765344b6..fe2cb2f5db17 100644
--- a/drivers/char/applicom.c
+++ b/drivers/char/applicom.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/miscdevice.h>
diff --git a/drivers/char/epca.c b/drivers/char/epca.c
index 9d589e3144de..dde5134713e2 100644
--- a/drivers/char/epca.c
+++ b/drivers/char/epca.c
@@ -30,6 +30,7 @@
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/delay.h>
 #include <linux/ctype.h>
diff --git a/drivers/char/generic_serial.c b/drivers/char/generic_serial.c
index 9e4e569dc00d..d400cbd280f2 100644
--- a/drivers/char/generic_serial.c
+++ b/drivers/char/generic_serial.c
@@ -22,6 +22,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/tty.h>
+#include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/mm.h>
 #include <linux/generic_serial.h>
diff --git a/drivers/char/istallion.c b/drivers/char/istallion.c
index ab2f3349c5c4..402838f4083e 100644
--- a/drivers/char/istallion.c
+++ b/drivers/char/istallion.c
@@ -19,6 +19,7 @@
 /*****************************************************************************/
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/interrupt.h>
diff --git a/drivers/char/nozomi.c b/drivers/char/nozomi.c
index ec58d8c387ff..d3400b20444f 100644
--- a/drivers/char/nozomi.c
+++ b/drivers/char/nozomi.c
@@ -48,6 +48,7 @@
 #include <linux/tty.h>
 #include <linux/tty_driver.h>
 #include <linux/tty_flip.h>
+#include <linux/sched.h>
 #include <linux/serial.h>
 #include <linux/interrupt.h>
 #include <linux/kmod.h>
diff --git a/drivers/char/pty.c b/drivers/char/pty.c
index e066c4fdf81b..62f282e67638 100644
--- a/drivers/char/pty.c
+++ b/drivers/char/pty.c
@@ -18,6 +18,7 @@
 #include <linux/tty.h>
 #include <linux/tty_flip.h>
 #include <linux/fcntl.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/major.h>
 #include <linux/mm.h>
diff --git a/drivers/char/rio/riocmd.c b/drivers/char/rio/riocmd.c
index 01f2654d5a2e..f121357e5af0 100644
--- a/drivers/char/rio/riocmd.c
+++ b/drivers/char/rio/riocmd.c
@@ -32,6 +32,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/tty.h>
diff --git a/drivers/char/rio/rioctrl.c b/drivers/char/rio/rioctrl.c
index 74339559f0b9..780506326a73 100644
--- a/drivers/char/rio/rioctrl.c
+++ b/drivers/char/rio/rioctrl.c
@@ -31,6 +31,7 @@
 */
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <asm/io.h>
diff --git a/drivers/char/rio/riotty.c b/drivers/char/rio/riotty.c
index 2fb49e89b324..47fab7c33073 100644
--- a/drivers/char/rio/riotty.c
+++ b/drivers/char/rio/riotty.c
@@ -33,6 +33,7 @@
 #define __EXPLICIT_DEF_H__
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/errno.h>
 #include <linux/tty.h>
diff --git a/drivers/char/ser_a2232.c b/drivers/char/ser_a2232.c
index 33a2b531802e..9610861d1f5f 100644
--- a/drivers/char/ser_a2232.c
+++ b/drivers/char/ser_a2232.c
@@ -89,6 +89,7 @@
 #include <linux/interrupt.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
+#include <linux/sched.h>
 #include <linux/tty.h>
 
 #include <asm/setup.h>
diff --git a/drivers/char/stallion.c b/drivers/char/stallion.c
index 53e504f41b20..db6dcfa35ba0 100644
--- a/drivers/char/stallion.c
+++ b/drivers/char/stallion.c
@@ -27,6 +27,7 @@
 /*****************************************************************************/
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/tty.h>
diff --git a/drivers/char/tlclk.c b/drivers/char/tlclk.c
index 8f2284be68e1..80ea6bcfffdc 100644
--- a/drivers/char/tlclk.c
+++ b/drivers/char/tlclk.c
@@ -32,6 +32,7 @@
 #include <linux/kernel.h>	/* printk() */
 #include <linux/fs.h>		/* everything... */
 #include <linux/errno.h>	/* error codes */
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/ioport.h>
 #include <linux/interrupt.h>
diff --git a/drivers/hwmon/sht15.c b/drivers/hwmon/sht15.c
index 303c02694c3c..2da6fb2c325e 100644
--- a/drivers/hwmon/sht15.c
+++ b/drivers/hwmon/sht15.c
@@ -30,6 +30,7 @@
 #include <linux/hwmon-sysfs.h>
 #include <linux/mutex.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/jiffies.h>
 #include <linux/err.h>
diff --git a/drivers/ieee1394/raw1394.c b/drivers/ieee1394/raw1394.c
index 0bc3d78ce7b1..8aa56ac07e29 100644
--- a/drivers/ieee1394/raw1394.c
+++ b/drivers/ieee1394/raw1394.c
@@ -29,6 +29,7 @@
 
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/slab.h>
 #include <linux/fs.h>
diff --git a/drivers/ieee1394/video1394.c b/drivers/ieee1394/video1394.c
index d287ba79821d..949064a05675 100644
--- a/drivers/ieee1394/video1394.c
+++ b/drivers/ieee1394/video1394.c
@@ -30,6 +30,7 @@
  */
 #include <linux/kernel.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/wait.h>
diff --git a/drivers/infiniband/core/iwcm.c b/drivers/infiniband/core/iwcm.c
index 625fec5a741c..0f89909abce9 100644
--- a/drivers/infiniband/core/iwcm.c
+++ b/drivers/infiniband/core/iwcm.c
@@ -40,6 +40,7 @@
 #include <linux/idr.h>
 #include <linux/interrupt.h>
 #include <linux/rbtree.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/workqueue.h>
 #include <linux/completion.h>
diff --git a/drivers/infiniband/core/ucma.c b/drivers/infiniband/core/ucma.c
index 4346a24568fb..bb96d3c4b0f4 100644
--- a/drivers/infiniband/core/ucma.c
+++ b/drivers/infiniband/core/ucma.c
@@ -34,6 +34,7 @@
 #include <linux/file.h>
 #include <linux/mutex.h>
 #include <linux/poll.h>
+#include <linux/sched.h>
 #include <linux/idr.h>
 #include <linux/in.h>
 #include <linux/in6.h>
diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c
index 03cfaecc3bb7..ed7175549ebd 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_provider.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c
@@ -37,6 +37,7 @@
 #include <linux/delay.h>
 #include <linux/errno.h>
 #include <linux/list.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c
index 6e8653471941..1cecf98829ac 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_qp.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c
@@ -29,6 +29,7 @@
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
+#include <linux/sched.h>
 #include "iwch_provider.h"
 #include "iwch.h"
 #include "iwch_cm.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 04e88b600558..013d1380e77c 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/pci.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_iba7220.c b/drivers/infiniband/hw/ipath/ipath_iba7220.c
index b2a9d4c155d1..a805402dd4ae 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba7220.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba7220.c
@@ -37,6 +37,7 @@
 
 #include <linux/interrupt.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/delay.h>
 #include <linux/io.h>
 #include <rdma/ib_verbs.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index 6c21b4b5ec71..c0a03ac03ee7 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -33,6 +33,7 @@
 
 #include <linux/pci.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 
 #include "ipath_kernel.h"
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
index 3a5a89b609c4..cb2d3ef2ae12 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/err.h>
+#include <linux/sched.h>
 #include <linux/vmalloc.h>
 
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index 2296832f94da..1f95bbaf7602 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index 6076cb61bf6a..7420715256a9 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -31,6 +31,7 @@
  * SOFTWARE.
  */
 
+#include <linux/sched.h>
 #include <rdma/ib_smi.h>
 
 #include "ipath_verbs.h"
diff --git a/drivers/infiniband/hw/ipath/ipath_user_pages.c b/drivers/infiniband/hw/ipath/ipath_user_pages.c
index 855911e7396d..82878e348627 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_pages.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_pages.c
@@ -33,6 +33,7 @@
 
 #include <linux/mm.h>
 #include <linux/device.h>
+#include <linux/sched.h>
 
 #include "ipath_kernel.h"
 
diff --git a/drivers/infiniband/hw/ipath/ipath_user_sdma.c b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
index 7bff4b9baa0a..be78f6643c06 100644
--- a/drivers/infiniband/hw/ipath/ipath_user_sdma.c
+++ b/drivers/infiniband/hw/ipath/ipath_user_sdma.c
@@ -33,6 +33,7 @@
 #include <linux/types.h>
 #include <linux/device.h>
 #include <linux/dmapool.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/list.h>
 #include <linux/highmem.h>
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
index d73e32232879..6923e1d986da 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs_mcast.c
@@ -32,6 +32,7 @@
  */
 
 #include <linux/rculist.h>
+#include <linux/sched.h>
 
 #include "ipath_verbs.h"
 
diff --git a/drivers/input/keyboard/hilkbd.c b/drivers/input/keyboard/hilkbd.c
index e9d639ec283d..5f72440b50c8 100644
--- a/drivers/input/keyboard/hilkbd.c
+++ b/drivers/input/keyboard/hilkbd.c
@@ -24,6 +24,7 @@
 #include <linux/interrupt.h>
 #include <linux/hil.h>
 #include <linux/io.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <asm/irq.h>
 #ifdef CONFIG_HP300
diff --git a/drivers/input/keyboard/sunkbd.c b/drivers/input/keyboard/sunkbd.c
index 472b56639cdb..a99a04b03ee4 100644
--- a/drivers/input/keyboard/sunkbd.c
+++ b/drivers/input/keyboard/sunkbd.c
@@ -27,6 +27,7 @@
  */
 
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/interrupt.h>
diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c
index 769ba65a585a..f3876acc3e83 100644
--- a/drivers/input/serio/libps2.c
+++ b/drivers/input/serio/libps2.c
@@ -13,6 +13,7 @@
 
 #include <linux/delay.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/input.h>
diff --git a/drivers/input/serio/serio_raw.c b/drivers/input/serio/serio_raw.c
index b03009bb7468..27fdaaffbb40 100644
--- a/drivers/input/serio/serio_raw.c
+++ b/drivers/input/serio/serio_raw.c
@@ -9,6 +9,7 @@
  * the Free Software Foundation.
  */
 
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/smp_lock.h>
 #include <linux/poll.h>
diff --git a/drivers/input/serio/serport.c b/drivers/input/serio/serport.c
index b9694b6445d0..6d345112bcb7 100644
--- a/drivers/input/serio/serport.c
+++ b/drivers/input/serio/serport.c
@@ -15,6 +15,7 @@
 
 #include <asm/uaccess.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/init.h>
diff --git a/drivers/isdn/capi/kcapi.c b/drivers/isdn/capi/kcapi.c
index 57d26360f64e..dc506ab99cac 100644
--- a/drivers/isdn/capi/kcapi.c
+++ b/drivers/isdn/capi/kcapi.c
@@ -18,6 +18,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/skbuff.h>
 #include <linux/workqueue.h>
diff --git a/drivers/isdn/hisax/arcofi.c b/drivers/isdn/hisax/arcofi.c
index d30ce5b978c2..85a8fd8dd0b7 100644
--- a/drivers/isdn/hisax/arcofi.c
+++ b/drivers/isdn/hisax/arcofi.c
@@ -10,6 +10,7 @@
  *
  */
  
+#include <linux/sched.h>
 #include "hisax.h"
 #include "isdnl1.h"
 #include "isac.h"
diff --git a/drivers/isdn/hisax/hfc_2bds0.c b/drivers/isdn/hisax/hfc_2bds0.c
index 5c46a7130e06..8d22f50760eb 100644
--- a/drivers/isdn/hisax/hfc_2bds0.c
+++ b/drivers/isdn/hisax/hfc_2bds0.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/init.h>
+#include <linux/sched.h>
 #include "hisax.h"
 #include "hfc_2bds0.h"
 #include "isdnl1.h"
diff --git a/drivers/isdn/hisax/hfc_pci.c b/drivers/isdn/hisax/hfc_pci.c
index d110a77940a4..10914731b304 100644
--- a/drivers/isdn/hisax/hfc_pci.c
+++ b/drivers/isdn/hisax/hfc_pci.c
@@ -20,6 +20,7 @@
 #include "hfc_pci.h"
 #include "isdnl1.h"
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/interrupt.h>
 
 static const char *hfcpci_revision = "$Revision: 1.48.2.4 $";
diff --git a/drivers/isdn/hysdn/hysdn_procconf.c b/drivers/isdn/hysdn/hysdn_procconf.c
index 8f9f4912de32..90b35e1a4b7e 100644
--- a/drivers/isdn/hysdn/hysdn_procconf.c
+++ b/drivers/isdn/hysdn/hysdn_procconf.c
@@ -11,6 +11,7 @@
  *
  */
 
+#include <linux/cred.h>
 #include <linux/module.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
diff --git a/drivers/isdn/hysdn/hysdn_proclog.c b/drivers/isdn/hysdn/hysdn_proclog.c
index 8991d2c8ee4a..8bcae28c4409 100644
--- a/drivers/isdn/hysdn/hysdn_proclog.c
+++ b/drivers/isdn/hysdn/hysdn_proclog.c
@@ -13,6 +13,7 @@
 #include <linux/module.h>
 #include <linux/poll.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 
 #include "hysdn_defs.h"
diff --git a/drivers/isdn/pcbit/drv.c b/drivers/isdn/pcbit/drv.c
index 8c66bcb953a1..123c1d6c43b4 100644
--- a/drivers/isdn/pcbit/drv.c
+++ b/drivers/isdn/pcbit/drv.c
@@ -23,6 +23,7 @@
 #include <linux/kernel.h>
 
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/isdn/pcbit/layer2.c b/drivers/isdn/pcbit/layer2.c
index e075e8d2fce0..30f0f45e3139 100644
--- a/drivers/isdn/pcbit/layer2.c
+++ b/drivers/isdn/pcbit/layer2.c
@@ -27,6 +27,7 @@
 #include <linux/string.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
diff --git a/drivers/isdn/sc/init.c b/drivers/isdn/sc/init.c
index dd0acd06750b..5a0774880d56 100644
--- a/drivers/isdn/sc/init.c
+++ b/drivers/isdn/sc/init.c
@@ -8,6 +8,7 @@
 #include <linux/init.h>
 #include <linux/interrupt.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include "includes.h"
 #include "hardware.h"
 #include "card.h"
diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c
index 18648180db02..daaf86631647 100644
--- a/drivers/lguest/interrupts_and_traps.c
+++ b/drivers/lguest/interrupts_and_traps.c
@@ -16,6 +16,7 @@
 #include <linux/uaccess.h>
 #include <linux/interrupt.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include "lg.h"
 
 /* Allow Guests to use a non-128 (ie. non-Linux) syscall trap. */
diff --git a/drivers/media/dvb/dvb-core/dvb_net.c b/drivers/media/dvb/dvb-core/dvb_net.c
index 8c9ae0a3a272..0241a7c5c34a 100644
--- a/drivers/media/dvb/dvb-core/dvb_net.c
+++ b/drivers/media/dvb/dvb-core/dvb_net.c
@@ -63,6 +63,7 @@
 #include <asm/uaccess.h>
 #include <linux/crc32.h>
 #include <linux/mutex.h>
+#include <linux/sched.h>
 
 #include "dvb_demux.h"
 #include "dvb_net.h"
diff --git a/drivers/media/video/meye.c b/drivers/media/video/meye.c
index 4b1bc05a462c..01e1eefcf1eb 100644
--- a/drivers/media/video/meye.c
+++ b/drivers/media/video/meye.c
@@ -28,6 +28,7 @@
  */
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/videodev.h>
 #include <media/v4l2-common.h>
diff --git a/drivers/media/video/videobuf-core.c b/drivers/media/video/videobuf-core.c
index f1ccf98c0a6f..8e93c6f25c83 100644
--- a/drivers/media/video/videobuf-core.c
+++ b/drivers/media/video/videobuf-core.c
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/media/video/videobuf-dma-sg.c b/drivers/media/video/videobuf-dma-sg.c
index 53cdd67cebe1..032ebae0134a 100644
--- a/drivers/media/video/videobuf-dma-sg.c
+++ b/drivers/media/video/videobuf-dma-sg.c
@@ -21,6 +21,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/moduleparam.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/interrupt.h>
 
diff --git a/drivers/message/fusion/mptlan.c b/drivers/message/fusion/mptlan.c
index bc2ec2182c00..34f3f36f819b 100644
--- a/drivers/message/fusion/mptlan.c
+++ b/drivers/message/fusion/mptlan.c
@@ -56,6 +56,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/fs.h>
+#include <linux/sched.h>
 
 #define my_VERSION	MPT_LINUX_VERSION_COMMON
 #define MYNAM		"mptlan"
diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c
index fea9085fe52c..60c3988f3cf3 100644
--- a/drivers/mfd/ucb1x00-core.c
+++ b/drivers/mfd/ucb1x00-core.c
@@ -18,6 +18,7 @@
  */
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/errno.h>
diff --git a/drivers/misc/hpilo.c b/drivers/misc/hpilo.c
index 1ad27c6abcca..a92a3a742b43 100644
--- a/drivers/misc/hpilo.c
+++ b/drivers/misc/hpilo.c
@@ -18,6 +18,7 @@
 #include <linux/device.h>
 #include <linux/file.h>
 #include <linux/cdev.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/delay.h>
 #include <linux/uaccess.h>
diff --git a/drivers/misc/ibmasm/command.c b/drivers/misc/ibmasm/command.c
index 276d3fb68094..e2031739aa29 100644
--- a/drivers/misc/ibmasm/command.c
+++ b/drivers/misc/ibmasm/command.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/sched.h>
 #include "ibmasm.h"
 #include "lowlevel.h"
 
diff --git a/drivers/misc/ibmasm/event.c b/drivers/misc/ibmasm/event.c
index 68a0a5b94795..572d41ffc186 100644
--- a/drivers/misc/ibmasm/event.c
+++ b/drivers/misc/ibmasm/event.c
@@ -22,6 +22,7 @@
  *
  */
 
+#include <linux/sched.h>
 #include "ibmasm.h"
 #include "lowlevel.h"
 
diff --git a/drivers/misc/ibmasm/r_heartbeat.c b/drivers/misc/ibmasm/r_heartbeat.c
index bec9e2c44bef..2de487ac788c 100644
--- a/drivers/misc/ibmasm/r_heartbeat.c
+++ b/drivers/misc/ibmasm/r_heartbeat.c
@@ -20,6 +20,7 @@
  *
  */
 
+#include <linux/sched.h>
 #include "ibmasm.h"
 #include "dot_command.h"
 
diff --git a/drivers/misc/phantom.c b/drivers/misc/phantom.c
index 90a95ce8dc34..04c27266f567 100644
--- a/drivers/misc/phantom.c
+++ b/drivers/misc/phantom.c
@@ -22,6 +22,7 @@
 #include <linux/interrupt.h>
 #include <linux/cdev.h>
 #include <linux/phantom.h>
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 
 #include <asm/atomic.h>
diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c
index 379c316f329e..4c19269de91a 100644
--- a/drivers/mtd/devices/m25p80.c
+++ b/drivers/mtd/devices/m25p80.c
@@ -21,6 +21,7 @@
 #include <linux/interrupt.h>
 #include <linux/mutex.h>
 #include <linux/math64.h>
+#include <linux/sched.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/mtd/devices/sst25l.c b/drivers/mtd/devices/sst25l.c
index c2baf3353f84..0a11721f146e 100644
--- a/drivers/mtd/devices/sst25l.c
+++ b/drivers/mtd/devices/sst25l.c
@@ -20,6 +20,7 @@
 #include <linux/device.h>
 #include <linux/mutex.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 
 #include <linux/mtd/mtd.h>
 #include <linux/mtd/partitions.h>
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index ff449de6f3c0..8762a27a2a18 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -22,6 +22,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/device.h>
+#include <linux/sched.h>
 #include <linux/sysdev.h>
 #include <linux/fs.h>
 #include <linux/types.h>
diff --git a/drivers/net/depca.c b/drivers/net/depca.c
index 9686c1fa28f1..7a3bdac84abe 100644
--- a/drivers/net/depca.c
+++ b/drivers/net/depca.c
@@ -237,6 +237,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/e100.c b/drivers/net/e100.c
index 679965c2bb86..5d2f48f02251 100644
--- a/drivers/net/e100.c
+++ b/drivers/net/e100.c
@@ -151,6 +151,7 @@
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/delay.h>
 #include <linux/init.h>
diff --git a/drivers/net/eql.c b/drivers/net/eql.c
index d4d9a3eda695..f5b96cadeb25 100644
--- a/drivers/net/eql.c
+++ b/drivers/net/eql.c
@@ -111,6 +111,7 @@
  * Sorry, I had to rewrite most of this for 2.5.x -DaveM
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
diff --git a/drivers/net/ethoc.c b/drivers/net/ethoc.c
index 34d0c69e67f7..96f5b2a2d2c5 100644
--- a/drivers/net/ethoc.c
+++ b/drivers/net/ethoc.c
@@ -17,6 +17,7 @@
 #include <linux/mii.h>
 #include <linux/phy.h>
 #include <linux/platform_device.h>
+#include <linux/sched.h>
 #include <net/ethoc.h>
 
 static int buffer_size = 0x8000; /* 32 KBytes */
diff --git a/drivers/net/ewrk3.c b/drivers/net/ewrk3.c
index b2a5ec8f3721..dd4ba01fd92d 100644
--- a/drivers/net/ewrk3.c
+++ b/drivers/net/ewrk3.c
@@ -145,6 +145,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/forcedeth.c b/drivers/net/forcedeth.c
index 0a1c2bb27d4d..e1da4666f204 100644
--- a/drivers/net/forcedeth.c
+++ b/drivers/net/forcedeth.c
@@ -49,6 +49,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/ethtool.h>
 #include <linux/timer.h>
diff --git a/drivers/net/hamachi.c b/drivers/net/hamachi.c
index 1d5064a09aca..f7519a594945 100644
--- a/drivers/net/hamachi.c
+++ b/drivers/net/hamachi.c
@@ -145,6 +145,7 @@ static int tx_params[MAX_UNITS] = {-1, -1, -1, -1, -1, -1, -1, -1};
 /* Time in jiffies before concluding the transmitter is hung. */
 #define TX_TIMEOUT  (5*HZ)
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/string.h>
diff --git a/drivers/net/hamradio/baycom_epp.c b/drivers/net/hamradio/baycom_epp.c
index 7bcaf7c66243..e344c84c0ef9 100644
--- a/drivers/net/hamradio/baycom_epp.c
+++ b/drivers/net/hamradio/baycom_epp.c
@@ -44,6 +44,7 @@
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/workqueue.h>
 #include <linux/fs.h>
diff --git a/drivers/net/hamradio/baycom_ser_fdx.c b/drivers/net/hamradio/baycom_ser_fdx.c
index aa4488e871b2..ed60fd664273 100644
--- a/drivers/net/hamradio/baycom_ser_fdx.c
+++ b/drivers/net/hamradio/baycom_ser_fdx.c
@@ -71,6 +71,7 @@
 
 /*****************************************************************************/
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/string.h>
diff --git a/drivers/net/hamradio/baycom_ser_hdx.c b/drivers/net/hamradio/baycom_ser_hdx.c
index 88c593596020..1686f6dcbbce 100644
--- a/drivers/net/hamradio/baycom_ser_hdx.c
+++ b/drivers/net/hamradio/baycom_ser_hdx.c
@@ -61,6 +61,7 @@
 
 /*****************************************************************************/
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/ioport.h>
 #include <linux/string.h>
diff --git a/drivers/net/hamradio/hdlcdrv.c b/drivers/net/hamradio/hdlcdrv.c
index 0013c409782c..91c5790c9581 100644
--- a/drivers/net/hamradio/hdlcdrv.c
+++ b/drivers/net/hamradio/hdlcdrv.c
@@ -42,6 +42,7 @@
 
 /*****************************************************************************/
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/types.h>
 #include <linux/net.h>
diff --git a/drivers/net/hp100.c b/drivers/net/hp100.c
index a9a1a99f02dd..dd8665138062 100644
--- a/drivers/net/hp100.c
+++ b/drivers/net/hp100.c
@@ -98,6 +98,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/igb/igb_ethtool.c b/drivers/net/igb/igb_ethtool.c
index d004c359244c..deaea8fa1032 100644
--- a/drivers/net/igb/igb_ethtool.c
+++ b/drivers/net/igb/igb_ethtool.c
@@ -34,6 +34,7 @@
 #include <linux/interrupt.h>
 #include <linux/if_ether.h>
 #include <linux/ethtool.h>
+#include <linux/sched.h>
 
 #include "igb.h"
 
diff --git a/drivers/net/irda/toim3232-sir.c b/drivers/net/irda/toim3232-sir.c
index fcf287b749db..99e1ec02a011 100644
--- a/drivers/net/irda/toim3232-sir.c
+++ b/drivers/net/irda/toim3232-sir.c
@@ -120,6 +120,7 @@
 #include <linux/module.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <net/irda/irda.h>
 
diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c
index c594e1946476..57fd483dbb1f 100644
--- a/drivers/net/ns83820.c
+++ b/drivers/net/ns83820.c
@@ -111,6 +111,7 @@
 #include <linux/compiler.h>
 #include <linux/prefetch.h>
 #include <linux/ethtool.h>
+#include <linux/sched.h>
 #include <linux/timer.h>
 #include <linux/if_vlan.h>
 #include <linux/rtnetlink.h>
diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c
index 6d28b18e7e28..c1b3f09f452c 100644
--- a/drivers/net/pcnet32.c
+++ b/drivers/net/pcnet32.c
@@ -31,6 +31,7 @@ static const char *const version =
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/errno.h>
 #include <linux/ioport.h>
diff --git a/drivers/net/sb1000.c b/drivers/net/sb1000.c
index ee366c5a8fa3..c9c70ab0cce0 100644
--- a/drivers/net/sb1000.c
+++ b/drivers/net/sb1000.c
@@ -36,6 +36,7 @@ static char version[] = "sb1000.c:v1.1.2 6/01/98 (fventuri@mediaone.net)\n";
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/interrupt.h>
 #include <linux/errno.h>
diff --git a/drivers/net/sis900.c b/drivers/net/sis900.c
index 97949d0a699b..c072f7f36acf 100644
--- a/drivers/net/sis900.c
+++ b/drivers/net/sis900.c
@@ -52,6 +52,7 @@
 #include <linux/module.h>
 #include <linux/moduleparam.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/errno.h>
diff --git a/drivers/net/skfp/skfddi.c b/drivers/net/skfp/skfddi.c
index 38a508b4aad9..b27156eaf267 100644
--- a/drivers/net/skfp/skfddi.c
+++ b/drivers/net/skfp/skfddi.c
@@ -73,6 +73,7 @@ static const char * const boot_msg =
 
 /* Include files */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
diff --git a/drivers/net/skge.c b/drivers/net/skge.c
index 01f6811f1324..8f5414348e86 100644
--- a/drivers/net/skge.c
+++ b/drivers/net/skge.c
@@ -37,6 +37,7 @@
 #include <linux/crc32.h>
 #include <linux/dma-mapping.h>
 #include <linux/debugfs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/mii.h>
 #include <asm/irq.h>
diff --git a/drivers/net/slip.c b/drivers/net/slip.c
index e17c535a577e..fe3cebb984de 100644
--- a/drivers/net/slip.c
+++ b/drivers/net/slip.c
@@ -67,6 +67,7 @@
 #include <asm/system.h>
 #include <asm/uaccess.h>
 #include <linux/bitops.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/interrupt.h>
diff --git a/drivers/net/sungem.c b/drivers/net/sungem.c
index 305ec3d783db..7019a0d1a82b 100644
--- a/drivers/net/sungem.c
+++ b/drivers/net/sungem.c
@@ -38,6 +38,7 @@
 #include <linux/interrupt.h>
 #include <linux/ioport.h>
 #include <linux/in.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/string.h>
 #include <linux/delay.h>
diff --git a/drivers/net/tokenring/ibmtr.c b/drivers/net/tokenring/ibmtr.c
index 525bbc5b9c9d..36cb2423bcf1 100644
--- a/drivers/net/tokenring/ibmtr.c
+++ b/drivers/net/tokenring/ibmtr.c
@@ -108,6 +108,7 @@ in the event that chatty debug messages are desired - jjs 12/30/98 */
 #define IBMTR_DEBUG_MESSAGES 0
 
 #include <linux/module.h>
+#include <linux/sched.h>
 
 #ifdef PCMCIA		/* required for ibmtr_cs.c to build */
 #undef MODULE		/* yes, really */
diff --git a/drivers/net/typhoon.c b/drivers/net/typhoon.c
index d6d345229fe9..5921f5bdd764 100644
--- a/drivers/net/typhoon.c
+++ b/drivers/net/typhoon.c
@@ -108,6 +108,7 @@ static const int multicast_filter_limit = 32;
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/timer.h>
 #include <linux/errno.h>
diff --git a/drivers/net/wan/cosa.c b/drivers/net/wan/cosa.c
index 66360a2a14c2..e2c33c06190b 100644
--- a/drivers/net/wan/cosa.c
+++ b/drivers/net/wan/cosa.c
@@ -76,6 +76,7 @@
 
 #include <linux/module.h>
 #include <linux/kernel.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/poll.h>
 #include <linux/fs.h>
diff --git a/drivers/net/wan/cycx_x25.c b/drivers/net/wan/cycx_x25.c
index 2573c18b6aa5..cd8cb95c5bd7 100644
--- a/drivers/net/wan/cycx_x25.c
+++ b/drivers/net/wan/cycx_x25.c
@@ -84,6 +84,7 @@
 #include <linux/kernel.h>	/* printk(), and other useful stuff */
 #include <linux/module.h>
 #include <linux/string.h>	/* inline memset(), etc. */
+#include <linux/sched.h>
 #include <linux/slab.h>		/* kmalloc(), kfree() */
 #include <linux/stddef.h>	/* offsetof(), etc. */
 #include <linux/wanrouter.h>	/* WAN router definitions */
diff --git a/drivers/net/wan/dscc4.c b/drivers/net/wan/dscc4.c
index 81c8aec9df92..07d00b4cf48a 100644
--- a/drivers/net/wan/dscc4.c
+++ b/drivers/net/wan/dscc4.c
@@ -81,6 +81,7 @@
  */
 
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/types.h>
 #include <linux/errno.h>
 #include <linux/list.h>
diff --git a/drivers/net/wan/farsync.c b/drivers/net/wan/farsync.c
index 3e90eb816181..beda387f2fc7 100644
--- a/drivers/net/wan/farsync.c
+++ b/drivers/net/wan/farsync.c
@@ -19,6 +19,7 @@
 #include <linux/kernel.h>
 #include <linux/version.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/ioport.h>
 #include <linux/init.h>
 #include <linux/if.h>
diff --git a/drivers/net/wireless/b43/pio.c b/drivers/net/wireless/b43/pio.c
index 9c1397996e0a..5e87650b07fb 100644
--- a/drivers/net/wireless/b43/pio.c
+++ b/drivers/net/wireless/b43/pio.c
@@ -30,6 +30,7 @@
 #include "xmit.h"
 
 #include <linux/delay.h>
+#include <linux/sched.h>
 
 
 static u16 generate_cookie(struct b43_pio_txqueue *q,
diff --git a/drivers/net/wireless/b43legacy/main.c b/drivers/net/wireless/b43legacy/main.c
index 1d9223b3d4c4..4b60148a5e61 100644
--- a/drivers/net/wireless/b43legacy/main.c
+++ b/drivers/net/wireless/b43legacy/main.c
@@ -37,6 +37,7 @@
 #include <linux/firmware.h>
 #include <linux/wireless.h>
 #include <linux/workqueue.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/dma-mapping.h>
 #include <net/dst.h>
diff --git a/drivers/net/wireless/b43legacy/phy.c b/drivers/net/wireless/b43legacy/phy.c
index 11319ec2d64a..aaf227203a98 100644
--- a/drivers/net/wireless/b43legacy/phy.c
+++ b/drivers/net/wireless/b43legacy/phy.c
@@ -31,6 +31,7 @@
 
 #include <linux/delay.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/types.h>
 
 #include "b43legacy.h"
diff --git a/drivers/net/wireless/hostap/hostap_info.c b/drivers/net/wireless/hostap/hostap_info.c
index 6fa14a4e4b53..4dfb40a84c96 100644
--- a/drivers/net/wireless/hostap/hostap_info.c
+++ b/drivers/net/wireless/hostap/hostap_info.c
@@ -1,6 +1,7 @@
 /* Host AP driver Info Frame processing (part of hostap.o module) */
 
 #include <linux/if_arp.h>
+#include <linux/sched.h>
 #include "hostap_wlan.h"
 #include "hostap.h"
 #include "hostap_ap.h"
diff --git a/drivers/net/wireless/hostap/hostap_ioctl.c b/drivers/net/wireless/hostap/hostap_ioctl.c
index 3f2bda881a4f..9419cebca8a5 100644
--- a/drivers/net/wireless/hostap/hostap_ioctl.c
+++ b/drivers/net/wireless/hostap/hostap_ioctl.c
@@ -1,6 +1,7 @@
 /* ioctl() (mostly Linux Wireless Extensions) routines for Host AP driver */
 
 #include <linux/types.h>
+#include <linux/sched.h>
 #include <linux/ethtool.h>
 #include <linux/if_arp.h>
 #include <net/lib80211.h>
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index 8d58e6ed4e7d..827824d45de9 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -30,6 +30,7 @@
 
 ******************************************************************************/
 
+#include <linux/sched.h>
 #include "ipw2200.h"
 
 
diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.c b/drivers/net/wireless/iwlwifi/iwl-3945.c
index e70c5b0af364..68136172b823 100644
--- a/drivers/net/wireless/iwlwifi/iwl-3945.c
+++ b/drivers/net/wireless/iwlwifi/iwl-3945.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-4965.c b/drivers/net/wireless/iwlwifi/iwl-4965.c
index a22a0501c190..6f703a041847 100644
--- a/drivers/net/wireless/iwlwifi/iwl-4965.c
+++ b/drivers/net/wireless/iwlwifi/iwl-4965.c
@@ -30,6 +30,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-5000.c b/drivers/net/wireless/iwlwifi/iwl-5000.c
index eb08f4411000..d6bc0e051043 100644
--- a/drivers/net/wireless/iwlwifi/iwl-5000.c
+++ b/drivers/net/wireless/iwlwifi/iwl-5000.c
@@ -29,6 +29,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index cdc07c477457..313d3e5ee84b 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c
index 484d5c1a7312..2dc928755454 100644
--- a/drivers/net/wireless/iwlwifi/iwl-core.c
+++ b/drivers/net/wireless/iwlwifi/iwl-core.c
@@ -29,6 +29,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <net/mac80211.h>
 
 #include "iwl-eeprom.h"
diff --git a/drivers/net/wireless/iwlwifi/iwl-hcmd.c b/drivers/net/wireless/iwlwifi/iwl-hcmd.c
index 532c8d6cd8da..a6856daf14cb 100644
--- a/drivers/net/wireless/iwlwifi/iwl-hcmd.c
+++ b/drivers/net/wireless/iwlwifi/iwl-hcmd.c
@@ -28,6 +28,7 @@
 
 #include <linux/kernel.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <net/mac80211.h>
 
 #include "iwl-dev.h" /* FIXME: remove */
diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c
index c18907544701..fb9bcfa6d947 100644
--- a/drivers/net/wireless/iwlwifi/iwl-tx.c
+++ b/drivers/net/wireless/iwlwifi/iwl-tx.c
@@ -28,6 +28,7 @@
  *****************************************************************************/
 
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 #include <net/mac80211.h>
 #include "iwl-eeprom.h"
 #include "iwl-dev.h"
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index c390dbd877e4..aa49230422f3 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -33,6 +33,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <linux/delay.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/netdevice.h>
 #include <linux/wireless.h>
diff --git a/drivers/net/wireless/iwmc3200wifi/cfg80211.c b/drivers/net/wireless/iwmc3200wifi/cfg80211.c
index a56a2b0ac99a..f3c55658225b 100644
--- a/drivers/net/wireless/iwmc3200wifi/cfg80211.c
+++ b/drivers/net/wireless/iwmc3200wifi/cfg80211.c
@@ -23,6 +23,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/sched.h>
 #include <linux/etherdevice.h>
 #include <linux/wireless.h>
 #include <linux/ieee80211.h>
diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c
index 23b52fa2605f..84158b6d35d8 100644
--- a/drivers/net/wireless/iwmc3200wifi/commands.c
+++ b/drivers/net/wireless/iwmc3200wifi/commands.c
@@ -40,6 +40,7 @@
 #include <linux/wireless.h>
 #include <linux/etherdevice.h>
 #include <linux/ieee80211.h>
+#include <linux/sched.h>
 
 #include "iwm.h"
 #include "bus.h"
diff --git a/drivers/net/wireless/iwmc3200wifi/main.c b/drivers/net/wireless/iwmc3200wifi/main.c
index d668e4756324..222eb2cf1b30 100644
--- a/drivers/net/wireless/iwmc3200wifi/main.c
+++ b/drivers/net/wireless/iwmc3200wifi/main.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/sched.h>
 #include <linux/ieee80211.h>
 #include <linux/wireless.h>
 
diff --git a/drivers/net/wireless/iwmc3200wifi/rx.c b/drivers/net/wireless/iwmc3200wifi/rx.c
index 40dbcbc16593..771a301003c9 100644
--- a/drivers/net/wireless/iwmc3200wifi/rx.c
+++ b/drivers/net/wireless/iwmc3200wifi/rx.c
@@ -38,6 +38,7 @@
 
 #include <linux/kernel.h>
 #include <linux/netdevice.h>
+#include <linux/sched.h>
 #include <linux/etherdevice.h>
 #include <linux/wireless.h>
 #include <linux/ieee80211.h>
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 685098148e10..0a324dcd264c 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -6,6 +6,7 @@
 #include <net/iw_handler.h>
 #include <net/lib80211.h>
 #include <linux/kfifo.h>
+#include <linux/sched.h>
 #include "host.h"
 #include "hostcmd.h"
 #include "decl.h"
diff --git a/drivers/net/wireless/libertas/tx.c b/drivers/net/wireless/libertas/tx.c
index 4c018f7a0a8d..8c3766a6e8e7 100644
--- a/drivers/net/wireless/libertas/tx.c
+++ b/drivers/net/wireless/libertas/tx.c
@@ -3,6 +3,7 @@
   */
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/sched.h>
 
 #include "hostcmd.h"
 #include "radiotap.h"
diff --git a/drivers/net/wireless/prism54/isl_ioctl.c b/drivers/net/wireless/prism54/isl_ioctl.c
index 4c97c6ad6f5d..bc08464d8323 100644
--- a/drivers/net/wireless/prism54/isl_ioctl.c
+++ b/drivers/net/wireless/prism54/isl_ioctl.c
@@ -19,6 +19,7 @@
  *
  */
 
+#include <linux/capability.h>
 #include <linux/module.h>
 #include <linux/kernel.h>
 #include <linux/if_arp.h>
diff --git a/drivers/net/wireless/prism54/islpci_dev.c b/drivers/net/wireless/prism54/islpci_dev.c
index e26d7b3ceab5..2505be56ae39 100644
--- a/drivers/net/wireless/prism54/islpci_dev.c
+++ b/drivers/net/wireless/prism54/islpci_dev.c
@@ -23,6 +23,7 @@
 #include <linux/netdevice.h>
 #include <linux/ethtool.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/etherdevice.h>
 #include <linux/delay.h>
 #include <linux/if_arp.h>
diff --git a/drivers/net/wireless/prism54/islpci_mgt.c b/drivers/net/wireless/prism54/islpci_mgt.c
index f7c677e2094d..69d2f882fd06 100644
--- a/drivers/net/wireless/prism54/islpci_mgt.c
+++ b/drivers/net/wireless/prism54/islpci_mgt.c
@@ -20,6 +20,7 @@
 #include <linux/netdevice.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 
 #include <asm/io.h>
 #include <asm/system.h>
diff --git a/drivers/net/wireless/rt2x00/rt2x00debug.c b/drivers/net/wireless/rt2x00/rt2x00debug.c
index 7b3ee8c2eaef..68bc9bb1dbf9 100644
--- a/drivers/net/wireless/rt2x00/rt2x00debug.c
+++ b/drivers/net/wireless/rt2x00/rt2x00debug.c
@@ -27,6 +27,7 @@
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/poll.h>
+#include <linux/sched.h>
 #include <linux/uaccess.h>
 
 #include "rt2x00.h"
diff --git a/drivers/pci/pcie/aer/aerdrv.c b/drivers/pci/pcie/aer/aerdrv.c
index 2ce8f9ccc66e..d49ecc94bd49 100644
--- a/drivers/pci/pcie/aer/aerdrv.c
+++ b/drivers/pci/pcie/aer/aerdrv.c
@@ -17,6 +17,7 @@
 
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/pm.h>
diff --git a/drivers/rtc/interface.c b/drivers/rtc/interface.c
index 4cdb31a362ca..a0c816238aa9 100644
--- a/drivers/rtc/interface.c
+++ b/drivers/rtc/interface.c
@@ -12,6 +12,7 @@
 */
 
 #include <linux/rtc.h>
+#include <linux/sched.h>
 #include <linux/log2.h>
 
 int rtc_read_time(struct rtc_device *rtc, struct rtc_time *tm)
diff --git a/drivers/rtc/rtc-dev.c b/drivers/rtc/rtc-dev.c
index 8a11de9552cd..62227cd52410 100644
--- a/drivers/rtc/rtc-dev.c
+++ b/drivers/rtc/rtc-dev.c
@@ -13,6 +13,7 @@
 
 #include <linux/module.h>
 #include <linux/rtc.h>
+#include <linux/sched.h>
 #include "rtc-core.h"
 
 static dev_t rtc_devt;
diff --git a/drivers/uio/uio.c b/drivers/uio/uio.c
index a9d707047202..e941367dd28f 100644
--- a/drivers/uio/uio.c
+++ b/drivers/uio/uio.c
@@ -19,6 +19,7 @@
 #include <linux/device.h>
 #include <linux/mm.h>
 #include <linux/idr.h>
+#include <linux/sched.h>
 #include <linux/string.h>
 #include <linux/kobject.h>
 #include <linux/uio_driver.h>
diff --git a/drivers/uwb/whc-rc.c b/drivers/uwb/whc-rc.c
index 1d9a6f54658e..01950c62dc8d 100644
--- a/drivers/uwb/whc-rc.c
+++ b/drivers/uwb/whc-rc.c
@@ -42,6 +42,7 @@
 #include <linux/init.h>
 #include <linux/module.h>
 #include <linux/pci.h>
+#include <linux/sched.h>
 #include <linux/dma-mapping.h>
 #include <linux/interrupt.h>
 #include <linux/workqueue.h>
diff --git a/fs/file.c b/fs/file.c
index f313314f996f..87e129030ab1 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -10,6 +10,7 @@
 #include <linux/fs.h>
 #include <linux/mm.h>
 #include <linux/time.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/file.h>
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index b78cf8194957..7ca72b74eec7 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -10,7 +10,6 @@
 #include <linux/irqreturn.h>
 #include <linux/irqnr.h>
 #include <linux/hardirq.h>
-#include <linux/sched.h>
 #include <linux/irqflags.h>
 #include <linux/smp.h>
 #include <linux/percpu.h>
@@ -610,6 +609,7 @@ extern void debug_poll_all_shared_irqs(void);
 static inline void debug_poll_all_shared_irqs(void) { }
 #endif
 
+struct seq_file;
 int show_interrupts(struct seq_file *p, void *v);
 
 struct irq_desc;
diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h
index 81bb42358595..eaf36364b7d4 100644
--- a/include/linux/mmc/host.h
+++ b/include/linux/mmc/host.h
@@ -11,6 +11,7 @@
 #define LINUX_MMC_HOST_H
 
 #include <linux/leds.h>
+#include <linux/sched.h>
 
 #include <linux/mmc/core.h>
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a81cf80554db..17c71bb565c6 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -11,6 +11,7 @@
  */
 
 #include <linux/irq.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/module.h>
 #include <linux/random.h>
diff --git a/kernel/mutex-debug.c b/kernel/mutex-debug.c
index 50d022e5a560..ec815a960b5d 100644
--- a/kernel/mutex-debug.c
+++ b/kernel/mutex-debug.c
@@ -16,6 +16,7 @@
 #include <linux/delay.h>
 #include <linux/module.h>
 #include <linux/poison.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/kallsyms.h>
 #include <linux/interrupt.h>
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index fb0f46fa1ecd..c3a4e2907eaa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -13,6 +13,7 @@
 #include <linux/percpu.h>
 #include <linux/init.h>
 #include <linux/mm.h>
+#include <linux/sched.h>
 #include <linux/sysdev.h>
 #include <linux/clocksource.h>
 #include <linux/jiffies.h>
diff --git a/lib/debugobjects.c b/lib/debugobjects.c
index 2755a3bd16a1..eae56fddfa3b 100644
--- a/lib/debugobjects.c
+++ b/lib/debugobjects.c
@@ -9,6 +9,7 @@
  */
 #include <linux/debugobjects.h>
 #include <linux/interrupt.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/hash.h>
diff --git a/lib/fault-inject.c b/lib/fault-inject.c
index f97af55bdd96..7e65af70635e 100644
--- a/lib/fault-inject.c
+++ b/lib/fault-inject.c
@@ -1,6 +1,7 @@
 #include <linux/kernel.h>
 #include <linux/init.h>
 #include <linux/random.h>
+#include <linux/sched.h>
 #include <linux/stat.h>
 #include <linux/types.h>
 #include <linux/fs.h>
diff --git a/mm/vmalloc.c b/mm/vmalloc.c
index 5e7aed0802bf..0f551a4a44cd 100644
--- a/mm/vmalloc.c
+++ b/mm/vmalloc.c
@@ -12,6 +12,7 @@
 #include <linux/mm.h>
 #include <linux/module.h>
 #include <linux/highmem.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/spinlock.h>
 #include <linux/interrupt.h>
diff --git a/net/irda/ircomm/ircomm_tty_attach.c b/net/irda/ircomm/ircomm_tty_attach.c
index eafc010907c2..3c1754023022 100644
--- a/net/irda/ircomm/ircomm_tty_attach.c
+++ b/net/irda/ircomm/ircomm_tty_attach.c
@@ -30,6 +30,7 @@
  ********************************************************************/
 
 #include <linux/init.h>
+#include <linux/sched.h>
 
 #include <net/irda/irda.h>
 #include <net/irda/irlmp.h>
diff --git a/net/irda/irlan/irlan_common.c b/net/irda/irlan/irlan_common.c
index 62116829b817..315ead3cb926 100644
--- a/net/irda/irlan/irlan_common.c
+++ b/net/irda/irlan/irlan_common.c
@@ -30,6 +30,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/proc_fs.h>
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <linux/random.h>
 #include <linux/netdevice.h>
diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c
index 7b6b631f647f..d340110f5c0c 100644
--- a/net/irda/irlan/irlan_eth.c
+++ b/net/irda/irlan/irlan_eth.c
@@ -30,6 +30,7 @@
 #include <linux/inetdevice.h>
 #include <linux/if_arp.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <net/arp.h>
 
 #include <net/irda/irda.h>
diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c
index cf9a4b531a98..cccc2e93234f 100644
--- a/net/irda/irnet/irnet_irda.c
+++ b/net/irda/irnet/irnet_irda.c
@@ -9,6 +9,7 @@
  */
 
 #include "irnet_irda.h"		/* Private header */
+#include <linux/sched.h>
 #include <linux/seq_file.h>
 #include <asm/unaligned.h>
 
diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c
index 68cbcb19cbd8..7dea882dbb75 100644
--- a/net/irda/irnet/irnet_ppp.c
+++ b/net/irda/irnet/irnet_ppp.c
@@ -13,6 +13,7 @@
  *	2) as a control channel (write commands, read events)
  */
 
+#include <linux/sched.h>
 #include <linux/smp_lock.h>
 #include "irnet_ppp.h"		/* Private header */
 /* Please put other headers in irnet.h - Thanks */
diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c
index a59043fbb0ff..45667054a5f3 100644
--- a/net/mac80211/rc80211_pid_debugfs.c
+++ b/net/mac80211/rc80211_pid_debugfs.c
@@ -6,6 +6,7 @@
  * published by the Free Software Foundation.
  */
 
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <linux/poll.h>
 #include <linux/netdevice.h>
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 7c9ec3dee96e..ca6e68dcd8a8 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -14,6 +14,7 @@
 #include <linux/types.h>
 #include <linux/netfilter.h>
 #include <linux/module.h>
+#include <linux/sched.h>
 #include <linux/skbuff.h>
 #include <linux/proc_fs.h>
 #include <linux/vmalloc.h>
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index 0cf5e8c27a10..3fa5751af0ec 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -42,6 +42,7 @@
 #include <linux/sunrpc/svc_xprt.h>
 #include <linux/sunrpc/debug.h>
 #include <linux/sunrpc/rpc_rdma.h>
+#include <linux/sched.h>
 #include <linux/spinlock.h>
 #include <rdma/ib_verbs.h>
 #include <rdma/rdma_cm.h>
diff --git a/net/wireless/core.c b/net/wireless/core.c
index 45b2be3274db..a595f712b5bf 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -14,6 +14,7 @@
 #include <linux/device.h>
 #include <linux/etherdevice.h>
 #include <linux/rtnetlink.h>
+#include <linux/sched.h>
 #include <net/genetlink.h>
 #include <net/cfg80211.h>
 #include "nl80211.h"
-- 
cgit v1.2.3


From e17b38bf9e70d74f3739a600db75240078ac1407 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sun, 11 Oct 2009 19:12:00 -0700
Subject: sched: Fix missing kernel-doc notation

The following htmldocs warnings:

  Warning(kernel/sched.c:685): No description found for parameter 'cpu'
  Warning(kernel/sched.c:3676): No description found for parameter 'sd'

Trigger because new parameters were added to update_rq_clock() and
update_group_power() without updating the kernel-doc notation.

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4AD29070.7070002@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e3f8f4f61c8e..789001da0a94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -676,6 +676,7 @@ inline void update_rq_clock(struct rq *rq)
 
 /**
  * runqueue_is_locked
+ * @cpu: the processor in question.
  *
  * Returns true if the current cpu runqueue is locked.
  * This interface allows printk to be called with the runqueue lock
@@ -3660,6 +3661,7 @@ static void update_group_power(struct sched_domain *sd, int cpu)
 
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: The sched_domain whose statistics are to be updated.
  * @group: sched_group whose statistics are to be updated.
  * @this_cpu: Cpu for which load balance is currently performed.
  * @idle: Idle status of this_cpu
-- 
cgit v1.2.3


From 8ad807318fcd62aba0e18c7c7fbfcc1af3fcdbab Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 13 Oct 2009 09:28:57 +0800
Subject: tracing/filters: Fix memory leak when setting a filter

Every time we set a filter, we leak memory allocated by
postfix_append_operand() and postfix_append_op().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: <stable@kernel.org> # for v2.6.31.x
LKML-Reference: <4AD3D7D9.4070400@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 23245785927f..98a6cc5c64ed 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -933,8 +933,9 @@ static void postfix_clear(struct filter_parse_state *ps)
 
 	while (!list_empty(&ps->postfix)) {
 		elt = list_first_entry(&ps->postfix, struct postfix_elt, list);
-		kfree(elt->operand);
 		list_del(&elt->list);
+		kfree(elt->operand);
+		kfree(elt);
 	}
 }
 
-- 
cgit v1.2.3


From d58e6576b0deec6f0b9ff8450fe282da18c50883 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 13 Oct 2009 20:40:43 +0200
Subject: futex: Handle spurious wake up

The futex code does not handle spurious wake up in futex_wait and
futex_wait_requeue_pi.

The code assumes that any wake up which was not caused by futex_wake /
requeue or by a timeout was caused by a signal wake up and returns one
of the syscall restart error codes.

In case of a spurious wake up the signal delivery code which deals
with the restart error codes is not invoked and we return that error
code to user space. That causes applications which actually check the
return codes to fail. Blaise reported that on preempt-rt a python test
program run into a exception trap. -rt exposed that due to a built in
spurious wake up accelerator :)

Solve this by checking signal_pending(current) in the wake up path and
handle the spurious wake up case w/o returning to user space.

Reported-by: Blaise Gassend <blaise@willowgarage.com>
Debugged-by: Darren Hart <dvhltc@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: stable@kernel.org
LKML-Reference: <new-submission>
---
 kernel/futex.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 4949d336d88d..5c88839bd993 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1791,6 +1791,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 					     current->timer_slack_ns);
 	}
 
+retry:
 	/* Prepare to wait on uaddr. */
 	ret = futex_wait_setup(uaddr, val, fshared, &q, &hb);
 	if (ret)
@@ -1808,9 +1809,14 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		goto out_put_key;
 
 	/*
-	 * We expect signal_pending(current), but another thread may
-	 * have handled it for us already.
+	 * We expect signal_pending(current), but we might be the
+	 * victim of a spurious wakeup as well.
 	 */
+	if (!signal_pending(current)) {
+		put_futex_key(fshared, &q.key);
+		goto retry;
+	}
+
 	ret = -ERESTARTSYS;
 	if (!abs_time)
 		goto out_put_key;
@@ -2118,9 +2124,11 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		 */
 		plist_del(&q->list, &q->list.plist);
 
+		/* Handle spurious wakeups gracefully */
+		ret = -EAGAIN;
 		if (timeout && !timeout->task)
 			ret = -ETIMEDOUT;
-		else
+		else if (signal_pending(current))
 			ret = -ERESTARTNOINTR;
 	}
 	return ret;
@@ -2198,6 +2206,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	debug_rt_mutex_init_waiter(&rt_waiter);
 	rt_waiter.task = NULL;
 
+retry:
 	key2 = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
@@ -2292,6 +2301,9 @@ out_put_keys:
 out_key2:
 	put_futex_key(fshared, &key2);
 
+	/* Spurious wakeup ? */
+	if (ret == -EAGAIN)
+		goto retry;
 out:
 	if (to) {
 		hrtimer_cancel(&to->timer);
-- 
cgit v1.2.3


From 03541f8b69c058162e4cf9675ec9181e6a204d55 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@samba.org>
Date: Wed, 14 Oct 2009 16:58:03 +1100
Subject: perf_event: Adjust frequency and unthrottle for non-group-leader
 events

The loop in perf_ctx_adjust_freq checks the frequency of sampling
event counters, and adjusts the event interval and unthrottles the
event if required, and resets the interrupt count for the event.
However, at present it only looks at group leaders.

This means that a sampling event that is not a group leader will
eventually get throttled, once its interrupt count reaches
sysctl_perf_event_sample_rate/HZ --- and that is guaranteed to
happen, if the event is active for long enough, since the interrupt
count never gets reset.  Once it is throttled it never gets
unthrottled, so it basically just stops working at that point.

This fixes it by making perf_ctx_adjust_freq use ctx->event_list
rather than ctx->group_list.  The existing spin_lock/spin_unlock
around the loop makes it unnecessary to put rcu_read_lock/
rcu_read_unlock around the list_for_each_entry_rcu().

Reported-by: Mark W. Krentel <krentel@cs.rice.edu>
Signed-off-by: Paul Mackerras <paulus@samba.org>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <19157.26731.855609.165622@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 9d0b5c665883..afb7ef3dbc44 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -1355,7 +1355,7 @@ static void perf_ctx_adjust_freq(struct perf_event_context *ctx)
 	u64 interrupts, freq;
 
 	spin_lock(&ctx->lock);
-	list_for_each_entry(event, &ctx->group_list, group_entry) {
+	list_for_each_entry_rcu(event, &ctx->event_list, event_entry) {
 		if (event->state != PERF_EVENT_STATE_ACTIVE)
 			continue;
 
-- 
cgit v1.2.3


From 92f6a5e37a2e2d3342dafb2b39c2f8bc340bbf84 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 9 Oct 2009 12:43:07 +0200
Subject: sched: Do less agressive buddy clearing

Yanmin reported a hackbench regression due to:

 > commit de69a80be32445b0a71e8e3b757e584d7beb90f7
 > Author: Peter Zijlstra <a.p.zijlstra@chello.nl>
 > Date:   Thu Sep 17 09:01:20 2009 +0200
 >
 >     sched: Stop buddies from hogging the system

I really liked de69a80b, and it affecting hackbench shows I wasn't
crazy ;-)

So hackbench is a multi-cast, with one sender spraying multiple
receivers, who in their turn don't spray back.

This would be exactly the scenario that patch 'cures'. Previously
we would not clear the last buddy after running the next task,
allowing the sender to get back to work sooner than it otherwise
ought to have been, increasing latencies for other tasks.

Now, since those receivers don't poke back, they don't enforce the
buddy relation, which means there's nothing to re-elect the sender.

Cure this by less agressively clearing the buddy stats. Only clear
buddies when they were not chosen. It should still avoid a buddy
sticking around long after its served its time.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
CC: Mike Galbraith <efault@gmx.de>
LKML-Reference: <1255084986.8802.46.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 4e777b47eeda..c32c3e643daa 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -861,12 +861,21 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
+	struct sched_entity *buddy;
 
-	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, se) < 1)
-		return cfs_rq->next;
+	if (cfs_rq->next) {
+		buddy = cfs_rq->next;
+		cfs_rq->next = NULL;
+		if (wakeup_preempt_entity(buddy, se) < 1)
+			return buddy;
+	}
 
-	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, se) < 1)
-		return cfs_rq->last;
+	if (cfs_rq->last) {
+		buddy = cfs_rq->last;
+		cfs_rq->last = NULL;
+		if (wakeup_preempt_entity(buddy, se) < 1)
+			return buddy;
+	}
 
 	return se;
 }
@@ -1654,16 +1663,6 @@ static struct task_struct *pick_next_task_fair(struct rq *rq)
 
 	do {
 		se = pick_next_entity(cfs_rq);
-		/*
-		 * If se was a buddy, clear it so that it will have to earn
-		 * the favour again.
-		 *
-		 * If se was not a buddy, clear the buddies because neither
-		 * was elegible to run, let them earn it again.
-		 *
-		 * IOW. unconditionally clear buddies.
-		 */
-		__clear_buddies(cfs_rq, NULL);
 		set_next_entity(cfs_rq, se);
 		cfs_rq = group_cfs_rq(se);
 	} while (cfs_rq);
-- 
cgit v1.2.3


From 43046b606673c9c991919ff75b980b72541e9ede Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 14 Oct 2009 09:16:42 -0700
Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed
 work

It basically turns a delayed work into an immediate work, and then waits
for it to finish.
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7ef0c7b94f31..cf24c20de9e4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -207,6 +207,7 @@ extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 extern void flush_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
+extern void flush_delayed_work(struct delayed_work *work);
 
 extern int schedule_work(struct work_struct *work);
 extern int schedule_work_on(int cpu, struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..ccefe574dcf7 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -639,6 +639,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
+/**
+ * flush_delayed_work - block until a dwork_struct's callback has terminated
+ * @dwork: the delayed work which is to be flushed
+ *
+ * Any timeout is cancelled, and any pending work is run immediately.
+ */
+void flush_delayed_work(struct delayed_work *dwork)
+{
+	if (del_timer(&dwork->timer)) {
+		struct cpu_workqueue_struct *cwq;
+		cwq = wq_per_cpu(keventd_wq, get_cpu());
+		__queue_work(cwq, &dwork->work);
+		put_cpu();
+	}
+	flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+
 /**
  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
  * @cpu: cpu to use
-- 
cgit v1.2.3


From 2bc872036e1c5948b5b02942810bbdd8dbdb9812 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Wed, 14 Oct 2009 10:12:39 -0700
Subject: futex: Check for NULL keys in match_futex

If userspace tries to perform a requeue_pi on a non-requeue_pi waiter,
it will find the futex_q->requeue_pi_key to be NULL and OOPS.

Check for NULL in match_futex() instead of doing explicit NULL pointer
checks on all call sites.  While match_futex(NULL, NULL) returning
false is a little odd, it's still correct as we expect valid key
references.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
CC: Eric Dumazet <eric.dumazet@gmail.com>
CC: Dinakar Guniguntala <dino@in.ibm.com>
CC: John Stultz <johnstul@us.ibm.com>
Cc: stable@kernel.org
LKML-Reference: <4AD60687.10306@us.ibm.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 5c88839bd993..06938e560ac9 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -150,7 +150,8 @@ static struct futex_hash_bucket *hash_futex(union futex_key *key)
  */
 static inline int match_futex(union futex_key *key1, union futex_key *key2)
 {
-	return (key1->both.word == key2->both.word
+	return (key1 && key2
+		&& key1->both.word == key2->both.word
 		&& key1->both.ptr == key2->both.ptr
 		&& key1->both.offset == key2->both.offset);
 }
-- 
cgit v1.2.3


From 8c53e46314562fe814b0afef6cfcbd2f562b017c Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Wed, 14 Oct 2009 09:16:42 -0700
Subject: workqueue: add 'flush_delayed_work()' to run and wait for delayed
 work

It basically turns a delayed work into an immediate work, and then waits
for it to finish, thus allowing you to force (and wait for) an immediate
flush of a delayed work.

We'll want to use this in the tty layer to clean up tty_flush_to_ldisc().

Acked-by: Oleg Nesterov <oleg@redhat.com>
[ Fixed to use 'del_timer_sync()' as noted by Oleg ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/workqueue.h |  1 +
 kernel/workqueue.c        | 18 ++++++++++++++++++
 2 files changed, 19 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 7ef0c7b94f31..cf24c20de9e4 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -207,6 +207,7 @@ extern int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 
 extern void flush_workqueue(struct workqueue_struct *wq);
 extern void flush_scheduled_work(void);
+extern void flush_delayed_work(struct delayed_work *work);
 
 extern int schedule_work(struct work_struct *work);
 extern int schedule_work_on(int cpu, struct work_struct *work);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..47cdd7e76f2b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -639,6 +639,24 @@ int schedule_delayed_work(struct delayed_work *dwork,
 }
 EXPORT_SYMBOL(schedule_delayed_work);
 
+/**
+ * flush_delayed_work - block until a dwork_struct's callback has terminated
+ * @dwork: the delayed work which is to be flushed
+ *
+ * Any timeout is cancelled, and any pending work is run immediately.
+ */
+void flush_delayed_work(struct delayed_work *dwork)
+{
+	if (del_timer_sync(&dwork->timer)) {
+		struct cpu_workqueue_struct *cwq;
+		cwq = wq_per_cpu(keventd_wq, get_cpu());
+		__queue_work(cwq, &dwork->work);
+		put_cpu();
+	}
+	flush_work(&dwork->work);
+}
+EXPORT_SYMBOL(flush_delayed_work);
+
 /**
  * schedule_delayed_work_on - queue work in global workqueue on CPU after delay
  * @cpu: cpu to use
-- 
cgit v1.2.3


From 37c72e56f6b234ea7387ba530434a80abf2658d8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Oct 2009 10:15:55 -0700
Subject: rcu: Prevent RCU IPI storms in presence of high call_rcu() load

As the number of callbacks on a given CPU rises, invoke
force_quiescent_state() only every blimit number of callbacks
(defaults to 10,000), and even then only if no other CPU has
invoked force_quiescent_state() in the meantime.

This should fix the performance regression reported by Nick.

Reported-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: jens.axboe@oracle.com
LKML-Reference: <12555405592133-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 29 ++++++++++++++++++++++++-----
 kernel/rcutree.h |  4 ++++
 2 files changed, 28 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 705f02ac7433..ddbf111e9e18 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -958,7 +958,7 @@ static void rcu_offline_cpu(int cpu)
  * Invoke any RCU callbacks that have made it to the end of their grace
  * period.  Thottle as specified by rdp->blimit.
  */
-static void rcu_do_batch(struct rcu_data *rdp)
+static void rcu_do_batch(struct rcu_state *rsp, struct rcu_data *rdp)
 {
 	unsigned long flags;
 	struct rcu_head *next, *list, **tail;
@@ -1011,6 +1011,13 @@ static void rcu_do_batch(struct rcu_data *rdp)
 	if (rdp->blimit == LONG_MAX && rdp->qlen <= qlowmark)
 		rdp->blimit = blimit;
 
+	/* Reset ->qlen_last_fqs_check trigger if enough CBs have drained. */
+	if (rdp->qlen == 0 && rdp->qlen_last_fqs_check != 0) {
+		rdp->qlen_last_fqs_check = 0;
+		rdp->n_force_qs_snap = rsp->n_force_qs;
+	} else if (rdp->qlen < rdp->qlen_last_fqs_check - qhimark)
+		rdp->qlen_last_fqs_check = rdp->qlen;
+
 	local_irq_restore(flags);
 
 	/* Re-raise the RCU softirq if there are callbacks remaining. */
@@ -1224,7 +1231,7 @@ __rcu_process_callbacks(struct rcu_state *rsp, struct rcu_data *rdp)
 	}
 
 	/* If there are callbacks ready, invoke them. */
-	rcu_do_batch(rdp);
+	rcu_do_batch(rsp, rdp);
 }
 
 /*
@@ -1288,10 +1295,20 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		rcu_start_gp(rsp, nestflag);  /* releases rnp_root->lock. */
 	}
 
-	/* Force the grace period if too many callbacks or too long waiting. */
-	if (unlikely(++rdp->qlen > qhimark)) {
+	/*
+	 * Force the grace period if too many callbacks or too long waiting.
+	 * Enforce hysteresis, and don't invoke force_quiescent_state()
+	 * if some other CPU has recently done so.  Also, don't bother
+	 * invoking force_quiescent_state() if the newly enqueued callback
+	 * is the only one waiting for a grace period to complete.
+	 */
+	if (unlikely(++rdp->qlen > rdp->qlen_last_fqs_check + qhimark)) {
 		rdp->blimit = LONG_MAX;
-		force_quiescent_state(rsp, 0);
+		if (rsp->n_force_qs == rdp->n_force_qs_snap &&
+		    *rdp->nxttail[RCU_DONE_TAIL] != head)
+			force_quiescent_state(rsp, 0);
+		rdp->n_force_qs_snap = rsp->n_force_qs;
+		rdp->qlen_last_fqs_check = rdp->qlen;
 	} else if ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)
 		force_quiescent_state(rsp, 1);
 	local_irq_restore(flags);
@@ -1523,6 +1540,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptable)
 	rdp->beenonline = 1;	 /* We have now been online. */
 	rdp->preemptable = preemptable;
 	rdp->passed_quiesc_completed = lastcomp - 1;
+	rdp->qlen_last_fqs_check = 0;
+	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
 	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index b40ac5706040..599161f309fb 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -167,6 +167,10 @@ struct rcu_data {
 	struct rcu_head *nxtlist;
 	struct rcu_head **nxttail[RCU_NEXT_SIZE];
 	long		qlen;		/* # of queued callbacks */
+	long		qlen_last_fqs_check;
+					/* qlen at last check for QS forcing */
+	unsigned long	n_force_qs_snap;
+					/* did other CPU force QS recently? */
 	long		blimit;		/* Upper limit on a processed batch */
 
 #ifdef CONFIG_NO_HZ
-- 
cgit v1.2.3


From 019129d595caaa5bd0b41d128308da1be6a91869 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 14 Oct 2009 10:15:56 -0700
Subject: rcu: Stopgap fix for synchronize_rcu_expedited() for TREE_PREEMPT_RCU

For the short term, map synchronize_rcu_expedited() to
synchronize_rcu() for TREE_PREEMPT_RCU and to
synchronize_sched_expedited() for TREE_RCU.

Longer term, there needs to be a real expedited grace period for
TREE_PREEMPT_RCU, but candidate patches to date are considerably
more complex and intrusive.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
Cc: npiggin@suse.de
Cc: jens.axboe@oracle.com
LKML-Reference: <12555405592331-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcutree.h |  6 +-----
 kernel/rcutree_plugin.h | 21 +++++++++++++++++++++
 2 files changed, 22 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index 46e9ab3ee6e1..9642c6bcb399 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -76,11 +76,7 @@ static inline void __rcu_read_unlock_bh(void)
 
 extern void call_rcu_sched(struct rcu_head *head,
 			   void (*func)(struct rcu_head *rcu));
-
-static inline void synchronize_rcu_expedited(void)
-{
-	synchronize_sched_expedited();
-}
+extern void synchronize_rcu_expedited(void);
 
 static inline void synchronize_rcu_bh_expedited(void)
 {
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index c0cb783aa16a..ebd20ee7707d 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -392,6 +392,17 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+/*
+ * Wait for an rcu-preempt grace period.  We are supposed to expedite the
+ * grace period, but this is the crude slow compatability hack, so just
+ * invoke synchronize_rcu().
+ */
+void synchronize_rcu_expedited(void)
+{
+	synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
 /*
  * Check to see if there is any immediate preemptable-RCU-related work
  * to be done.
@@ -564,6 +575,16 @@ void call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu))
 }
 EXPORT_SYMBOL_GPL(call_rcu);
 
+/*
+ * Wait for an rcu-preempt grace period, but make it happen quickly.
+ * But because preemptable RCU does not exist, map to rcu-sched.
+ */
+void synchronize_rcu_expedited(void)
+{
+	synchronize_sched_expedited();
+}
+EXPORT_SYMBOL_GPL(synchronize_rcu_expedited);
+
 /*
  * Because preemptable RCU does not exist, it never has any work to do.
  */
-- 
cgit v1.2.3


From 237c80c5c8fb7ec128cf2a756b550dc41ad7eac7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 15 Oct 2009 09:26:14 -0700
Subject: rcu: Fix TREE_PREEMPT_RCU CPU_HOTPLUG bad-luck hang

If the following sequence of events occurs, then
TREE_PREEMPT_RCU will hang waiting for a grace period to
complete, eventually OOMing the system:

o	A TREE_PREEMPT_RCU build of the kernel is booted on a system
	with more than 64 physical CPUs present (32 on a 32-bit system).
	Alternatively, a TREE_PREEMPT_RCU build of the kernel is booted
	with RCU_FANOUT set to a sufficiently small value that the
	physical CPUs populate two or more leaf rcu_node structures.

o	A task is preempted in an RCU read-side critical section
	while running on a CPU corresponding to a given leaf rcu_node
	structure.

o	All CPUs corresponding to this same leaf rcu_node structure
	record quiescent states for the current grace period.

o	All of these same CPUs go offline (hence the need for enough
	physical CPUs to populate more than one leaf rcu_node structure).
	This causes the preempted task to be moved to the root rcu_node
	structure.

At this point, there is nothing left to cause the quiescent
state to be propagated up the rcu_node tree, so the current
grace period never completes.

The simplest fix, especially after considering the deadlock
possibilities, is to detect this situation when the last CPU is
offlined, and to set that CPU's ->qsmask bit in its leaf
rcu_node structure.  This will cause the next invocation of
force_quiescent_state() to end the grace period.

Without this fix, this hang can be triggered in an hour or so on
some machines with rcutorture and random CPU onlining/offlining.
With this fix, these same machines pass a full 10 hours of this
sort of abuse.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <20091015162614.GA19131@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c        | 15 ++++++++++++++-
 kernel/rcutree.h        |  6 +++---
 kernel/rcutree_plugin.h | 25 +++++++++++++++++--------
 3 files changed, 34 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index ddbf111e9e18..0536125b0497 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -913,7 +913,20 @@ static void __rcu_offline_cpu(int cpu, struct rcu_state *rsp)
 			spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			break;
 		}
-		rcu_preempt_offline_tasks(rsp, rnp, rdp);
+
+		/*
+		 * If there was a task blocking the current grace period,
+		 * and if all CPUs have checked in, we need to propagate
+		 * the quiescent state up the rcu_node hierarchy.  But that
+		 * is inconvenient at the moment due to deadlock issues if
+		 * this should end the current grace period.  So set the
+		 * offlined CPU's bit in ->qsmask in order to force the
+		 * next force_quiescent_state() invocation to clean up this
+		 * mess in a deadlock-free manner.
+		 */
+		if (rcu_preempt_offline_tasks(rsp, rnp, rdp) && !rnp->qsmask)
+			rnp->qsmask |= mask;
+
 		mask = rnp->grpmask;
 		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 		rnp = rnp->parent;
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 599161f309fb..1823c6e20609 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -306,9 +306,9 @@ static void rcu_print_task_stall(struct rcu_node *rnp);
 #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */
 static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp);
 #ifdef CONFIG_HOTPLUG_CPU
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				      struct rcu_node *rnp,
-				      struct rcu_data *rdp);
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				     struct rcu_node *rnp,
+				     struct rcu_data *rdp);
 static void rcu_preempt_offline_cpu(int cpu);
 #endif /* #ifdef CONFIG_HOTPLUG_CPU */
 static void rcu_preempt_check_callbacks(int cpu);
diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h
index ebd20ee7707d..ef2a58c2b9d5 100644
--- a/kernel/rcutree_plugin.h
+++ b/kernel/rcutree_plugin.h
@@ -304,21 +304,25 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
  * parent is to remove the need for rcu_read_unlock_special() to
  * make more than two attempts to acquire the target rcu_node's lock.
  *
+ * Returns 1 if there was previously a task blocking the current grace
+ * period on the specified rcu_node structure.
+ *
  * The caller must hold rnp->lock with irqs disabled.
  */
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				      struct rcu_node *rnp,
-				      struct rcu_data *rdp)
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				     struct rcu_node *rnp,
+				     struct rcu_data *rdp)
 {
 	int i;
 	struct list_head *lp;
 	struct list_head *lp_root;
+	int retval = rcu_preempted_readers(rnp);
 	struct rcu_node *rnp_root = rcu_get_root(rsp);
 	struct task_struct *tp;
 
 	if (rnp == rnp_root) {
 		WARN_ONCE(1, "Last CPU thought to be offlined?");
-		return;  /* Shouldn't happen: at least one CPU online. */
+		return 0;  /* Shouldn't happen: at least one CPU online. */
 	}
 	WARN_ON_ONCE(rnp != rdp->mynode &&
 		     (!list_empty(&rnp->blocked_tasks[0]) ||
@@ -342,6 +346,8 @@ static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
 			spin_unlock(&rnp_root->lock); /* irqs remain disabled */
 		}
 	}
+
+	return retval;
 }
 
 /*
@@ -532,12 +538,15 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 
 /*
  * Because preemptable RCU does not exist, it never needs to migrate
- * tasks that were blocked within RCU read-side critical sections.
+ * tasks that were blocked within RCU read-side critical sections, and
+ * such non-existent tasks cannot possibly have been blocking the current
+ * grace period.
  */
-static void rcu_preempt_offline_tasks(struct rcu_state *rsp,
-				      struct rcu_node *rnp,
-				      struct rcu_data *rdp)
+static int rcu_preempt_offline_tasks(struct rcu_state *rsp,
+				     struct rcu_node *rnp,
+				     struct rcu_data *rdp)
 {
+	return 0;
 }
 
 /*
-- 
cgit v1.2.3


From 89061d3d58e1f0742139605dc6a7950aa1ecc019 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 15 Oct 2009 15:30:48 -0700
Subject: futex: Move drop_futex_key_refs out of spinlock'ed region

When requeuing tasks from one futex to another, the reference held
by the requeued task to the original futex location needs to be
dropped eventually.

Dropping the reference may ultimately lead to a call to
"iput_final" and subsequently call into filesystem- specific code -
which may be non-atomic.

It is therefore safer to defer this drop operation until after the
futex_hash_bucket spinlock has been dropped.

Originally-From: Helge Bahmann <hcb@chaoticmind.net>
Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Cc: <stable@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
Cc: Sven-Thorsten Dietrich <sdietrich@novell.com>
Cc: John Kacur <jkacur@redhat.com>
LKML-Reference: <4AD7A298.5040802@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 06938e560ac9..642f3bbaacc7 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1029,7 +1029,6 @@ static inline
 void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key,
 			   struct futex_hash_bucket *hb)
 {
-	drop_futex_key_refs(&q->key);
 	get_futex_key_refs(key);
 	q->key = *key;
 
@@ -1227,6 +1226,7 @@ retry_private:
 		 */
 		if (ret == 1) {
 			WARN_ON(pi_state);
+			drop_count++;
 			task_count++;
 			ret = get_futex_value_locked(&curval2, uaddr2);
 			if (!ret)
@@ -1305,6 +1305,7 @@ retry_private:
 			if (ret == 1) {
 				/* We got the lock. */
 				requeue_pi_wake_futex(this, &key2, hb2);
+				drop_count++;
 				continue;
 			} else if (ret) {
 				/* -EDEADLK */
-- 
cgit v1.2.3


From 65a64464349883891e21e74af16c05d6e1eeb4e9 Mon Sep 17 00:00:00 2001
From: Andi Kleen <ak@linux.intel.com>
Date: Wed, 14 Oct 2009 06:22:47 +0200
Subject: HWPOISON: Allow schedule_on_each_cpu() from keventd

Right now when calling schedule_on_each_cpu() from keventd there
is a deadlock because it tries to schedule a work item on the current CPU
too. This happens via lru_add_drain_all() in hwpoison.

Just call the function for the current CPU in this case. This is actually
faster too.

Debugging with Fengguang Wu & Max Asbock

Signed-off-by: Andi Kleen <ak@linux.intel.com>
---
 kernel/workqueue.c | 21 +++++++++++++++++++--
 1 file changed, 19 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index addfe2df93b1..f61a2fecf281 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -667,21 +667,38 @@ EXPORT_SYMBOL(schedule_delayed_work_on);
 int schedule_on_each_cpu(work_func_t func)
 {
 	int cpu;
+	int orig = -1;
 	struct work_struct *works;
 
 	works = alloc_percpu(struct work_struct);
 	if (!works)
 		return -ENOMEM;
 
+	/*
+	 * when running in keventd don't schedule a work item on itself.
+	 * Can just call directly because the work queue is already bound.
+	 * This also is faster.
+	 * Make this a generic parameter for other workqueues?
+	 */
+	if (current_is_keventd()) {
+		orig = raw_smp_processor_id();
+		INIT_WORK(per_cpu_ptr(works, orig), func);
+		func(per_cpu_ptr(works, orig));
+	}
+
 	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
+		if (cpu == orig)
+			continue;
 		INIT_WORK(work, func);
 		schedule_work_on(cpu, work);
 	}
-	for_each_online_cpu(cpu)
-		flush_work(per_cpu_ptr(works, cpu));
+	for_each_online_cpu(cpu) {
+		if (cpu != orig)
+			flush_work(per_cpu_ptr(works, cpu));
+	}
 	put_online_cpus();
 	free_percpu(works);
 	return 0;
-- 
cgit v1.2.3


From 04bf7539c08d64184736cdc5e4ad617eda77eb0f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Tue, 20 Oct 2009 06:45:02 +0200
Subject: PM: Make warning in suspend_test_finish() less likely to happen

Increase TEST_SUSPEND_SECONDS to 10 so the warning in
suspend_test_finish() doesn't annoy the users of slower systems so much.

Also, make the warning print the suspend-resume cycle time, so that we
know why the warning actually triggered.

Patch prepared during the hacking session at the Kernel Summit in Tokyo.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/suspend_test.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/suspend_test.c b/kernel/power/suspend_test.c
index 17d8bb1acf9c..25596e450ac7 100644
--- a/kernel/power/suspend_test.c
+++ b/kernel/power/suspend_test.c
@@ -19,7 +19,7 @@
  * The time it takes is system-specific though, so when we test this
  * during system bootup we allow a LOT of time.
  */
-#define TEST_SUSPEND_SECONDS	5
+#define TEST_SUSPEND_SECONDS	10
 
 static unsigned long suspend_test_start_time;
 
@@ -49,7 +49,8 @@ void suspend_test_finish(const char *label)
 	 * has some performance issues.  The stack dump of a WARN_ON
 	 * is more likely to get the right attention than a printk...
 	 */
-	WARN(msec > (TEST_SUSPEND_SECONDS * 1000), "Component: %s\n", label);
+	WARN(msec > (TEST_SUSPEND_SECONDS * 1000),
+	     "Component: %s, time: %u\n", label, msec);
 }
 
 /*
-- 
cgit v1.2.3


From 721a669b7225edeeb0ca8e2bf71b83882326a71b Mon Sep 17 00:00:00 2001
From: Soeren Sandmann <sandmann@daimi.au.dk>
Date: Tue, 15 Sep 2009 14:33:08 +0200
Subject: perf events: Fix swevent hrtimer sampling by keeping track of
 remaining time when enabling/disabling swevent hrtimers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Make the hrtimer based events work for sysprof.

Whenever a swevent is scheduled out, the hrtimer is canceled.
When it is scheduled back in, the timer is restarted. This
happens every scheduler tick, which means the timer never
expired because it was getting repeatedly restarted over and
over with the same period.

To fix that, save the remaining time when disabling; when
reenabling, use that saved time as the period instead of the
user-specified sampling period.

Also, move the starting and stopping of the hrtimers to helper
functions instead of duplicating the code.

Signed-off-by: Søren Sandmann Pedersen <sandmann@redhat.com>
LKML-Reference: <ye8vdi7mluz.fsf@camel16.daimi.au.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_event.h |  4 +--
 kernel/perf_event.c        | 61 +++++++++++++++++++++++++++++++---------------
 2 files changed, 43 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 2e6d95f97419..9e7012689a84 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -471,8 +471,8 @@ struct hw_perf_event {
 			unsigned long	event_base;
 			int		idx;
 		};
-		union { /* software */
-			atomic64_t	count;
+		struct { /* software */
+			s64		remaining;
 			struct hrtimer	hrtimer;
 		};
 	};
diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index afb7ef3dbc44..33ff019f9aa6 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3969,6 +3969,42 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 	return ret;
 }
 
+static void perf_swevent_start_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
+	hwc->hrtimer.function = perf_swevent_hrtimer;
+	if (hwc->sample_period) {
+		u64 period;
+
+		if (hwc->remaining) {
+			if (hwc->remaining < 0)
+				period = 10000;
+			else
+				period = hwc->remaining;
+			hwc->remaining = 0;
+		} else {
+			period = max_t(u64, 10000, hwc->sample_period);
+		}
+		__hrtimer_start_range_ns(&hwc->hrtimer,
+				ns_to_ktime(period), 0,
+				HRTIMER_MODE_REL, 0);
+	}
+}
+
+static void perf_swevent_cancel_hrtimer(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+
+	if (hwc->sample_period) {
+		ktime_t remaining = hrtimer_get_remaining(&hwc->hrtimer);
+		hwc->remaining = ktime_to_ns(remaining);
+
+		hrtimer_cancel(&hwc->hrtimer);
+	}
+}
+
 /*
  * Software event: cpu wall time clock
  */
@@ -3991,22 +4027,14 @@ static int cpu_clock_perf_event_enable(struct perf_event *event)
 	int cpu = raw_smp_processor_id();
 
 	atomic64_set(&hwc->prev_count, cpu_clock(cpu));
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swevent_hrtimer;
-	if (hwc->sample_period) {
-		u64 period = max_t(u64, 10000, hwc->sample_period);
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
+	perf_swevent_start_hrtimer(event);
 
 	return 0;
 }
 
 static void cpu_clock_perf_event_disable(struct perf_event *event)
 {
-	if (event->hw.sample_period)
-		hrtimer_cancel(&event->hw.hrtimer);
+	perf_swevent_cancel_hrtimer(event);
 	cpu_clock_perf_event_update(event);
 }
 
@@ -4043,22 +4071,15 @@ static int task_clock_perf_event_enable(struct perf_event *event)
 	now = event->ctx->time;
 
 	atomic64_set(&hwc->prev_count, now);
-	hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL);
-	hwc->hrtimer.function = perf_swevent_hrtimer;
-	if (hwc->sample_period) {
-		u64 period = max_t(u64, 10000, hwc->sample_period);
-		__hrtimer_start_range_ns(&hwc->hrtimer,
-				ns_to_ktime(period), 0,
-				HRTIMER_MODE_REL, 0);
-	}
+
+	perf_swevent_start_hrtimer(event);
 
 	return 0;
 }
 
 static void task_clock_perf_event_disable(struct perf_event *event)
 {
-	if (event->hw.sample_period)
-		hrtimer_cancel(&event->hw.hrtimer);
+	perf_swevent_cancel_hrtimer(event);
 	task_clock_perf_event_update(event, event->ctx->time);
 
 }
-- 
cgit v1.2.3


From 54f4407608c59712a8f5ec1e10dfac40bef5a2e7 Mon Sep 17 00:00:00 2001
From: Soeren Sandmann <sandmann@daimi.au.dk>
Date: Thu, 22 Oct 2009 18:34:08 +0200
Subject: perf events: Don't generate events for the idle task when
 exclude_idle is set
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Getting samples for the idle task is often not interesting, so
don't generate them when exclude_idle is set for the event in
question.

Signed-off-by: Søren Sandmann Pedersen <sandmann@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <ye8pr8fmlq7.fsf@camel16.daimi.au.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_event.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/perf_event.c b/kernel/perf_event.c
index 33ff019f9aa6..7f29643c8985 100644
--- a/kernel/perf_event.c
+++ b/kernel/perf_event.c
@@ -3959,8 +3959,9 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer)
 		regs = task_pt_regs(current);
 
 	if (regs) {
-		if (perf_event_overflow(event, 0, &data, regs))
-			ret = HRTIMER_NORESTART;
+		if (!(event->attr.exclude_idle && current->pid == 0))
+			if (perf_event_overflow(event, 0, &data, regs))
+				ret = HRTIMER_NORESTART;
 	}
 
 	period = max_t(u64, 10000, event->hw.sample_period);
-- 
cgit v1.2.3


From f685ceacab07d3f6c236f04803e2f2f0dbcc5afb Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 23 Oct 2009 23:09:22 +0200
Subject: sched: Strengthen buddies and mitigate buddy induced latencies

This patch restores the effectiveness of LAST_BUDDY in preventing
pgsql+oltp from collapsing due to wakeup preemption. It also
switches LAST_BUDDY to exclusively do what it does best, namely
mitigate the effects of aggressive wakeup preemption, which
improves vmark throughput markedly, and restores mysql+oltp
scalability.

Since buddies are about scalability, enable them beginning at the
point where we begin expanding sched_latency, namely
sched_nr_latency. Previously, buddies were cleared aggressively,
which seriously reduced their effectiveness. Not clearing
aggressively however, produces a small drop in mysql+oltp
throughput immediately after peak, indicating that LAST_BUDDY is
actually doing some harm. This is right at the point where X on the
desktop in competition with another load wants low latency service.
Ergo, do not enable until we need to scale.

To mitigate latency induced by buddies, or by a task just missing
wakeup preemption, check latency at tick time.

Last hunk prevents buddies from stymieing BALANCE_NEWIDLE via
CACHE_HOT_BUDDY.

Supporting performance tests:

 tip   = v2.6.32-rc5-1497-ga525b32
 tipx  = NO_GENTLE_FAIR_SLEEPERS NEXT_BUDDY granularity knobs = 31 knobs + 31 buddies
 tip+x = NO_GENTLE_FAIR_SLEEPERS granularity knobs = 31 knobs

(Three run averages except where noted.)

 vmark:
 ------
 tip           108466 messages per second
 tip+          125307 messages per second
 tip+x         125335 messages per second
 tipx          117781 messages per second
 2.6.31.3      122729 messages per second

 mysql+oltp:
 -----------
 clients          1        2        4        8       16       32       64        128    256
 ..........................................................................................
 tip        9949.89 18690.20 34801.24 34460.04 32682.88 30765.97 28305.27 25059.64 19548.08
 tip+      10013.90 18526.84 34900.38 34420.14 33069.83 32083.40 30578.30 28010.71 25605.47
 tipx       9698.71 18002.70 34477.56 33420.01 32634.30 31657.27 29932.67 26827.52 21487.18
 2.6.31.3   8243.11 18784.20 34404.83 33148.38 31900.32 31161.90 29663.81 25995.94 18058.86

 pgsql+oltp:
 -----------
 clients          1        2        4        8       16       32       64      128      256
 ..........................................................................................
 tip       13686.37 26609.25 51934.28 51347.81 49479.51 45312.65 36691.91 26851.57 24145.35
 tip+ (1x) 13907.85 27135.87 52951.98 52514.04 51742.52 50705.43 49947.97 48374.19 46227.94
 tip+x     13906.78 27065.81 52951.19 52542.59 52176.11 51815.94 50838.90 49439.46 46891.00
 tipx      13742.46 26769.81 52351.99 51891.73 51320.79 50938.98 50248.65 48908.70 46553.84
 2.6.31.3  13815.35 26906.46 52683.34 52061.31 51937.10 51376.80 50474.28 49394.47 47003.25

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      |  2 +-
 kernel/sched_fair.c | 73 ++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 48 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 789001da0a94..cae6700bedb3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2008,7 +2008,7 @@ task_hot(struct task_struct *p, u64 now, struct sched_domain *sd)
 	/*
 	 * Buddy candidates are cache hot:
 	 */
-	if (sched_feat(CACHE_HOT_BUDDY) &&
+	if (sched_feat(CACHE_HOT_BUDDY) && this_rq()->nr_running &&
 			(&p->se == cfs_rq_of(&p->se)->next ||
 			 &p->se == cfs_rq_of(&p->se)->last))
 		return 1;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c32c3e643daa..37087a7fac22 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -822,6 +822,26 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr)
 		 * re-elected due to buddy favours.
 		 */
 		clear_buddies(cfs_rq, curr);
+		return;
+	}
+
+	/*
+	 * Ensure that a task that missed wakeup preemption by a
+	 * narrow margin doesn't have to wait for a full slice.
+	 * This also mitigates buddy induced latencies under load.
+	 */
+	if (!sched_feat(WAKEUP_PREEMPT))
+		return;
+
+	if (delta_exec < sysctl_sched_min_granularity)
+		return;
+
+	if (cfs_rq->nr_running > 1) {
+		struct sched_entity *se = __pick_next_entity(cfs_rq);
+		s64 delta = curr->vruntime - se->vruntime;
+
+		if (delta > ideal_runtime)
+			resched_task(rq_of(cfs_rq)->curr);
 	}
 }
 
@@ -861,21 +881,18 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se);
 static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
 {
 	struct sched_entity *se = __pick_next_entity(cfs_rq);
-	struct sched_entity *buddy;
+	struct sched_entity *left = se;
 
-	if (cfs_rq->next) {
-		buddy = cfs_rq->next;
-		cfs_rq->next = NULL;
-		if (wakeup_preempt_entity(buddy, se) < 1)
-			return buddy;
-	}
+	if (cfs_rq->next && wakeup_preempt_entity(cfs_rq->next, left) < 1)
+		se = cfs_rq->next;
 
-	if (cfs_rq->last) {
-		buddy = cfs_rq->last;
-		cfs_rq->last = NULL;
-		if (wakeup_preempt_entity(buddy, se) < 1)
-			return buddy;
-	}
+	/*
+	 * Prefer last buddy, try to return the CPU to a preempted task.
+	 */
+	if (cfs_rq->last && wakeup_preempt_entity(cfs_rq->last, left) < 1)
+		se = cfs_rq->last;
+
+	clear_buddies(cfs_rq, se);
 
 	return se;
 }
@@ -1577,6 +1594,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	struct sched_entity *se = &curr->se, *pse = &p->se;
 	struct cfs_rq *cfs_rq = task_cfs_rq(curr);
 	int sync = wake_flags & WF_SYNC;
+	int scale = cfs_rq->nr_running >= sched_nr_latency;
 
 	update_curr(cfs_rq);
 
@@ -1591,18 +1609,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 	if (unlikely(se == pse))
 		return;
 
-	/*
-	 * Only set the backward buddy when the current task is still on the
-	 * rq. This can happen when a wakeup gets interleaved with schedule on
-	 * the ->pre_schedule() or idle_balance() point, either of which can
-	 * drop the rq lock.
-	 *
-	 * Also, during early boot the idle thread is in the fair class, for
-	 * obvious reasons its a bad idea to schedule back to the idle thread.
-	 */
-	if (sched_feat(LAST_BUDDY) && likely(se->on_rq && curr != rq->idle))
-		set_last_buddy(se);
-	if (sched_feat(NEXT_BUDDY) && !(wake_flags & WF_FORK))
+	if (sched_feat(NEXT_BUDDY) && scale && !(wake_flags & WF_FORK))
 		set_next_buddy(pse);
 
 	/*
@@ -1648,8 +1655,22 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
 
 	BUG_ON(!pse);
 
-	if (wakeup_preempt_entity(se, pse) == 1)
+	if (wakeup_preempt_entity(se, pse) == 1) {
 		resched_task(curr);
+		/*
+		 * Only set the backward buddy when the current task is still
+		 * on the rq. This can happen when a wakeup gets interleaved
+		 * with schedule on the ->pre_schedule() or idle_balance()
+		 * point, either of which can * drop the rq lock.
+		 *
+		 * Also, during early boot the idle thread is in the fair class,
+		 * for obvious reasons its a bad idea to schedule back to it.
+		 */
+		if (unlikely(!se->on_rq || curr == rq->idle))
+			return;
+		if (sched_feat(LAST_BUDDY) && scale && entity_is_task(se))
+			set_last_buddy(se);
+	}
 }
 
 static struct task_struct *pick_next_task_fair(struct rq *rq)
-- 
cgit v1.2.3


From cf8517cf905b5cd31d5790250b9ac39f7cb8aa53 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 23 Oct 2009 19:36:16 -0400
Subject: tracing: Update *ppos instead of filp->f_pos

Instead of directly updating filp->f_pos we should update the *ppos
argument. The filp->f_pos gets updated within the file_pos_write()
function called from sys_write().

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091023233646.399670810@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 kernel/trace/trace.c  | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 37ba67e33265..9c451a1930b6 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -740,7 +740,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf,
  out:
 	mutex_unlock(&ftrace_profile_lock);
 
-	filp->f_pos += cnt;
+	*ppos += cnt;
 
 	return cnt;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c820b0310a12..b20d3ec75de9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2440,7 +2440,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			return ret;
 	}
 
-	filp->f_pos += cnt;
+	*ppos += cnt;
 
 	return cnt;
 }
@@ -2582,7 +2582,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 	}
 	mutex_unlock(&trace_types_lock);
 
-	filp->f_pos += cnt;
+	*ppos += cnt;
 
 	return cnt;
 }
@@ -2764,7 +2764,7 @@ tracing_set_trace_write(struct file *filp, const char __user *ubuf,
 	if (err)
 		return err;
 
-	filp->f_pos += ret;
+	*ppos += ret;
 
 	return ret;
 }
@@ -3299,7 +3299,7 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 		}
 	}
 
-	filp->f_pos += cnt;
+	*ppos += cnt;
 
 	/* If check pages failed, return ENOMEM */
 	if (tracing_disabled)
-- 
cgit v1.2.3


From 3e69533b51930a7169235db2caf703884e6e3bbb Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 23 Oct 2009 19:36:17 -0400
Subject: tracing: Fix trace_seq_printf() return value

trace_seq_printf() return value is a little ambiguous. It
currently returns the length of the space available in the
buffer. printf usually returns the amount written. This is not
adequate here, because:

  trace_seq_printf(s, "");

is perfectly legal, and returning 0 would indicate that it
failed.

We can always see the amount written by looking at the before
and after values of s->len. This is not quite the same use as
printf. We only care if the string was successfully written to
the buffer or not.

Make trace_seq_printf() return 0 if the trace oversizes the
buffer's free space, 1 otherwise.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091023233646.631787612@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ed17565826b0..b6c12c6a1bcd 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -69,6 +69,9 @@ enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
  * @s: trace sequence descriptor
  * @fmt: printf format string
  *
+ * It returns 0 if the trace oversizes the buffer's free
+ * space, 1 otherwise.
+ *
  * The tracer may use either sequence operations or its own
  * copy to user routines. To simplify formating of a trace
  * trace_seq_printf is used to store strings into a special
@@ -95,7 +98,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 
 	s->len += ret;
 
-	return len;
+	return 1;
 }
 EXPORT_SYMBOL_GPL(trace_seq_printf);
 
-- 
cgit v1.2.3


From 67b394f7f26d84edb7294cc6528ab7ca6daa2ad1 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 23 Oct 2009 19:36:18 -0400
Subject: tracing: Fix comment typo and documentation example

Trivial patch to fix a documentation example and to fix a
comment.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091023233646.871719877@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/ftrace.txt | 2 ++
 kernel/trace/ring_buffer.c     | 2 +-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/Documentation/trace/ftrace.txt b/Documentation/trace/ftrace.txt
index 957b22fde2df..8179692fbb90 100644
--- a/Documentation/trace/ftrace.txt
+++ b/Documentation/trace/ftrace.txt
@@ -1231,6 +1231,7 @@ something like this simple program:
 #include <sys/stat.h>
 #include <fcntl.h>
 #include <unistd.h>
+#include <string.h>
 
 #define _STR(x) #x
 #define STR(x) _STR(x)
@@ -1265,6 +1266,7 @@ const char *find_debugfs(void)
                return NULL;
        }
 
+       strcat(debugfs, "/tracing/");
        debugfs_found = 1;
 
        return debugfs;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d4ff01970547..217f6991184f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2681,7 +2681,7 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
 
 /**
- * ring_buffer_overrun_cpu - get the number of overruns in buffer
+ * ring_buffer_overruns - get the number of overruns in buffer
  * @buffer: The ring buffer
  *
  * Returns the total number of overruns in the ring buffer
-- 
cgit v1.2.3


From 6d3f1e12f46a2f9a1bb7e7aa433df8dd31ce5647 Mon Sep 17 00:00:00 2001
From: Jiri Olsa <jolsa@redhat.com>
Date: Fri, 23 Oct 2009 19:36:19 -0400
Subject: tracing: Remove cpu arg from the rb_time_stamp() function

The cpu argument is not used inside the rb_time_stamp() function.
Plus fix a typo.

Signed-off-by: Jiri Olsa <jolsa@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20091023233647.118547500@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_seq.h  |  2 +-
 kernel/trace/ring_buffer.c | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h
index c134dd1fe6b6..09077f6ed128 100644
--- a/include/linux/trace_seq.h
+++ b/include/linux/trace_seq.h
@@ -7,7 +7,7 @@
 
 /*
  * Trace sequences are used to allow a function to call several other functions
- * to create a string of data to use (up to a max of PAGE_SIZE.
+ * to create a string of data to use (up to a max of PAGE_SIZE).
  */
 
 struct trace_seq {
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 217f6991184f..3ffa502fb243 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -483,7 +483,7 @@ struct ring_buffer_iter {
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
-static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu)
+static inline u64 rb_time_stamp(struct ring_buffer *buffer)
 {
 	/* shift to debug/test normalization and TIME_EXTENTS */
 	return buffer->clock() << DEBUG_SHIFT;
@@ -494,7 +494,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
 	u64 time;
 
 	preempt_disable_notrace();
-	time = rb_time_stamp(buffer, cpu);
+	time = rb_time_stamp(buffer);
 	preempt_enable_no_resched_notrace();
 
 	return time;
@@ -599,7 +599,7 @@ static struct list_head *rb_list_head(struct list_head *list)
 }
 
 /*
- * rb_is_head_page - test if the give page is the head page
+ * rb_is_head_page - test if the given page is the head page
  *
  * Because the reader may move the head_page pointer, we can
  * not trust what the head page is (it may be pointing to
@@ -1868,7 +1868,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer,
 		 * Nested commits always have zero deltas, so
 		 * just reread the time stamp
 		 */
-		*ts = rb_time_stamp(buffer, cpu_buffer->cpu);
+		*ts = rb_time_stamp(buffer);
 		next_page->page->time_stamp = *ts;
 	}
 
@@ -2111,7 +2111,7 @@ rb_reserve_next_event(struct ring_buffer *buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		goto out_fail;
 
-	ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
+	ts = rb_time_stamp(cpu_buffer->buffer);
 
 	/*
 	 * Only the first commit can update the timestamp.
-- 
cgit v1.2.3


From 4a6cc4bd32e580722882115d4c8b964d732c11e4 Mon Sep 17 00:00:00 2001
From: Jiri Kosina <jkosina@suse.cz>
Date: Thu, 29 Oct 2009 00:26:00 +0900
Subject: sched: move rq_weight data array out of .percpu

Commit 34d76c41 introduced percpu array update_shares_data, size of which
being proportional to NR_CPUS. Unfortunately this blows up ia64 for large
NR_CPUS configuration, as ia64 allows only 64k for .percpu section.

Fix this by allocating this array dynamically and keep only pointer to it
percpu.

The per-cpu handling doesn't impose significant performance penalty on
potentially contented path in tg_shares_up().

...
ffffffff8104337c:       65 48 8b 14 25 20 cd    mov    %gs:0xcd20,%rdx
ffffffff81043383:       00 00
ffffffff81043385:       48 c7 c0 00 e1 00 00    mov    $0xe100,%rax
ffffffff8104338c:       48 c7 45 a0 00 00 00    movq   $0x0,-0x60(%rbp)
ffffffff81043393:       00
ffffffff81043394:       48 c7 45 a8 00 00 00    movq   $0x0,-0x58(%rbp)
ffffffff8104339b:       00
ffffffff8104339c:       48 01 d0                add    %rdx,%rax
ffffffff8104339f:       49 8d 94 24 08 01 00    lea    0x108(%r12),%rdx
ffffffff810433a6:       00
ffffffff810433a7:       b9 ff ff ff ff          mov    $0xffffffff,%ecx
ffffffff810433ac:       48 89 45 b0             mov    %rax,-0x50(%rbp)
ffffffff810433b0:       bb 00 04 00 00          mov    $0x400,%ebx
ffffffff810433b5:       48 89 55 c0             mov    %rdx,-0x40(%rbp)
...

After:

...
ffffffff8104337c:       65 8b 04 25 28 cd 00    mov    %gs:0xcd28,%eax
ffffffff81043383:       00
ffffffff81043384:       48 98                   cltq
ffffffff81043386:       49 8d bc 24 08 01 00    lea    0x108(%r12),%rdi
ffffffff8104338d:       00
ffffffff8104338e:       48 8b 15 d3 7f 76 00    mov    0x767fd3(%rip),%rdx        # ffffffff817ab368 <update_shares_data>
ffffffff81043395:       48 8b 34 c5 00 ee 6d    mov    -0x7e921200(,%rax,8),%rsi
ffffffff8104339c:       81
ffffffff8104339d:       48 c7 45 a0 00 00 00    movq   $0x0,-0x60(%rbp)
ffffffff810433a4:       00
ffffffff810433a5:       b9 ff ff ff ff          mov    $0xffffffff,%ecx
ffffffff810433aa:       48 89 7d c0             mov    %rdi,-0x40(%rbp)
ffffffff810433ae:       48 c7 45 a8 00 00 00    movq   $0x0,-0x58(%rbp)
ffffffff810433b5:       00
ffffffff810433b6:       bb 00 04 00 00          mov    $0x400,%ebx
ffffffff810433bb:       48 01 f2                add    %rsi,%rdx
ffffffff810433be:       48 89 55 b0             mov    %rdx,-0x50(%rbp)
...

Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/sched.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ee61f454a98b..526d237b8ce5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1563,11 +1563,7 @@ static unsigned long cpu_avg_load_per_task(int cpu)
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
 
-struct update_shares_data {
-	unsigned long rq_weight[NR_CPUS];
-};
-
-static DEFINE_PER_CPU(struct update_shares_data, update_shares_data);
+static __read_mostly unsigned long *update_shares_data;
 
 static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 
@@ -1577,12 +1573,12 @@ static void __set_se_shares(struct sched_entity *se, unsigned long shares);
 static void update_group_shares_cpu(struct task_group *tg, int cpu,
 				    unsigned long sd_shares,
 				    unsigned long sd_rq_weight,
-				    struct update_shares_data *usd)
+				    unsigned long *usd_rq_weight)
 {
 	unsigned long shares, rq_weight;
 	int boost = 0;
 
-	rq_weight = usd->rq_weight[cpu];
+	rq_weight = usd_rq_weight[cpu];
 	if (!rq_weight) {
 		boost = 1;
 		rq_weight = NICE_0_LOAD;
@@ -1617,7 +1613,7 @@ static void update_group_shares_cpu(struct task_group *tg, int cpu,
 static int tg_shares_up(struct task_group *tg, void *data)
 {
 	unsigned long weight, rq_weight = 0, shares = 0;
-	struct update_shares_data *usd;
+	unsigned long *usd_rq_weight;
 	struct sched_domain *sd = data;
 	unsigned long flags;
 	int i;
@@ -1626,11 +1622,11 @@ static int tg_shares_up(struct task_group *tg, void *data)
 		return 0;
 
 	local_irq_save(flags);
-	usd = &__get_cpu_var(update_shares_data);
+	usd_rq_weight = per_cpu_ptr(update_shares_data, smp_processor_id());
 
 	for_each_cpu(i, sched_domain_span(sd)) {
 		weight = tg->cfs_rq[i]->load.weight;
-		usd->rq_weight[i] = weight;
+		usd_rq_weight[i] = weight;
 
 		/*
 		 * If there are currently no tasks on the cpu pretend there
@@ -1651,7 +1647,7 @@ static int tg_shares_up(struct task_group *tg, void *data)
 		shares = tg->shares;
 
 	for_each_cpu(i, sched_domain_span(sd))
-		update_group_shares_cpu(tg, i, shares, rq_weight, usd);
+		update_group_shares_cpu(tg, i, shares, rq_weight, usd_rq_weight);
 
 	local_irq_restore(flags);
 
@@ -9406,6 +9402,10 @@ void __init sched_init(void)
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_GROUP_SCHED */
 
+#if defined CONFIG_FAIR_GROUP_SCHED && defined CONFIG_SMP
+	update_shares_data = __alloc_percpu(nr_cpu_ids * sizeof(unsigned long),
+					    __alignof__(unsigned long));
+#endif
 	for_each_possible_cpu(i) {
 		struct rq *rq;
 
-- 
cgit v1.2.3


From 11df6dddcbc38affb7473aad3d962baf8414a947 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 28 Oct 2009 20:26:48 +0100
Subject: futex: Fix spurious wakeup for requeue_pi really

The requeue_pi path doesn't use unqueue_me() (and the racy lock_ptr ==
NULL test) nor does it use the wake_list of futex_wake() which where
the reason for commit 41890f2 (futex: Handle spurious wake up)

See debugging discussing on LKML Message-ID: <4AD4080C.20703@us.ibm.com>

The changes in this fix to the wait_requeue_pi path were considered to
be a likely unecessary, but harmless safety net. But it turns out that
due to the fact that for unknown $@#!*( reasons EWOULDBLOCK is defined
as EAGAIN we built an endless loop in the code path which returns
correctly EWOULDBLOCK.

Spurious wakeups in wait_requeue_pi code path are unlikely so we do
the easy solution and return EWOULDBLOCK^WEAGAIN to user space and let
it deal with the spurious wakeup.

Cc: Darren Hart <dvhltc@us.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: John Stultz <johnstul@linux.vnet.ibm.com>
Cc: Dinakar Guniguntala <dino@in.ibm.com>
LKML-Reference: <4AE23C74.1090502@us.ibm.com>
Cc: stable@kernel.org
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/futex.c | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 642f3bbaacc7..fb65e822fc41 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -2127,7 +2127,7 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb,
 		plist_del(&q->list, &q->list.plist);
 
 		/* Handle spurious wakeups gracefully */
-		ret = -EAGAIN;
+		ret = -EWOULDBLOCK;
 		if (timeout && !timeout->task)
 			ret = -ETIMEDOUT;
 		else if (signal_pending(current))
@@ -2208,7 +2208,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared,
 	debug_rt_mutex_init_waiter(&rt_waiter);
 	rt_waiter.task = NULL;
 
-retry:
 	key2 = FUTEX_KEY_INIT;
 	ret = get_futex_key(uaddr2, fshared, &key2, VERIFY_WRITE);
 	if (unlikely(ret != 0))
@@ -2303,9 +2302,6 @@ out_put_keys:
 out_key2:
 	put_futex_key(fshared, &key2);
 
-	/* Spurious wakeup ? */
-	if (ret == -EAGAIN)
-		goto retry;
 out:
 	if (to) {
 		hrtimer_cancel(&to->timer);
-- 
cgit v1.2.3


From 65afac7d80ab3bc9f81e75eafb71eeb92a3ebdef Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 29 Oct 2009 08:56:16 -0600
Subject: param: fix lots of bugs with writing charp params from sysfs, by
 leaking mem.

e180a6b7759a "param: fix charp parameters set via sysfs" fixed the case
where charp parameters written via sysfs were freed, leaving drivers
accessing random memory.

Unfortunately, storing a flag in the kparam struct was a bad idea: it's
rodata so setting it causes an oops on some archs.  But that's not all:

1) module_param_array() on charp doesn't work reliably, since we use an
   uninitialized temporary struct kernel_param.
2) there's a fundamental race if a module uses this parameter and then
   it's changed: they will still access the old, freed, memory.

The simplest fix (ie. for 2.6.32) is to never free the memory.  This
prevents all these problems, at cost of a memory leak.  In practice, there
are only 18 places where a charp is writable via sysfs, and all are
root-only writable.

Reported-by: Takashi Iwai <tiwai@suse.de>
Cc: Sitsofe Wheeler <sitsofe@yahoo.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: stable@kernel.org
---
 include/linux/moduleparam.h |  1 -
 kernel/params.c             | 10 +---------
 2 files changed, 1 insertion(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index 6547c3cdbc4c..82a9124f7d75 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -37,7 +37,6 @@ typedef int (*param_set_fn)(const char *val, struct kernel_param *kp);
 typedef int (*param_get_fn)(char *buffer, struct kernel_param *kp);
 
 /* Flag bits for kernel_param.flags */
-#define KPARAM_KMALLOCED	1
 #define KPARAM_ISBOOL		2
 
 struct kernel_param {
diff --git a/kernel/params.c b/kernel/params.c
index 9da58eabdcb2..95ef27cf8e82 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -218,13 +218,9 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	if (kp->flags & KPARAM_KMALLOCED)
-		kfree(*(char **)kp->arg);
-
 	/* This is a hack.  We can't need to strdup in early boot, and we
 	 * don't need to; this mangled commandline is preserved. */
 	if (slab_is_available()) {
-		kp->flags |= KPARAM_KMALLOCED;
 		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
 		if (!kp->arg)
 			return -ENOMEM;
@@ -605,11 +601,7 @@ void module_param_sysfs_remove(struct module *mod)
 
 void destroy_params(const struct kernel_param *params, unsigned num)
 {
-	unsigned int i;
-
-	for (i = 0; i < num; i++)
-		if (params[i].flags & KPARAM_KMALLOCED)
-			kfree(*(char **)params[i].arg);
+	/* FIXME: This should free kmalloced charp parameters.  It doesn't. */
 }
 
 static void __init kernel_add_sysfs_param(const char *name,
-- 
cgit v1.2.3


From d553ad864e3b3dde3f1038d491e207021b2d6293 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 29 Oct 2009 08:56:17 -0600
Subject: param: fix NULL comparison on oom

kp->arg is always true: it's the contents of that pointer we care about.

Reported-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: stable@kernel.org
---
 kernel/params.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 95ef27cf8e82..00520c43d88c 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -222,7 +222,7 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 	 * don't need to; this mangled commandline is preserved. */
 	if (slab_is_available()) {
 		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
-		if (!kp->arg)
+		if (!*(char **)kp->arg)
 			return -ENOMEM;
 	} else
 		*(const char **)kp->arg = val;
-- 
cgit v1.2.3


From 3c7d76e371ac1a3802ae1673f5c63554af59325c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 29 Oct 2009 08:56:19 -0600
Subject: param: fix setting arrays of bool

We create a dummy struct kernel_param on the stack for parsing each
array element, but we didn't initialize the flags word.  This matters
for arrays of type "bool", where the flag indicates if it really is
an array of bools or unsigned int (old-style).

Reported-by: Takashi Iwai <tiwai@suse.de>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: stable@kernel.org
---
 kernel/params.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/params.c b/kernel/params.c
index 00520c43d88c..d656c276508d 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -300,6 +300,7 @@ static int param_array(const char *name,
 		       unsigned int min, unsigned int max,
 		       void *elem, int elemsize,
 		       int (*set)(const char *, struct kernel_param *kp),
+		       u16 flags,
 		       unsigned int *num)
 {
 	int ret;
@@ -309,6 +310,7 @@ static int param_array(const char *name,
 	/* Get the name right for errors. */
 	kp.name = name;
 	kp.arg = elem;
+	kp.flags = flags;
 
 	/* No equals sign? */
 	if (!val) {
@@ -354,7 +356,8 @@ int param_array_set(const char *val, struct kernel_param *kp)
 	unsigned int temp_num;
 
 	return param_array(kp->name, val, 1, arr->max, arr->elem,
-			   arr->elemsize, arr->set, arr->num ?: &temp_num);
+			   arr->elemsize, arr->set, kp->flags,
+			   arr->num ?: &temp_num);
 }
 
 int param_array_get(char *buffer, struct kernel_param *kp)
-- 
cgit v1.2.3


From 0d0df599f1f11f12d589318bacb59a50fb5c0310 Mon Sep 17 00:00:00 2001
From: Christian Borntraeger <borntraeger@de.ibm.com>
Date: Mon, 26 Oct 2009 16:49:34 -0700
Subject: connector: fix regression introduced by sid connector

Since commit 02b51df1b07b4e9ca823c89284e704cadb323cd1 (proc connector: add
event for process becoming session leader) we have the following warning:

Badness at kernel/softirq.c:143
[...]
Krnl PSW : 0404c00180000000 00000000001481d4 (local_bh_enable+0xb0/0xe0)
[...]
Call Trace:
([<000000013fe04100>] 0x13fe04100)
 [<000000000048a946>] sk_filter+0x9a/0xd0
 [<000000000049d938>] netlink_broadcast+0x2c0/0x53c
 [<00000000003ba9ae>] cn_netlink_send+0x272/0x2b0
 [<00000000003baef0>] proc_sid_connector+0xc4/0xd4
 [<0000000000142604>] __set_special_pids+0x58/0x90
 [<0000000000159938>] sys_setsid+0xb4/0xd8
 [<00000000001187fe>] sysc_noemu+0x10/0x16
 [<00000041616cb266>] 0x41616cb266

The warning is
--->    WARN_ON_ONCE(in_irq() || irqs_disabled());

The network code must not be called with disabled interrupts but
sys_setsid holds the tasklist_lock with spinlock_irq while calling the
connector.

After a discussion we agreed that we can move proc_sid_connector from
__set_special_pids to sys_setsid.

We also agreed that it is sufficient to change the check from
task_session(curr) != pid into err > 0, since if we don't change the
session, this means we were already the leader and return -EPERM.

One last thing:
There is also daemonize(), and some people might want to get a
notification in that case. Since daemonize() is only needed if a user
space does kernel_thread this does not look important (and there seems
to be no consensus if this connector should be called in daemonize). If
we really want this, we can add proc_sid_connector to daemonize() in an
additional patch (Scott?)

Signed-off-by: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Scott James Remnant <scott@ubuntu.com>
Cc: Matt Helsley <matthltc@us.ibm.com>
Cc: David S. Miller <davem@davemloft.net>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Evgeniy Polyakov <zbr@ioremap.net>
Acked-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 4 +---
 kernel/sys.c  | 2 ++
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index e61891f80123..f7864ac2ecc1 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -359,10 +359,8 @@ void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
 
-	if (task_session(curr) != pid) {
+	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
-		proc_sid_connector(curr);
-	}
 
 	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
diff --git a/kernel/sys.c b/kernel/sys.c
index 255475d163e0..1828f8d10844 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1110,6 +1110,8 @@ SYSCALL_DEFINE0(setsid)
 	err = session;
 out:
 	write_unlock_irq(&tasklist_lock);
+	if (err > 0)
+		proc_sid_connector(group_leader);
 	return err;
 }
 
-- 
cgit v1.2.3


From 478988d3b28e98a31e0cfe24e011e28ba0f3cf0d Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 26 Oct 2009 16:49:36 -0700
Subject: cgroup: fix strstrip() misuse

cgroup_write_X64() and cgroup_write_string() ignore the return value of
strstrip().  it makes small inconsistent behavior.

example:
=========================
 # cd /mnt/cgroup/hoge
 # cat memory.swappiness
 60
 # echo "59 " > memory.swappiness
 # cat memory.swappiness
 59
 # echo " 58" > memory.swappiness
 bash: echo: write error: Invalid argument

This patch fixes it.

Cc: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index ca83b73fba19..0249f4be9b5c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1710,14 +1710,13 @@ static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft,
 		return -EFAULT;
 
 	buffer[nbytes] = 0;     /* nul-terminate */
-	strstrip(buffer);
 	if (cft->write_u64) {
-		u64 val = simple_strtoull(buffer, &end, 0);
+		u64 val = simple_strtoull(strstrip(buffer), &end, 0);
 		if (*end)
 			return -EINVAL;
 		retval = cft->write_u64(cgrp, cft, val);
 	} else {
-		s64 val = simple_strtoll(buffer, &end, 0);
+		s64 val = simple_strtoll(strstrip(buffer), &end, 0);
 		if (*end)
 			return -EINVAL;
 		retval = cft->write_s64(cgrp, cft, val);
@@ -1753,8 +1752,7 @@ static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft,
 	}
 
 	buffer[nbytes] = 0;     /* nul-terminate */
-	strstrip(buffer);
-	retval = cft->write_string(cgrp, cft, buffer);
+	retval = cft->write_string(cgrp, cft, strstrip(buffer));
 	if (!retval)
 		retval = nbytes;
 out:
-- 
cgit v1.2.3


From 8c85dd8730bfb696e691145335f884c7baef8277 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Mon, 26 Oct 2009 16:50:07 -0700
Subject: sysctl: fix false positives when PROC_SYSCTL=n

Having ->procname but not ->proc_handler is valid when PROC_SYSCTL=n,
people use such combination to reduce ifdefs with non-standard handlers.

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=14408

Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Reported-by: Peter Teoh <htmldeveloper@gmail.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl_check.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index b38423ca711a..b6e7aaea4604 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -1521,7 +1521,7 @@ int sysctl_check_table(struct nsproxy *namespaces, struct ctl_table *table)
 			if (!table->ctl_name && table->strategy)
 				set_fail(&fail, table, "Strategy without ctl_name");
 #endif
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 			if (table->procname && !table->proc_handler)
 				set_fail(&fail, table, "No proc_handler");
 #endif
-- 
cgit v1.2.3


From 49557e620339cb134127b5bfbcfecc06b77d0232 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 2 Nov 2009 20:37:20 +1030
Subject: sched: Fix boot crash by zalloc()ing most of the cpu masks

I got a boot crash when forcing cpumasks offstack on 32 bit,
because find_new_ilb() returned 3 on my UP system (nohz.cpu_mask
wasn't zeroed).

AFAICT the others need to be zeroed too: only
nohz.ilb_grp_nohz_mask is initialized before use.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <200911022037.21282.rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index cae6700bedb3..bf21adb6c9fc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9535,13 +9535,13 @@ void __init sched_init(void)
 	current->sched_class = &fair_sched_class;
 
 	/* Allocate the nohz_cpu_mask if CONFIG_CPUMASK_OFFSTACK */
-	alloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz_cpu_mask, GFP_NOWAIT);
 #ifdef CONFIG_SMP
 #ifdef CONFIG_NO_HZ
-	alloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
+	zalloc_cpumask_var(&nohz.cpu_mask, GFP_NOWAIT);
 	alloc_cpumask_var(&nohz.ilb_grp_nohz_mask, GFP_NOWAIT);
 #endif
-	alloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
+	zalloc_cpumask_var(&cpu_isolated_map, GFP_NOWAIT);
 #endif /* SMP */
 
 	perf_event_init();
-- 
cgit v1.2.3


From b00bc0b237055b4c45816325ee14f0bd83e6f590 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 2 Nov 2009 13:01:56 +0100
Subject: uids: Prevent tear down race

Ingo triggered the following warning:

WARNING: at lib/debugobjects.c:255 debug_print_object+0x42/0x50()
Hardware name: System Product Name
ODEBUG: init active object type: timer_list
Modules linked in:
Pid: 2619, comm: dmesg Tainted: G        W  2.6.32-rc5-tip+ #5298
Call Trace:
 [<81035443>] warn_slowpath_common+0x6a/0x81
 [<8120e483>] ? debug_print_object+0x42/0x50
 [<81035498>] warn_slowpath_fmt+0x29/0x2c
 [<8120e483>] debug_print_object+0x42/0x50
 [<8120ec2a>] __debug_object_init+0x279/0x2d7
 [<8120ecb3>] debug_object_init+0x13/0x18
 [<810409d2>] init_timer_key+0x17/0x6f
 [<81041526>] free_uid+0x50/0x6c
 [<8104ed2d>] put_cred_rcu+0x61/0x72
 [<81067fac>] rcu_do_batch+0x70/0x121

debugobjects warns about an enqueued timer being initialized. If
CONFIG_USER_SCHED=y the user management code uses delayed work to
remove the user from the hash table and tear down the sysfs objects.

free_uid is called from RCU and initializes/schedules delayed work if
the usage count of the user_struct is 0. The init/schedule happens
outside of the uidhash_lock protected region which allows a concurrent
caller of find_user() to reference the about to be destroyed
user_struct w/o preventing the work from being scheduled. If the next
free_uid call happens before the work timer expired then the active
timer is initialized and the work scheduled again.

The race was introduced in commit 5cb350ba (sched: group scheduling,
sysfs tunables) and made more prominent by commit 3959214f (sched:
delayed cleanup of user_struct)

Move the init/schedule_delayed_work inside of the uidhash_lock
protected region to prevent the race.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Acked-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Cc: Paul E. McKenney <paulmck@us.ibm.com>
Cc: Kay Sievers <kay.sievers@vrfy.org>
Cc: stable@kernel.org
---
 kernel/user.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 2c000e7132ac..46d0165ca70c 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -330,9 +330,9 @@ done:
  */
 static void free_user(struct user_struct *up, unsigned long flags)
 {
-	spin_unlock_irqrestore(&uidhash_lock, flags);
 	INIT_DELAYED_WORK(&up->work, cleanup_user_struct);
 	schedule_delayed_work(&up->work, msecs_to_jiffies(1000));
+	spin_unlock_irqrestore(&uidhash_lock, flags);
 }
 
 #else	/* CONFIG_USER_SCHED && CONFIG_SYSFS */
-- 
cgit v1.2.3


From 83f5b01ffbbaea6f97c9a79d21e240dbfb69f2f1 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 28 Oct 2009 08:14:49 -0700
Subject: rcu: Fix long-grace-period race between forcing and initialization

Very long RCU read-side critical sections (50 milliseconds or
so) can cause a race between force_quiescent_state() and
rcu_start_gp() as follows on kernel builds with multi-level
rcu_node hierarchies:

1.	CPU 0 calls force_quiescent_state(), sees that there is a
	grace period in progress, and acquires ->fsqlock.

2.	CPU 1 detects the end of the grace period, and so
	cpu_quiet_msk_finish() sets rsp->completed to rsp->gpnum.
	This operation is carried out under the root rnp->lock,
	but CPU 0 has not yet acquired that lock.  Note that
	rsp->signaled is still RCU_SAVE_DYNTICK from the last
	grace period.

3.	CPU 1 calls rcu_start_gp(), but no one wants a new grace
	period, so it drops the root rnp->lock and returns.

4.	CPU 0 acquires the root rnp->lock and picks up rsp->completed
	and rsp->signaled, then drops rnp->lock.  It then enters the
	RCU_SAVE_DYNTICK leg of the switch statement.

5.	CPU 2 invokes call_rcu(), and now needs a new grace period.
	It calls rcu_start_gp(), which acquires the root rnp->lock, sets
	rsp->signaled to RCU_GP_INIT (too bad that CPU 0 is already in
	the RCU_SAVE_DYNTICK leg of the switch statement!)  and starts
	initializing the rcu_node hierarchy.  If there are multiple
	levels to the hierarchy, it will drop the root rnp->lock and
	initialize the lower levels of the hierarchy.

6.	CPU 0 notes that rsp->completed has not changed, which permits
        both CPU 2 and CPU 0 to try updating it concurrently.  If CPU 0's
	update prevails, later calls to force_quiescent_state() can
	count old quiescent states against the new grace period, which
	can in turn result in premature ending of grace periods.

	Not good.

This patch adds an RCU_GP_IDLE state for rsp->signaled that is
set initially at boot time and any time a grace period ends.
This prevents CPU 0 from getting into the workings of
force_quiescent_state() in step 4.  Additional locking and
checks prevent the concurrent update of rsp->signaled in step 6.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: mathieu.desnoyers@polymtl.ca
Cc: josh@joshtriplett.org
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
Cc: Valdis.Kletnieks@vt.edu
Cc: dhowells@redhat.com
LKML-Reference: <1256742889199-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c | 16 +++++++++++-----
 kernel/rcutree.h |  7 ++++---
 2 files changed, 15 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 0536125b0497..f3077c0ab181 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -59,7 +59,7 @@
 		NUM_RCU_LVL_2, \
 		NUM_RCU_LVL_3, /* == MAX_RCU_LVLS */ \
 	}, \
-	.signaled = RCU_SIGNAL_INIT, \
+	.signaled = RCU_GP_IDLE, \
 	.gpnum = -300, \
 	.completed = -300, \
 	.onofflock = __SPIN_LOCK_UNLOCKED(&name.onofflock), \
@@ -657,14 +657,17 @@ rcu_start_gp(struct rcu_state *rsp, unsigned long flags)
 	 * irqs disabled.
 	 */
 	rcu_for_each_node_breadth_first(rsp, rnp) {
-		spin_lock(&rnp->lock);	/* irqs already disabled. */
+		spin_lock(&rnp->lock);		/* irqs already disabled. */
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		rnp->gpnum = rsp->gpnum;
-		spin_unlock(&rnp->lock);	/* irqs already disabled. */
+		spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	}
 
+	rnp = rcu_get_root(rsp);
+	spin_lock(&rnp->lock);			/* irqs already disabled. */
 	rsp->signaled = RCU_SIGNAL_INIT; /* force_quiescent_state now OK. */
+	spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 	spin_unlock_irqrestore(&rsp->onofflock, flags);
 }
 
@@ -706,6 +709,7 @@ static void cpu_quiet_msk_finish(struct rcu_state *rsp, unsigned long flags)
 {
 	WARN_ON_ONCE(!rcu_gp_in_progress(rsp));
 	rsp->completed = rsp->gpnum;
+	rsp->signaled = RCU_GP_IDLE;
 	rcu_process_gp_end(rsp, rsp->rda[smp_processor_id()]);
 	rcu_start_gp(rsp, flags);  /* releases root node's rnp->lock. */
 }
@@ -1162,9 +1166,10 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 	}
 	spin_unlock(&rnp->lock);
 	switch (signaled) {
+	case RCU_GP_IDLE:
 	case RCU_GP_INIT:
 
-		break; /* grace period still initializing, ignore. */
+		break; /* grace period idle or initializing, ignore. */
 
 	case RCU_SAVE_DYNTICK:
 
@@ -1178,7 +1183,8 @@ static void force_quiescent_state(struct rcu_state *rsp, int relaxed)
 
 		/* Update state, record completion counter. */
 		spin_lock(&rnp->lock);
-		if (lastcomp == rsp->completed) {
+		if (lastcomp == rsp->completed &&
+		    rsp->signaled == RCU_SAVE_DYNTICK) {
 			rsp->signaled = RCU_FORCE_QS;
 			dyntick_record_completed(rsp, lastcomp);
 		}
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
index 1823c6e20609..1899023b0962 100644
--- a/kernel/rcutree.h
+++ b/kernel/rcutree.h
@@ -201,9 +201,10 @@ struct rcu_data {
 };
 
 /* Values for signaled field in struct rcu_state. */
-#define RCU_GP_INIT		0	/* Grace period being initialized. */
-#define RCU_SAVE_DYNTICK	1	/* Need to scan dyntick state. */
-#define RCU_FORCE_QS		2	/* Need to force quiescent state. */
+#define RCU_GP_IDLE		0	/* No grace period in progress. */
+#define RCU_GP_INIT		1	/* Grace period being initialized. */
+#define RCU_SAVE_DYNTICK	2	/* Need to scan dyntick state. */
+#define RCU_FORCE_QS		3	/* Need to force quiescent state. */
 #ifdef CONFIG_NO_HZ
 #define RCU_SIGNAL_INIT		RCU_SAVE_DYNTICK
 #else /* #ifdef CONFIG_NO_HZ */
-- 
cgit v1.2.3


From b84ff7d6f1b7f8a43414e74d972ec4c8f3361db4 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Thu, 29 Oct 2009 11:48:30 +0100
Subject: sched: Fix kthread_bind() by moving the body of kthread_bind() to
 sched.c

Eric Paris reported that commit
f685ceacab07d3f6c236f04803e2f2f0dbcc5afb causes boot time
PREEMPT_DEBUG complaints.

 [    4.590699] BUG: using smp_processor_id() in preemptible [00000000] code: rmmod/1314
 [    4.593043] caller is task_hot+0x86/0xd0

Since kthread_bind() messes with scheduler internals, move the
body to sched.c, and lock the runqueue.

Reported-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Tested-by: Eric Paris <eparis@redhat.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1256813310.7574.3.camel@marge.simson.net>
[ v2: fix !SMP build and clean up ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kthread.c | 23 -----------------------
 kernel/sched.c   | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 32 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kthread.c b/kernel/kthread.c
index 5fe709982caa..ab7ae57773e1 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -149,29 +149,6 @@ struct task_struct *kthread_create(int (*threadfn)(void *data),
 }
 EXPORT_SYMBOL(kthread_create);
 
-/**
- * kthread_bind - bind a just-created kthread to a cpu.
- * @k: thread created by kthread_create().
- * @cpu: cpu (might not be online, must be possible) for @k to run on.
- *
- * Description: This function is equivalent to set_cpus_allowed(),
- * except that @cpu doesn't need to be online, and the thread must be
- * stopped (i.e., just returned from kthread_create()).
- */
-void kthread_bind(struct task_struct *k, unsigned int cpu)
-{
-	/* Must have done schedule() in kthread() before we set_task_cpu */
-	if (!wait_task_inactive(k, TASK_UNINTERRUPTIBLE)) {
-		WARN_ON(1);
-		return;
-	}
-	set_task_cpu(k, cpu);
-	k->cpus_allowed = cpumask_of_cpu(cpu);
-	k->rt.nr_cpus_allowed = 1;
-	k->flags |= PF_THREAD_BOUND;
-}
-EXPORT_SYMBOL(kthread_bind);
-
 /**
  * kthread_stop - stop a thread created by kthread_create().
  * @k: thread created by kthread_create().
diff --git a/kernel/sched.c b/kernel/sched.c
index bf21adb6c9fc..5cb7d637e33a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1996,6 +1996,38 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 		p->sched_class->prio_changed(rq, p, oldprio, running);
 }
 
+/**
+ * kthread_bind - bind a just-created kthread to a cpu.
+ * @k: thread created by kthread_create().
+ * @cpu: cpu (might not be online, must be possible) for @k to run on.
+ *
+ * Description: This function is equivalent to set_cpus_allowed(),
+ * except that @cpu doesn't need to be online, and the thread must be
+ * stopped (i.e., just returned from kthread_create()).
+ *
+ * Function lives here instead of kthread.c because it messes with
+ * scheduler internals which require locking.
+ */
+void kthread_bind(struct task_struct *p, unsigned int cpu)
+{
+	struct rq *rq = cpu_rq(cpu);
+	unsigned long flags;
+
+	/* Must have done schedule() in kthread() before we set_task_cpu */
+	if (!wait_task_inactive(p, TASK_UNINTERRUPTIBLE)) {
+		WARN_ON(1);
+		return;
+	}
+
+	spin_lock_irqsave(&rq->lock, flags);
+	set_task_cpu(p, cpu);
+	p->cpus_allowed = cpumask_of_cpu(cpu);
+	p->rt.nr_cpus_allowed = 1;
+	p->flags |= PF_THREAD_BOUND;
+	spin_unlock_irqrestore(&rq->lock, flags);
+}
+EXPORT_SYMBOL(kthread_bind);
+
 #ifdef CONFIG_SMP
 /*
  * Is this task likely cache-hot:
-- 
cgit v1.2.3


From 76b57e613f6006ff525a17876c89326d127cadc9 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 7 Oct 2009 22:37:35 +0200
Subject: PM / Hibernate: Fix blkdev refleaks

While cruising through the swsusp code I found few blkdev reference
leaks of resume_bdev.

swsusp_read: remove blkdev_put altogether. Some fail paths do
             not do that.
swsusp_check: make sure we always put a reference on fail paths
software_resume: all fail paths between swsusp_check and swsusp_read
                 omit swsusp_close. Add it in those cases. And since
                 swsusp_read doesn't drop the reference anymore, do
                 it here unconditionally.

[rjw: Fixed a small coding style issue.]

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/hibernate.c | 11 ++++++++---
 kernel/power/swap.c      |  8 ++++----
 2 files changed, 12 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c
index 04b3a83d686f..04a9e90d248f 100644
--- a/kernel/power/hibernate.c
+++ b/kernel/power/hibernate.c
@@ -693,21 +693,22 @@ static int software_resume(void)
 	/* The snapshot device should not be opened while we're running */
 	if (!atomic_add_unless(&snapshot_device_available, -1, 0)) {
 		error = -EBUSY;
+		swsusp_close(FMODE_READ);
 		goto Unlock;
 	}
 
 	pm_prepare_console();
 	error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
 	if (error)
-		goto Finish;
+		goto close_finish;
 
 	error = usermodehelper_disable();
 	if (error)
-		goto Finish;
+		goto close_finish;
 
 	error = create_basic_memory_bitmaps();
 	if (error)
-		goto Finish;
+		goto close_finish;
 
 	pr_debug("PM: Preparing processes for restore.\n");
 	error = prepare_processes();
@@ -719,6 +720,7 @@ static int software_resume(void)
 	pr_debug("PM: Reading hibernation image.\n");
 
 	error = swsusp_read(&flags);
+	swsusp_close(FMODE_READ);
 	if (!error)
 		hibernation_restore(flags & SF_PLATFORM_MODE);
 
@@ -737,6 +739,9 @@ static int software_resume(void)
 	mutex_unlock(&pm_mutex);
 	pr_debug("PM: Resume from disk failed.\n");
 	return error;
+close_finish:
+	swsusp_close(FMODE_READ);
+	goto Finish;
 }
 
 late_initcall(software_resume);
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index b101cdc4df3f..a4388624ed98 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -572,8 +572,6 @@ int swsusp_read(unsigned int *flags_p)
 		error = load_image(&handle, &snapshot, header->pages - 1);
 	release_swap_reader(&handle);
 
-	blkdev_put(resume_bdev, FMODE_READ);
-
 	if (!error)
 		pr_debug("PM: Image successfully loaded\n");
 	else
@@ -596,7 +594,7 @@ int swsusp_check(void)
 		error = bio_read_page(swsusp_resume_block,
 					swsusp_header, NULL);
 		if (error)
-			return error;
+			goto put;
 
 		if (!memcmp(SWSUSP_SIG, swsusp_header->sig, 10)) {
 			memcpy(swsusp_header->sig, swsusp_header->orig_sig, 10);
@@ -604,8 +602,10 @@ int swsusp_check(void)
 			error = bio_write_page(swsusp_resume_block,
 						swsusp_header, NULL);
 		} else {
-			return -EINVAL;
+			error = -EINVAL;
 		}
+
+put:
 		if (error)
 			blkdev_put(resume_bdev, FMODE_READ);
 		else
-- 
cgit v1.2.3


From 4ff277f9e42fa16314045bd124a61519286094c0 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 28 Oct 2009 22:55:33 +0100
Subject: PM / Hibernate: Fix error handling in save_image()

There are too many retval variables in save_image(). Thus error return
value from snapshot_read_next() may be ignored and only part of the
snapshot (successfully) written.

Remove 'error' variable, invert the condition in the do-while loop
and convert the loop to use only 'ret' variable.

Switch the rest of the function to consider only 'ret'.

Also make sure we end printed line by \n if an error occurs.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index a4388624ed98..afa052b61161 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -314,7 +314,6 @@ static int save_image(struct swap_map_handle *handle,
 {
 	unsigned int m;
 	int ret;
-	int error = 0;
 	int nr_pages;
 	int err2;
 	struct bio *bio;
@@ -329,26 +328,27 @@ static int save_image(struct swap_map_handle *handle,
 	nr_pages = 0;
 	bio = NULL;
 	do_gettimeofday(&start);
-	do {
+	while (1) {
 		ret = snapshot_read_next(snapshot, PAGE_SIZE);
-		if (ret > 0) {
-			error = swap_write_page(handle, data_of(*snapshot),
-						&bio);
-			if (error)
-				break;
-			if (!(nr_pages % m))
-				printk("\b\b\b\b%3d%%", nr_pages / m);
-			nr_pages++;
-		}
-	} while (ret > 0);
+		if (ret <= 0)
+			break;
+		ret = swap_write_page(handle, data_of(*snapshot), &bio);
+		if (ret)
+			break;
+		if (!(nr_pages % m))
+			printk("\b\b\b\b%3d%%", nr_pages / m);
+		nr_pages++;
+	}
 	err2 = wait_on_bio_chain(&bio);
 	do_gettimeofday(&stop);
-	if (!error)
-		error = err2;
-	if (!error)
+	if (!ret)
+		ret = err2;
+	if (!ret)
 		printk("\b\b\b\bdone\n");
+	else
+		printk("\n");
 	swsusp_show_speed(&start, &stop, nr_to_write, "Wrote");
-	return error;
+	return ret;
 }
 
 /**
-- 
cgit v1.2.3


From bf9fd67a0328d56eff6022f80d4eb88ba6614119 Mon Sep 17 00:00:00 2001
From: Jiri Slaby <jirislaby@gmail.com>
Date: Wed, 28 Oct 2009 22:55:42 +0100
Subject: PM / Hibernate: Add newline to load_image() fail path

Finish a line by \n when load_image fails in the middle of loading.

Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
---
 kernel/power/swap.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index afa052b61161..890f6b11b1d3 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -536,7 +536,8 @@ static int load_image(struct swap_map_handle *handle,
 		snapshot_write_finalize(snapshot);
 		if (!snapshot_image_loaded(snapshot))
 			error = -ENODATA;
-	}
+	} else
+		printk("\n");
 	swsusp_show_speed(&start, &stop, nr_to_read, "Read");
 	return error;
 }
-- 
cgit v1.2.3


From 1d510750941a53a1d3049c1d33c75d6dfcd78618 Mon Sep 17 00:00:00 2001
From: Ian Campbell <Ian.Campbell@citrix.com>
Date: Tue, 3 Nov 2009 10:11:14 +0000
Subject: Correct nr_processes() when CPUs have been unplugged

nr_processes() returns the sum of the per cpu counter process_counts for
all online CPUs. This counter is incremented for the current CPU on
fork() and decremented for the current CPU on exit(). Since a process
does not necessarily fork and exit on the same CPU the process_count for
an individual CPU can be either positive or negative and effectively has
no meaning in isolation.

Therefore calculating the sum of process_counts over only the online
CPUs omits the processes which were started or stopped on any CPU which
has since been unplugged. Only the sum of process_counts across all
possible CPUs has meaning.

The only caller of nr_processes() is proc_root_getattr() which
calculates the number of links to /proc as
        stat->nlink = proc_root.nlink + nr_processes();

You don't have to be all that unlucky for the nr_processes() to return a
negative value leading to a negative number of links (or rather, an
apparently enormous number of links). If this happens then you can get
failures where things like "ls /proc" start to fail because they got an
-EOVERFLOW from some stat() call.

Example with some debugging inserted to show what goes on:
        # ps haux|wc -l
        nr_processes: CPU0:     90
        nr_processes: CPU1:     1030
        nr_processes: CPU2:     -900
        nr_processes: CPU3:     -136
        nr_processes: TOTAL:    84
        proc_root_getattr. nlink 12 + nr_processes() 84 = 96
        84
        # echo 0 >/sys/devices/system/cpu/cpu1/online
        # ps haux|wc -l
        nr_processes: CPU0:     85
        nr_processes: CPU2:     -901
        nr_processes: CPU3:     -137
        nr_processes: TOTAL:    -953
        proc_root_getattr. nlink 12 + nr_processes() -953 = -941
        75
        # stat /proc/
        nr_processes: CPU0:     84
        nr_processes: CPU2:     -901
        nr_processes: CPU3:     -137
        nr_processes: TOTAL:    -954
        proc_root_getattr. nlink 12 + nr_processes() -954 = -942
          File: `/proc/'
          Size: 0               Blocks: 0          IO Block: 1024   directory
        Device: 3h/3d   Inode: 1           Links: 4294966354
        Access: (0555/dr-xr-xr-x)  Uid: (    0/    root)   Gid: (    0/    root)
        Access: 2009-11-03 09:06:55.000000000 +0000
        Modify: 2009-11-03 09:06:55.000000000 +0000
        Change: 2009-11-03 09:06:55.000000000 +0000

I'm not 100% convinced that the per_cpu regions remain valid for offline
CPUs, although my testing suggests that they do. If not then I think the
correct solution would be to aggregate the process_count for a given CPU
into a global base value in cpu_down().

This bug appears to pre-date the transition to git and it looks like it
may even have been present in linux-2.6.0-test7-bk3 since it looks like
the code Rusty patched in http://lwn.net/Articles/64773/ was already
wrong.

Signed-off-by: Ian Campbell <ian.campbell@citrix.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 4c20fff8c13a..166b8c49257c 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -91,7 +91,7 @@ int nr_processes(void)
 	int cpu;
 	int total = 0;
 
-	for_each_online_cpu(cpu)
+	for_each_possible_cpu(cpu)
 		total += per_cpu(process_counts, cpu);
 
 	return total;
-- 
cgit v1.2.3


From f7112949f6a4cd6883d66c882d568c2197321de6 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 3 Nov 2009 19:42:45 +0800
Subject: ring-buffer: Synchronize resizing buffer with reader lock

We got a sudden panic when we reduced the size of the
ringbuffer.

We can reproduce the panic by the following steps:

echo 1 > events/sched/enable
cat trace_pipe > /dev/null &

while ((1))
do
echo 12000 > buffer_size_kb
echo 512 > buffer_size_kb
done

(not more than 5 seconds, panic ...)

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <4AF01735.9060409@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ring_buffer.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 3ffa502fb243..5dd017fea6f5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1193,6 +1193,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 	atomic_inc(&cpu_buffer->record_disabled);
 	synchronize_sched();
 
+	spin_lock_irq(&cpu_buffer->reader_lock);
 	rb_head_page_deactivate(cpu_buffer);
 
 	for (i = 0; i < nr_pages; i++) {
@@ -1207,6 +1208,7 @@ rb_remove_pages(struct ring_buffer_per_cpu *cpu_buffer, unsigned nr_pages)
 		return;
 
 	rb_reset_cpu(cpu_buffer);
+	spin_unlock_irq(&cpu_buffer->reader_lock);
 
 	rb_check_pages(cpu_buffer);
 
-- 
cgit v1.2.3


From ed146b25942b428f8e8056587b7638ce76573c2f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 3 Nov 2009 08:55:38 +0800
Subject: ftrace: Fix unmatched locking in ftrace_regex_write()

When a command is passed to the set_ftrace_filter, then
the ftrace_regex_lock is still held going back to user space.

 # echo 'do_open : foo' > set_ftrace_filter
 (still holding ftrace_regex_lock when returning to user space!)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
LKML-Reference: <4AEF7F8A.3080300@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/trace/ftrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9c451a1930b6..6dc4e5ef7a01 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2222,15 +2222,15 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 		ret = ftrace_process_regex(parser->buffer,
 					   parser->idx, enable);
 		if (ret)
-			goto out;
+			goto out_unlock;
 
 		trace_parser_clear(parser);
 	}
 
 	ret = read;
-
+out_unlock:
 	mutex_unlock(&ftrace_regex_lock);
-out:
+
 	return ret;
 }
 
-- 
cgit v1.2.3


From e7e7e0c084ef862d5754701108d4a038514d6314 Mon Sep 17 00:00:00 2001
From: Yong Zhang <yong.zhang0@gmail.com>
Date: Sat, 7 Nov 2009 11:16:13 +0800
Subject: genirq: try_one_irq() must be called with irq disabled

Prarit reported:
=================================
[ INFO: inconsistent lock state ]
2.6.32-rc5 #1
---------------------------------
inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage.
swapper/0 [HC0[0]:SC1[1]:HE1:SE0] takes:
 (&irq_desc_lock_class){?.-...}, at: [<ffffffff810c264e>] try_one_irq+0x32/0x138
{IN-HARDIRQ-W} state was registered at:
 [<ffffffff81095160>] __lock_acquire+0x2fc/0xd5d
 [<ffffffff81095cb4>] lock_acquire+0xf3/0x12d
 [<ffffffff814cdadd>] _spin_lock+0x40/0x89
 [<ffffffff810c3389>] handle_level_irq+0x30/0x105
 [<ffffffff81014e0e>] handle_irq+0x95/0xb7
 [<ffffffff810141bd>] do_IRQ+0x6a/0xe0
 [<ffffffff81012813>] ret_from_intr+0x0/0x16
irq event stamp: 195096
hardirqs last  enabled at (195096): [<ffffffff814cd7f7>] _spin_unlock_irq+0x3a/0x5c
hardirqs last disabled at (195095): [<ffffffff814cdbdd>] _spin_lock_irq+0x29/0x95
softirqs last  enabled at (195088): [<ffffffff81068c92>] __do_softirq+0x1c1/0x1ef
softirqs last disabled at (195093): [<ffffffff8101304c>] call_softirq+0x1c/0x30

other info that might help us debug this:
1 lock held by swapper/0:
 #0:  (kernel/irq/spurious.c:21){+.-...}, at: [<ffffffff81070cf2>]
run_timer_softirq+0x1a9/0x315

stack backtrace:
Pid: 0, comm: swapper Not tainted 2.6.32-rc5 #1
Call Trace:
 <IRQ>  [<ffffffff81093e94>] valid_state+0x187/0x1ae
 [<ffffffff81093fe4>] mark_lock+0x129/0x253
 [<ffffffff810951d4>] __lock_acquire+0x370/0xd5d
 [<ffffffff81095cb4>] lock_acquire+0xf3/0x12d
 [<ffffffff814cdadd>] _spin_lock+0x40/0x89
 [<ffffffff810c264e>] try_one_irq+0x32/0x138
 [<ffffffff810c2795>] poll_all_shared_irqs+0x41/0x6d
 [<ffffffff810c27dd>] poll_spurious_irqs+0x1c/0x49
 [<ffffffff81070d82>] run_timer_softirq+0x239/0x315
 [<ffffffff81068bd3>] __do_softirq+0x102/0x1ef
 [<ffffffff8101304c>] call_softirq+0x1c/0x30
 [<ffffffff81014b65>] do_softirq+0x59/0xca
 [<ffffffff810686ad>] irq_exit+0x58/0xae
 [<ffffffff81029b84>] smp_apic_timer_interrupt+0x94/0xba
 [<ffffffff81012a33>] apic_timer_interrupt+0x13/0x20

The reason is that try_one_irq() is called from hardirq context with
interrupts disabled and from softirq context (poll_all_shared_irqs())
with interrupts enabled.

Disable interrupts before calling it from poll_all_shared_irqs().

Reported-and-tested-by: Prarit Bhargava <prarit@redhat.com>
Signed-off-by: Yong Zhang <yong.zhang0@gmail.com>
LKML-Reference: <1257563773-4620-1-git-send-email-yong.zhang0@gmail.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/spurious.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index 114e704760fe..bd7273e6282e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -121,7 +121,9 @@ static void poll_all_shared_irqs(void)
 		if (!(status & IRQ_SPURIOUS_DISABLED))
 			continue;
 
+		local_irq_disable();
 		try_one_irq(i, desc);
+		local_irq_enable();
 	}
 }
 
-- 
cgit v1.2.3


From 968c86458a5975efa7a95f832a4ec9fb21471137 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 6 Nov 2009 15:31:08 -0800
Subject: sched: Fix kernel-doc function parameter name

Fix variable name in sched.c kernel-doc notation.

Fixes this DocBook warning:

 Warning(kernel/sched.c:2008): No description found for parameter
 'p' Warning(kernel/sched.c:2008): Excess function parameter 'k'
 description in 'kthread_bind'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
LKML-Reference: <4AF4B1BC.8020604@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 28dd4f490bfc..7d7d5fcca4cb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1994,7 +1994,7 @@ static inline void check_class_changed(struct rq *rq, struct task_struct *p,
 
 /**
  * kthread_bind - bind a just-created kthread to a cpu.
- * @k: thread created by kthread_create().
+ * @p: thread created by kthread_create().
  * @cpu: cpu (might not be online, must be possible) for @k to run on.
  *
  * Description: This function is equivalent to set_cpus_allowed(),
-- 
cgit v1.2.3


From e9036b36eed4d3cdb33fa9cbcdd9888ae516889f Mon Sep 17 00:00:00 2001
From: Cyrill Gorcunov <gorcunov@openvz.org>
Date: Mon, 26 Oct 2009 22:24:14 +0300
Subject: sched: Use root_task_group_empty only with FAIR_GROUP_SCHED

root_task_group_empty is used only with FAIR_GROUP_SCHED
so if we use other scheduler options we get:

  kernel/sched.c:314: warning: 'root_task_group_empty' defined but not used

So move CONFIG_FAIR_GROUP_SCHED up that it covers
root_task_group_empty().

Signed-off-by: Cyrill Gorcunov <gorcunov@openvz.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20091026192414.GB5321@lenovo>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7d7d5fcca4cb..3c11ae0a948d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -309,6 +309,8 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct rt_rq, init_rt_rq);
  */
 static DEFINE_SPINLOCK(task_group_lock);
 
+#ifdef CONFIG_FAIR_GROUP_SCHED
+
 #ifdef CONFIG_SMP
 static int root_task_group_empty(void)
 {
@@ -316,7 +318,6 @@ static int root_task_group_empty(void)
 }
 #endif
 
-#ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
 #else /* !CONFIG_USER_SCHED */
-- 
cgit v1.2.3


From 9398180097e359646d46083c3e079a54e20bee82 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Tue, 17 Nov 2009 14:06:20 -0800
Subject: workqueue: fix race condition in schedule_on_each_cpu()

Commit 65a64464349883891e21e74af16c05d6e1eeb4e9 ("HWPOISON: Allow
schedule_on_each_cpu() from keventd") which allows schedule_on_each_cpu()
to be called from keventd added a race condition.  schedule_on_each_cpu()
may race with cpu hotplug and end up executing the function twice on a
cpu.

Fix it by moving direct execution into the section protected with
get/put_online_cpus().  While at it, update code such that direct
execution is done after works have been scheduled for all other cpus and
drop unnecessary cpu != orig test from flush loop.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andi Kleen <ak@linux.intel.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Ingo Molnar <mingo@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 28 +++++++++++++---------------
 1 file changed, 13 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 12328147132c..67e526b6ae81 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -692,31 +692,29 @@ int schedule_on_each_cpu(work_func_t func)
 	if (!works)
 		return -ENOMEM;
 
+	get_online_cpus();
+
 	/*
-	 * when running in keventd don't schedule a work item on itself.
-	 * Can just call directly because the work queue is already bound.
-	 * This also is faster.
-	 * Make this a generic parameter for other workqueues?
+	 * When running in keventd don't schedule a work item on
+	 * itself.  Can just call directly because the work queue is
+	 * already bound.  This also is faster.
 	 */
-	if (current_is_keventd()) {
+	if (current_is_keventd())
 		orig = raw_smp_processor_id();
-		INIT_WORK(per_cpu_ptr(works, orig), func);
-		func(per_cpu_ptr(works, orig));
-	}
 
-	get_online_cpus();
 	for_each_online_cpu(cpu) {
 		struct work_struct *work = per_cpu_ptr(works, cpu);
 
-		if (cpu == orig)
-			continue;
 		INIT_WORK(work, func);
-		schedule_work_on(cpu, work);
-	}
-	for_each_online_cpu(cpu) {
 		if (cpu != orig)
-			flush_work(per_cpu_ptr(works, cpu));
+			schedule_work_on(cpu, work);
 	}
+	if (orig >= 0)
+		func(per_cpu_ptr(works, orig));
+
+	for_each_online_cpu(cpu)
+		flush_work(per_cpu_ptr(works, cpu));
+
 	put_online_cpus();
 	free_percpu(works);
 	return 0;
-- 
cgit v1.2.3


From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:23 +0000
Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to
 clear

Wait for outstanding slow work items belonging to a module to clear when
unregistering that module as a user of the facility.  This prevents the put_ref
code of a work item from being taken away before it returns.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  13 ++++-
 fs/fscache/main.c           |   6 +-
 fs/fscache/object.c         |   1 +
 fs/fscache/operation.c      |   1 +
 fs/gfs2/main.c              |   4 +-
 fs/gfs2/recovery.c          |   1 +
 include/linux/slow-work.h   |   8 ++-
 kernel/slow-work.c          | 132 ++++++++++++++++++++++++++++++++++++++++++--
 8 files changed, 150 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index ebc50f808ea4..f12fda31dcdc 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -64,9 +64,11 @@ USING SLOW WORK ITEMS
 Firstly, a module or subsystem wanting to make use of slow work items must
 register its interest:
 
-	 int ret = slow_work_register_user();
+	 int ret = slow_work_register_user(struct module *module);
 
-This will return 0 if successful, or a -ve error upon failure.
+This will return 0 if successful, or a -ve error upon failure.  The module
+pointer should be the module interested in using this facility (almost
+certainly THIS_MODULE).
 
 
 Slow work items may then be set up by:
@@ -110,7 +112,12 @@ operation.  When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
 
-	slow_work_unregister_user();
+	slow_work_unregister_user(struct module *module);
+
+The module pointer is used to wait for all outstanding work items for that
+module before completing the unregistration.  This prevents the put_ref() code
+from being taken away before it completes.  module should almost certainly be
+THIS_MODULE.
 
 
 ===============
diff --git a/fs/fscache/main.c b/fs/fscache/main.c
index 4de41b597499..add6bdb53f04 100644
--- a/fs/fscache/main.c
+++ b/fs/fscache/main.c
@@ -48,7 +48,7 @@ static int __init fscache_init(void)
 {
 	int ret;
 
-	ret = slow_work_register_user();
+	ret = slow_work_register_user(THIS_MODULE);
 	if (ret < 0)
 		goto error_slow_work;
 
@@ -80,7 +80,7 @@ error_kobj:
 error_cookie_jar:
 	fscache_proc_cleanup();
 error_proc:
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 error_slow_work:
 	return ret;
 }
@@ -97,7 +97,7 @@ static void __exit fscache_exit(void)
 	kobject_put(fscache_root);
 	kmem_cache_destroy(fscache_cookie_jar);
 	fscache_proc_cleanup();
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 	printk(KERN_NOTICE "FS-Cache: Unloaded\n");
 }
 
diff --git a/fs/fscache/object.c b/fs/fscache/object.c
index 392a41b1b79d..d236eb1d6f37 100644
--- a/fs/fscache/object.c
+++ b/fs/fscache/object.c
@@ -45,6 +45,7 @@ static void fscache_enqueue_dependents(struct fscache_object *);
 static void fscache_dequeue_object(struct fscache_object *);
 
 const struct slow_work_ops fscache_object_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_object_slow_work_get_ref,
 	.put_ref	= fscache_object_slow_work_put_ref,
 	.execute	= fscache_object_slow_work_execute,
diff --git a/fs/fscache/operation.c b/fs/fscache/operation.c
index e7f8d53b8b6b..f1a2857b2ff5 100644
--- a/fs/fscache/operation.c
+++ b/fs/fscache/operation.c
@@ -453,6 +453,7 @@ static void fscache_op_execute(struct slow_work *work)
 }
 
 const struct slow_work_ops fscache_op_slow_work_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= fscache_op_get_ref,
 	.put_ref	= fscache_op_put_ref,
 	.execute	= fscache_op_execute,
diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c
index eacd78a5d082..5b31f7741a8f 100644
--- a/fs/gfs2/main.c
+++ b/fs/gfs2/main.c
@@ -114,7 +114,7 @@ static int __init init_gfs2_fs(void)
 	if (error)
 		goto fail_unregister;
 
-	error = slow_work_register_user();
+	error = slow_work_register_user(THIS_MODULE);
 	if (error)
 		goto fail_slow;
 
@@ -163,7 +163,7 @@ static void __exit exit_gfs2_fs(void)
 	gfs2_unregister_debugfs();
 	unregister_filesystem(&gfs2_fs_type);
 	unregister_filesystem(&gfs2meta_fs_type);
-	slow_work_unregister_user();
+	slow_work_unregister_user(THIS_MODULE);
 
 	kmem_cache_destroy(gfs2_quotad_cachep);
 	kmem_cache_destroy(gfs2_rgrpd_cachep);
diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c
index 59d2695509d3..b2bb779f09ed 100644
--- a/fs/gfs2/recovery.c
+++ b/fs/gfs2/recovery.c
@@ -593,6 +593,7 @@ fail:
 }
 
 struct slow_work_ops gfs2_recover_ops = {
+	.owner	 = THIS_MODULE,
 	.get_ref = gfs2_recover_get_ref,
 	.put_ref = gfs2_recover_put_ref,
 	.execute = gfs2_recover_work,
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index b65c8881f07a..9adb2b30754f 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -24,6 +24,9 @@ struct slow_work;
  * The operations used to support slow work items
  */
 struct slow_work_ops {
+	/* owner */
+	struct module *owner;
+
 	/* get a ref on a work item
 	 * - return 0 if successful, -ve if not
 	 */
@@ -42,6 +45,7 @@ struct slow_work_ops {
  *   queued
  */
 struct slow_work {
+	struct module		*owner;	/* the owning module */
 	unsigned long		flags;
 #define SLOW_WORK_PENDING	0	/* item pending (further) execution */
 #define SLOW_WORK_EXECUTING	1	/* item currently executing */
@@ -84,8 +88,8 @@ static inline void vslow_work_init(struct slow_work *work,
 }
 
 extern int slow_work_enqueue(struct slow_work *work);
-extern int slow_work_register_user(void);
-extern void slow_work_unregister_user(void);
+extern int slow_work_register_user(struct module *owner);
+extern void slow_work_unregister_user(struct module *owner);
 
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 0d31135efbf4..dd08f376e406 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -22,6 +22,8 @@
 #define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
 					 * OOM */
 
+#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
@@ -46,7 +48,7 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
 
 #ifdef CONFIG_SYSCTL
 static const int slow_work_min_min_threads = 2;
-static int slow_work_max_max_threads = 255;
+static int slow_work_max_max_threads = SLOW_WORK_THREAD_LIMIT;
 static const int slow_work_min_vslow = 1;
 static const int slow_work_max_vslow = 99;
 
@@ -97,6 +99,23 @@ static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
 static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
 static struct slow_work slow_work_new_thread; /* new thread starter */
 
+/*
+ * slow work ID allocation (use slow_work_queue_lock)
+ */
+static DECLARE_BITMAP(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
+
+/*
+ * Unregistration tracking to prevent put_ref() from disappearing during module
+ * unload
+ */
+#ifdef CONFIG_MODULES
+static struct module *slow_work_thread_processing[SLOW_WORK_THREAD_LIMIT];
+static struct module *slow_work_unreg_module;
+static struct slow_work *slow_work_unreg_work_item;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
+static DEFINE_MUTEX(slow_work_unreg_sync_lock);
+#endif
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -149,8 +168,11 @@ static unsigned slow_work_calc_vsmax(void)
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
-static bool slow_work_execute(void)
+static bool slow_work_execute(int id)
 {
+#ifdef CONFIG_MODULES
+	struct module *module;
+#endif
 	struct slow_work *work = NULL;
 	unsigned vsmax;
 	bool very_slow;
@@ -186,6 +208,12 @@ static bool slow_work_execute(void)
 	} else {
 		very_slow = false; /* avoid the compiler warning */
 	}
+
+#ifdef CONFIG_MODULES
+	if (work)
+		slow_work_thread_processing[id] = work->owner;
+#endif
+
 	spin_unlock_irq(&slow_work_queue_lock);
 
 	if (!work)
@@ -219,7 +247,18 @@ static bool slow_work_execute(void)
 		spin_unlock_irq(&slow_work_queue_lock);
 	}
 
+	/* sort out the race between module unloading and put_ref() */
 	work->ops->put_ref(work);
+
+#ifdef CONFIG_MODULES
+	module = slow_work_thread_processing[id];
+	slow_work_thread_processing[id] = NULL;
+	smp_mb();
+	if (slow_work_unreg_work_item == work ||
+	    slow_work_unreg_module == module)
+		wake_up_all(&slow_work_unreg_wq);
+#endif
+
 	return true;
 
 auto_requeue:
@@ -232,6 +271,7 @@ auto_requeue:
 	else
 		list_add_tail(&work->link, &slow_work_queue);
 	spin_unlock_irq(&slow_work_queue_lock);
+	slow_work_thread_processing[id] = NULL;
 	return true;
 }
 
@@ -368,13 +408,22 @@ static inline bool slow_work_available(int vsmax)
  */
 static int slow_work_thread(void *_data)
 {
-	int vsmax;
+	int vsmax, id;
 
 	DEFINE_WAIT(wait);
 
 	set_freezable();
 	set_user_nice(current, -5);
 
+	/* allocate ourselves an ID */
+	spin_lock_irq(&slow_work_queue_lock);
+	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
+	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
+	__set_bit(id, slow_work_ids);
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	sprintf(current->comm, "kslowd%03u", id);
+
 	for (;;) {
 		vsmax = vslow_work_proportion;
 		vsmax *= atomic_read(&slow_work_thread_count);
@@ -395,7 +444,7 @@ static int slow_work_thread(void *_data)
 		vsmax *= atomic_read(&slow_work_thread_count);
 		vsmax /= 100;
 
-		if (slow_work_available(vsmax) && slow_work_execute()) {
+		if (slow_work_available(vsmax) && slow_work_execute(id)) {
 			cond_resched();
 			if (list_empty(&slow_work_queue) &&
 			    list_empty(&vslow_work_queue) &&
@@ -412,6 +461,10 @@ static int slow_work_thread(void *_data)
 			break;
 	}
 
+	spin_lock_irq(&slow_work_queue_lock);
+	__clear_bit(id, slow_work_ids);
+	spin_unlock_irq(&slow_work_queue_lock);
+
 	if (atomic_dec_and_test(&slow_work_thread_count))
 		complete_and_exit(&slow_work_last_thread_exited, 0);
 	return 0;
@@ -475,6 +528,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 }
 
 static const struct slow_work_ops slow_work_new_thread_ops = {
+	.owner		= THIS_MODULE,
 	.get_ref	= slow_work_new_thread_get_ref,
 	.put_ref	= slow_work_new_thread_put_ref,
 	.execute	= slow_work_new_thread_execute,
@@ -546,12 +600,13 @@ static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
 
 /**
  * slow_work_register_user - Register a user of the facility
+ * @module: The module about to make use of the facility
  *
  * Register a user of the facility, starting up the initial threads if there
  * aren't any other users at this point.  This will return 0 if successful, or
  * an error if not.
  */
-int slow_work_register_user(void)
+int slow_work_register_user(struct module *module)
 {
 	struct task_struct *p;
 	int loop;
@@ -598,14 +653,79 @@ error:
 }
 EXPORT_SYMBOL(slow_work_register_user);
 
+/*
+ * wait for all outstanding items from the calling module to complete
+ * - note that more items may be queued whilst we're waiting
+ */
+static void slow_work_wait_for_items(struct module *module)
+{
+	DECLARE_WAITQUEUE(myself, current);
+	struct slow_work *work;
+	int loop;
+
+	mutex_lock(&slow_work_unreg_sync_lock);
+	add_wait_queue(&slow_work_unreg_wq, &myself);
+
+	for (;;) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		/* first of all, we wait for the last queued item in each list
+		 * to be processed */
+		list_for_each_entry_reverse(work, &vslow_work_queue, link) {
+			if (work->owner == module) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				slow_work_unreg_work_item = work;
+				goto do_wait;
+			}
+		}
+		list_for_each_entry_reverse(work, &slow_work_queue, link) {
+			if (work->owner == module) {
+				set_current_state(TASK_UNINTERRUPTIBLE);
+				slow_work_unreg_work_item = work;
+				goto do_wait;
+			}
+		}
+
+		/* then we wait for the items being processed to finish */
+		slow_work_unreg_module = module;
+		smp_mb();
+		for (loop = 0; loop < SLOW_WORK_THREAD_LIMIT; loop++) {
+			if (slow_work_thread_processing[loop] == module)
+				goto do_wait;
+		}
+		spin_unlock_irq(&slow_work_queue_lock);
+		break; /* okay, we're done */
+
+	do_wait:
+		spin_unlock_irq(&slow_work_queue_lock);
+		schedule();
+		slow_work_unreg_work_item = NULL;
+		slow_work_unreg_module = NULL;
+	}
+
+	remove_wait_queue(&slow_work_unreg_wq, &myself);
+	mutex_unlock(&slow_work_unreg_sync_lock);
+}
+
 /**
  * slow_work_unregister_user - Unregister a user of the facility
+ * @module: The module whose items should be cleared
  *
  * Unregister a user of the facility, killing all the threads if this was the
  * last one.
+ *
+ * This waits for all the work items belonging to the nominated module to go
+ * away before proceeding.
  */
-void slow_work_unregister_user(void)
+void slow_work_unregister_user(struct module *module)
 {
+	/* first of all, wait for all outstanding items from the calling module
+	 * to complete */
+	if (module)
+		slow_work_wait_for_items(module);
+
+	/* then we can actually go about shutting down the facility if need
+	 * be */
 	mutex_lock(&slow_work_user_lock);
 
 	BUG_ON(slow_work_user_count <= 0);
-- 
cgit v1.2.3


From 4d8bb2cbccf6dccaada509aafeb01c6205c9d8c4 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:39 +0000
Subject: SLOW_WORK: Make slow_work_ops ->get_ref/->put_ref optional

Make the ability for the slow-work facility to take references on a work item
optional as not everyone requires this.

Even the internal slow-work stubs them out, so those can be got rid of too.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  2 +-
 kernel/slow-work.c          | 36 ++++++++++++++++--------------------
 2 files changed, 17 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index f12fda31dcdc..c655c517fc68 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -125,7 +125,7 @@ ITEM OPERATIONS
 ===============
 
 Each work item requires a table of operations of type struct slow_work_ops.
-All members are required:
+Only ->execute() is required, getting and putting of a reference are optional.
 
  (*) Get a reference on an item:
 
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index dd08f376e406..fccf421eb5c1 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -145,6 +145,20 @@ static DECLARE_COMPLETION(slow_work_last_thread_exited);
 static int slow_work_user_count;
 static DEFINE_MUTEX(slow_work_user_lock);
 
+static inline int slow_work_get_ref(struct slow_work *work)
+{
+	if (work->ops->get_ref)
+		return work->ops->get_ref(work);
+
+	return 0;
+}
+
+static inline void slow_work_put_ref(struct slow_work *work)
+{
+	if (work->ops->put_ref)
+		work->ops->put_ref(work);
+}
+
 /*
  * Calculate the maximum number of active threads in the pool that are
  * permitted to process very slow work items.
@@ -248,7 +262,7 @@ static bool slow_work_execute(int id)
 	}
 
 	/* sort out the race between module unloading and put_ref() */
-	work->ops->put_ref(work);
+	slow_work_put_ref(work);
 
 #ifdef CONFIG_MODULES
 	module = slow_work_thread_processing[id];
@@ -309,7 +323,6 @@ int slow_work_enqueue(struct slow_work *work)
 	BUG_ON(slow_work_user_count <= 0);
 	BUG_ON(!work);
 	BUG_ON(!work->ops);
-	BUG_ON(!work->ops->get_ref);
 
 	/* when honouring an enqueue request, we only promise that we will run
 	 * the work function in the future; we do not promise to run it once
@@ -339,7 +352,7 @@ int slow_work_enqueue(struct slow_work *work)
 		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 		} else {
-			if (work->ops->get_ref(work) < 0)
+			if (slow_work_get_ref(work) < 0)
 				goto cant_get_ref;
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
@@ -479,21 +492,6 @@ static void slow_work_cull_timeout(unsigned long data)
 	wake_up(&slow_work_thread_wq);
 }
 
-/*
- * Get a reference on slow work thread starter
- */
-static int slow_work_new_thread_get_ref(struct slow_work *work)
-{
-	return 0;
-}
-
-/*
- * Drop a reference on slow work thread starter
- */
-static void slow_work_new_thread_put_ref(struct slow_work *work)
-{
-}
-
 /*
  * Start a new slow work thread
  */
@@ -529,8 +527,6 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
-	.get_ref	= slow_work_new_thread_get_ref,
-	.put_ref	= slow_work_new_thread_put_ref,
 	.execute	= slow_work_new_thread_execute,
 };
 
-- 
cgit v1.2.3


From 0160950297c08f8233c89b9f9e7dd59cfb080809 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:43 +0000
Subject: SLOW_WORK: Add support for cancellation of slow work

Add support for cancellation of queued slow work and delayed slow work items.
The cancellation functions will wait for items that are pending or undergoing
execution to be discarded by the slow work facility.

Attempting to enqueue work that is in the process of being cancelled will
result in ECANCELED.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 12 ++++++-
 include/linux/slow-work.h   |  2 ++
 kernel/slow-work.c          | 81 +++++++++++++++++++++++++++++++++++++++++----
 3 files changed, 88 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index c655c517fc68..2e384bd4dead 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -108,7 +108,17 @@ on the item, 0 otherwise.
 
 
 The items are reference counted, so there ought to be no need for a flush
-operation.  When all a module's slow work items have been processed, and the
+operation.  But as the reference counting is optional, means to cancel
+existing work items are also included:
+
+	cancel_slow_work(&myitem);
+
+can be used to cancel pending work.  The above cancel function waits for
+existing work to have been executed (or prevent execution of them, depending
+on timing).
+
+
+When all a module's slow work items have been processed, and the
 module has no further interest in the facility, it should unregister its
 interest:
 
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 9adb2b30754f..eef20182d5b4 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -51,6 +51,7 @@ struct slow_work {
 #define SLOW_WORK_EXECUTING	1	/* item currently executing */
 #define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
 #define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+#define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
 };
@@ -88,6 +89,7 @@ static inline void vslow_work_init(struct slow_work *work,
 }
 
 extern int slow_work_enqueue(struct slow_work *work);
+extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
 extern void slow_work_unregister_user(struct module *owner);
 
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index fccf421eb5c1..671cc434532a 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -236,12 +236,17 @@ static bool slow_work_execute(int id)
 	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
 		BUG();
 
-	work->ops->execute(work);
+	/* don't execute if the work is in the process of being cancelled */
+	if (!test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		work->ops->execute(work);
 
 	if (very_slow)
 		atomic_dec(&vslow_work_executing_count);
 	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
 
+	/* wake up anyone waiting for this work to be complete */
+	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
+
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
@@ -314,11 +319,16 @@ auto_requeue:
  * allowed to pick items to execute.  This ensures that very slow items won't
  * overly block ones that are just ordinarily slow.
  *
- * Returns 0 if successful, -EAGAIN if not.
+ * Returns 0 if successful, -EAGAIN if not (or -ECANCELED if cancelled work is
+ * attempted queued)
  */
 int slow_work_enqueue(struct slow_work *work)
 {
 	unsigned long flags;
+	int ret;
+
+	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		return -ECANCELED;
 
 	BUG_ON(slow_work_user_count <= 0);
 	BUG_ON(!work);
@@ -335,6 +345,9 @@ int slow_work_enqueue(struct slow_work *work)
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 
+		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
+			goto cancelled;
+
 		/* we promise that we will not attempt to execute the work
 		 * function in more than one thread simultaneously
 		 *
@@ -352,8 +365,9 @@ int slow_work_enqueue(struct slow_work *work)
 		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 		} else {
-			if (slow_work_get_ref(work) < 0)
-				goto cant_get_ref;
+			ret = slow_work_get_ref(work);
+			if (ret < 0)
+				goto failed;
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -365,12 +379,67 @@ int slow_work_enqueue(struct slow_work *work)
 	}
 	return 0;
 
-cant_get_ref:
+cancelled:
+	ret = -ECANCELED;
+failed:
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
-	return -EAGAIN;
+	return ret;
 }
 EXPORT_SYMBOL(slow_work_enqueue);
 
+static int slow_work_wait(void *word)
+{
+	schedule();
+	return 0;
+}
+
+/**
+ * slow_work_cancel - Cancel a slow work item
+ * @work: The work item to cancel
+ *
+ * This function will cancel a previously enqueued work item. If we cannot
+ * cancel the work item, it is guarenteed to have run when this function
+ * returns.
+ */
+void slow_work_cancel(struct slow_work *work)
+{
+	bool wait = true, put = false;
+
+	set_bit(SLOW_WORK_CANCELLING, &work->flags);
+
+	spin_lock_irq(&slow_work_queue_lock);
+
+	if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
+	    !list_empty(&work->link)) {
+		/* the link in the pending queue holds a reference on the item
+		 * that we will need to release */
+		list_del_init(&work->link);
+		wait = false;
+		put = true;
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+
+	} else if (test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags)) {
+		/* the executor is holding our only reference on the item, so
+		 * we merely need to wait for it to finish executing */
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+	}
+
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	/* the EXECUTING flag is set by the executor whilst the spinlock is set
+	 * and before the item is dequeued - so assuming the above doesn't
+	 * actually dequeue it, simply waiting for the EXECUTING flag to be
+	 * released here should be sufficient */
+	if (wait)
+		wait_on_bit(&work->flags, SLOW_WORK_EXECUTING, slow_work_wait,
+			    TASK_UNINTERRUPTIBLE);
+
+	clear_bit(SLOW_WORK_CANCELLING, &work->flags);
+	if (put)
+		slow_work_put_ref(work);
+}
+EXPORT_SYMBOL(slow_work_cancel);
+
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
-- 
cgit v1.2.3


From 6b8268b17a1ffc942bc72d7d00274e433d6b6719 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Thu, 19 Nov 2009 18:10:47 +0000
Subject: SLOW_WORK: Add delayed_slow_work support

This adds support for starting slow work with a delay, similar
to the functionality we have for workqueues.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  16 +++++-
 include/linux/slow-work.h   |  29 ++++++++++
 kernel/slow-work.c          | 129 +++++++++++++++++++++++++++++++++++++++++++-
 3 files changed, 171 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 2e384bd4dead..a9d1b0ffdded 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -41,6 +41,13 @@ expand files, provided the time taken to do so isn't too long.
 Operations of both types may sleep during execution, thus tying up the thread
 loaned to it.
 
+A further class of work item is available, based on the slow work item class:
+
+ (*) Delayed slow work items.
+
+These are slow work items that have a timer to defer queueing of the item for
+a while.
+
 
 THREAD-TO-CLASS ALLOCATION
 --------------------------
@@ -93,6 +100,10 @@ Slow work items may then be set up by:
 
 	slow_work_init(&myitem, &myitem_ops);
 
+     or:
+
+	delayed_slow_work_init(&myitem, &myitem_ops);
+
      or:
 
 	vslow_work_init(&myitem, &myitem_ops);
@@ -104,7 +115,9 @@ A suitably set up work item can then be enqueued for processing:
 	int ret = slow_work_enqueue(&myitem);
 
 This will return a -ve error if the thread pool is unable to gain a reference
-on the item, 0 otherwise.
+on the item, 0 otherwise, or (for delayed work):
+
+	int ret = delayed_slow_work_enqueue(&myitem, my_jiffy_delay);
 
 
 The items are reference counted, so there ought to be no need for a flush
@@ -112,6 +125,7 @@ operation.  But as the reference counting is optional, means to cancel
 existing work items are also included:
 
 	cancel_slow_work(&myitem);
+	cancel_delayed_slow_work(&myitem);
 
 can be used to cancel pending work.  The above cancel function waits for
 existing work to have been executed (or prevent execution of them, depending
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index eef20182d5b4..b245b9a9cc0b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -17,6 +17,7 @@
 #ifdef CONFIG_SLOW_WORK
 
 #include <linux/sysctl.h>
+#include <linux/timer.h>
 
 struct slow_work;
 
@@ -52,10 +53,16 @@ struct slow_work {
 #define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
 #define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
 #define SLOW_WORK_CANCELLING	4	/* item is being cancelled, don't enqueue */
+#define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
 };
 
+struct delayed_slow_work {
+	struct slow_work	work;
+	struct timer_list	timer;
+};
+
 /**
  * slow_work_init - Initialise a slow work item
  * @work: The work item to initialise
@@ -71,6 +78,20 @@ static inline void slow_work_init(struct slow_work *work,
 	INIT_LIST_HEAD(&work->link);
 }
 
+/**
+ * slow_work_init - Initialise a delayed slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a delayed slow work item.
+ */
+static inline void delayed_slow_work_init(struct delayed_slow_work *dwork,
+					  const struct slow_work_ops *ops)
+{
+	init_timer(&dwork->timer);
+	slow_work_init(&dwork->work, ops);
+}
+
 /**
  * vslow_work_init - Initialise a very slow work item
  * @work: The work item to initialise
@@ -93,6 +114,14 @@ extern void slow_work_cancel(struct slow_work *work);
 extern int slow_work_register_user(struct module *owner);
 extern void slow_work_unregister_user(struct module *owner);
 
+extern int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
+				     unsigned long delay);
+
+static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
+{
+	slow_work_cancel(&dwork->work);
+}
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
 #endif
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 671cc434532a..f67e1daae93d 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -406,11 +406,40 @@ void slow_work_cancel(struct slow_work *work)
 	bool wait = true, put = false;
 
 	set_bit(SLOW_WORK_CANCELLING, &work->flags);
+	smp_mb();
+
+	/* if the work item is a delayed work item with an active timer, we
+	 * need to wait for the timer to finish _before_ getting the spinlock,
+	 * lest we deadlock against the timer routine
+	 *
+	 * the timer routine will leave DELAYED set if it notices the
+	 * CANCELLING flag in time
+	 */
+	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
+		struct delayed_slow_work *dwork =
+			container_of(work, struct delayed_slow_work, work);
+		del_timer_sync(&dwork->timer);
+	}
 
 	spin_lock_irq(&slow_work_queue_lock);
 
-	if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
-	    !list_empty(&work->link)) {
+	if (test_bit(SLOW_WORK_DELAYED, &work->flags)) {
+		/* the timer routine aborted or never happened, so we are left
+		 * holding the timer's reference on the item and should just
+		 * drop the pending flag and wait for any ongoing execution to
+		 * finish */
+		struct delayed_slow_work *dwork =
+			container_of(work, struct delayed_slow_work, work);
+
+		BUG_ON(timer_pending(&dwork->timer));
+		BUG_ON(!list_empty(&work->link));
+
+		clear_bit(SLOW_WORK_DELAYED, &work->flags);
+		put = true;
+		clear_bit(SLOW_WORK_PENDING, &work->flags);
+
+	} else if (test_bit(SLOW_WORK_PENDING, &work->flags) &&
+		   !list_empty(&work->link)) {
 		/* the link in the pending queue holds a reference on the item
 		 * that we will need to release */
 		list_del_init(&work->link);
@@ -440,6 +469,102 @@ void slow_work_cancel(struct slow_work *work)
 }
 EXPORT_SYMBOL(slow_work_cancel);
 
+/*
+ * Handle expiry of the delay timer, indicating that a delayed slow work item
+ * should now be queued if not cancelled
+ */
+static void delayed_slow_work_timer(unsigned long data)
+{
+	struct slow_work *work = (struct slow_work *) data;
+	unsigned long flags;
+	bool queued = false, put = false;
+
+	spin_lock_irqsave(&slow_work_queue_lock, flags);
+	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
+		clear_bit(SLOW_WORK_DELAYED, &work->flags);
+
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			/* we discard the reference the timer was holding in
+			 * favour of the one the executor holds */
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+			put = true;
+		} else {
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			queued = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	if (put)
+		slow_work_put_ref(work);
+	if (queued)
+		wake_up(&slow_work_thread_wq);
+}
+
+/**
+ * delayed_slow_work_enqueue - Schedule a delayed slow work item for processing
+ * @dwork: The delayed work item to queue
+ * @delay: When to start executing the work, in jiffies from now
+ *
+ * This is similar to slow_work_enqueue(), but it adds a delay before the work
+ * is actually queued for processing.
+ *
+ * The item can have delayed processing requested on it whilst it is being
+ * executed.  The delay will begin immediately, and if it expires before the
+ * item finishes executing, the item will be placed back on the queue when it
+ * has done executing.
+ */
+int delayed_slow_work_enqueue(struct delayed_slow_work *dwork,
+			      unsigned long delay)
+{
+	struct slow_work *work = &dwork->work;
+	unsigned long flags;
+	int ret;
+
+	if (delay == 0)
+		return slow_work_enqueue(&dwork->work);
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+
+	if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+		return -ECANCELED;
+
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		if (test_bit(SLOW_WORK_CANCELLING, &work->flags))
+			goto cancelled;
+
+		/* the timer holds a reference whilst it is pending */
+		ret = work->ops->get_ref(work);
+		if (ret < 0)
+			goto cant_get_ref;
+
+		if (test_and_set_bit(SLOW_WORK_DELAYED, &work->flags))
+			BUG();
+		dwork->timer.expires = jiffies + delay;
+		dwork->timer.data = (unsigned long) work;
+		dwork->timer.function = delayed_slow_work_timer;
+		add_timer(&dwork->timer);
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+
+	return 0;
+
+cancelled:
+	ret = -ECANCELED;
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return ret;
+}
+EXPORT_SYMBOL(delayed_slow_work_enqueue);
+
 /*
  * Schedule a cull of the thread pool at some time in the near future
  */
-- 
cgit v1.2.3


From 8fba10a42d191de612e60e7009c8f0313f90a9b3 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:51 +0000
Subject: SLOW_WORK: Allow the work items to be viewed through a /proc file

Allow the executing and queued work items to be viewed through a /proc file
for debugging purposes.  The contents look something like the following:

    THR PID   ITEM ADDR        FL MARK  DESC
    === ===== ================ == ===== ==========
      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK

In the 'THR' column, executing items show the thread they're occupying and
queued threads indicate which queue they're on.  'PID' shows the process ID of
a slow-work thread that's executing something.  'FL' shows the work item flags.
'MARK' indicates how long since an item was queued or began executing.  Lastly,
the 'DESC' column permits the owner of an item to give some information.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt |  60 +++++++++++-
 include/linux/slow-work.h   |  11 +++
 init/Kconfig                |  10 ++
 kernel/Makefile             |   1 +
 kernel/slow-work-proc.c     | 227 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/slow-work.c          |  44 ++++++---
 kernel/slow-work.h          |  72 ++++++++++++++
 7 files changed, 413 insertions(+), 12 deletions(-)
 create mode 100644 kernel/slow-work-proc.c
 create mode 100644 kernel/slow-work.h

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index a9d1b0ffdded..f120238e70fe 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -149,7 +149,8 @@ ITEM OPERATIONS
 ===============
 
 Each work item requires a table of operations of type struct slow_work_ops.
-Only ->execute() is required, getting and putting of a reference are optional.
+Only ->execute() is required; the getting and putting of a reference and the
+describing of an item are all optional.
 
  (*) Get a reference on an item:
 
@@ -179,6 +180,16 @@ Only ->execute() is required, getting and putting of a reference are optional.
      This should perform the work required of the item.  It may sleep, it may
      perform disk I/O and it may wait for locks.
 
+ (*) View an item through /proc:
+
+	void (*desc)(struct slow_work *work, struct seq_file *m);
+
+     If supplied, this should print to 'm' a small string describing the work
+     the item is to do.  This should be no more than about 40 characters, and
+     shouldn't include a newline character.
+
+     See the 'Viewing executing and queued items' section below.
+
 
 ==================
 POOL CONFIGURATION
@@ -203,3 +214,50 @@ The slow-work thread pool has a number of configurables:
      is bounded to between 1 and one fewer than the number of active threads.
      This ensures there is always at least one thread that can process very
      slow work items, and always at least one thread that won't.
+
+
+==================================
+VIEWING EXECUTING AND QUEUED ITEMS
+==================================
+
+If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
+
+	/proc/slow_work_rq
+
+through which the list of work items being executed and the queues of items to
+be executed may be viewed.  The owner of a work item is given the chance to
+add some information of its own.
+
+The contents look something like the following:
+
+    THR PID   ITEM ADDR        FL MARK  DESC
+    === ===== ================ == ===== ==========
+      0  3005 ffff880023f52348  a 952ms FSC: OBJ17d3: LOOK
+      1  3006 ffff880024e33668  2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2
+      2  3165 ffff8800296dd180  a 424ms FSC: OBJ17e4: LOOK
+      3  4089 ffff8800262c8d78  a 212ms FSC: OBJ17ea: CRTN
+      4  4090 ffff88002792bed8  2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2
+      5  4092 ffff88002a0ef308  2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2
+      6  4094 ffff88002abaf4b8  2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2
+      7  4095 ffff88002bb188e0  a 388ms FSC: OBJ17e9: CRTN
+    vsq     - ffff880023d99668  1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2
+    vsq     - ffff8800295d1740  1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2
+    vsq     - ffff880025ba3308  1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2
+    vsq     - ffff880024ec83e0  1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2
+    vsq     - ffff880026618e00  1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2
+    vsq     - ffff880025a2a4b8  1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2
+    vsq     - ffff880023cbe6d8  9 212ms FSC: OBJ17eb: LOOK
+    vsq     - ffff880024d37590  9 212ms FSC: OBJ17ec: LOOK
+    vsq     - ffff880027746cb0  9 212ms FSC: OBJ17ed: LOOK
+    vsq     - ffff880024d37ae8  9 212ms FSC: OBJ17ee: LOOK
+    vsq     - ffff880024d37cb0  9 212ms FSC: OBJ17ef: LOOK
+    vsq     - ffff880025036550  9 212ms FSC: OBJ17f0: LOOK
+    vsq     - ffff8800250368e0  9 212ms FSC: OBJ17f1: LOOK
+    vsq     - ffff880025036aa8  9 212ms FSC: OBJ17f2: LOOK
+
+In the 'THR' column, executing items show the thread they're occupying and
+queued threads indicate which queue they're on.  'PID' shows the process ID of
+a slow-work thread that's executing something.  'FL' shows the work item flags.
+'MARK' indicates how long since an item was queued or began executing.  Lastly,
+the 'DESC' column permits the owner of an item to give some information.
+
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index b245b9a9cc0b..f41485145ed1 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -20,6 +20,9 @@
 #include <linux/timer.h>
 
 struct slow_work;
+#ifdef CONFIG_SLOW_WORK_PROC
+struct seq_file;
+#endif
 
 /*
  * The operations used to support slow work items
@@ -38,6 +41,11 @@ struct slow_work_ops {
 
 	/* execute a work item */
 	void (*execute)(struct slow_work *work);
+
+#ifdef CONFIG_SLOW_WORK_PROC
+	/* describe a work item for /proc */
+	void (*desc)(struct slow_work *work, struct seq_file *m);
+#endif
 };
 
 /*
@@ -56,6 +64,9 @@ struct slow_work {
 #define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
+#ifdef CONFIG_SLOW_WORK_PROC
+	struct timespec		mark;	/* jiffies at which queued or exec begun */
+#endif
 };
 
 struct delayed_slow_work {
diff --git a/init/Kconfig b/init/Kconfig
index 9e03ef8b311e..ab5c64801fe5 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1098,6 +1098,16 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
+config SLOW_WORK_PROC
+	bool "Slow work debugging through /proc"
+	default n
+	depends on SLOW_WORK && PROC_FS
+	help
+	  Display the contents of the slow work run queue through /proc,
+	  including items currently executing.
+
+	  See Documentation/slow-work.txt.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index b8d4cd8ac0b9..776ffed1556d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,6 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
+obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c
new file mode 100644
index 000000000000..3988032571f5
--- /dev/null
+++ b/kernel/slow-work-proc.c
@@ -0,0 +1,227 @@
+/* Slow work debugging
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include "slow-work.h"
+
+#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
+#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
+#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
+
+void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
+{
+	seq_puts(m, "Slow-work: New thread");
+}
+
+/*
+ * Render the time mark field on a work item into a 5-char time with units plus
+ * a space
+ */
+static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
+{
+	struct timespec now, diff;
+
+	now = CURRENT_TIME;
+	diff = timespec_sub(now, work->mark);
+
+	if (diff.tv_sec < 0)
+		seq_puts(m, "  -ve ");
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
+		seq_printf(m, "%3luns ", diff.tv_nsec);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
+		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
+		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
+	else if (diff.tv_sec <= 1)
+		seq_puts(m, "   1s ");
+	else if (diff.tv_sec < 60)
+		seq_printf(m, "%4lus ", diff.tv_sec);
+	else if (diff.tv_sec < 60 * 60)
+		seq_printf(m, "%4lum ", diff.tv_sec / 60);
+	else if (diff.tv_sec < 60 * 60 * 24)
+		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
+	else
+		seq_puts(m, "exces ");
+}
+
+/*
+ * Describe a slow work item for /proc
+ */
+static int slow_work_runqueue_show(struct seq_file *m, void *v)
+{
+	struct slow_work *work;
+	struct list_head *p = v;
+	unsigned long id;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
+		return 0;
+	case 2:
+		seq_puts(m, "=== ===== ================ == ===== ==========\n");
+		return 0;
+
+	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
+		id = (unsigned long) v - 3;
+
+		read_lock(&slow_work_execs_lock);
+		work = slow_work_execs[id];
+		if (work) {
+			smp_read_barrier_depends();
+
+			seq_printf(m, "%3lu %5d %16p %2lx ",
+				   id, slow_work_pids[id], work, work->flags);
+			slow_work_print_mark(m, work);
+
+			if (work->ops->desc)
+				work->ops->desc(work, m);
+			seq_putc(m, '\n');
+		}
+		read_unlock(&slow_work_execs_lock);
+		return 0;
+
+	default:
+		work = list_entry(p, struct slow_work, link);
+		seq_printf(m, "%3s     - %16p %2lx ",
+			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
+			   work, work->flags);
+		slow_work_print_mark(m, work);
+
+		if (work->ops->desc)
+			work->ops->desc(work, m);
+		seq_putc(m, '\n');
+		return 0;
+	}
+}
+
+/*
+ * map the iterator to a work item
+ */
+static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
+{
+	struct list_head *p;
+	unsigned long count, id;
+
+	switch (*_pos >> ITERATOR_SHIFT) {
+	case 0x0:
+		if (*_pos == 0)
+			*_pos = 1;
+		if (*_pos < 3)
+			return (void *)(unsigned long) *_pos;
+		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
+			for (id = *_pos - 3;
+			     id < SLOW_WORK_THREAD_LIMIT;
+			     id++, (*_pos)++)
+				if (slow_work_execs[id])
+					return (void *)(unsigned long) *_pos;
+		*_pos = 0x1UL << ITERATOR_SHIFT;
+
+	case 0x1:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &slow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+
+	case 0x2:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &vslow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
+{
+	spin_lock_irq(&slow_work_queue_lock);
+	return slow_work_runqueue_index(m, _pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	struct list_head *p = v;
+	unsigned long selector = *_pos >> ITERATOR_SHIFT;
+
+	(*_pos)++;
+	switch (selector) {
+	case 0x0:
+		return slow_work_runqueue_index(m, _pos);
+
+	case 0x1:
+		if (*_pos >> ITERATOR_SHIFT == 0x1) {
+			p = p->next;
+			if (p != &slow_work_queue)
+				return p;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+		p = &vslow_work_queue;
+
+	case 0x2:
+		if (*_pos >> ITERATOR_SHIFT == 0x2) {
+			p = p->next;
+			if (p != &vslow_work_queue)
+				return p;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * clean up after reading
+ */
+static void slow_work_runqueue_stop(struct seq_file *m, void *v)
+{
+	spin_unlock_irq(&slow_work_queue_lock);
+}
+
+static const struct seq_operations slow_work_runqueue_ops = {
+	.start		= slow_work_runqueue_start,
+	.stop		= slow_work_runqueue_stop,
+	.next		= slow_work_runqueue_next,
+	.show		= slow_work_runqueue_show,
+};
+
+/*
+ * open "/proc/slow_work_rq" to list queue contents
+ */
+static int slow_work_runqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slow_work_runqueue_ops);
+}
+
+const struct file_operations slow_work_runqueue_fops = {
+	.owner		= THIS_MODULE,
+	.open		= slow_work_runqueue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index f67e1daae93d..b763bc2d2670 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,13 +16,8 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-
-#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
-					 * things to do */
-#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
-					 * OOM */
-
-#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+#include <linux/proc_fs.h>
+#include "slow-work.h"
 
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
@@ -116,6 +111,15 @@ static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
 static DEFINE_MUTEX(slow_work_unreg_sync_lock);
 #endif
 
+/*
+ * Data for tracking currently executing items for indication through /proc
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
+pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
+DEFINE_RWLOCK(slow_work_execs_lock);
+#endif
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -124,9 +128,9 @@ static DEFINE_MUTEX(slow_work_unreg_sync_lock);
  * There are two queues of work items: one for slow work items, and one for
  * very slow work items.
  */
-static LIST_HEAD(slow_work_queue);
-static LIST_HEAD(vslow_work_queue);
-static DEFINE_SPINLOCK(slow_work_queue_lock);
+LIST_HEAD(slow_work_queue);
+LIST_HEAD(vslow_work_queue);
+DEFINE_SPINLOCK(slow_work_queue_lock);
 
 /*
  * The thread controls.  A variable used to signal to the threads that they
@@ -182,7 +186,7 @@ static unsigned slow_work_calc_vsmax(void)
  * Attempt to execute stuff queued on a slow thread.  Return true if we managed
  * it, false if there was nothing to do.
  */
-static bool slow_work_execute(int id)
+static noinline bool slow_work_execute(int id)
 {
 #ifdef CONFIG_MODULES
 	struct module *module;
@@ -227,6 +231,10 @@ static bool slow_work_execute(int id)
 	if (work)
 		slow_work_thread_processing[id] = work->owner;
 #endif
+	if (work) {
+		slow_work_mark_time(work);
+		slow_work_begin_exec(id, work);
+	}
 
 	spin_unlock_irq(&slow_work_queue_lock);
 
@@ -247,6 +255,8 @@ static bool slow_work_execute(int id)
 	/* wake up anyone waiting for this work to be complete */
 	wake_up_bit(&work->flags, SLOW_WORK_EXECUTING);
 
+	slow_work_end_exec(id, work);
+
 	/* if someone tried to enqueue the item whilst we were executing it,
 	 * then it'll be left unenqueued to avoid multiple threads trying to
 	 * execute it simultaneously
@@ -285,6 +295,7 @@ auto_requeue:
 	 * - we transfer our ref on the item back to the appropriate queue
 	 * - don't wake another thread up as we're awake already
 	 */
+	slow_work_mark_time(work);
 	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 		list_add_tail(&work->link, &vslow_work_queue);
 	else
@@ -368,6 +379,7 @@ int slow_work_enqueue(struct slow_work *work)
 			ret = slow_work_get_ref(work);
 			if (ret < 0)
 				goto failed;
+			slow_work_mark_time(work);
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -489,6 +501,7 @@ static void delayed_slow_work_timer(unsigned long data)
 			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
 			put = true;
 		} else {
+			slow_work_mark_time(work);
 			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
 				list_add_tail(&work->link, &vslow_work_queue);
 			else
@@ -627,6 +640,7 @@ static int slow_work_thread(void *_data)
 	id = find_first_zero_bit(slow_work_ids, SLOW_WORK_THREAD_LIMIT);
 	BUG_ON(id < 0 || id >= SLOW_WORK_THREAD_LIMIT);
 	__set_bit(id, slow_work_ids);
+	slow_work_set_thread_pid(id, current->pid);
 	spin_unlock_irq(&slow_work_queue_lock);
 
 	sprintf(current->comm, "kslowd%03u", id);
@@ -669,6 +683,7 @@ static int slow_work_thread(void *_data)
 	}
 
 	spin_lock_irq(&slow_work_queue_lock);
+	slow_work_set_thread_pid(id, 0);
 	__clear_bit(id, slow_work_ids);
 	spin_unlock_irq(&slow_work_queue_lock);
 
@@ -722,6 +737,9 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
 	.execute	= slow_work_new_thread_execute,
+#ifdef CONFIG_SLOW_WORK_PROC
+	.desc		= slow_work_new_thread_desc,
+#endif
 };
 
 /*
@@ -948,6 +966,10 @@ static int __init init_slow_work(void)
 #ifdef CONFIG_SYSCTL
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
+#endif
+#ifdef CONFIG_SLOW_WORK_PROC
+	proc_create("slow_work_rq", S_IFREG | 0400, NULL,
+		    &slow_work_runqueue_fops);
 #endif
 	return 0;
 }
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
new file mode 100644
index 000000000000..3c2f007f3ad6
--- /dev/null
+++ b/kernel/slow-work.h
@@ -0,0 +1,72 @@
+/* Slow work private definitions
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
+					 * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
+					 * OOM */
+
+#define SLOW_WORK_THREAD_LIMIT	255	/* abs maximum number of slow-work threads */
+
+/*
+ * slow-work.c
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+extern struct slow_work *slow_work_execs[];
+extern pid_t slow_work_pids[];
+extern rwlock_t slow_work_execs_lock;
+#endif
+
+extern struct list_head slow_work_queue;
+extern struct list_head vslow_work_queue;
+extern spinlock_t slow_work_queue_lock;
+
+/*
+ * slow-work-proc.c
+ */
+#ifdef CONFIG_SLOW_WORK_PROC
+extern const struct file_operations slow_work_runqueue_fops;
+
+extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
+#endif
+
+/*
+ * Helper functions
+ */
+static inline void slow_work_set_thread_pid(int id, pid_t pid)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	slow_work_pids[id] = pid;
+#endif
+}
+
+static inline void slow_work_mark_time(struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	work->mark = CURRENT_TIME;
+#endif
+}
+
+static inline void slow_work_begin_exec(int id, struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	slow_work_execs[id] = work;
+#endif
+}
+
+static inline void slow_work_end_exec(int id, struct slow_work *work)
+{
+#ifdef CONFIG_SLOW_WORK_PROC
+	write_lock(&slow_work_execs_lock);
+	slow_work_execs[id] = NULL;
+	write_unlock(&slow_work_execs_lock);
+#endif
+}
-- 
cgit v1.2.3


From 3bde31a4ac225cb5805be02eff6eaaf7e0766ccd Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 19 Nov 2009 18:10:57 +0000
Subject: SLOW_WORK: Allow a requeueable work item to sleep till the thread is
 needed

Add a function to allow a requeueable work item to sleep till the thread
processing it is needed by the slow-work facility to perform other work.

Sometimes a work item can't progress immediately, but must wait for the
completion of another work item that's currently being processed by another
slow-work thread.

In some circumstances, the waiting item could instead - theoretically - put
itself back on the queue and yield its thread back to the slow-work facility,
thus waiting till it gets processing time again before attempting to progress.
This would allow other work items processing time on that thread.

However, this only works if there is something on the queue for it to queue
behind - otherwise it will just get a thread again immediately, and will end
up cycling between the queue and the thread, eating up valuable CPU time.

So, slow_work_sleep_till_thread_needed() is provided such that an item can put
itself on a wait queue that will wake it up when the event it is actually
interested in occurs, then call this function in lieu of calling schedule().

This function will then sleep until either the item's event occurs or another
work item appears on the queue.  If another work item is queued, but the
item's event hasn't occurred, then the work item should requeue itself and
yield the thread back to the slow-work facility by returning.

This can be used by CacheFiles for an object that is being created on one
thread to wait for an object being deleted on another thread where there is
nothing on the queue for the creation to go and wait behind.  As soon as an
item appears on the queue that could be given thread time instead, CacheFiles
can stick the creating object back on the queue and return to the slow-work
facility - assuming the object deletion didn't also complete.

Signed-off-by: David Howells <dhowells@redhat.com>
---
 Documentation/slow-work.txt | 44 +++++++++++++++++++++
 include/linux/slow-work.h   |  3 ++
 kernel/slow-work.c          | 94 ++++++++++++++++++++++++++++++++++++++++-----
 3 files changed, 132 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 0169c9d9dd16..52bc31433723 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -158,6 +158,50 @@ with a requeue pending).  This can be used to work out whether an item on which
 another depends is on the queue, thus allowing a dependent item to be queued
 after it.
 
+If the above shows an item on which another depends not to be queued, then the
+owner of the dependent item might need to wait.  However, to avoid locking up
+the threads unnecessarily be sleeping in them, it can make sense under some
+circumstances to return the work item to the queue, thus deferring it until
+some other items have had a chance to make use of the yielded thread.
+
+To yield a thread and defer an item, the work function should simply enqueue
+the work item again and return.  However, this doesn't work if there's nothing
+actually on the queue, as the thread just vacated will jump straight back into
+the item's work function, thus busy waiting on a CPU.
+
+Instead, the item should use the thread to wait for the dependency to go away,
+but rather than using schedule() or schedule_timeout() to sleep, it should use
+the following function:
+
+	bool requeue = slow_work_sleep_till_thread_needed(
+			struct slow_work *work,
+			signed long *_timeout);
+
+This will add a second wait and then sleep, such that it will be woken up if
+either something appears on the queue that could usefully make use of the
+thread - and behind which this item can be queued, or if the event the caller
+set up to wait for happens.  True will be returned if something else appeared
+on the queue and this work function should perhaps return, of false if
+something else woke it up.  The timeout is as for schedule_timeout().
+
+For example:
+
+	wq = bit_waitqueue(&my_flags, MY_BIT);
+	init_wait(&wait);
+	requeue = false;
+	do {
+		prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+		if (!test_bit(MY_BIT, &my_flags))
+			break;
+		requeue = slow_work_sleep_till_thread_needed(&my_work,
+							     &timeout);
+	} while (timeout > 0 && !requeue);
+	finish_wait(wq, &wait);
+	if (!test_bit(MY_BIT, &my_flags)
+		goto do_my_thing;
+	if (requeue)
+		return; // to slow_work
+
 
 ===============
 ITEM OPERATIONS
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index bfd3ab4c8898..5035a2691739 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -152,6 +152,9 @@ static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork)
 	slow_work_cancel(&dwork->work);
 }
 
+extern bool slow_work_sleep_till_thread_needed(struct slow_work *work,
+					       signed long *_timeout);
+
 #ifdef CONFIG_SYSCTL
 extern ctl_table slow_work_sysctls[];
 #endif
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b763bc2d2670..da94f3c101af 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -132,6 +132,15 @@ LIST_HEAD(slow_work_queue);
 LIST_HEAD(vslow_work_queue);
 DEFINE_SPINLOCK(slow_work_queue_lock);
 
+/*
+ * The following are two wait queues that get pinged when a work item is placed
+ * on an empty queue.  These allow work items that are hogging a thread by
+ * sleeping in a way that could be deferred to yield their thread and enqueue
+ * themselves.
+ */
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_queue_waits_for_occupation);
+static DECLARE_WAIT_QUEUE_HEAD(vslow_work_queue_waits_for_occupation);
+
 /*
  * The thread controls.  A variable used to signal to the threads that they
  * should exit when the queue is empty, a waitqueue used by the threads to wait
@@ -305,6 +314,50 @@ auto_requeue:
 	return true;
 }
 
+/**
+ * slow_work_sleep_till_thread_needed - Sleep till thread needed by other work
+ * work: The work item under execution that wants to sleep
+ * _timeout: Scheduler sleep timeout
+ *
+ * Allow a requeueable work item to sleep on a slow-work processor thread until
+ * that thread is needed to do some other work or the sleep is interrupted by
+ * some other event.
+ *
+ * The caller must set up a wake up event before calling this and must have set
+ * the appropriate sleep mode (such as TASK_UNINTERRUPTIBLE) and tested its own
+ * condition before calling this function as no test is made here.
+ *
+ * False is returned if there is nothing on the queue; true is returned if the
+ * work item should be requeued
+ */
+bool slow_work_sleep_till_thread_needed(struct slow_work *work,
+					signed long *_timeout)
+{
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
+
+	DEFINE_WAIT(wait);
+
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+		wfo_wq = &vslow_work_queue_waits_for_occupation;
+		queue = &vslow_work_queue;
+	} else {
+		wfo_wq = &slow_work_queue_waits_for_occupation;
+		queue = &slow_work_queue;
+	}
+
+	if (!list_empty(queue))
+		return true;
+
+	add_wait_queue_exclusive(wfo_wq, &wait);
+	if (list_empty(queue))
+		*_timeout = schedule_timeout(*_timeout);
+	finish_wait(wfo_wq, &wait);
+
+	return !list_empty(queue);
+}
+EXPORT_SYMBOL(slow_work_sleep_till_thread_needed);
+
 /**
  * slow_work_enqueue - Schedule a slow work item for processing
  * @work: The work item to queue
@@ -335,6 +388,8 @@ auto_requeue:
  */
 int slow_work_enqueue(struct slow_work *work)
 {
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
 	unsigned long flags;
 	int ret;
 
@@ -354,6 +409,14 @@ int slow_work_enqueue(struct slow_work *work)
 	 * maintaining our promise
 	 */
 	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+			wfo_wq = &vslow_work_queue_waits_for_occupation;
+			queue = &vslow_work_queue;
+		} else {
+			wfo_wq = &slow_work_queue_waits_for_occupation;
+			queue = &slow_work_queue;
+		}
+
 		spin_lock_irqsave(&slow_work_queue_lock, flags);
 
 		if (unlikely(test_bit(SLOW_WORK_CANCELLING, &work->flags)))
@@ -380,11 +443,13 @@ int slow_work_enqueue(struct slow_work *work)
 			if (ret < 0)
 				goto failed;
 			slow_work_mark_time(work);
-			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-				list_add_tail(&work->link, &vslow_work_queue);
-			else
-				list_add_tail(&work->link, &slow_work_queue);
+			list_add_tail(&work->link, queue);
 			wake_up(&slow_work_thread_wq);
+
+			/* if someone who could be requeued is sleeping on a
+			 * thread, then ask them to yield their thread */
+			if (work->link.prev == queue)
+				wake_up(wfo_wq);
 		}
 
 		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
@@ -487,9 +552,19 @@ EXPORT_SYMBOL(slow_work_cancel);
  */
 static void delayed_slow_work_timer(unsigned long data)
 {
+	wait_queue_head_t *wfo_wq;
+	struct list_head *queue;
 	struct slow_work *work = (struct slow_work *) data;
 	unsigned long flags;
-	bool queued = false, put = false;
+	bool queued = false, put = false, first = false;
+
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags)) {
+		wfo_wq = &vslow_work_queue_waits_for_occupation;
+		queue = &vslow_work_queue;
+	} else {
+		wfo_wq = &slow_work_queue_waits_for_occupation;
+		queue = &slow_work_queue;
+	}
 
 	spin_lock_irqsave(&slow_work_queue_lock, flags);
 	if (likely(!test_bit(SLOW_WORK_CANCELLING, &work->flags))) {
@@ -502,17 +577,18 @@ static void delayed_slow_work_timer(unsigned long data)
 			put = true;
 		} else {
 			slow_work_mark_time(work);
-			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
-				list_add_tail(&work->link, &vslow_work_queue);
-			else
-				list_add_tail(&work->link, &slow_work_queue);
+			list_add_tail(&work->link, queue);
 			queued = true;
+			if (work->link.prev == queue)
+				first = true;
 		}
 	}
 
 	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
 	if (put)
 		slow_work_put_ref(work);
+	if (first)
+		wake_up(wfo_wq);
 	if (queued)
 		wake_up(&slow_work_thread_wq);
 }
-- 
cgit v1.2.3


From fa1dae4906982b5d896c07613b1fe42456133b1c Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Dec 2009 13:52:08 +0000
Subject: SLOW_WORK: Fix the CONFIG_MODULES=n case

Commits 3d7a641 ("SLOW_WORK: Wait for outstanding work items belonging to a
module to clear") introduced some code to make sure that all of a module's
slow-work items were complete before that module was removed, and commit
3bde31a ("SLOW_WORK: Allow a requeueable work item to sleep till the thread is
needed") further extended that, breaking it in the process if CONFIG_MODULES=n:

    CC      kernel/slow-work.o
  kernel/slow-work.c: In function 'slow_work_execute':
  kernel/slow-work.c:313: error: 'slow_work_thread_processing' undeclared (first use in this function)
  kernel/slow-work.c:313: error: (Each undeclared identifier is reported only once
  kernel/slow-work.c:313: error: for each function it appears in.)
  kernel/slow-work.c: In function 'slow_work_wait_for_items':
  kernel/slow-work.c:950: error: 'slow_work_unreg_sync_lock' undeclared (first use in this function)
  kernel/slow-work.c:951: error: 'slow_work_unreg_wq' undeclared (first use in this function)
  kernel/slow-work.c:961: error: 'slow_work_unreg_work_item' undeclared (first use in this function)
  kernel/slow-work.c:974: error: 'slow_work_unreg_module' undeclared (first use in this function)
  kernel/slow-work.c:977: error: 'slow_work_thread_processing' undeclared (first use in this function)
  make[1]: *** [kernel/slow-work.o] Error 1

Fix this by:

 (1) Extracting the bits of slow_work_execute() that are contingent on
     CONFIG_MODULES, and the bits that should be, into inline functions and
     placing them into the #ifdef'd section that defines the relevant variables
     and adding stubs for moduleless kernels.  This allows the removal of some
     #ifdefs.

 (2) #ifdef'ing out the contents of slow_work_wait_for_items() in moduleless
     kernels.

The four functions related to handling module unloading synchronisation (and
their associated variables) could be offloaded into a separate .c file, but
each function is only used once and three of them are tiny, so doing so would
prevent them from being inlined.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/slow-work.c | 46 +++++++++++++++++++++++++++++-----------------
 1 file changed, 29 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index da94f3c101af..b5c17f15f9de 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -109,6 +109,30 @@ static struct module *slow_work_unreg_module;
 static struct slow_work *slow_work_unreg_work_item;
 static DECLARE_WAIT_QUEUE_HEAD(slow_work_unreg_wq);
 static DEFINE_MUTEX(slow_work_unreg_sync_lock);
+
+static void slow_work_set_thread_processing(int id, struct slow_work *work)
+{
+	if (work)
+		slow_work_thread_processing[id] = work->owner;
+}
+static void slow_work_done_thread_processing(int id, struct slow_work *work)
+{
+	struct module *module = slow_work_thread_processing[id];
+
+	slow_work_thread_processing[id] = NULL;
+	smp_mb();
+	if (slow_work_unreg_work_item == work ||
+	    slow_work_unreg_module == module)
+		wake_up_all(&slow_work_unreg_wq);
+}
+static void slow_work_clear_thread_processing(int id)
+{
+	slow_work_thread_processing[id] = NULL;
+}
+#else
+static void slow_work_set_thread_processing(int id, struct slow_work *work) {}
+static void slow_work_done_thread_processing(int id, struct slow_work *work) {}
+static void slow_work_clear_thread_processing(int id) {}
 #endif
 
 /*
@@ -197,9 +221,6 @@ static unsigned slow_work_calc_vsmax(void)
  */
 static noinline bool slow_work_execute(int id)
 {
-#ifdef CONFIG_MODULES
-	struct module *module;
-#endif
 	struct slow_work *work = NULL;
 	unsigned vsmax;
 	bool very_slow;
@@ -236,10 +257,7 @@ static noinline bool slow_work_execute(int id)
 		very_slow = false; /* avoid the compiler warning */
 	}
 
-#ifdef CONFIG_MODULES
-	if (work)
-		slow_work_thread_processing[id] = work->owner;
-#endif
+	slow_work_set_thread_processing(id, work);
 	if (work) {
 		slow_work_mark_time(work);
 		slow_work_begin_exec(id, work);
@@ -287,15 +305,7 @@ static noinline bool slow_work_execute(int id)
 
 	/* sort out the race between module unloading and put_ref() */
 	slow_work_put_ref(work);
-
-#ifdef CONFIG_MODULES
-	module = slow_work_thread_processing[id];
-	slow_work_thread_processing[id] = NULL;
-	smp_mb();
-	if (slow_work_unreg_work_item == work ||
-	    slow_work_unreg_module == module)
-		wake_up_all(&slow_work_unreg_wq);
-#endif
+	slow_work_done_thread_processing(id, work);
 
 	return true;
 
@@ -310,7 +320,7 @@ auto_requeue:
 	else
 		list_add_tail(&work->link, &slow_work_queue);
 	spin_unlock_irq(&slow_work_queue_lock);
-	slow_work_thread_processing[id] = NULL;
+	slow_work_clear_thread_processing(id);
 	return true;
 }
 
@@ -943,6 +953,7 @@ EXPORT_SYMBOL(slow_work_register_user);
  */
 static void slow_work_wait_for_items(struct module *module)
 {
+#ifdef CONFIG_MODULES
 	DECLARE_WAITQUEUE(myself, current);
 	struct slow_work *work;
 	int loop;
@@ -989,6 +1000,7 @@ static void slow_work_wait_for_items(struct module *module)
 
 	remove_wait_queue(&slow_work_unreg_wq, &myself);
 	mutex_unlock(&slow_work_unreg_sync_lock);
+#endif /* CONFIG_MODULES */
 }
 
 /**
-- 
cgit v1.2.3


From f13a48bd798a159291ca583b95453171b88b7448 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Tue, 1 Dec 2009 15:36:11 +0000
Subject: SLOW_WORK: Move slow_work's proc file to debugfs

Move slow_work's debugging proc file to debugfs.

Signed-off-by: David Howells <dhowells@redhat.com>
Requested-and-acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/slow-work.txt |   4 +-
 include/linux/slow-work.h   |   8 +-
 init/Kconfig                |   8 +-
 kernel/Makefile             |   2 +-
 kernel/slow-work-debugfs.c  | 227 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/slow-work-proc.c     | 227 --------------------------------------------
 kernel/slow-work.c          |  18 ++--
 kernel/slow-work.h          |   6 +-
 8 files changed, 253 insertions(+), 247 deletions(-)
 create mode 100644 kernel/slow-work-debugfs.c
 delete mode 100644 kernel/slow-work-proc.c

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
index 52bc31433723..9dbf4470c7e1 100644
--- a/Documentation/slow-work.txt
+++ b/Documentation/slow-work.txt
@@ -279,9 +279,9 @@ The slow-work thread pool has a number of configurables:
 VIEWING EXECUTING AND QUEUED ITEMS
 ==================================
 
-If CONFIG_SLOW_WORK_PROC is enabled, a proc file is made available:
+If CONFIG_SLOW_WORK_DEBUG is enabled, a debugfs file is made available:
 
-	/proc/slow_work_rq
+	/sys/kernel/debug/slow_work/runqueue
 
 through which the list of work items being executed and the queues of items to
 be executed may be viewed.  The owner of a work item is given the chance to
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 5035a2691739..13337bf6c3f5 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -20,7 +20,7 @@
 #include <linux/timer.h>
 
 struct slow_work;
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 struct seq_file;
 #endif
 
@@ -42,8 +42,8 @@ struct slow_work_ops {
 	/* execute a work item */
 	void (*execute)(struct slow_work *work);
 
-#ifdef CONFIG_SLOW_WORK_PROC
-	/* describe a work item for /proc */
+#ifdef CONFIG_SLOW_WORK_DEBUG
+	/* describe a work item for debugfs */
 	void (*desc)(struct slow_work *work, struct seq_file *m);
 #endif
 };
@@ -64,7 +64,7 @@ struct slow_work {
 #define SLOW_WORK_DELAYED	5	/* item is struct delayed_slow_work with active timer */
 	const struct slow_work_ops *ops; /* operations table for this item */
 	struct list_head	link;	/* link in queue */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	struct timespec		mark;	/* jiffies at which queued or exec begun */
 #endif
 };
diff --git a/init/Kconfig b/init/Kconfig
index ab5c64801fe5..39923ccc287b 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1098,12 +1098,12 @@ config SLOW_WORK
 
 	  See Documentation/slow-work.txt.
 
-config SLOW_WORK_PROC
-	bool "Slow work debugging through /proc"
+config SLOW_WORK_DEBUG
+	bool "Slow work debugging through debugfs"
 	default n
-	depends on SLOW_WORK && PROC_FS
+	depends on SLOW_WORK && DEBUG_FS
 	help
-	  Display the contents of the slow work run queue through /proc,
+	  Display the contents of the slow work run queue through debugfs,
 	  including items currently executing.
 
 	  See Documentation/slow-work.txt.
diff --git a/kernel/Makefile b/kernel/Makefile
index 776ffed1556d..d7c13d249b2d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -94,7 +94,7 @@ obj-$(CONFIG_X86_DS) += trace/
 obj-$(CONFIG_RING_BUFFER) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
 obj-$(CONFIG_SLOW_WORK) += slow-work.o
-obj-$(CONFIG_SLOW_WORK_PROC) += slow-work-proc.o
+obj-$(CONFIG_SLOW_WORK_DEBUG) += slow-work-debugfs.o
 obj-$(CONFIG_PERF_EVENTS) += perf_event.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
diff --git a/kernel/slow-work-debugfs.c b/kernel/slow-work-debugfs.c
new file mode 100644
index 000000000000..e45c43645298
--- /dev/null
+++ b/kernel/slow-work-debugfs.c
@@ -0,0 +1,227 @@
+/* Slow work debugging
+ *
+ * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/fs.h>
+#include <linux/time.h>
+#include <linux/seq_file.h>
+#include "slow-work.h"
+
+#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
+#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
+#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
+
+void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
+{
+	seq_puts(m, "Slow-work: New thread");
+}
+
+/*
+ * Render the time mark field on a work item into a 5-char time with units plus
+ * a space
+ */
+static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
+{
+	struct timespec now, diff;
+
+	now = CURRENT_TIME;
+	diff = timespec_sub(now, work->mark);
+
+	if (diff.tv_sec < 0)
+		seq_puts(m, "  -ve ");
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
+		seq_printf(m, "%3luns ", diff.tv_nsec);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
+		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
+	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
+		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
+	else if (diff.tv_sec <= 1)
+		seq_puts(m, "   1s ");
+	else if (diff.tv_sec < 60)
+		seq_printf(m, "%4lus ", diff.tv_sec);
+	else if (diff.tv_sec < 60 * 60)
+		seq_printf(m, "%4lum ", diff.tv_sec / 60);
+	else if (diff.tv_sec < 60 * 60 * 24)
+		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
+	else
+		seq_puts(m, "exces ");
+}
+
+/*
+ * Describe a slow work item for debugfs
+ */
+static int slow_work_runqueue_show(struct seq_file *m, void *v)
+{
+	struct slow_work *work;
+	struct list_head *p = v;
+	unsigned long id;
+
+	switch ((unsigned long) v) {
+	case 1:
+		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
+		return 0;
+	case 2:
+		seq_puts(m, "=== ===== ================ == ===== ==========\n");
+		return 0;
+
+	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
+		id = (unsigned long) v - 3;
+
+		read_lock(&slow_work_execs_lock);
+		work = slow_work_execs[id];
+		if (work) {
+			smp_read_barrier_depends();
+
+			seq_printf(m, "%3lu %5d %16p %2lx ",
+				   id, slow_work_pids[id], work, work->flags);
+			slow_work_print_mark(m, work);
+
+			if (work->ops->desc)
+				work->ops->desc(work, m);
+			seq_putc(m, '\n');
+		}
+		read_unlock(&slow_work_execs_lock);
+		return 0;
+
+	default:
+		work = list_entry(p, struct slow_work, link);
+		seq_printf(m, "%3s     - %16p %2lx ",
+			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
+			   work, work->flags);
+		slow_work_print_mark(m, work);
+
+		if (work->ops->desc)
+			work->ops->desc(work, m);
+		seq_putc(m, '\n');
+		return 0;
+	}
+}
+
+/*
+ * map the iterator to a work item
+ */
+static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
+{
+	struct list_head *p;
+	unsigned long count, id;
+
+	switch (*_pos >> ITERATOR_SHIFT) {
+	case 0x0:
+		if (*_pos == 0)
+			*_pos = 1;
+		if (*_pos < 3)
+			return (void *)(unsigned long) *_pos;
+		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
+			for (id = *_pos - 3;
+			     id < SLOW_WORK_THREAD_LIMIT;
+			     id++, (*_pos)++)
+				if (slow_work_execs[id])
+					return (void *)(unsigned long) *_pos;
+		*_pos = 0x1UL << ITERATOR_SHIFT;
+
+	case 0x1:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &slow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+
+	case 0x2:
+		count = *_pos & ITERATOR_COUNTER;
+		list_for_each(p, &vslow_work_queue) {
+			if (count == 0)
+				return p;
+			count--;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * set up the iterator to start reading from the first line
+ */
+static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
+{
+	spin_lock_irq(&slow_work_queue_lock);
+	return slow_work_runqueue_index(m, _pos);
+}
+
+/*
+ * move to the next line
+ */
+static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
+{
+	struct list_head *p = v;
+	unsigned long selector = *_pos >> ITERATOR_SHIFT;
+
+	(*_pos)++;
+	switch (selector) {
+	case 0x0:
+		return slow_work_runqueue_index(m, _pos);
+
+	case 0x1:
+		if (*_pos >> ITERATOR_SHIFT == 0x1) {
+			p = p->next;
+			if (p != &slow_work_queue)
+				return p;
+		}
+		*_pos = 0x2UL << ITERATOR_SHIFT;
+		p = &vslow_work_queue;
+
+	case 0x2:
+		if (*_pos >> ITERATOR_SHIFT == 0x2) {
+			p = p->next;
+			if (p != &vslow_work_queue)
+				return p;
+		}
+		*_pos = 0x3UL << ITERATOR_SHIFT;
+
+	default:
+		return NULL;
+	}
+}
+
+/*
+ * clean up after reading
+ */
+static void slow_work_runqueue_stop(struct seq_file *m, void *v)
+{
+	spin_unlock_irq(&slow_work_queue_lock);
+}
+
+static const struct seq_operations slow_work_runqueue_ops = {
+	.start		= slow_work_runqueue_start,
+	.stop		= slow_work_runqueue_stop,
+	.next		= slow_work_runqueue_next,
+	.show		= slow_work_runqueue_show,
+};
+
+/*
+ * open "/sys/kernel/debug/slow_work/runqueue" to list queue contents
+ */
+static int slow_work_runqueue_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &slow_work_runqueue_ops);
+}
+
+const struct file_operations slow_work_runqueue_fops = {
+	.owner		= THIS_MODULE,
+	.open		= slow_work_runqueue_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
diff --git a/kernel/slow-work-proc.c b/kernel/slow-work-proc.c
deleted file mode 100644
index 3988032571f5..000000000000
--- a/kernel/slow-work-proc.c
+++ /dev/null
@@ -1,227 +0,0 @@
-/* Slow work debugging
- *
- * Copyright (C) 2009 Red Hat, Inc. All Rights Reserved.
- * Written by David Howells (dhowells@redhat.com)
- *
- * This program is free software; you can redistribute it and/or
- * modify it under the terms of the GNU General Public Licence
- * as published by the Free Software Foundation; either version
- * 2 of the Licence, or (at your option) any later version.
- */
-
-#include <linux/module.h>
-#include <linux/slow-work.h>
-#include <linux/fs.h>
-#include <linux/time.h>
-#include <linux/seq_file.h>
-#include "slow-work.h"
-
-#define ITERATOR_SHIFT		(BITS_PER_LONG - 4)
-#define ITERATOR_SELECTOR	(0xfUL << ITERATOR_SHIFT)
-#define ITERATOR_COUNTER	(~ITERATOR_SELECTOR)
-
-void slow_work_new_thread_desc(struct slow_work *work, struct seq_file *m)
-{
-	seq_puts(m, "Slow-work: New thread");
-}
-
-/*
- * Render the time mark field on a work item into a 5-char time with units plus
- * a space
- */
-static void slow_work_print_mark(struct seq_file *m, struct slow_work *work)
-{
-	struct timespec now, diff;
-
-	now = CURRENT_TIME;
-	diff = timespec_sub(now, work->mark);
-
-	if (diff.tv_sec < 0)
-		seq_puts(m, "  -ve ");
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000)
-		seq_printf(m, "%3luns ", diff.tv_nsec);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000)
-		seq_printf(m, "%3luus ", diff.tv_nsec / 1000);
-	else if (diff.tv_sec == 0 && diff.tv_nsec < 1000000000)
-		seq_printf(m, "%3lums ", diff.tv_nsec / 1000000);
-	else if (diff.tv_sec <= 1)
-		seq_puts(m, "   1s ");
-	else if (diff.tv_sec < 60)
-		seq_printf(m, "%4lus ", diff.tv_sec);
-	else if (diff.tv_sec < 60 * 60)
-		seq_printf(m, "%4lum ", diff.tv_sec / 60);
-	else if (diff.tv_sec < 60 * 60 * 24)
-		seq_printf(m, "%4luh ", diff.tv_sec / 3600);
-	else
-		seq_puts(m, "exces ");
-}
-
-/*
- * Describe a slow work item for /proc
- */
-static int slow_work_runqueue_show(struct seq_file *m, void *v)
-{
-	struct slow_work *work;
-	struct list_head *p = v;
-	unsigned long id;
-
-	switch ((unsigned long) v) {
-	case 1:
-		seq_puts(m, "THR PID   ITEM ADDR        FL MARK  DESC\n");
-		return 0;
-	case 2:
-		seq_puts(m, "=== ===== ================ == ===== ==========\n");
-		return 0;
-
-	case 3 ... 3 + SLOW_WORK_THREAD_LIMIT - 1:
-		id = (unsigned long) v - 3;
-
-		read_lock(&slow_work_execs_lock);
-		work = slow_work_execs[id];
-		if (work) {
-			smp_read_barrier_depends();
-
-			seq_printf(m, "%3lu %5d %16p %2lx ",
-				   id, slow_work_pids[id], work, work->flags);
-			slow_work_print_mark(m, work);
-
-			if (work->ops->desc)
-				work->ops->desc(work, m);
-			seq_putc(m, '\n');
-		}
-		read_unlock(&slow_work_execs_lock);
-		return 0;
-
-	default:
-		work = list_entry(p, struct slow_work, link);
-		seq_printf(m, "%3s     - %16p %2lx ",
-			   work->flags & SLOW_WORK_VERY_SLOW ? "vsq" : "sq",
-			   work, work->flags);
-		slow_work_print_mark(m, work);
-
-		if (work->ops->desc)
-			work->ops->desc(work, m);
-		seq_putc(m, '\n');
-		return 0;
-	}
-}
-
-/*
- * map the iterator to a work item
- */
-static void *slow_work_runqueue_index(struct seq_file *m, loff_t *_pos)
-{
-	struct list_head *p;
-	unsigned long count, id;
-
-	switch (*_pos >> ITERATOR_SHIFT) {
-	case 0x0:
-		if (*_pos == 0)
-			*_pos = 1;
-		if (*_pos < 3)
-			return (void *)(unsigned long) *_pos;
-		if (*_pos < 3 + SLOW_WORK_THREAD_LIMIT)
-			for (id = *_pos - 3;
-			     id < SLOW_WORK_THREAD_LIMIT;
-			     id++, (*_pos)++)
-				if (slow_work_execs[id])
-					return (void *)(unsigned long) *_pos;
-		*_pos = 0x1UL << ITERATOR_SHIFT;
-
-	case 0x1:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &slow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-
-	case 0x2:
-		count = *_pos & ITERATOR_COUNTER;
-		list_for_each(p, &vslow_work_queue) {
-			if (count == 0)
-				return p;
-			count--;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * set up the iterator to start reading from the first line
- */
-static void *slow_work_runqueue_start(struct seq_file *m, loff_t *_pos)
-{
-	spin_lock_irq(&slow_work_queue_lock);
-	return slow_work_runqueue_index(m, _pos);
-}
-
-/*
- * move to the next line
- */
-static void *slow_work_runqueue_next(struct seq_file *m, void *v, loff_t *_pos)
-{
-	struct list_head *p = v;
-	unsigned long selector = *_pos >> ITERATOR_SHIFT;
-
-	(*_pos)++;
-	switch (selector) {
-	case 0x0:
-		return slow_work_runqueue_index(m, _pos);
-
-	case 0x1:
-		if (*_pos >> ITERATOR_SHIFT == 0x1) {
-			p = p->next;
-			if (p != &slow_work_queue)
-				return p;
-		}
-		*_pos = 0x2UL << ITERATOR_SHIFT;
-		p = &vslow_work_queue;
-
-	case 0x2:
-		if (*_pos >> ITERATOR_SHIFT == 0x2) {
-			p = p->next;
-			if (p != &vslow_work_queue)
-				return p;
-		}
-		*_pos = 0x3UL << ITERATOR_SHIFT;
-
-	default:
-		return NULL;
-	}
-}
-
-/*
- * clean up after reading
- */
-static void slow_work_runqueue_stop(struct seq_file *m, void *v)
-{
-	spin_unlock_irq(&slow_work_queue_lock);
-}
-
-static const struct seq_operations slow_work_runqueue_ops = {
-	.start		= slow_work_runqueue_start,
-	.stop		= slow_work_runqueue_stop,
-	.next		= slow_work_runqueue_next,
-	.show		= slow_work_runqueue_show,
-};
-
-/*
- * open "/proc/slow_work_rq" to list queue contents
- */
-static int slow_work_runqueue_open(struct inode *inode, struct file *file)
-{
-	return seq_open(file, &slow_work_runqueue_ops);
-}
-
-const struct file_operations slow_work_runqueue_fops = {
-	.owner		= THIS_MODULE,
-	.open		= slow_work_runqueue_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index b5c17f15f9de..00889bd3c590 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,7 +16,7 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <linux/proc_fs.h>
+#include <linux/debugfs.h>
 #include "slow-work.h"
 
 static void slow_work_cull_timeout(unsigned long);
@@ -138,7 +138,7 @@ static void slow_work_clear_thread_processing(int id) {}
 /*
  * Data for tracking currently executing items for indication through /proc
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 struct slow_work *slow_work_execs[SLOW_WORK_THREAD_LIMIT];
 pid_t slow_work_pids[SLOW_WORK_THREAD_LIMIT];
 DEFINE_RWLOCK(slow_work_execs_lock);
@@ -823,7 +823,7 @@ static void slow_work_new_thread_execute(struct slow_work *work)
 static const struct slow_work_ops slow_work_new_thread_ops = {
 	.owner		= THIS_MODULE,
 	.execute	= slow_work_new_thread_execute,
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 	.desc		= slow_work_new_thread_desc,
 #endif
 };
@@ -1055,9 +1055,15 @@ static int __init init_slow_work(void)
 	if (slow_work_max_max_threads < nr_cpus * 2)
 		slow_work_max_max_threads = nr_cpus * 2;
 #endif
-#ifdef CONFIG_SLOW_WORK_PROC
-	proc_create("slow_work_rq", S_IFREG | 0400, NULL,
-		    &slow_work_runqueue_fops);
+#ifdef CONFIG_SLOW_WORK_DEBUG
+	{
+		struct dentry *dbdir;
+
+		dbdir = debugfs_create_dir("slow_work", NULL);
+		if (dbdir && !IS_ERR(dbdir))
+			debugfs_create_file("runqueue", S_IFREG | 0400, dbdir,
+					    NULL, &slow_work_runqueue_fops);
+	}
 #endif
 	return 0;
 }
diff --git a/kernel/slow-work.h b/kernel/slow-work.h
index 3c2f007f3ad6..321f3c59d732 100644
--- a/kernel/slow-work.h
+++ b/kernel/slow-work.h
@@ -19,7 +19,7 @@
 /*
  * slow-work.c
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 extern struct slow_work *slow_work_execs[];
 extern pid_t slow_work_pids[];
 extern rwlock_t slow_work_execs_lock;
@@ -30,9 +30,9 @@ extern struct list_head vslow_work_queue;
 extern spinlock_t slow_work_queue_lock;
 
 /*
- * slow-work-proc.c
+ * slow-work-debugfs.c
  */
-#ifdef CONFIG_SLOW_WORK_PROC
+#ifdef CONFIG_SLOW_WORK_DEBUG
 extern const struct file_operations slow_work_runqueue_fops;
 
 extern void slow_work_new_thread_desc(struct slow_work *, struct seq_file *);
-- 
cgit v1.2.3


From 35dead4235e2b67da7275b4122fed37099c2f462 Mon Sep 17 00:00:00 2001
From: Helge Deller <deller@gmx.de>
Date: Thu, 3 Dec 2009 00:29:15 +0100
Subject: modules: don't export section names of empty sections via sysfs

On the parisc architecture we face for each and every loaded kernel module
this kernel "badness warning":
  sysfs: cannot create duplicate filename '/module/ac97_bus/sections/.text'
  Badness at fs/sysfs/dir.c:487

Reason for that is, that on parisc all kernel modules do have multiple
.text sections due to the usage of the -ffunction-sections compiler flag
which is needed to reach all jump targets on this platform.

An objdump on such a kernel module gives:
Sections:
Idx Name          Size      VMA       LMA       File off  Algn
  0 .note.gnu.build-id 00000024  00000000  00000000  00000034  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, DATA
  1 .text         00000000  00000000  00000000  00000058  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  2 .text.ac97_bus_match 0000001c  00000000  00000000  00000058  2**2
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
  3 .text         00000000  00000000  00000000  000000d4  2**0
                  CONTENTS, ALLOC, LOAD, READONLY, CODE
...
Since the .text sections are empty (size of 0 bytes) and won't be
loaded by the kernel module loader anyway, I don't see a reason
why such sections need to be listed under
/sys/module/<module_name>/sections/<section_name> either.

The attached patch does solve this issue by not exporting section
names which are empty.

This fixes bugzilla http://bugzilla.kernel.org/show_bug.cgi?id=14703

Signed-off-by: Helge Deller <deller@gmx.de>
CC: rusty@rustcorp.com.au
CC: akpm@linux-foundation.org
CC: James.Bottomley@HansenPartnership.com
CC: roland@redhat.com
CC: dave@hiauly1.hia.nrc.ca
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/module.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 8b7d8805819d..5842a71cf052 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1187,7 +1187,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 
 	/* Count loaded sections and allocate structures */
 	for (i = 0; i < nsect; i++)
-		if (sechdrs[i].sh_flags & SHF_ALLOC)
+		if (sechdrs[i].sh_flags & SHF_ALLOC
+		    && sechdrs[i].sh_size)
 			nloaded++;
 	size[0] = ALIGN(sizeof(*sect_attrs)
 			+ nloaded * sizeof(sect_attrs->attrs[0]),
@@ -1207,6 +1208,8 @@ static void add_sect_attrs(struct module *mod, unsigned int nsect,
 	for (i = 0; i < nsect; i++) {
 		if (! (sechdrs[i].sh_flags & SHF_ALLOC))
 			continue;
+		if (!sechdrs[i].sh_size)
+			continue;
 		sattr->address = sechdrs[i].sh_addr;
 		sattr->name = kstrdup(secstrings + sechdrs[i].sh_name,
 					GFP_KERNEL);
-- 
cgit v1.2.3