From 517a92c4e19fcea815332d3155e9fb7723251274 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:02:13 +0100
Subject: panic: print more informative messages on stackprotect failure

pointed out by pageexec@freemail.hu:

we just simply panic() when there's a stackprotector attack - giving
the attacked person no information about what kernel code the attack went
against.

print out the attacked function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 425567f45b9f..f236001cc4db 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -327,7 +327,8 @@ EXPORT_SYMBOL(warn_on_slowpath);
  */
 void __stack_chk_fail(void)
 {
-	panic("stack-protector: Kernel stack is corrupted");
+	panic("stack-protector: Kernel stack is corrupted in: %p\n",
+		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
 #endif
-- 
cgit v1.2.3


From 5cb273013e182a35e7db614d3e20a144cba71e53 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 14 Feb 2008 09:07:01 +0100
Subject: panic: print out stacktrace if DEBUG_BUGVERBOSE

if CONFIG_DEBUG_BUGVERBOSE is set then the user most definitely wanted
to see as much information about kernel crashes as possible - so give
them at least a stack dump.

this is particularly useful for stackprotector related panics, where
the stacktrace can give us the exact location of the (attempted)
attack.

Pointed out by pageexec@freemail.hu in the stackprotector breakage
threads.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index f236001cc4db..17aad578a2f2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -80,6 +80,9 @@ NORET_TYPE void panic(const char * fmt, ...)
 	vsnprintf(buf, sizeof(buf), fmt, args);
 	va_end(args);
 	printk(KERN_EMERG "Kernel panic - not syncing: %s\n",buf);
+#ifdef CONFIG_DEBUG_BUGVERBOSE
+	dump_stack();
+#endif
 	bust_spinlocks(0);
 
 	/*
-- 
cgit v1.2.3


From 54371a43a66f4477889769b4fa00df936855dc8f Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 15 Feb 2008 15:33:12 -0800
Subject: x86: add CONFIG_CC_STACKPROTECTOR self-test

This patch adds a simple self-test capability to the stackprotector
feature. The test deliberately overflows a stack buffer and then
checks if the canary trap function gets called.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 17aad578a2f2..50cf9257b234 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -324,14 +324,82 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #endif
 
 #ifdef CONFIG_CC_STACKPROTECTOR
+
+static unsigned long __stack_check_testing;
+/*
+ * Self test function for the stack-protector feature.
+ * This test requires that the local variable absolutely has
+ * a stack slot, hence the barrier()s.
+ */
+static noinline void __stack_chk_test_func(void)
+{
+	unsigned long foo;
+	barrier();
+	/*
+	 * we need to make sure we're not about to clobber the return address,
+	 * while real exploits do this, it's unhealthy on a running system.
+	 * Besides, if we would, the test is already failed anyway so
+	 * time to pull the emergency brake on it.
+	 */
+	if ((unsigned long)__builtin_return_address(0) == 
+					*(((unsigned long *)&foo)+1)) {
+		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
+		return;
+	}
+#ifdef CONFIG_FRAME_POINTER
+	/* We also don't want to clobber the frame pointer */
+	if ((unsigned long)__builtin_return_address(0) == 
+					*(((unsigned long *)&foo)+2)) {
+		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
+		return;
+	}
+#endif
+	barrier();
+	if (current->stack_canary == *(((unsigned long *)&foo)+1))
+		*(((unsigned long *)&foo)+1) = 0;
+	else
+		printk(KERN_ERR "No -fstack-protector canary found\n");
+	barrier();
+}
+
+static int __stack_chk_test(void)
+{
+	printk(KERN_INFO "Testing -fstack-protector-all feature\n");
+	__stack_check_testing = (unsigned long)&__stack_chk_test_func;
+	__stack_chk_test_func();
+	if (__stack_check_testing) {
+		printk(KERN_ERR "-fstack-protector-all test failed\n");
+		WARN_ON(1);
+	}
+	return 0;
+}
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
 void __stack_chk_fail(void)
 {
+	if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) {
+		long delta;
+
+		delta = (unsigned long)__builtin_return_address(0) -
+				__stack_check_testing;
+		/*
+		 * The test needs to happen inside the test function, so
+		 * check if the return address is close to that function.
+		 * The function is only 2 dozen bytes long, but keep a wide
+		 * safety margin to avoid panic()s for normal users regardless
+		 * of the quality of the compiler.
+		 */
+		if (delta >= 0 && delta <= 400) {
+			__stack_check_testing = 0;
+			return;
+		}
+	}
 	panic("stack-protector: Kernel stack is corrupted in: %p\n",
 		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
+
+late_initcall(__stack_chk_test);
 #endif
-- 
cgit v1.2.3


From b719ac56c0032bc1602914c6ea70b0f1581b08c7 Mon Sep 17 00:00:00 2001
From: Daniel Walker <dwalker@mvista.com>
Date: Mon, 14 Apr 2008 10:03:50 -0700
Subject: panic.c: fix whitespace additions

trivial: remove white space addition in stack protector

Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/panic.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 50cf9257b234..866be9b72e4f 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -341,14 +341,14 @@ static noinline void __stack_chk_test_func(void)
 	 * Besides, if we would, the test is already failed anyway so
 	 * time to pull the emergency brake on it.
 	 */
-	if ((unsigned long)__builtin_return_address(0) == 
+	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+1)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
 		return;
 	}
 #ifdef CONFIG_FRAME_POINTER
 	/* We also don't want to clobber the frame pointer */
-	if ((unsigned long)__builtin_return_address(0) == 
+	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+2)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
 		return;
-- 
cgit v1.2.3


From b40a4392a3c262e0d1b5379b4e142a8eefa63439 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 18 Apr 2008 06:16:45 -0700
Subject: stackprotector: turn not having the right gcc into a #warning

If the user selects the stack-protector config option, but does not have
a gcc that has the right bits enabled (for example because it isn't build
with a glibc that supports TLS, as is common for cross-compilers, but also
because it may be too old), then the runtime test fails right now.

This patch adds a warning message for this scenario. This warning accomplishes
two goals
1) the user is informed that the security option he selective isn't available
2) the user is suggested to turn of the CONFIG option that won't work for him,
   and would make the runtime test fail anyway.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/Makefile | 2 +-
 kernel/panic.c    | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index 3cff3c894cf3..c3e0eeeb1dd2 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -73,7 +73,7 @@ else
 
         stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
         stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
-                "$(CC)" -fstack-protector )
+                "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
         stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
                 "$(CC)" -fstack-protector-all )
 
diff --git a/kernel/panic.c b/kernel/panic.c
index 866be9b72e4f..6729e3f4ebcb 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -325,6 +325,9 @@ EXPORT_SYMBOL(warn_on_slowpath);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 
+#ifndef GCC_HAS_SP
+#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
+#endif
 static unsigned long __stack_check_testing;
 /*
  * Self test function for the stack-protector feature.
-- 
cgit v1.2.3


From 7c9f8861e6c9c839f913e49b98c3854daca18f27 Mon Sep 17 00:00:00 2001
From: Eric Sandeen <sandeen@sandeen.net>
Date: Tue, 22 Apr 2008 16:38:23 -0500
Subject: stackprotector: use canary at end of stack to indicate overruns at
 oops time

(Updated with a common max-stack-used checker that knows about
the canary, as suggested by Joe Perches)

Use a canary at the end of the stack to clearly indicate
at oops time whether the stack has ever overflowed.

This is a very simple implementation with a couple of
drawbacks:

1) a thread may legitimately use exactly up to the last
   word on the stack

 -- but the chances of doing this and then oopsing later seem slim

2) it's possible that the stack usage isn't dense enough
   that the canary location could get skipped over

 -- but the worst that happens is that we don't flag the overrun
 -- though this happens fairly often in my testing :(

With the code in place, an intentionally-bloated stack oops might
do:

BUG: unable to handle kernel paging request at ffff8103f84cc680
IP: [<ffffffff810253df>] update_curr+0x9a/0xa8
PGD 8063 PUD 0
Thread overran stack or stack corrupted
Oops: 0000 [1] SMP
CPU 0
...

... unless the stack overrun is so bad that it corrupts some other
thread.

Signed-off-by: Eric Sandeen <sandeen@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 arch/x86/mm/fault.c   |  7 +++++++
 include/linux/magic.h |  1 +
 include/linux/sched.h | 13 +++++++++++++
 kernel/exit.c         |  5 +----
 kernel/fork.c         |  5 +++++
 kernel/sched.c        |  7 +------
 6 files changed, 28 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index fd7e1798c75a..1f524df68b96 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -25,6 +25,7 @@
 #include <linux/kprobes.h>
 #include <linux/uaccess.h>
 #include <linux/kdebug.h>
+#include <linux/magic.h>
 
 #include <asm/system.h>
 #include <asm/desc.h>
@@ -581,6 +582,8 @@ void __kprobes do_page_fault(struct pt_regs *regs, unsigned long error_code)
 	unsigned long address;
 	int write, si_code;
 	int fault;
+	unsigned long *stackend;
+
 #ifdef CONFIG_X86_64
 	unsigned long flags;
 #endif
@@ -850,6 +853,10 @@ no_context:
 
 	show_fault_oops(regs, error_code, address);
 
+ 	stackend = end_of_stack(tsk);
+	if (*stackend != STACK_END_MAGIC)
+		printk(KERN_ALERT "Thread overran stack, or stack corrupted\n");
+
 	tsk->thread.cr2 = address;
 	tsk->thread.trap_no = 14;
 	tsk->thread.error_code = error_code;
diff --git a/include/linux/magic.h b/include/linux/magic.h
index 1fa0c2ce4dec..74e68e201166 100644
--- a/include/linux/magic.h
+++ b/include/linux/magic.h
@@ -42,4 +42,5 @@
 #define FUTEXFS_SUPER_MAGIC	0xBAD1DEA
 #define INOTIFYFS_SUPER_MAGIC	0x2BAD1DEA
 
+#define STACK_END_MAGIC		0x57AC6E9D
 #endif /* __LINUX_MAGIC_H__ */
diff --git a/include/linux/sched.h b/include/linux/sched.h
index d6a515158783..c5181e77f305 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1969,6 +1969,19 @@ static inline unsigned long *end_of_stack(struct task_struct *p)
 
 extern void thread_info_cache_init(void);
 
+#ifdef CONFIG_DEBUG_STACK_USAGE
+static inline unsigned long stack_not_used(struct task_struct *p)
+{
+	unsigned long *n = end_of_stack(p);
+
+	do { 	/* Skip over canary */
+		n++;
+	} while (!*n);
+
+	return (unsigned long)n - (unsigned long)end_of_stack(p);
+}
+#endif
+
 /* set thread flags in other task's structures
  * - see asm/thread_info.h for TIF_xxxx flags available
  */
diff --git a/kernel/exit.c b/kernel/exit.c
index 8f6185e69b69..fb8de6cbf2c7 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -899,12 +899,9 @@ static void check_stack_usage(void)
 {
 	static DEFINE_SPINLOCK(low_water_lock);
 	static int lowest_to_date = THREAD_SIZE;
-	unsigned long *n = end_of_stack(current);
 	unsigned long free;
 
-	while (*n == 0)
-		n++;
-	free = (unsigned long)n - (unsigned long)end_of_stack(current);
+	free = stack_not_used(current);
 
 	if (free >= lowest_to_date)
 		return;
diff --git a/kernel/fork.c b/kernel/fork.c
index 19908b26cf80..d428336e7aa1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -54,6 +54,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/magic.h>
 
 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
@@ -186,6 +187,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 {
 	struct task_struct *tsk;
 	struct thread_info *ti;
+	unsigned long *stackend;
+
 	int err;
 
 	prepare_to_copy(orig);
@@ -211,6 +214,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
 		goto out;
 
 	setup_thread_stack(tsk, orig);
+	stackend = end_of_stack(tsk);
+	*stackend = STACK_END_MAGIC;	/* for overflow detection */
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 	tsk->stack_canary = get_random_int();
diff --git a/kernel/sched.c b/kernel/sched.c
index cfa222a91539..a964ed945094 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5748,12 +5748,7 @@ void sched_show_task(struct task_struct *p)
 		printk(KERN_CONT " %016lx ", thread_saved_pc(p));
 #endif
 #ifdef CONFIG_DEBUG_STACK_USAGE
-	{
-		unsigned long *n = end_of_stack(p);
-		while (!*n)
-			n++;
-		free = (unsigned long)n - (unsigned long)end_of_stack(p);
-	}
+	free = stack_not_used(p);
 #endif
 	printk(KERN_CONT "%5lu %5d %6d\n", free,
 		task_pid_nr(p), task_pid_nr(p->real_parent));
-- 
cgit v1.2.3


From aa92db14270b79f0f91a9060b547a46f9e2639da Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Fri, 11 Jul 2008 05:09:55 -0700
Subject: stackprotector: better self-test

check stackprotector functionality by manipulating the canary briefly
during bootup.

far more robust than trying to overflow the stack. (which is architecture
dependent, etc.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 6729e3f4ebcb..28153aec7100 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -347,22 +347,18 @@ static noinline void __stack_chk_test_func(void)
 	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+1)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-		return;
 	}
 #ifdef CONFIG_FRAME_POINTER
 	/* We also don't want to clobber the frame pointer */
 	if ((unsigned long)__builtin_return_address(0) ==
 					*(((unsigned long *)&foo)+2)) {
 		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-		return;
 	}
 #endif
-	barrier();
-	if (current->stack_canary == *(((unsigned long *)&foo)+1))
-		*(((unsigned long *)&foo)+1) = 0;
-	else
+	if (current->stack_canary != *(((unsigned long *)&foo)+1))
 		printk(KERN_ERR "No -fstack-protector canary found\n");
-	barrier();
+
+	current->stack_canary = ~current->stack_canary;
 }
 
 static int __stack_chk_test(void)
@@ -373,7 +369,8 @@ static int __stack_chk_test(void)
 	if (__stack_check_testing) {
 		printk(KERN_ERR "-fstack-protector-all test failed\n");
 		WARN_ON(1);
-	}
+	};
+	current->stack_canary = ~current->stack_canary;
 	return 0;
 }
 /*
-- 
cgit v1.2.3


From af9ff7868f0f76d3364351b1641b9dfa99588e77 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Jul 2008 09:36:38 -0700
Subject: x86: simplify stackprotector self-check

Clean up the code by removing no longer needed code;
make sure the pda is updated and kept in sync

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-x86/pda.h |  1 +
 kernel/panic.c        | 29 +++++++----------------------
 2 files changed, 8 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-x86/pda.h b/include/asm-x86/pda.h
index 62b734986a44..a5ff5bb76299 100644
--- a/include/asm-x86/pda.h
+++ b/include/asm-x86/pda.h
@@ -131,4 +131,5 @@ do {									\
 
 #define PDA_STACKOFFSET (5*8)
 
+#define refresh_stack_canary() write_pda(stack_canary, current->stack_canary)
 #endif
diff --git a/kernel/panic.c b/kernel/panic.c
index 28153aec7100..87445a894c3a 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -328,37 +328,21 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #ifndef GCC_HAS_SP
 #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
 #endif
+
 static unsigned long __stack_check_testing;
+
 /*
  * Self test function for the stack-protector feature.
  * This test requires that the local variable absolutely has
- * a stack slot, hence the barrier()s.
+ * a stack slot.
  */
 static noinline void __stack_chk_test_func(void)
 {
-	unsigned long foo;
-	barrier();
-	/*
-	 * we need to make sure we're not about to clobber the return address,
-	 * while real exploits do this, it's unhealthy on a running system.
-	 * Besides, if we would, the test is already failed anyway so
-	 * time to pull the emergency brake on it.
-	 */
-	if ((unsigned long)__builtin_return_address(0) ==
-					*(((unsigned long *)&foo)+1)) {
-		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-	}
-#ifdef CONFIG_FRAME_POINTER
-	/* We also don't want to clobber the frame pointer */
-	if ((unsigned long)__builtin_return_address(0) ==
-					*(((unsigned long *)&foo)+2)) {
-		printk(KERN_ERR "No -fstack-protector-stack-frame!\n");
-	}
-#endif
-	if (current->stack_canary != *(((unsigned long *)&foo)+1))
-		printk(KERN_ERR "No -fstack-protector canary found\n");
+	unsigned long dummy_buffer[64]; /* force gcc to use the canary */
 
 	current->stack_canary = ~current->stack_canary;
+	refresh_stack_canary();
+	dummy_buffer[3] = 1; /* fool gcc into keeping the variable */
 }
 
 static int __stack_chk_test(void)
@@ -371,6 +355,7 @@ static int __stack_chk_test(void)
 		WARN_ON(1);
 	};
 	current->stack_canary = ~current->stack_canary;
+	refresh_stack_canary();
 	return 0;
 }
 /*
-- 
cgit v1.2.3


From 4f962d4d65923d7b722192e729840cfb79af0a5a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Jul 2008 21:42:44 +0200
Subject: stackprotector: remove self-test

turns out gcc generates such stackprotector-failure sequences
in certain circumstances:

        movq    -8(%rbp), %rax  # D.16032,
        xorq    %gs:40, %rax    #,
        jne     .L17    #,
        leave
        ret
.L17:
        call    __stack_chk_fail        #
        .size   __stack_chk_test_func, .-__stack_chk_test_func
        .section        .init.text,"ax",@progbits
        .type   panic_setup, @function
panic_setup:
        pushq   %rbp    #

note that there's no jump back to the failing context after the
call to __stack_chk_fail - i.e. it has a ((noreturn)) attribute.

Which is fair enough in the normal case but kills the self-test.
(as we cannot reliably return in the self-test)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 47 -----------------------------------------------
 1 file changed, 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 87445a894c3a..c35c9eca3eb2 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -329,62 +329,15 @@ EXPORT_SYMBOL(warn_on_slowpath);
 #warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
 #endif
 
-static unsigned long __stack_check_testing;
-
-/*
- * Self test function for the stack-protector feature.
- * This test requires that the local variable absolutely has
- * a stack slot.
- */
-static noinline void __stack_chk_test_func(void)
-{
-	unsigned long dummy_buffer[64]; /* force gcc to use the canary */
-
-	current->stack_canary = ~current->stack_canary;
-	refresh_stack_canary();
-	dummy_buffer[3] = 1; /* fool gcc into keeping the variable */
-}
-
-static int __stack_chk_test(void)
-{
-	printk(KERN_INFO "Testing -fstack-protector-all feature\n");
-	__stack_check_testing = (unsigned long)&__stack_chk_test_func;
-	__stack_chk_test_func();
-	if (__stack_check_testing) {
-		printk(KERN_ERR "-fstack-protector-all test failed\n");
-		WARN_ON(1);
-	};
-	current->stack_canary = ~current->stack_canary;
-	refresh_stack_canary();
-	return 0;
-}
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
  */
 void __stack_chk_fail(void)
 {
-	if (__stack_check_testing == (unsigned long)&__stack_chk_test_func) {
-		long delta;
-
-		delta = (unsigned long)__builtin_return_address(0) -
-				__stack_check_testing;
-		/*
-		 * The test needs to happen inside the test function, so
-		 * check if the return address is close to that function.
-		 * The function is only 2 dozen bytes long, but keep a wide
-		 * safety margin to avoid panic()s for normal users regardless
-		 * of the quality of the compiler.
-		 */
-		if (delta >= 0 && delta <= 400) {
-			__stack_check_testing = 0;
-			return;
-		}
-	}
 	panic("stack-protector: Kernel stack is corrupted in: %p\n",
 		__builtin_return_address(0));
 }
 EXPORT_SYMBOL(__stack_chk_fail);
 
-late_initcall(__stack_chk_test);
 #endif
-- 
cgit v1.2.3


From 412d0bb553c0227191f1bfd06100f561600bff22 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 24 Dec 2008 01:43:25 +0100
Subject: tracing/function-graph-tracer: strip ending newlines on comments

Impact: tracer output improvement

Ending newlines are appended automatically on comments by the function
graph tracer because the newline needs to be placed after the "*/"
comment characters.

So if the user puts an ending newline, we want to strip it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4bf39fcae97a..bc7d90850be5 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -592,6 +592,12 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	if (ent->flags & TRACE_FLAG_CONT)
 		trace_seq_print_cont(s, iter);
 
+	/* Strip ending newline */
+	if (s->buffer[s->len - 1] == '\n') {
+		s->buffer[s->len - 1] = '\0';
+		s->len--;
+	}
+
 	ret = trace_seq_printf(s, " */\n");
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3


From 1cc4fff0b360aeffeedb7d6db5089d88dd861700 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 22 Dec 2008 02:24:48 +0100
Subject: hrtimers: increase clock min delta threshold while interrupt hanging

Impact: avoid timer IRQ hanging slow systems

While using the function graph tracer on a virtualized system, the
hrtimer_interrupt can hang the system on an infinite loop.

This can be caused in several situations:

 - the hardware is very slow and HZ is set too high

 - something intrusive is slowing the system down (tracing under emulation)

... and the next clock events to program are always before the current time.

This patch implements a reasonable compromise: if such a situation is
detected, we share the CPUs time in 1/4 to process the hrtimer interrupts.
This is enough to let the system running without serious starvation.

It has been successfully tested under VirtualBox with 1000 HZ and 100 HZ
with function graph tracer launched. On both cases, the clock events were
increased until about 25 ms periodic ticks, which means 40 HZ.

So we change a hard to debug hang into a warning message and a system that
still manages to limp along.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hrtimer.c | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index bda9cb924276..c2a69b89ac61 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1171,6 +1171,29 @@ static void __run_hrtimer(struct hrtimer *timer)
 
 #ifdef CONFIG_HIGH_RES_TIMERS
 
+static int force_clock_reprogram;
+
+/*
+ * After 5 iteration's attempts, we consider that hrtimer_interrupt()
+ * is hanging, which could happen with something that slows the interrupt
+ * such as the tracing. Then we force the clock reprogramming for each future
+ * hrtimer interrupts to avoid infinite loops and use the min_delta_ns
+ * threshold that we will overwrite.
+ * The next tick event will be scheduled to 3 times we currently spend on
+ * hrtimer_interrupt(). This gives a good compromise, the cpus will spend
+ * 1/4 of their time to process the hrtimer interrupts. This is enough to
+ * let it running without serious starvation.
+ */
+
+static inline void
+hrtimer_interrupt_hanging(struct clock_event_device *dev,
+			ktime_t try_time)
+{
+	force_clock_reprogram = 1;
+	dev->min_delta_ns = (unsigned long)try_time.tv64 * 3;
+	printk(KERN_WARNING "hrtimer: interrupt too slow, "
+		"forcing clock min delta to %lu ns\n", dev->min_delta_ns);
+}
 /*
  * High resolution timer interrupt
  * Called with interrupts disabled
@@ -1180,6 +1203,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	struct hrtimer_cpu_base *cpu_base = &__get_cpu_var(hrtimer_bases);
 	struct hrtimer_clock_base *base;
 	ktime_t expires_next, now;
+	int nr_retries = 0;
 	int i;
 
 	BUG_ON(!cpu_base->hres_active);
@@ -1187,6 +1211,10 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 	dev->next_event.tv64 = KTIME_MAX;
 
  retry:
+	/* 5 retries is enough to notice a hang */
+	if (!(++nr_retries % 5))
+		hrtimer_interrupt_hanging(dev, ktime_sub(ktime_get(), now));
+
 	now = ktime_get();
 
 	expires_next.tv64 = KTIME_MAX;
@@ -1239,7 +1267,7 @@ void hrtimer_interrupt(struct clock_event_device *dev)
 
 	/* Reprogramming necessary ? */
 	if (expires_next.tv64 != KTIME_MAX) {
-		if (tick_program_event(expires_next, 0))
+		if (tick_program_event(expires_next, force_clock_reprogram))
 			goto retry;
 	}
 }
-- 
cgit v1.2.3


From efdc64f0c792ea744bcc9203f35b908e66d42f41 Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Mon, 29 Dec 2008 13:35:11 +0800
Subject: genirq: check chip->ack before calling

Impact: fix theoretical NULL dereference

The generic irq layer doesn't know whether irq_chip has ack routine on some
architectures or not. Upon that, before calling chip->ack, we should check
that it's not NULL.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/chip.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index 6eb3c7952b64..0ad02d76a0c4 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -290,7 +290,8 @@ static inline void mask_ack_irq(struct irq_desc *desc, int irq)
 		desc->chip->mask_ack(irq);
 	else {
 		desc->chip->mask(irq);
-		desc->chip->ack(irq);
+		if (desc->chip->ack)
+			desc->chip->ack(irq);
 	}
 }
 
@@ -475,7 +476,8 @@ handle_edge_irq(unsigned int irq, struct irq_desc *desc)
 	kstat_incr_irqs_this_cpu(irq, desc);
 
 	/* Start handling the irq */
-	desc->chip->ack(irq);
+	if (desc->chip->ack)
+		desc->chip->ack(irq);
 	desc = irq_remap_to_desc(irq, desc);
 
 	/* Mark the IRQ currently in progress.*/
-- 
cgit v1.2.3


From c47956d9ae3341d2d1998bff26620fa3338c01e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:11 -0500
Subject: ftrace: remove obsolete print continue functionality

Impact: cleanup, remove obsolete code

Now that the ring buffer used by ftrace allows for variable length
entries, we do not need the 'cont' feature of the buffer.  This code
makes other parts of ftrace more complex and by removing this it
simplifies the ftrace code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 61 ------------------------------------
 kernel/trace/trace.h                 |  7 -----
 kernel/trace/trace_functions_graph.c |  3 --
 kernel/trace/trace_mmiotrace.c       |  3 --
 4 files changed, 74 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f4bb3800318b..fca0233f1d73 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1765,43 +1765,6 @@ static int task_state_char(unsigned long state)
 	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
 }
 
-/*
- * The message is supposed to contain an ending newline.
- * If the printing stops prematurely, try to add a newline of our own.
- */
-void trace_seq_print_cont(struct trace_seq *s, struct trace_iterator *iter)
-{
-	struct trace_entry *ent;
-	struct trace_field_cont *cont;
-	bool ok = true;
-
-	ent = peek_next_entry(iter, iter->cpu, NULL);
-	if (!ent || ent->type != TRACE_CONT) {
-		trace_seq_putc(s, '\n');
-		return;
-	}
-
-	do {
-		cont = (struct trace_field_cont *)ent;
-		if (ok)
-			ok = (trace_seq_printf(s, "%s", cont->buf) > 0);
-
-		ftrace_disable_cpu();
-
-		if (iter->buffer_iter[iter->cpu])
-			ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-		else
-			ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
-
-		ftrace_enable_cpu();
-
-		ent = peek_next_entry(iter, iter->cpu, NULL);
-	} while (ent && ent->type == TRACE_CONT);
-
-	if (!ok)
-		trace_seq_putc(s, '\n');
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1834,9 +1797,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	int S, T;
 	int i;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	test_cpu_buff_start(iter);
 
 	next_entry = find_next_entry(iter, NULL, &next_ts);
@@ -1922,8 +1882,6 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 
 		seq_print_ip_sym(s, field->ip, sym_flags);
 		trace_seq_printf(s, ": %s", field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	case TRACE_BRANCH: {
@@ -1968,9 +1926,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	test_cpu_buff_start(iter);
 
 	comm = trace_find_cmdline(iter->ent->pid);
@@ -2076,8 +2031,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 
 		seq_print_ip_sym(s, field->ip, sym_flags);
 		trace_seq_printf(s, ": %s", field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	case TRACE_GRAPH_RET: {
@@ -2124,9 +2077,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	ret = trace_seq_printf(s, "%d %d %llu ",
 		entry->pid, iter->cpu, iter->ts);
 	if (!ret)
@@ -2187,8 +2137,6 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 
 		trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
-		if (entry->flags & TRACE_FLAG_CONT)
-			trace_seq_print_cont(s, iter);
 		break;
 	}
 	}
@@ -2217,9 +2165,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
@@ -2283,9 +2228,6 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (entry->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -2296,9 +2238,6 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	if (entry->type == TRACE_CONT)
-		return TRACE_TYPE_HANDLED;
-
 	SEQ_PUT_FIELD_RET(s, entry->pid);
 	SEQ_PUT_FIELD_RET(s, entry->cpu);
 	SEQ_PUT_FIELD_RET(s, iter->ts);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..3a357382cce6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -16,7 +16,6 @@ enum trace_type {
 	TRACE_FN,
 	TRACE_CTX,
 	TRACE_WAKE,
-	TRACE_CONT,
 	TRACE_STACK,
 	TRACE_PRINT,
 	TRACE_SPECIAL,
@@ -178,7 +177,6 @@ struct trace_power {
  *  NEED_RESCED		- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
- *  CONT		- multiple entries hold the trace item
  */
 enum trace_flag_type {
 	TRACE_FLAG_IRQS_OFF		= 0x01,
@@ -186,7 +184,6 @@ enum trace_flag_type {
 	TRACE_FLAG_NEED_RESCHED		= 0x04,
 	TRACE_FLAG_HARDIRQ		= 0x08,
 	TRACE_FLAG_SOFTIRQ		= 0x10,
-	TRACE_FLAG_CONT			= 0x20,
 };
 
 #define TRACE_BUF_SIZE		1024
@@ -262,7 +259,6 @@ extern void __ftrace_bad_type(void);
 	do {								\
 		IF_ASSIGN(var, ent, struct ftrace_entry, TRACE_FN);	\
 		IF_ASSIGN(var, ent, struct ctx_switch_entry, 0);	\
-		IF_ASSIGN(var, ent, struct trace_field_cont, TRACE_CONT); \
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
@@ -489,9 +485,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 
 extern void *head_page(struct trace_array_cpu *data);
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern void trace_seq_print_cont(struct trace_seq *s,
-				 struct trace_iterator *iter);
-
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index bc7d90850be5..f261966e5b6c 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -589,9 +589,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (ent->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	/* Strip ending newline */
 	if (s->buffer[s->len - 1] == '\n') {
 		s->buffer[s->len - 1] = '\0';
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1dc..83f20ae6bd68 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -262,9 +262,6 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (entry->flags & TRACE_FLAG_CONT)
-		trace_seq_print_cont(s, iter);
-
 	return TRACE_TYPE_HANDLED;
 }
 
-- 
cgit v1.2.3


From f0868d1e23a8efec33beb3aa688aab7fdb1ae093 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:12 -0500
Subject: ftrace: set up trace event hash infrastructure

Impact: simplify/generalize/refactor trace.c

The trace.c file is becoming more difficult to maintain due to the
growing number of events. There is several formats that an event may
be printed. This patch sets up the infrastructure of an event hash to
allow for events to register how they should be printed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile                |   1 +
 kernel/trace/trace.c                 | 275 +-------------------------
 kernel/trace/trace.h                 |   8 +-
 kernel/trace/trace_boot.c            |   1 +
 kernel/trace/trace_functions_graph.c |   1 +
 kernel/trace/trace_hw_branches.c     |   1 +
 kernel/trace/trace_mmiotrace.c       |   1 +
 kernel/trace/trace_output.c          | 365 +++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h          |  43 +++++
 kernel/trace/trace_power.c           |   1 +
 10 files changed, 416 insertions(+), 281 deletions(-)
 create mode 100644 kernel/trace/trace_output.c
 create mode 100644 kernel/trace/trace_output.h

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..549f93c9b393 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fca0233f1d73..90ce0c1d437b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -38,6 +38,7 @@
 #include <linux/irqflags.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
 
@@ -330,132 +331,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(current);
 }
 
-/**
- * trace_seq_printf - sequence printing of trace information
- * @s: trace sequence descriptor
- * @fmt: printf format string
- *
- * The tracer may use either sequence operations or its own
- * copy to user routines. To simplify formating of a trace
- * trace_seq_printf is used to store strings into a special
- * buffer (@s). Then the output may be either used by
- * the sequencer or pulled into another buffer.
- */
-int
-trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
-{
-	int len = (PAGE_SIZE - 1) - s->len;
-	va_list ap;
-	int ret;
-
-	if (!len)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
-	va_end(ap);
-
-	/* If we can't write it all, don't bother writing anything */
-	if (ret >= len)
-		return 0;
-
-	s->len += ret;
-
-	return len;
-}
-
-/**
- * trace_seq_puts - trace sequence printing of simple string
- * @s: trace sequence descriptor
- * @str: simple string to record
- *
- * The tracer may use either the sequence operations or its own
- * copy to user routines. This function records a simple string
- * into a special buffer (@s) for later retrieval by a sequencer
- * or other mechanism.
- */
-static int
-trace_seq_puts(struct trace_seq *s, const char *str)
-{
-	int len = strlen(str);
-
-	if (len > ((PAGE_SIZE - 1) - s->len))
-		return 0;
-
-	memcpy(s->buffer + s->len, str, len);
-	s->len += len;
-
-	return len;
-}
-
-static int
-trace_seq_putc(struct trace_seq *s, unsigned char c)
-{
-	if (s->len >= (PAGE_SIZE - 1))
-		return 0;
-
-	s->buffer[s->len++] = c;
-
-	return 1;
-}
-
-static int
-trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
-{
-	if (len > ((PAGE_SIZE - 1) - s->len))
-		return 0;
-
-	memcpy(s->buffer + s->len, mem, len);
-	s->len += len;
-
-	return len;
-}
-
-#define MAX_MEMHEX_BYTES	8
-#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-
-static int
-trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
-{
-	unsigned char hex[HEX_CHARS];
-	unsigned char *data = mem;
-	int i, j;
-
-#ifdef __BIG_ENDIAN
-	for (i = 0, j = 0; i < len; i++) {
-#else
-	for (i = len-1, j = 0; i >= 0; i--) {
-#endif
-		hex[j++] = hex_asc_hi(data[i]);
-		hex[j++] = hex_asc_lo(data[i]);
-	}
-	hex[j++] = ' ';
-
-	return trace_seq_putmem(s, hex, j);
-}
-
-static int
-trace_seq_path(struct trace_seq *s, struct path *path)
-{
-	unsigned char *p;
-
-	if (s->len >= (PAGE_SIZE - 1))
-		return 0;
-	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
-	if (!IS_ERR(p)) {
-		p = mangle_path(s->buffer + s->len, p, "\n");
-		if (p) {
-			s->len = p - s->buffer;
-			return 1;
-		}
-	} else {
-		s->buffer[s->len++] = '?';
-		return 1;
-	}
-
-	return 0;
-}
-
 static void
 trace_seq_reset(struct trace_seq *s)
 {
@@ -1473,154 +1348,6 @@ static void s_stop(struct seq_file *m, void *p)
 	mutex_unlock(&trace_types_lock);
 }
 
-#ifdef CONFIG_KRETPROBES
-static inline const char *kretprobed(const char *name)
-{
-	static const char tramp_name[] = "kretprobe_trampoline";
-	int size = sizeof(tramp_name);
-
-	if (strncmp(tramp_name, name, size) == 0)
-		return "[unknown/kretprobe'd]";
-	return name;
-}
-#else
-static inline const char *kretprobed(const char *name)
-{
-	return name;
-}
-#endif /* CONFIG_KRETPROBES */
-
-static int
-seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-	char str[KSYM_SYMBOL_LEN];
-	const char *name;
-
-	kallsyms_lookup(address, NULL, NULL, NULL, str);
-
-	name = kretprobed(str);
-
-	return trace_seq_printf(s, fmt, name);
-#endif
-	return 1;
-}
-
-static int
-seq_print_sym_offset(struct trace_seq *s, const char *fmt,
-		     unsigned long address)
-{
-#ifdef CONFIG_KALLSYMS
-	char str[KSYM_SYMBOL_LEN];
-	const char *name;
-
-	sprint_symbol(str, address);
-	name = kretprobed(str);
-
-	return trace_seq_printf(s, fmt, name);
-#endif
-	return 1;
-}
-
-#ifndef CONFIG_64BIT
-# define IP_FMT "%08lx"
-#else
-# define IP_FMT "%016lx"
-#endif
-
-int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
-{
-	int ret;
-
-	if (!ip)
-		return trace_seq_printf(s, "0");
-
-	if (sym_flags & TRACE_ITER_SYM_OFFSET)
-		ret = seq_print_sym_offset(s, "%s", ip);
-	else
-		ret = seq_print_sym_short(s, "%s", ip);
-
-	if (!ret)
-		return 0;
-
-	if (sym_flags & TRACE_ITER_SYM_ADDR)
-		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
-	return ret;
-}
-
-static inline int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-				    unsigned long ip, unsigned long sym_flags)
-{
-	struct file *file = NULL;
-	unsigned long vmstart = 0;
-	int ret = 1;
-
-	if (mm) {
-		const struct vm_area_struct *vma;
-
-		down_read(&mm->mmap_sem);
-		vma = find_vma(mm, ip);
-		if (vma) {
-			file = vma->vm_file;
-			vmstart = vma->vm_start;
-		}
-		if (file) {
-			ret = trace_seq_path(s, &file->f_path);
-			if (ret)
-				ret = trace_seq_printf(s, "[+0x%lx]", ip - vmstart);
-		}
-		up_read(&mm->mmap_sem);
-	}
-	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
-		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
-	return ret;
-}
-
-static int
-seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
-		      unsigned long sym_flags)
-{
-	struct mm_struct *mm = NULL;
-	int ret = 1;
-	unsigned int i;
-
-	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
-		struct task_struct *task;
-		/*
-		 * we do the lookup on the thread group leader,
-		 * since individual threads might have already quit!
-		 */
-		rcu_read_lock();
-		task = find_task_by_vpid(entry->ent.tgid);
-		if (task)
-			mm = get_task_mm(task);
-		rcu_read_unlock();
-	}
-
-	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-		unsigned long ip = entry->caller[i];
-
-		if (ip == ULONG_MAX || !ret)
-			break;
-		if (i && ret)
-			ret = trace_seq_puts(s, " <- ");
-		if (!ip) {
-			if (ret)
-				ret = trace_seq_puts(s, "??");
-			continue;
-		}
-		if (!ret)
-			break;
-		if (ret)
-			ret = seq_print_user_ip(s, mm, ip, sym_flags);
-	}
-
-	if (mm)
-		mmput(mm);
-	return ret;
-}
-
 static void print_lat_help_header(struct seq_file *m)
 {
 	seq_puts(m, "#                  _------=> CPU#            \n");
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3a357382cce6..6bd71fa1e1c7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,7 +30,7 @@ enum trace_type {
 	TRACE_HW_BRANCHES,
 	TRACE_POWER,
 
-	__TRACE_LAST_TYPE
+	__TRACE_LAST_TYPE,
 };
 
 /*
@@ -484,12 +484,6 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
-extern int
-seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
-		unsigned long sym_flags);
-extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
-				 size_t cnt);
 extern long ns2usecs(cycle_t nsec);
 extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 3ccebde28482..cb2ff3e297b1 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -11,6 +11,7 @@
 #include <linux/kallsyms.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static struct trace_array *boot_trace;
 static bool pre_initcalls_finished;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index f261966e5b6c..f8ac5417afc8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -12,6 +12,7 @@
 #include <linux/fs.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_GRAPH_INDENT	2
 
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index b6a3e20a49a9..879752b006b3 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -14,6 +14,7 @@
 #include <asm/ds.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 
 #define SIZEOF_BTS (1 << 13)
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 83f20ae6bd68..fcec59ff2355 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -11,6 +11,7 @@
 #include <linux/pci.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 struct header_iter {
 	struct pci_dev *dev;
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
new file mode 100644
index 000000000000..1f3f80002b5e
--- /dev/null
+++ b/kernel/trace/trace_output.c
@@ -0,0 +1,365 @@
+/*
+ * trace_output.c
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/ftrace.h>
+
+#include "trace_output.h"
+
+/* must be a power of 2 */
+#define EVENT_HASHSIZE	128
+
+static DEFINE_MUTEX(trace_event_mutex);
+static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
+
+static int next_event_type = __TRACE_LAST_TYPE + 1;
+
+/**
+ * trace_seq_printf - sequence printing of trace information
+ * @s: trace sequence descriptor
+ * @fmt: printf format string
+ *
+ * The tracer may use either sequence operations or its own
+ * copy to user routines. To simplify formating of a trace
+ * trace_seq_printf is used to store strings into a special
+ * buffer (@s). Then the output may be either used by
+ * the sequencer or pulled into another buffer.
+ */
+int
+trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	va_list ap;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = vsnprintf(s->buffer + s->len, len, fmt, ap);
+	va_end(ap);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
+/**
+ * trace_seq_puts - trace sequence printing of simple string
+ * @s: trace sequence descriptor
+ * @str: simple string to record
+ *
+ * The tracer may use either the sequence operations or its own
+ * copy to user routines. This function records a simple string
+ * into a special buffer (@s) for later retrieval by a sequencer
+ * or other mechanism.
+ */
+int trace_seq_puts(struct trace_seq *s, const char *str)
+{
+	int len = strlen(str);
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, str, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putc(struct trace_seq *s, unsigned char c)
+{
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+
+	s->buffer[s->len++] = c;
+
+	return 1;
+}
+
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+{
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return 0;
+
+	memcpy(s->buffer + s->len, mem, len);
+	s->len += len;
+
+	return len;
+}
+
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+{
+	unsigned char hex[HEX_CHARS];
+	unsigned char *data = mem;
+	int i, j;
+
+#ifdef __BIG_ENDIAN
+	for (i = 0, j = 0; i < len; i++) {
+#else
+	for (i = len-1, j = 0; i >= 0; i--) {
+#endif
+		hex[j++] = hex_asc_hi(data[i]);
+		hex[j++] = hex_asc_lo(data[i]);
+	}
+	hex[j++] = ' ';
+
+	return trace_seq_putmem(s, hex, j);
+}
+
+int trace_seq_path(struct trace_seq *s, struct path *path)
+{
+	unsigned char *p;
+
+	if (s->len >= (PAGE_SIZE - 1))
+		return 0;
+	p = d_path(path, s->buffer + s->len, PAGE_SIZE - s->len);
+	if (!IS_ERR(p)) {
+		p = mangle_path(s->buffer + s->len, p, "\n");
+		if (p) {
+			s->len = p - s->buffer;
+			return 1;
+		}
+	} else {
+		s->buffer[s->len++] = '?';
+		return 1;
+	}
+
+	return 0;
+}
+
+#ifdef CONFIG_KRETPROBES
+static inline const char *kretprobed(const char *name)
+{
+	static const char tramp_name[] = "kretprobe_trampoline";
+	int size = sizeof(tramp_name);
+
+	if (strncmp(tramp_name, name, size) == 0)
+		return "[unknown/kretprobe'd]";
+	return name;
+}
+#else
+static inline const char *kretprobed(const char *name)
+{
+	return name;
+}
+#endif /* CONFIG_KRETPROBES */
+
+static int
+seq_print_sym_short(struct trace_seq *s, const char *fmt, unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+	const char *name;
+
+	kallsyms_lookup(address, NULL, NULL, NULL, str);
+
+	name = kretprobed(str);
+
+	return trace_seq_printf(s, fmt, name);
+#endif
+	return 1;
+}
+
+static int
+seq_print_sym_offset(struct trace_seq *s, const char *fmt,
+		     unsigned long address)
+{
+#ifdef CONFIG_KALLSYMS
+	char str[KSYM_SYMBOL_LEN];
+	const char *name;
+
+	sprint_symbol(str, address);
+	name = kretprobed(str);
+
+	return trace_seq_printf(s, fmt, name);
+#endif
+	return 1;
+}
+
+#ifndef CONFIG_64BIT
+# define IP_FMT "%08lx"
+#else
+# define IP_FMT "%016lx"
+#endif
+
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+		      unsigned long ip, unsigned long sym_flags)
+{
+	struct file *file = NULL;
+	unsigned long vmstart = 0;
+	int ret = 1;
+
+	if (mm) {
+		const struct vm_area_struct *vma;
+
+		down_read(&mm->mmap_sem);
+		vma = find_vma(mm, ip);
+		if (vma) {
+			file = vma->vm_file;
+			vmstart = vma->vm_start;
+		}
+		if (file) {
+			ret = trace_seq_path(s, &file->f_path);
+			if (ret)
+				ret = trace_seq_printf(s, "[+0x%lx]",
+						       ip - vmstart);
+		}
+		up_read(&mm->mmap_sem);
+	}
+	if (ret && ((sym_flags & TRACE_ITER_SYM_ADDR) || !file))
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+int
+seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s,
+		      unsigned long sym_flags)
+{
+	struct mm_struct *mm = NULL;
+	int ret = 1;
+	unsigned int i;
+
+	if (trace_flags & TRACE_ITER_SYM_USEROBJ) {
+		struct task_struct *task;
+		/*
+		 * we do the lookup on the thread group leader,
+		 * since individual threads might have already quit!
+		 */
+		rcu_read_lock();
+		task = find_task_by_vpid(entry->ent.tgid);
+		if (task)
+			mm = get_task_mm(task);
+		rcu_read_unlock();
+	}
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		unsigned long ip = entry->caller[i];
+
+		if (ip == ULONG_MAX || !ret)
+			break;
+		if (i && ret)
+			ret = trace_seq_puts(s, " <- ");
+		if (!ip) {
+			if (ret)
+				ret = trace_seq_puts(s, "??");
+			continue;
+		}
+		if (!ret)
+			break;
+		if (ret)
+			ret = seq_print_user_ip(s, mm, ip, sym_flags);
+	}
+
+	if (mm)
+		mmput(mm);
+	return ret;
+}
+
+int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
+{
+	int ret;
+
+	if (!ip)
+		return trace_seq_printf(s, "0");
+
+	if (sym_flags & TRACE_ITER_SYM_OFFSET)
+		ret = seq_print_sym_offset(s, "%s", ip);
+	else
+		ret = seq_print_sym_short(s, "%s", ip);
+
+	if (!ret)
+		return 0;
+
+	if (sym_flags & TRACE_ITER_SYM_ADDR)
+		ret = trace_seq_printf(s, " <" IP_FMT ">", ip);
+	return ret;
+}
+
+/**
+ * ftrace_find_event - find a registered event
+ * @type: the type of event to look for
+ *
+ * Returns an event of type @type otherwise NULL
+ */
+struct trace_event *ftrace_find_event(int type)
+{
+	struct trace_event *event;
+	struct hlist_node *n;
+	unsigned key;
+
+	key = type & (EVENT_HASHSIZE - 1);
+
+	hlist_for_each_entry_rcu(event, n, &event_hash[key], node) {
+		if (event->type == type)
+			return event;
+	}
+
+	return NULL;
+}
+
+/**
+ * register_ftrace_event - register output for an event type
+ * @event: the event type to register
+ *
+ * Event types are stored in a hash and this hash is used to
+ * find a way to print an event. If the @event->type is set
+ * then it will use that type, otherwise it will assign a
+ * type to use.
+ *
+ * If you assign your own type, please make sure it is added
+ * to the trace_type enum in trace.h, to avoid collisions
+ * with the dynamic types.
+ *
+ * Returns the event type number or zero on error.
+ */
+int register_ftrace_event(struct trace_event *event)
+{
+	unsigned key;
+	int ret = 0;
+
+	mutex_lock(&trace_event_mutex);
+
+	if (!event->type)
+		event->type = next_event_type++;
+	else if (event->type > __TRACE_LAST_TYPE) {
+		printk(KERN_WARNING "Need to add type to trace.h\n");
+		WARN_ON(1);
+	}
+
+	if (ftrace_find_event(event->type))
+		goto out;
+
+	key = event->type & (EVENT_HASHSIZE - 1);
+
+	hlist_add_head_rcu(&event->node, &event_hash[key]);
+
+	ret = event->type;
+ out:
+	mutex_unlock(&trace_event_mutex);
+
+	return ret;
+}
+
+/**
+ * unregister_ftrace_event - remove a no longer used event
+ * @event: the event to remove
+ */
+int unregister_ftrace_event(struct trace_event *event)
+{
+	mutex_lock(&trace_event_mutex);
+	hlist_del(&event->node);
+	mutex_unlock(&trace_event_mutex);
+
+	return 0;
+}
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
new file mode 100644
index 000000000000..1fcc76e1378e
--- /dev/null
+++ b/kernel/trace/trace_output.h
@@ -0,0 +1,43 @@
+#ifndef __TRACE_EVENTS_H
+#define __TRACE_EVENTS_H
+
+#include "trace.h"
+
+typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
+				int flags);
+
+struct trace_event {
+	struct hlist_node	node;
+	int			type;
+	trace_print_func	trace;
+	trace_print_func	latency_trace;
+	trace_print_func	raw;
+	trace_print_func	hex;
+	trace_print_func	binary;
+};
+
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern int
+seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
+		unsigned long sym_flags);
+extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
+				 size_t cnt);
+int trace_seq_puts(struct trace_seq *s, const char *str);
+int trace_seq_putc(struct trace_seq *s, unsigned char c);
+int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+int trace_seq_path(struct trace_seq *s, struct path *path);
+int seq_print_userip_objs(const struct userstack_entry *entry,
+			  struct trace_seq *s, unsigned long sym_flags);
+int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+		      unsigned long ip, unsigned long sym_flags);
+
+struct trace_event *ftrace_find_event(int type);
+int register_ftrace_event(struct trace_event *event);
+int unregister_ftrace_event(struct trace_event *event);
+
+#define MAX_MEMHEX_BYTES	8
+#define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
+
+#endif
+
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index a7172a352f62..b9b13c39b4bb 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
-- 
cgit v1.2.3


From f633cef0200bbaec539e2dbb0bc4bed7f022f98b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 23 Dec 2008 23:24:13 -0500
Subject: ftrace: change trace.c to use registered events

Impact: rework trace.c to use new event register API

Almost every ftrace event has to implement its output display in
trace.c through a different function. Some events did not handle
all the formats (trace, latency-trace, raw, hex, binary), and
this method does not scale well.

This patch converts the format functions to use the event API to
find the event and and print its format. Currently, we have
a print function for trace, latency_trace, raw, hex and binary.
A trace_nop_print is available if the event wants to avoid output
on a particular format.

Perhaps other tracers could use this in the future (like mmiotrace and
function_graph).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 402 ++++----------------------------------
 kernel/trace/trace_branch.c |  53 +++++
 kernel/trace/trace_output.c | 467 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |  16 ++
 4 files changed, 574 insertions(+), 364 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 90ce0c1d437b..3f0317586cfd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1483,15 +1483,6 @@ lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 		trace_seq_puts(s, " : ");
 }
 
-static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
-
-static int task_state_char(unsigned long state)
-{
-	int bit = state ? __ffs(state) + 1 : 0;
-
-	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1515,14 +1506,14 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *next_entry;
+	struct trace_event *event;
 	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
 	struct trace_entry *entry = iter->ent;
 	unsigned long abs_usecs;
 	unsigned long rel_usecs;
 	u64 next_ts;
 	char *comm;
-	int S, T;
-	int i;
+	int ret;
 
 	test_cpu_buff_start(iter);
 
@@ -1547,94 +1538,16 @@ print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
 		lat_print_generic(s, entry, cpu);
 		lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_puts(s, " (");
-		seq_print_ip_sym(s, field->parent_ip, sym_flags);
-		trace_seq_puts(s, ")\n");
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = task_state_char(field->prev_state);
-		comm = trace_find_cmdline(field->next_pid);
-		trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-				 field->prev_pid,
-				 field->prev_prio,
-				 S, entry->type == TRACE_CTX ? "==>" : "  +",
-				 field->next_cpu,
-				 field->next_pid,
-				 field->next_prio,
-				 T, comm);
-		break;
-	}
-	case TRACE_SPECIAL: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		break;
-	}
-	case TRACE_STACK: {
-		struct stack_entry *field;
-
-		trace_assign_type(field, entry);
-
-		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-			if (i)
-				trace_seq_puts(s, " <= ");
-			seq_print_ip_sym(s, field->caller[i], sym_flags);
-		}
-		trace_seq_puts(s, "\n");
-		break;
-	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
 
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_printf(s, ": %s", field->buf);
-		break;
-	}
-	case TRACE_BRANCH: {
-		struct trace_branch *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "  ok  " : " MISS ",
-				 field->func,
-				 field->file,
-				 field->line);
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->latency_trace) {
+		ret = event->latency_trace(s, entry, sym_flags);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_USER_STACK: {
-		struct userstack_entry *field;
 
-		trace_assign_type(field, entry);
-
-		seq_print_userip_objs(field, s, sym_flags);
-		trace_seq_putc(s, '\n');
-		break;
-	}
-	default:
-		trace_seq_printf(s, "Unknown type %d\n", entry->type);
-	}
+	trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1643,13 +1556,12 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
+	struct trace_event *event;
 	unsigned long usec_rem;
 	unsigned long long t;
 	unsigned long secs;
 	char *comm;
 	int ret;
-	int S, T;
-	int i;
 
 	entry = iter->ent;
 
@@ -1671,127 +1583,17 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = seq_print_ip_sym(s, field->ip, sym_flags);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		if ((sym_flags & TRACE_ITER_PRINT_PARENT) &&
-						field->parent_ip) {
-			ret = trace_seq_printf(s, " <-");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-			ret = seq_print_ip_sym(s,
-					       field->parent_ip,
-					       sym_flags);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		ret = trace_seq_printf(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = task_state_char(field->prev_state);
-		ret = trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c\n",
-				       field->prev_pid,
-				       field->prev_prio,
-				       S,
-				       entry->type == TRACE_CTX ? "==>" : "  +",
-				       field->next_cpu,
-				       field->next_pid,
-				       field->next_prio,
-				       T);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_SPECIAL: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_STACK: {
-		struct stack_entry *field;
-
-		trace_assign_type(field, entry);
-
-		for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
-			if (i) {
-				ret = trace_seq_puts(s, " <= ");
-				if (!ret)
-					return TRACE_TYPE_PARTIAL_LINE;
-			}
-			ret = seq_print_ip_sym(s, field->caller[i],
-					       sym_flags);
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		ret = trace_seq_puts(s, "\n");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
-
-		seq_print_ip_sym(s, field->ip, sym_flags);
-		trace_seq_printf(s, ": %s", field->buf);
-		break;
-	}
-	case TRACE_GRAPH_RET: {
-		return print_graph_function(iter);
-	}
-	case TRACE_GRAPH_ENT: {
-		return print_graph_function(iter);
-	}
-	case TRACE_BRANCH: {
-		struct trace_branch *field;
-
-		trace_assign_type(field, entry);
-
-		trace_seq_printf(s, "[%s] %s:%s:%d\n",
-				 field->correct ? "  ok  " : " MISS ",
-				 field->func,
-				 field->file,
-				 field->line);
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->trace) {
+		ret = event->trace(s, entry, sym_flags);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_USER_STACK: {
-		struct userstack_entry *field;
-
-		trace_assign_type(field, entry);
+	ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		ret = seq_print_userip_objs(field, s, sym_flags);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		ret = trace_seq_putc(s, '\n');
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	}
 	return TRACE_TYPE_HANDLED;
 }
 
@@ -1799,8 +1601,8 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
+	struct trace_event *event;
 	int ret;
-	int S, T;
 
 	entry = iter->ent;
 
@@ -1809,86 +1611,26 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "%x %x\n",
-					field->ip,
-					field->parent_ip);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = entry->type == TRACE_WAKE ? '+' :
-			task_state_char(field->prev_state);
-		ret = trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
-				       field->prev_pid,
-				       field->prev_prio,
-				       S,
-				       field->next_cpu,
-				       field->next_pid,
-				       field->next_prio,
-				       T);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		ret = trace_seq_printf(s, "# %ld %ld %ld\n",
-				 field->arg1,
-				 field->arg2,
-				 field->arg3);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		break;
+	event = ftrace_find_event(entry->type);
+	if (event && event->raw) {
+		ret = event->raw(s, entry, 0);
+		if (ret)
+			return ret;
+		return TRACE_TYPE_HANDLED;
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
-
-		trace_assign_type(field, entry);
+	ret = trace_seq_printf(s, "%d ?\n", entry->type);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		trace_seq_printf(s, "# %lx %s", field->ip, field->buf);
-		break;
-	}
-	}
 	return TRACE_TYPE_HANDLED;
 }
 
-#define SEQ_PUT_FIELD_RET(s, x)				\
-do {							\
-	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
-		return 0;				\
-} while (0)
-
-#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
-do {							\
-	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
-	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
-		return 0;				\
-} while (0)
-
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned char newline = '\n';
 	struct trace_entry *entry;
-	int S, T;
+	struct trace_event *event;
 
 	entry = iter->ent;
 
@@ -1896,47 +1638,10 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
 	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_HEX_FIELD_RET(s, field->ip);
-		SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
-		break;
-	}
-	case TRACE_CTX:
-	case TRACE_WAKE: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		T = task_state_char(field->next_state);
-		S = entry->type == TRACE_WAKE ? '+' :
-			task_state_char(field->prev_state);
-		SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
-		SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
-		SEQ_PUT_HEX_FIELD_RET(s, S);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
-		SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
-		SEQ_PUT_HEX_FIELD_RET(s, T);
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
+	event = ftrace_find_event(entry->type);
+	if (event && event->hex)
+		event->hex(s, entry, 0);
 
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
-		SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
-		break;
-	}
-	}
 	SEQ_PUT_FIELD_RET(s, newline);
 
 	return TRACE_TYPE_HANDLED;
@@ -1962,6 +1667,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
+	struct trace_event *event;
 
 	entry = iter->ent;
 
@@ -1969,43 +1675,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	SEQ_PUT_FIELD_RET(s, entry->cpu);
 	SEQ_PUT_FIELD_RET(s, iter->ts);
 
-	switch (entry->type) {
-	case TRACE_FN: {
-		struct ftrace_entry *field;
-
-		trace_assign_type(field, entry);
+	event = ftrace_find_event(entry->type);
+	if (event && event->binary)
+		event->binary(s, entry, 0);
 
-		SEQ_PUT_FIELD_RET(s, field->ip);
-		SEQ_PUT_FIELD_RET(s, field->parent_ip);
-		break;
-	}
-	case TRACE_CTX: {
-		struct ctx_switch_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_FIELD_RET(s, field->prev_pid);
-		SEQ_PUT_FIELD_RET(s, field->prev_prio);
-		SEQ_PUT_FIELD_RET(s, field->prev_state);
-		SEQ_PUT_FIELD_RET(s, field->next_pid);
-		SEQ_PUT_FIELD_RET(s, field->next_prio);
-		SEQ_PUT_FIELD_RET(s, field->next_state);
-		break;
-	}
-	case TRACE_SPECIAL:
-	case TRACE_USER_STACK:
-	case TRACE_STACK: {
-		struct special_entry *field;
-
-		trace_assign_type(field, entry);
-
-		SEQ_PUT_FIELD_RET(s, field->arg1);
-		SEQ_PUT_FIELD_RET(s, field->arg2);
-		SEQ_PUT_FIELD_RET(s, field->arg3);
-		break;
-	}
-	}
-	return 1;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_empty(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 6c00feb3bac7..c15222a01073 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -14,7 +14,9 @@
 #include <linux/hash.h>
 #include <linux/fs.h>
 #include <asm/local.h>
+
 #include "trace.h"
+#include "trace_output.h"
 
 #ifdef CONFIG_BRANCH_TRACER
 
@@ -142,6 +144,49 @@ static void branch_trace_reset(struct trace_array *tr)
 	stop_branch_trace(tr);
 }
 
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct trace_branch *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+			     field->correct ? "  ok  " : " MISS ",
+			     field->func,
+			     field->file,
+			     field->line))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static struct trace_event trace_branch_event = {
+	.type	 	= TRACE_BRANCH,
+	.trace		= trace_branch_print,
+	.latency_trace	= trace_branch_print,
+	.raw		= trace_nop_print,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 struct tracer branch_trace __read_mostly =
 {
 	.name		= "branch",
@@ -154,6 +199,14 @@ struct tracer branch_trace __read_mostly =
 
 __init static int init_branch_trace(void)
 {
+	int ret;
+
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register branch events\n");
+		return 1;
+	}
+
 	return register_tracer(&branch_trace);
 }
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1f3f80002b5e..df0c25cbed30 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,6 +286,15 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
+
+static int task_state_char(unsigned long state)
+{
+	int bit = state ? __ffs(state) + 1 : 0;
+
+	return bit < sizeof(state_to_char) - 1 ? state_to_char[bit] : '?';
+}
+
 /**
  * ftrace_find_event - find a registered event
  * @type: the type of event to look for
@@ -363,3 +372,461 @@ int unregister_ftrace_event(struct trace_event *event)
 
 	return 0;
 }
+
+/*
+ * Standard events
+ */
+
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return 0;
+}
+
+/* TRACE_FN */
+static int
+trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+	if (!trace_seq_puts(s, " ("))
+		goto partial;
+	if (!seq_print_ip_sym(s, field->parent_ip, flags))
+		goto partial;
+	if (!trace_seq_puts(s, ")\n"))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if ((flags & TRACE_ITER_PRINT_PARENT) && field->parent_ip) {
+		if (!trace_seq_printf(s, " <-"))
+			goto partial;
+		if (!seq_print_ip_sym(s,
+				      field->parent_ip,
+				      flags))
+			goto partial;
+	}
+	if (!trace_seq_printf(s, "\n"))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "%x %x\n",
+			     field->ip,
+			     field->parent_ip))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
+	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
+
+	return 0;
+}
+
+static int
+trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ftrace_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->ip);
+	SEQ_PUT_FIELD_RET(s, field->parent_ip);
+
+	return 0;
+}
+
+static struct trace_event trace_fn_event = {
+	.type	 	= TRACE_FN,
+	.trace		= trace_fn_trace,
+	.latency_trace	= trace_fn_latency,
+	.raw		= trace_fn_raw,
+	.hex		= trace_fn_hex,
+	.binary		= trace_fn_bin,
+};
+
+/* TRACE_CTX an TRACE_WAKE */
+static int
+trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
+		    char *delim)
+{
+	struct ctx_switch_entry *field;
+	char *comm;
+	int S, T;
+
+	trace_assign_type(field, entry);
+
+	T = task_state_char(field->next_state);
+	S = task_state_char(field->prev_state);
+	comm = trace_find_cmdline(field->next_pid);
+	if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+			     field->prev_pid,
+			     field->prev_prio,
+			     S, delim,
+			     field->next_cpu,
+			     field->next_pid,
+			     field->next_prio,
+			     T, comm))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_print(s, entry, flags, "==>");
+}
+
+static int
+trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_print(s, entry, flags, "  +");
+}
+
+static int
+trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
+		  char S)
+{
+	struct ctx_switch_entry *field;
+	int T;
+
+	trace_assign_type(field, entry);
+
+	if (!S)
+		task_state_char(field->prev_state);
+	T = task_state_char(field->next_state);
+	if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+			     field->prev_pid,
+			     field->prev_prio,
+			     S,
+			     field->next_cpu,
+			     field->next_pid,
+			     field->next_prio,
+			     T))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_raw(s, entry, flags, 0);
+}
+
+static int
+trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_raw(s, entry, flags, '+');
+}
+
+
+static int
+trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
+		  char S)
+{
+	struct ctx_switch_entry *field;
+	int T;
+
+	trace_assign_type(field, entry);
+
+	if (!S)
+		task_state_char(field->prev_state);
+	T = task_state_char(field->next_state);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->prev_pid);
+	SEQ_PUT_HEX_FIELD_RET(s, field->prev_prio);
+	SEQ_PUT_HEX_FIELD_RET(s, S);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_cpu);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_pid);
+	SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
+	SEQ_PUT_HEX_FIELD_RET(s, T);
+
+	return 0;
+}
+
+static int
+trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_hex(s, entry, flags, 0);
+}
+
+static int
+trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	return trace_ctxwake_hex(s, entry, flags, '+');
+}
+
+static int
+trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct ctx_switch_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->prev_pid);
+	SEQ_PUT_FIELD_RET(s, field->prev_prio);
+	SEQ_PUT_FIELD_RET(s, field->prev_state);
+	SEQ_PUT_FIELD_RET(s, field->next_pid);
+	SEQ_PUT_FIELD_RET(s, field->next_prio);
+	SEQ_PUT_FIELD_RET(s, field->next_state);
+
+	return 0;
+}
+
+static struct trace_event trace_ctx_event = {
+	.type	 	= TRACE_CTX,
+	.trace		= trace_ctx_print,
+	.latency_trace	= trace_ctx_print,
+	.raw		= trace_ctx_raw,
+	.hex		= trace_ctx_hex,
+	.binary		= trace_ctxwake_bin,
+};
+
+static struct trace_event trace_wake_event = {
+	.type	 	= TRACE_WAKE,
+	.trace		= trace_wake_print,
+	.latency_trace	= trace_wake_print,
+	.raw		= trace_wake_raw,
+	.hex		= trace_wake_hex,
+	.binary		= trace_ctxwake_bin,
+};
+
+/* TRACE_SPECIAL */
+static int
+trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (trace_seq_printf(s, "# %ld %ld %ld\n",
+			     field->arg1,
+			     field->arg2,
+			     field->arg3))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return 0;
+}
+
+static int
+trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
+	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
+
+	return 0;
+}
+
+static int
+trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct special_entry *field;
+
+	trace_assign_type(field, entry);
+
+	SEQ_PUT_FIELD_RET(s, field->arg1);
+	SEQ_PUT_FIELD_RET(s, field->arg2);
+	SEQ_PUT_FIELD_RET(s, field->arg3);
+
+	return 0;
+}
+
+static struct trace_event trace_special_event = {
+	.type	 	= TRACE_SPECIAL,
+	.trace		= trace_special_print,
+	.latency_trace	= trace_special_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_STACK */
+
+static int
+trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct stack_entry *field;
+	int i;
+
+	trace_assign_type(field, entry);
+
+	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
+		if (i) {
+			if (trace_seq_puts(s, " <= "))
+				goto partial;
+
+			if (seq_print_ip_sym(s, field->caller[i], flags))
+				goto partial;
+		}
+		if (trace_seq_puts(s, "\n"))
+			goto partial;
+	}
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_stack_event = {
+	.type	 	= TRACE_STACK,
+	.trace		= trace_stack_print,
+	.latency_trace	= trace_stack_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_USER_STACK */
+static int
+trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
+		       int flags)
+{
+	struct userstack_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_userip_objs(field, s, flags))
+		goto partial;
+
+	if (trace_seq_putc(s, '\n'))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_user_stack_event = {
+	.type	 	= TRACE_USER_STACK,
+	.trace		= trace_user_stack_print,
+	.latency_trace	= trace_user_stack_print,
+	.raw		= trace_special_print,
+	.hex		= trace_special_hex,
+	.binary		= trace_special_bin,
+};
+
+/* TRACE_PRINT */
+static int
+trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int
+trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return 0;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.latency_trace	= trace_print_print,
+	.raw		= trace_print_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
+static struct trace_event *events[] __initdata = {
+	&trace_fn_event,
+	&trace_ctx_event,
+	&trace_wake_event,
+	&trace_special_event,
+	&trace_stack_event,
+	&trace_user_stack_event,
+	&trace_print_event,
+	NULL
+};
+
+__init static int init_events(void)
+{
+	struct trace_event *event;
+	int i, ret;
+
+	for (i = 0; events[i]; i++) {
+		event = events[i];
+
+		ret = register_ftrace_event(event);
+		if (!ret) {
+			printk(KERN_WARNING "event %d failed to register\n",
+			       event->type);
+			WARN_ON_ONCE(1);
+		}
+	}
+
+	return 0;
+}
+device_initcall(init_events);
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1fcc76e1378e..ecab4ea4a4fd 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -36,8 +36,24 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
+int
+trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
 
+#define SEQ_PUT_FIELD_RET(s, x)				\
+do {							\
+	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
+#define SEQ_PUT_HEX_FIELD_RET(s, x)			\
+do {							\
+	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
+	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
+		return 0;				\
+} while (0)
+
 #endif
 
-- 
cgit v1.2.3


From dbd0b4b33074aa6b7832a9d9a5bd985eca5c1aa2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 28 Dec 2008 20:44:51 -0800
Subject: tracing/ftrace: provide the base infrastructure for histogram tracing

Impact: extend the tracing API

The goal of this patch is to normalize and make more easy the
implementation of statistical (histogram) tracing.

It implements a trace_stat file into the /debugfs/tracing directory where
one can print a one-shot output of statistics/histogram entries.

A tracer has to provide two basic iterator callbacks:

  stat_start() => the first entry
  stat_next(prev, idx) => the next one.

Note that it is adapted for arrays or hash tables or lists.... since it
provides a pointer to the previous entry and the current index of the
iterator.

These two callbacks are called to get a snapshot of the statistics at each
opening of the trace_stat file because. The values are so updated between
two "cat trace_stat". And the tracer is free to lock its datas during the
iteration to keep consistent values.

Since it is almost always interesting to sort statisticals values to
address the problems by priority, this infrastructure provides a "sorting"
of the stat entries too if desired. A tracer has just to provide a
stat_cmp callback to compare two entries and the stat tracing
infrastructure will build a sorted list of the given entries.

A last callback, called stat_headers, can be implemented by a tracer to
output headers on its trace.

If one of these callbacks is changed on runtime, it just have to signal it
to the stat tracing API by calling the init_tracer_stat() helper.

Changes in V2:

- Fix a memory leak if the user opens multiple times the trace_stat file
  without closing it. Now we always free our list before rebuilding it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile     |   1 +
 kernel/trace/trace.c      |   3 +-
 kernel/trace/trace.h      |  17 ++++
 kernel/trace/trace_stat.c | 251 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 271 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/trace_stat.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 549f93c9b393..31cd5fbc0eed 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -20,6 +20,7 @@ obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_output.o
+obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3f0317586cfd..b789c010512c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2354,6 +2354,7 @@ static int tracing_set_tracer(char *buf)
 		if (ret)
 			goto out;
 	}
+	init_tracer_stat(t);
 
 	trace_branch_enable(tr);
  out:
@@ -3206,7 +3207,7 @@ __init static int tracer_alloc_buffers(void)
 #else
 	current_trace = &nop_trace;
 #endif
-
+	init_tracer_stat(current_trace);
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6bd71fa1e1c7..05fa804d1c16 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -336,6 +336,21 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags 	*flags;
+
+	/*
+	 * If you change one of the following on tracing runtime, recall
+	 * init_tracer_stat()
+	 */
+
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for sorting (optional) for stats */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
 };
 
 struct trace_seq {
@@ -421,6 +436,8 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
+void init_tracer_stat(struct tracer *trace);
+
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_max_latency;
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
new file mode 100644
index 000000000000..6f194a33a64a
--- /dev/null
+++ b/kernel/trace/trace_stat.c
@@ -0,0 +1,251 @@
+/*
+ * Infrastructure for statistic tracing (histogram output).
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ * Based on the code from trace_branch.c which is
+ * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+
+#include <linux/list.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include "trace.h"
+
+
+/* List of stat entries from a tracer */
+struct trace_stat_list {
+	struct list_head list;
+	void *stat;
+};
+
+static struct trace_stat_list stat_list;
+
+/*
+ * This is a copy of the current tracer to avoid racy
+ * and dangerous output while the current tracer is
+ * switched.
+ */
+static struct tracer current_tracer;
+
+/*
+ * Protect both the current tracer and the global
+ * stat list.
+ */
+static DEFINE_MUTEX(stat_list_mutex);
+
+
+static void reset_stat_list(void)
+{
+	struct trace_stat_list *node;
+	struct list_head *next;
+
+	if (list_empty(&stat_list.list))
+		return;
+
+	node = list_entry(stat_list.list.next, struct trace_stat_list, list);
+	next = node->list.next;
+
+	while (&node->list != next) {
+		kfree(node);
+		node = list_entry(next, struct trace_stat_list, list);
+	}
+	kfree(node);
+
+	INIT_LIST_HEAD(&stat_list.list);
+}
+
+void init_tracer_stat(struct tracer *trace)
+{
+	mutex_lock(&stat_list_mutex);
+	current_tracer = *trace;
+	mutex_unlock(&stat_list_mutex);
+}
+
+/*
+ * For tracers that don't provide a stat_cmp callback.
+ * This one will force an immediate insertion on tail of
+ * the list.
+ */
+static int dummy_cmp(void *p1, void *p2)
+{
+	return 1;
+}
+
+/*
+ * Initialize the stat list at each trace_stat file opening.
+ * All of these copies and sorting are required on all opening
+ * since the stats could have changed between two file sessions.
+ */
+static int stat_seq_init(void)
+{
+	struct trace_stat_list *iter_entry, *new_entry;
+	void *prev_stat;
+	int ret = 0;
+	int i;
+
+	mutex_lock(&stat_list_mutex);
+	reset_stat_list();
+
+	if (!current_tracer.stat_start || !current_tracer.stat_next ||
+					!current_tracer.stat_show)
+		goto exit;
+
+	if (!current_tracer.stat_cmp)
+		current_tracer.stat_cmp = dummy_cmp;
+
+	/*
+	 * The first entry. Actually this is the second, but the first
+	 * one (the stat_list head) is pointless.
+	 */
+	new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+	if (!new_entry) {
+		ret = -ENOMEM;
+		goto exit;
+	}
+
+	INIT_LIST_HEAD(&new_entry->list);
+	list_add(&new_entry->list, &stat_list.list);
+	new_entry->stat = current_tracer.stat_start();
+
+	prev_stat = new_entry->stat;
+
+	/*
+	 * Iterate over the tracer stat entries and store them in a sorted
+	 * list.
+	 */
+	for (i = 1; ; i++) {
+		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
+		if (!new_entry) {
+			ret = -ENOMEM;
+			goto exit_free_list;
+		}
+
+		INIT_LIST_HEAD(&new_entry->list);
+		new_entry->stat = current_tracer.stat_next(prev_stat, i);
+
+		/* End of insertion */
+		if (!new_entry->stat)
+			break;
+
+		list_for_each_entry(iter_entry, &stat_list.list, list) {
+			/* Insertion with a descendent sorting */
+			if (current_tracer.stat_cmp(new_entry->stat,
+						iter_entry->stat) > 0) {
+
+				list_add_tail(&new_entry->list,
+						&iter_entry->list);
+				break;
+
+			/* The current smaller value */
+			} else if (list_is_last(&iter_entry->list,
+						&stat_list.list)) {
+				list_add(&new_entry->list, &iter_entry->list);
+				break;
+			}
+		}
+
+		prev_stat = new_entry->stat;
+	}
+exit:
+	mutex_unlock(&stat_list_mutex);
+	return ret;
+
+exit_free_list:
+	reset_stat_list();
+	mutex_unlock(&stat_list_mutex);
+	return ret;
+}
+
+
+static void *stat_seq_start(struct seq_file *s, loff_t *pos)
+{
+	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+
+	/* Prevent from tracer switch or stat_list modification */
+	mutex_lock(&stat_list_mutex);
+
+	/* If we are in the beginning of the file, print the headers */
+	if (!*pos && current_tracer.stat_headers)
+		current_tracer.stat_headers(s);
+
+	return seq_list_start(&l->list, *pos);
+}
+
+static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
+{
+	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+
+	return seq_list_next(p, &l->list, pos);
+}
+
+static void stat_seq_stop(struct seq_file *m, void *p)
+{
+	mutex_unlock(&stat_list_mutex);
+}
+
+static int stat_seq_show(struct seq_file *s, void *v)
+{
+	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
+	return current_tracer.stat_show(s, l->stat);
+}
+
+static const struct seq_operations trace_stat_seq_ops = {
+	.start = stat_seq_start,
+	.next = stat_seq_next,
+	.stop = stat_seq_stop,
+	.show = stat_seq_show
+};
+
+static int tracing_stat_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &trace_stat_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+		m->private = &stat_list;
+		ret = stat_seq_init();
+	}
+
+	return ret;
+}
+
+
+/*
+ * Avoid consuming memory with our now useless list.
+ */
+static int tracing_stat_release(struct inode *i, struct file *f)
+{
+	mutex_lock(&stat_list_mutex);
+	reset_stat_list();
+	mutex_unlock(&stat_list_mutex);
+	return 0;
+}
+
+static const struct file_operations tracing_stat_fops = {
+	.open		= tracing_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= tracing_stat_release
+};
+
+static int __init tracing_stat_init(void)
+{
+	struct dentry *d_tracing;
+	struct dentry *entry;
+
+	INIT_LIST_HEAD(&stat_list.list);
+	d_tracing = tracing_init_dentry();
+
+	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
+					NULL,
+				    &tracing_stat_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'trace_stat' entry\n");
+	return 0;
+}
+fs_initcall(tracing_stat_init);
-- 
cgit v1.2.3


From e302cf3f961ceb54c1dd0aff7ba8531df83be07a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 27 Dec 2008 23:25:38 +0100
Subject: tracing/branch-tracer: adapt to the stat tracing API

Impact: refactor the branch tracer

This patch adapts the branch tracer to the tracing API.

This is a proof of concept because the branch tracer implements two
"stat tracing" that were split in two files.

So I added an option to the branch tracer: stat_all_branch.
If it is set, then trace_stat will output all of the branches
entries stats. Otherwise, it will print the annotated branches.

Its is a kind of quick trick, waiting for a better solution.

By default, the annotated branches stat are sorted by incorrect branch
prediction percentage.

Ie:

 correct incorrect  %        Function                  File              Line
 ------- ---------  -        --------                  ----              ----
       0        1 100 native_smp_prepare_cpus        smpboot.c            1228
       0        1 100 hpet_rtc_timer_reinit          hpet.c               1057
       0    18032 100 sched_info_queued              sched_stats.h        223
       0      684 100 yield_task_fair                sched_fair.c         984
       0      282 100 pre_schedule_rt                sched_rt.c           1263
       0    13414 100 sched_info_dequeued            sched_stats.h        178
       0    21724 100 sched_info_switch              sched_stats.h        270
       0        1 100 get_signal_to_deliver          signal.c             1820
       0        8 100 __cancel_work_timer            workqueue.c          560
       0      212 100 verify_export_symbols          module.c             1509
       0       17 100 __rmqueue_fallback             page_alloc.c         793
       0       43 100 clear_page_mlock               internal.h           129
       0      124 100 try_to_unmap_anon              rmap.c               1021
       0       53 100 try_to_unmap_anon              rmap.c               1013
       0        6 100 vma_address                    rmap.c               232
       0     3301 100 try_to_unmap_file              rmap.c               1082
       0      466 100 try_to_unmap_file              rmap.c               1077
       0        1 100 mem_cgroup_create              memcontrol.c         1090
       0        3 100 inotify_find_update_watch      inotify.c            726
       2    30163  99 perf_counter_task_sched_out    perf_counter.c       385
       1     2935  99 percpu_free                    allocpercpu.c        138
    1544   297672  99 dentry_lru_del_init            dcache.c             153
       8     1074  99 input_pass_event               input.c              86
    1390    76781  98 mapping_unevictable            pagemap.h            50
     280     6665  95 pick_next_task_rt              sched_rt.c           889
     750     4826  86 next_pidmap                    pid.c                194
       2        8  80 blocking_notifier_chain_regist notifier.c           220
      36      130  78 ioremap_pte_range              ioremap.c            22
    1093     3247  74 IS_ERR                         err.h                34
    1023     2908  73 sched_slice                    sched_fair.c         445
      22       60  73 disk_put_part                  genhd.h              206
[...]

It enables a developer to quickly address the source of incorrect branch
predictions.  Note that this sorting would be better with a second sort on
the number of incorrect predictions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 268 +++++++++++++++++++++++---------------------
 1 file changed, 142 insertions(+), 126 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c15222a01073..4785a3b9bc4a 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -18,10 +18,13 @@
 #include "trace.h"
 #include "trace_output.h"
 
+static struct tracer branch_trace;
+
 #ifdef CONFIG_BRANCH_TRACER
 
 static int branch_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(branch_tracing_mutex);
+
 static struct trace_array *branch_tracer;
 
 static void
@@ -178,6 +181,7 @@ trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
+
 static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
@@ -187,30 +191,6 @@ static struct trace_event trace_branch_event = {
 	.binary		= trace_nop_print,
 };
 
-struct tracer branch_trace __read_mostly =
-{
-	.name		= "branch",
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif
-};
-
-__init static int init_branch_trace(void)
-{
-	int ret;
-
-	ret = register_ftrace_event(&trace_branch_event);
-	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register branch events\n");
-		return 1;
-	}
-
-	return register_tracer(&branch_trace);
-}
-
-device_initcall(init_branch_trace);
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -236,66 +216,39 @@ void ftrace_likely_update(struct ftrace_branch_data *f, int val, int expect)
 }
 EXPORT_SYMBOL(ftrace_likely_update);
 
-struct ftrace_pointer {
-	void		*start;
-	void		*stop;
-	int		hit;
-};
+extern unsigned long __start_annotated_branch_profile[];
+extern unsigned long __stop_annotated_branch_profile[];
 
-static void *
-t_next(struct seq_file *m, void *v, loff_t *pos)
+static int annotated_branch_stat_headers(struct seq_file *m)
 {
-	const struct ftrace_pointer *f = m->private;
-	struct ftrace_branch_data *p = v;
-
-	(*pos)++;
-
-	if (v == (void *)1)
-		return f->start;
-
-	++p;
-
-	if ((void *)p >= (void *)f->stop)
-		return NULL;
-
-	return p;
+	seq_printf(m, " correct incorrect  %% ");
+	seq_printf(m, "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+	return 0;
 }
 
-static void *t_start(struct seq_file *m, loff_t *pos)
+static inline long get_incorrect_percent(struct ftrace_branch_data *p)
 {
-	void *t = (void *)1;
-	loff_t l = 0;
-
-	for (; t && l < *pos; t = t_next(m, t, &l))
-		;
+	long percent;
 
-	return t;
-}
+	if (p->correct) {
+		percent = p->incorrect * 100;
+		percent /= p->correct + p->incorrect;
+	} else
+		percent = p->incorrect ? 100 : -1;
 
-static void t_stop(struct seq_file *m, void *p)
-{
+	return percent;
 }
 
-static int t_show(struct seq_file *m, void *v)
+static int branch_stat_show(struct seq_file *m, void *v)
 {
-	const struct ftrace_pointer *fp = m->private;
 	struct ftrace_branch_data *p = v;
 	const char *f;
 	long percent;
 
-	if (v == (void *)1) {
-		if (fp->hit)
-			seq_printf(m, "   miss      hit    %% ");
-		else
-			seq_printf(m, " correct incorrect  %% ");
-		seq_printf(m, "       Function                "
-			      "  File              Line\n"
-			      " ------- ---------  - "
-			      "       --------                "
-			      "  ----              ----\n");
-		return 0;
-	}
-
 	/* Only print the file, not the path */
 	f = p->file + strlen(p->file);
 	while (f >= p->file && *f != '/')
@@ -305,11 +258,7 @@ static int t_show(struct seq_file *m, void *v)
 	/*
 	 * The miss is overlayed on correct, and hit on incorrect.
 	 */
-	if (p->correct) {
-		percent = p->incorrect * 100;
-		percent /= p->correct + p->incorrect;
-	} else
-		percent = p->incorrect ? 100 : -1;
+	percent = get_incorrect_percent(p);
 
 	seq_printf(m, "%8lu %8lu ",  p->correct, p->incorrect);
 	if (percent < 0)
@@ -320,76 +269,143 @@ static int t_show(struct seq_file *m, void *v)
 	return 0;
 }
 
-static struct seq_operations tracing_likely_seq_ops = {
-	.start		= t_start,
-	.next		= t_next,
-	.stop		= t_stop,
-	.show		= t_show,
-};
+static void *annotated_branch_stat_start(void)
+{
+	return __start_annotated_branch_profile;
+}
 
-static int tracing_branch_open(struct inode *inode, struct file *file)
+static void *
+annotated_branch_stat_next(void *v, int idx)
 {
-	int ret;
+	struct ftrace_branch_data *p = v;
 
-	ret = seq_open(file, &tracing_likely_seq_ops);
-	if (!ret) {
-		struct seq_file *m = file->private_data;
-		m->private = (void *)inode->i_private;
-	}
+	++p;
 
-	return ret;
+	if ((void *)p >= (void *)__stop_annotated_branch_profile)
+		return NULL;
+
+	return p;
 }
 
-static const struct file_operations tracing_branch_fops = {
-	.open		= tracing_branch_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-};
+static int annotated_branch_stat_cmp(void *p1, void *p2)
+{
+	struct ftrace_branch_data *a = p1;
+	struct ftrace_branch_data *b = p2;
+
+	long percent_a, percent_b;
+
+	percent_a = get_incorrect_percent(a);
+	percent_b = get_incorrect_percent(b);
+
+	if (percent_a < percent_b)
+		return -1;
+	if (percent_a > percent_b)
+		return 1;
+	else
+		return 0;
+}
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-extern unsigned long __start_branch_profile[];
-extern unsigned long __stop_branch_profile[];
+enum {
+	TRACE_BRANCH_OPT_ALL = 0x1
+};
 
-static const struct ftrace_pointer ftrace_branch_pos = {
-	.start			= __start_branch_profile,
-	.stop			= __stop_branch_profile,
-	.hit			= 1,
+static struct tracer_opt branch_opts[] = {
+	{ TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) },
+	{ }
 };
 
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+static struct tracer_flags branch_flags = {
+	.val = 0,
+	.opts = branch_opts
+};
 
-extern unsigned long __start_annotated_branch_profile[];
-extern unsigned long __stop_annotated_branch_profile[];
+extern unsigned long __start_branch_profile[];
+extern unsigned long __stop_branch_profile[];
 
-static const struct ftrace_pointer ftrace_annotated_branch_pos = {
-	.start			= __start_annotated_branch_profile,
-	.stop			= __stop_annotated_branch_profile,
-};
+static int all_branch_stat_headers(struct seq_file *m)
+{
+	seq_printf(m, "   miss      hit    %% ");
+	seq_printf(m, "       Function                "
+			      "  File              Line\n"
+			      " ------- ---------  - "
+			      "       --------                "
+			      "  ----              ----\n");
+	return 0;
+}
 
-static __init int ftrace_branch_init(void)
+static void *all_branch_stat_start(void)
 {
-	struct dentry *d_tracer;
-	struct dentry *entry;
+	return __start_branch_profile;
+}
+
+static void *
+all_branch_stat_next(void *v, int idx)
+{
+	struct ftrace_branch_data *p = v;
 
-	d_tracer = tracing_init_dentry();
+	++p;
 
-	entry = debugfs_create_file("profile_annotated_branch", 0444, d_tracer,
-				    (void *)&ftrace_annotated_branch_pos,
-				    &tracing_branch_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'profile_annotatet_branch' entry\n");
+	if ((void *)p >= (void *)__stop_branch_profile)
+		return NULL;
 
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
-	entry = debugfs_create_file("profile_branch", 0444, d_tracer,
-				    (void *)&ftrace_branch_pos,
-				    &tracing_branch_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs"
-			   " 'profile_branch' entry\n");
-#endif
+	return p;
+}
 
+static int branch_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_BRANCH_OPT_ALL) {
+		if (set) {
+			branch_trace.stat_headers = all_branch_stat_headers;
+			branch_trace.stat_start = all_branch_stat_start;
+			branch_trace.stat_next = all_branch_stat_next;
+			branch_trace.stat_cmp = NULL;
+		} else {
+			branch_trace.stat_headers =
+				annotated_branch_stat_headers;
+			branch_trace.stat_start = annotated_branch_stat_start;
+			branch_trace.stat_next = annotated_branch_stat_next;
+			branch_trace.stat_cmp = annotated_branch_stat_cmp;
+		}
+		init_tracer_stat(&branch_trace);
+	}
 	return 0;
 }
 
-device_initcall(ftrace_branch_init);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
+
+static struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+#ifdef CONFIG_BRANCH_TRACER
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+#endif /* CONFIG_BRANCH_TRACER */
+	.stat_start	=	annotated_branch_stat_start,
+	.stat_next	= annotated_branch_stat_next,
+	.stat_show	= branch_stat_show,
+	.stat_headers	= annotated_branch_stat_headers,
+	.stat_cmp	= annotated_branch_stat_cmp,
+#ifdef CONFIG_PROFILE_ALL_BRANCHES
+	.flags	= &branch_flags,
+	.set_flag	= branch_set_flag,
+#endif
+};
+
+__init static int init_branch_trace(void)
+{
+#ifdef CONFIG_BRANCH_TRACER
+	int ret;
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register branch events\n");
+		return 1;
+	}
+#endif
+
+	return register_tracer(&branch_trace);
+}
+device_initcall(init_branch_trace);
-- 
cgit v1.2.3


From f7d48cbde5c0710008caeaf7dbf14f4a9b064940 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 13:02:17 +0100
Subject: tracing/ftrace: make trace_find_cmdline() generally available

Impact: build fix

On !CONFIG_CONTEXT_SWITCH_TRACER trace_find_cmdline() is not defined:

 kernel/trace/trace_output.c: In function 'trace_ctxwake_print':
 kernel/trace/trace_output.c:499: error: implicit declaration of function 'trace_find_cmdline'
 kernel/trace/trace_output.c:499: warning: assignment makes pointer from integer without a cast

Move it to the generic section in trace.h.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 05fa804d1c16..a8b624ccd4d6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -469,10 +469,10 @@ struct tracer_switch_ops {
 	void				*private;
 	struct tracer_switch_ops	*next;
 };
-
-char *trace_find_cmdline(int pid);
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
+extern char *trace_find_cmdline(int pid);
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
 #define DYN_FTRACE_TEST_NAME trace_selftest_dynamic_test_func
-- 
cgit v1.2.3


From a103e2ab7377dbbef2506be59c49a3f2ae10b60b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 15:07:47 +0100
Subject: tracing/selftest: remove TRACE_CONT reference

Impact: build fix

TRACE_CONT is gone - fix up the self-test too.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54a..5013812578b1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -9,7 +9,6 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_FN:
 	case TRACE_CTX:
 	case TRACE_WAKE:
-	case TRACE_CONT:
 	case TRACE_STACK:
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
-- 
cgit v1.2.3


From 4d9842776a23e52ec4c60e0a79f5e1bbe91e463e Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:49 -0500
Subject: sched: cleanup inc/dec_rt_tasks

Move some common definitions up to the function prologe to simplify the
body logic.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched_rt.c | 40 ++++++++++++++++------------------------
 1 file changed, 16 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 1bbd99014011..0a5277233452 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -550,30 +550,28 @@ static void update_curr_rt(struct rq *rq)
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (rt_se_prio(rt_se) < rt_rq->highest_prio) {
+	int prio = rt_se_prio(rt_se);
 #ifdef CONFIG_SMP
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 #endif
 
-		rt_rq->highest_prio = rt_se_prio(rt_se);
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+	if (prio < rt_rq->highest_prio) {
+
+		rt_rq->highest_prio = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_se_prio(rt_se));
+			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
 	}
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory++;
-	}
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
@@ -590,6 +588,7 @@ static inline
 void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 	int highest_prio = rt_rq->highest_prio;
 #endif
 
@@ -611,20 +610,13 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		rt_rq->highest_prio = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
+	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
-	}
 
-	if (rt_rq->highest_prio != highest_prio) {
-		struct rq *rq = rq_of_rt_rq(rt_rq);
-
-		if (rq->online)
-			cpupri_set(&rq->rd->cpupri, rq->cpu,
-				   rt_rq->highest_prio);
-	}
+	if (rq->online && rt_rq->highest_prio != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
 
-	update_rt_migration(rq_of_rt_rq(rt_rq));
+	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
 	if (rt_se_boosted(rt_se))
-- 
cgit v1.2.3


From e864c499d9e57805ae1f9e7ea404dd223759cd53 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:49 -0500
Subject: sched: track the next-highest priority on each runqueue

We will use this later in the series to reduce the amount of rq-lock
contention during a pull operation

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched.c    |  8 ++++--
 kernel/sched_rt.c | 81 +++++++++++++++++++++++++++++++++++++++++--------------
 2 files changed, 67 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 756d981d91a4..7729f9a45a8b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -463,7 +463,10 @@ struct rt_rq {
 	struct rt_prio_array active;
 	unsigned long rt_nr_running;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	int highest_prio; /* highest queued rt task prio */
+	struct {
+		int curr; /* highest queued rt task prio */
+		int next; /* next highest */
+	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
@@ -8169,7 +8172,8 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 	__set_bit(MAX_RT_PRIO, array->bitmap);
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	rt_rq->highest_prio = MAX_RT_PRIO;
+	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 0a5277233452..ad36d7232236 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -108,7 +108,7 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
 	if (rt_rq->rt_nr_running) {
 		if (rt_se && !on_rt_rq(rt_se))
 			enqueue_rt_entity(rt_se);
-		if (rt_rq->highest_prio < curr->prio)
+		if (rt_rq->highest_prio.curr < curr->prio)
 			resched_task(curr);
 	}
 }
@@ -473,7 +473,7 @@ static inline int rt_se_prio(struct sched_rt_entity *rt_se)
 	struct rt_rq *rt_rq = group_rt_rq(rt_se);
 
 	if (rt_rq)
-		return rt_rq->highest_prio;
+		return rt_rq->highest_prio.curr;
 #endif
 
 	return rt_task_of(rt_se)->prio;
@@ -547,6 +547,21 @@ static void update_curr_rt(struct rq *rq)
 	}
 }
 
+#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+
+static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
+
+static inline int next_prio(struct rq *rq)
+{
+	struct task_struct *next = pick_next_highest_task_rt(rq, rq->cpu);
+
+	if (next && rt_prio(next->prio))
+		return next->prio;
+	else
+		return MAX_RT_PRIO;
+}
+#endif
+
 static inline
 void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
@@ -558,14 +573,32 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	WARN_ON(!rt_prio(prio));
 	rt_rq->rt_nr_running++;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (prio < rt_rq->highest_prio) {
+	if (prio < rt_rq->highest_prio.curr) {
 
-		rt_rq->highest_prio = prio;
+		/*
+		 * If the new task is higher in priority than anything on the
+		 * run-queue, we have a new high that must be published to
+		 * the world.  We also know that the previous high becomes
+		 * our next-highest.
+		 */
+		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
+		rt_rq->highest_prio.curr = prio;
 #ifdef CONFIG_SMP
 		if (rq->online)
 			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
 #endif
-	}
+	} else if (prio == rt_rq->highest_prio.curr)
+		/*
+		 * If the next task is equal in priority to the highest on
+		 * the run-queue, then we implicitly know that the next highest
+		 * task cannot be any lower than current
+		 */
+		rt_rq->highest_prio.next = prio;
+	else if (prio < rt_rq->highest_prio.next)
+		/*
+		 * Otherwise, we need to recompute next-highest
+		 */
+		rt_rq->highest_prio.next = next_prio(rq);
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1)
@@ -589,7 +622,7 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 {
 #ifdef CONFIG_SMP
 	struct rq *rq = rq_of_rt_rq(rt_rq);
-	int highest_prio = rt_rq->highest_prio;
+	int highest_prio = rt_rq->highest_prio.curr;
 #endif
 
 	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
@@ -597,24 +630,32 @@ void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 	rt_rq->rt_nr_running--;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	if (rt_rq->rt_nr_running) {
-		struct rt_prio_array *array;
+		int prio = rt_se_prio(rt_se);
+
+		WARN_ON(prio < rt_rq->highest_prio.curr);
 
-		WARN_ON(rt_se_prio(rt_se) < rt_rq->highest_prio);
-		if (rt_se_prio(rt_se) == rt_rq->highest_prio) {
-			/* recalculate */
-			array = &rt_rq->active;
-			rt_rq->highest_prio =
+		/*
+		 * This may have been our highest or next-highest priority
+		 * task and therefore we may have some recomputation to do
+		 */
+		if (prio == rt_rq->highest_prio.curr) {
+			struct rt_prio_array *array = &rt_rq->active;
+
+			rt_rq->highest_prio.curr =
 				sched_find_first_bit(array->bitmap);
-		} /* otherwise leave rq->highest prio alone */
+		}
+
+		if (prio <= rt_rq->highest_prio.next)
+			rt_rq->highest_prio.next = next_prio(rq);
 	} else
-		rt_rq->highest_prio = MAX_RT_PRIO;
+		rt_rq->highest_prio.curr = MAX_RT_PRIO;
 #endif
 #ifdef CONFIG_SMP
 	if (rt_se->nr_cpus_allowed > 1)
 		rq->rt.rt_nr_migratory--;
 
-	if (rq->online && rt_rq->highest_prio != highest_prio)
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio);
+	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 
 	update_rt_migration(rq);
 #endif /* CONFIG_SMP */
@@ -1064,7 +1105,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 		}
 
 		/* If this rq is still suitable use it. */
-		if (lowest_rq->rt.highest_prio > task->prio)
+		if (lowest_rq->rt.highest_prio.curr > task->prio)
 			break;
 
 		/* try again */
@@ -1252,7 +1293,7 @@ static int pull_rt_task(struct rq *this_rq)
 static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 {
 	/* Try to pull RT tasks here if we lower this rq's prio */
-	if (unlikely(rt_task(prev)) && rq->rt.highest_prio > prev->prio)
+	if (unlikely(rt_task(prev)) && rq->rt.highest_prio.curr > prev->prio)
 		pull_rt_task(rq);
 }
 
@@ -1338,7 +1379,7 @@ static void rq_online_rt(struct rq *rq)
 
 	__enable_runtime(rq);
 
-	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio);
+	cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
 }
 
 /* Assumes rq->lock is held */
@@ -1429,7 +1470,7 @@ static void prio_changed_rt(struct rq *rq, struct task_struct *p,
 		 * can release the rq lock and p could migrate.
 		 * Only reschedule if p is still on the same runqueue.
 		 */
-		if (p->prio > rq->rt.highest_prio && rq->curr == p)
+		if (p->prio > rq->rt.highest_prio.curr && rq->curr == p)
 			resched_task(p);
 #else
 		/* For UP simply resched on drop of prio */
-- 
cgit v1.2.3


From a8728944efe23417e38bf22063f06d9d8ee21d59 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:49 -0500
Subject: sched: use highest_prio.curr for pull threshold

highest_prio.curr is actually a more accurate way to keep track of
the pull_rt_task() threshold since it is always up to date, even
if the "next" task migrates during double_lock.  Therefore, stop
looking at the "next" task object and simply use the highest_prio.curr.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched_rt.c | 31 ++++++-------------------------
 1 file changed, 6 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index ad36d7232236..f8fb3edadcaa 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1207,14 +1207,12 @@ static void push_rt_tasks(struct rq *rq)
 static int pull_rt_task(struct rq *this_rq)
 {
 	int this_cpu = this_rq->cpu, ret = 0, cpu;
-	struct task_struct *p, *next;
+	struct task_struct *p;
 	struct rq *src_rq;
 
 	if (likely(!rt_overloaded(this_rq)))
 		return 0;
 
-	next = pick_next_task_rt(this_rq);
-
 	for_each_cpu(cpu, this_rq->rd->rto_mask) {
 		if (this_cpu == cpu)
 			continue;
@@ -1223,17 +1221,9 @@ static int pull_rt_task(struct rq *this_rq)
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
-		 * steal our next task - hence we must cause
-		 * the caller to recalculate the next task
-		 * in that case:
+		 * alter this_rq
 		 */
-		if (double_lock_balance(this_rq, src_rq)) {
-			struct task_struct *old_next = next;
-
-			next = pick_next_task_rt(this_rq);
-			if (next != old_next)
-				ret = 1;
-		}
+		double_lock_balance(this_rq, src_rq);
 
 		/*
 		 * Are there still pullable RT tasks?
@@ -1247,7 +1237,7 @@ static int pull_rt_task(struct rq *this_rq)
 		 * Do we have an RT task that preempts
 		 * the to-be-scheduled task?
 		 */
-		if (p && (!next || (p->prio < next->prio))) {
+		if (p && (p->prio < this_rq->rt.highest_prio.curr)) {
 			WARN_ON(p == src_rq->curr);
 			WARN_ON(!p->se.on_rq);
 
@@ -1257,12 +1247,9 @@ static int pull_rt_task(struct rq *this_rq)
 			 * This is just that p is wakeing up and hasn't
 			 * had a chance to schedule. We only pull
 			 * p if it is lower in priority than the
-			 * current task on the run queue or
-			 * this_rq next task is lower in prio than
-			 * the current task on that rq.
+			 * current task on the run queue
 			 */
-			if (p->prio < src_rq->curr->prio ||
-			    (next && next->prio < src_rq->curr->prio))
+			if (p->prio < src_rq->curr->prio)
 				goto skip;
 
 			ret = 1;
@@ -1275,13 +1262,7 @@ static int pull_rt_task(struct rq *this_rq)
 			 * case there's an even higher prio task
 			 * in another runqueue. (low likelyhood
 			 * but possible)
-			 *
-			 * Update next so that we won't pick a task
-			 * on another cpu with a priority lower (or equal)
-			 * than the one we just picked.
 			 */
-			next = p;
-
 		}
  skip:
 		double_unlock_balance(this_rq, src_rq);
-- 
cgit v1.2.3


From 74ab8e4f6412c0b2d730fe5de28dc21de8b92c01 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:50 -0500
Subject: sched: use highest_prio.next to optimize pull operations

We currently take the rq->lock for every cpu in an overload state during
pull_rt_tasks().  However, we now have enough information via the
highest_prio.[curr|next] fields to determine if there is any tasks of
interest to warrant the overhead of the rq->lock, before we actually take
it.  So we use this information to reduce lock contention during the
pull for the case where the source-rq doesnt have tasks that preempt
the current task.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched_rt.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index f8fb3edadcaa..d047f288c411 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1218,6 +1218,18 @@ static int pull_rt_task(struct rq *this_rq)
 			continue;
 
 		src_rq = cpu_rq(cpu);
+
+		/*
+		 * Don't bother taking the src_rq->lock if the next highest
+		 * task is known to be lower-priority than our current task.
+		 * This may look racy, but if this value is about to go
+		 * logically higher, the src_rq will push this task away.
+		 * And if its going logically lower, we do not care
+		 */
+		if (src_rq->rt.highest_prio.next >=
+		    this_rq->rt.highest_prio.curr)
+			continue;
+
 		/*
 		 * We can potentially drop this_rq's lock in
 		 * double_lock_balance, and another CPU could
-- 
cgit v1.2.3


From 777c2f389e463428fd7e2871051a84d7fe84b172 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:50 -0500
Subject: sched: only try to push a task on wakeup if it is migratable

There is no sense in wasting time trying to push a task away that
cannot move anywhere else.  We gain no benefit from trying to push
other tasks at this point, so if the task being woken up is non
migratable, just skip the whole operation.  This reduces overhead
in the wakeup path for certain tasks.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched_rt.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index d047f288c411..8d33843cb2c4 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1314,7 +1314,8 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    rq->rt.overloaded)
+	    rq->rt.overloaded &&
+	    p->rt.nr_cpus_allowed > 1)
 		push_rt_tasks(rq);
 }
 
-- 
cgit v1.2.3


From 7e96fa5875d4a9be18d74d3ca7b90518d05bc426 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:50 -0500
Subject: sched: pull only one task during NEWIDLE balancing to limit critical
 section

git-id c4acb2c0669c5c5c9b28e9d02a34b5c67edf7092 attempted to limit
newidle critical section length by stopping after at least one task
was moved.  Further investigation has shown that there are other
paths nested further inside the algorithm which still remain that allow
long latencies to occur with newidle balancing.  This patch applies
the same technique inside balance_tasks() to limit the duration of
this optional balancing operation.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
CC: Nick Piggin <npiggin@suse.de>
---
 kernel/sched.c | 18 +++++++++++++++++-
 1 file changed, 17 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7729f9a45a8b..94d9a6c5ff94 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2984,6 +2984,16 @@ next:
 	pulled++;
 	rem_load_move -= p->se.load.weight;
 
+#ifdef CONFIG_PREEMPT
+	/*
+	 * NEWIDLE balancing is a source of latency, so preemptible kernels
+	 * will stop after the first task is pulled to minimize the critical
+	 * section.
+	 */
+	if (idle == CPU_NEWLY_IDLE)
+		goto out;
+#endif
+
 	/*
 	 * We only want to steal up to the prescribed amount of weighted load.
 	 */
@@ -3030,9 +3040,15 @@ static int move_tasks(struct rq *this_rq, int this_cpu, struct rq *busiest,
 				sd, idle, all_pinned, &this_best_prio);
 		class = class->next;
 
+#ifdef CONFIG_PREEMPT
+		/*
+		 * NEWIDLE balancing is a source of latency, so preemptible
+		 * kernels will stop after the first task is pulled to minimize
+		 * the critical section.
+		 */
 		if (idle == CPU_NEWLY_IDLE && this_rq->nr_running)
 			break;
-
+#endif
 	} while (class && max_load_move > total_load_moved);
 
 	return total_load_moved > 0;
-- 
cgit v1.2.3


From 8f45e2b516201d1bf681e6026fa5276385def565 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:51 -0500
Subject: sched: make double-lock-balance fair

double_lock balance() currently favors logically lower cpus since they
often do not have to release their own lock to acquire a second lock.
The result is that logically higher cpus can get starved when there is
a lot of pressure on the RQs.  This can result in higher latencies on
higher cpu-ids.

This patch makes the algorithm more fair by forcing all paths to have
to release both locks before acquiring them again.  Since callsites to
double_lock_balance already consider it a potential preemption/reschedule
point, they have the proper logic to recheck for atomicity violations.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched.c | 51 ++++++++++++++++++++++++++++++++++++++++++++-------
 1 file changed, 44 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 94d9a6c5ff94..8fca364f3593 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1608,21 +1608,42 @@ static inline void update_shares_locked(struct rq *rq, struct sched_domain *sd)
 
 #endif
 
+#ifdef CONFIG_PREEMPT
+
 /*
- * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ * fair double_lock_balance: Safely acquires both rq->locks in a fair
+ * way at the expense of forcing extra atomic operations in all
+ * invocations.  This assures that the double_lock is acquired using the
+ * same underlying policy as the spinlock_t on this architecture, which
+ * reduces latency compared to the unfair variant below.  However, it
+ * also adds more overhead and therefore may reduce throughput.
  */
-static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+static inline int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
+	__releases(this_rq->lock)
+	__acquires(busiest->lock)
+	__acquires(this_rq->lock)
+{
+	spin_unlock(&this_rq->lock);
+	double_rq_lock(this_rq, busiest);
+
+	return 1;
+}
+
+#else
+/*
+ * Unfair double_lock_balance: Optimizes throughput at the expense of
+ * latency by eliminating extra atomic operations when the locks are
+ * already in proper order on entry.  This favors lower cpu-ids and will
+ * grant the double lock to lower cpus over higher ids under contention,
+ * regardless of entry order into the function.
+ */
+static int _double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(this_rq->lock)
 	__acquires(busiest->lock)
 	__acquires(this_rq->lock)
 {
 	int ret = 0;
 
-	if (unlikely(!irqs_disabled())) {
-		/* printk() doesn't work good under rq->lock */
-		spin_unlock(&this_rq->lock);
-		BUG_ON(1);
-	}
 	if (unlikely(!spin_trylock(&busiest->lock))) {
 		if (busiest < this_rq) {
 			spin_unlock(&this_rq->lock);
@@ -1635,6 +1656,22 @@ static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
 	return ret;
 }
 
+#endif /* CONFIG_PREEMPT */
+
+/*
+ * double_lock_balance - lock the busiest runqueue, this_rq is locked already.
+ */
+static int double_lock_balance(struct rq *this_rq, struct rq *busiest)
+{
+	if (unlikely(!irqs_disabled())) {
+		/* printk() doesn't work good under rq->lock */
+		spin_unlock(&this_rq->lock);
+		BUG_ON(1);
+	}
+
+	return _double_lock_balance(this_rq, busiest);
+}
+
 static inline void double_unlock_balance(struct rq *this_rq, struct rq *busiest)
 	__releases(busiest->lock)
 {
-- 
cgit v1.2.3


From 967fc04671feea4dbf780c9e55a0bc8fcf68a14e Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:52 -0500
Subject: sched: add sched_class->needs_post_schedule() member

We currently run class->post_schedule() outside of the rq->lock, which
means that we need to test for the need to post_schedule outside of
the lock to avoid a forced reacquistion.  This is currently not a problem
as we only look at rq->rt.overloaded.  However, we want to enhance this
going forward to look at more state to reduce the need to post_schedule to
a bare minimum set.  Therefore, we introduce a new member-func called
needs_post_schedule() which tests for the post_schedule condtion without
actually performing the work.  Therefore it is safe to call this
function before the rq->lock is released, because we are guaranteed not
to drop the lock at an intermediate point (such as what post_schedule()
may do).

We will use this later in the series

[ rostedt: removed paranoid BUG_ON ]

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 include/linux/sched.h |  1 +
 kernel/sched.c        |  8 +++++++-
 kernel/sched_rt.c     | 24 ++++++++++++++----------
 3 files changed, 22 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index e5f928a079e8..836a86c32a65 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1012,6 +1012,7 @@ struct sched_class {
 			      struct rq *busiest, struct sched_domain *sd,
 			      enum cpu_idle_type idle);
 	void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
+	int (*needs_post_schedule) (struct rq *this_rq);
 	void (*post_schedule) (struct rq *this_rq);
 	void (*task_wake_up) (struct rq *this_rq, struct task_struct *task);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 8fca364f3593..3acbad8991a2 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2621,6 +2621,12 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 {
 	struct mm_struct *mm = rq->prev_mm;
 	long prev_state;
+#ifdef CONFIG_SMP
+	int post_schedule = 0;
+
+	if (current->sched_class->needs_post_schedule)
+		post_schedule = current->sched_class->needs_post_schedule(rq);
+#endif
 
 	rq->prev_mm = NULL;
 
@@ -2639,7 +2645,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
 	finish_arch_switch(prev);
 	finish_lock_switch(rq, prev);
 #ifdef CONFIG_SMP
-	if (current->sched_class->post_schedule)
+	if (post_schedule)
 		current->sched_class->post_schedule(rq);
 #endif
 
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 8d33843cb2c4..b0b6ea4ed674 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1290,20 +1290,23 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
 		pull_rt_task(rq);
 }
 
+/*
+ * assumes rq->lock is held
+ */
+static int needs_post_schedule_rt(struct rq *rq)
+{
+	return rq->rt.overloaded ? 1 : 0;
+}
+
 static void post_schedule_rt(struct rq *rq)
 {
 	/*
-	 * If we have more than one rt_task queued, then
-	 * see if we can push the other rt_tasks off to other CPUS.
-	 * Note we may release the rq lock, and since
-	 * the lock was owned by prev, we need to release it
-	 * first via finish_lock_switch and then reaquire it here.
+	 * This is only called if needs_post_schedule_rt() indicates that
+	 * we need to push tasks away
 	 */
-	if (unlikely(rq->rt.overloaded)) {
-		spin_lock_irq(&rq->lock);
-		push_rt_tasks(rq);
-		spin_unlock_irq(&rq->lock);
-	}
+	spin_lock_irq(&rq->lock);
+	push_rt_tasks(rq);
+	spin_unlock_irq(&rq->lock);
 }
 
 /*
@@ -1557,6 +1560,7 @@ static const struct sched_class rt_sched_class = {
 	.rq_online              = rq_online_rt,
 	.rq_offline             = rq_offline_rt,
 	.pre_schedule		= pre_schedule_rt,
+	.needs_post_schedule	= needs_post_schedule_rt,
 	.post_schedule		= post_schedule_rt,
 	.task_wake_up		= task_wake_up_rt,
 	.switched_from		= switched_from_rt,
-- 
cgit v1.2.3


From 917b627d4d981dc614519d7b34ea31a976b14e12 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:53 -0500
Subject: sched: create "pushable_tasks" list to limit pushing to one attempt

The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis).  When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted.  This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.

When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state.  Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over.  The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.

However, the current implementation suffers from a limitation in the
push logic.  Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc).  Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue.  This causes two problems:

1) We can have the same tasks analyzed over and over again during each
   push, which extends out the fast path in the scheduler for no
   gain.  Consider a RQ that has dozens of tasks that are bound to a
   core.  Each one of those tasks will be encountered and skipped
   for each push operation while they are queued.

2) There may be lower-priority tasks under the unpushable task that
   could have been successfully pushed, but will never be considered
   until either the unpushable task is cleared, or a pull operation
   succeeds.  The net result is a potential latency source for mid
   priority tasks.

This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks".  A task is added to the list
each time a task is activated or preempted.  It is removed from the
list any time it is deactivated, made current, or fails to push.

This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper.  This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks.  Now every task will have a push attempted (when
appropriate).

This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.

[ rostedt: added a couple more BUG_ONs ]

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/init_task.h |   1 +
 include/linux/sched.h     |   1 +
 kernel/sched.c            |   4 ++
 kernel/sched_rt.c         | 119 +++++++++++++++++++++++++++++++++++++++-------
 4 files changed, 107 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index 23fd8909b9e5..6851225f44a7 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -140,6 +140,7 @@ extern struct group_info init_groups;
 		.nr_cpus_allowed = NR_CPUS,				\
 	},								\
 	.tasks		= LIST_HEAD_INIT(tsk.tasks),			\
+	.pushable_tasks = PLIST_NODE_INIT(tsk.pushable_tasks, MAX_PRIO), \
 	.ptraced	= LIST_HEAD_INIT(tsk.ptraced),			\
 	.ptrace_entry	= LIST_HEAD_INIT(tsk.ptrace_entry),		\
 	.real_parent	= &tsk,						\
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 836a86c32a65..440cabb2d432 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1179,6 +1179,7 @@ struct task_struct {
 #endif
 
 	struct list_head tasks;
+	struct plist_node pushable_tasks;
 
 	struct mm_struct *mm, *active_mm;
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 3acbad8991a2..24ab80c28765 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -471,6 +471,7 @@ struct rt_rq {
 #ifdef CONFIG_SMP
 	unsigned long rt_nr_migratory;
 	int overloaded;
+	struct plist_head pushable_tasks;
 #endif
 	int rt_throttled;
 	u64 rt_time;
@@ -2481,6 +2482,8 @@ void sched_fork(struct task_struct *p, int clone_flags)
 	/* Want to start with kernel preemption disabled. */
 	task_thread_info(p)->preempt_count = 1;
 #endif
+	plist_node_init(&p->pushable_tasks, MAX_PRIO);
+
 	put_cpu();
 }
 
@@ -8237,6 +8240,7 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
+	plist_head_init(&rq->rt.pushable_tasks, &rq->lock);
 #endif
 
 	rt_rq->rt_time = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index b0b6ea4ed674..fe9da6084c87 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -49,6 +49,24 @@ static void update_rt_migration(struct rq *rq)
 		rq->rt.overloaded = 0;
 	}
 }
+
+static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+	plist_node_init(&p->pushable_tasks, p->prio);
+	plist_add(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
+}
+
+#else
+
+#define enqueue_pushable_task(rq, p) do { } while (0)
+#define dequeue_pushable_task(rq, p) do { } while (0)
+
 #endif /* CONFIG_SMP */
 
 static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
@@ -751,6 +769,9 @@ static void enqueue_task_rt(struct rq *rq, struct task_struct *p, int wakeup)
 
 	enqueue_rt_entity(rt_se);
 
+	if (!task_current(rq, p) && p->rt.nr_cpus_allowed > 1)
+		enqueue_pushable_task(rq, p);
+
 	inc_cpu_load(rq, p->se.load.weight);
 }
 
@@ -761,6 +782,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int sleep)
 	update_curr_rt(rq);
 	dequeue_rt_entity(rt_se);
 
+	dequeue_pushable_task(rq, p);
+
 	dec_cpu_load(rq, p->se.load.weight);
 }
 
@@ -911,7 +934,7 @@ static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
 	return next;
 }
 
-static struct task_struct *pick_next_task_rt(struct rq *rq)
+static struct task_struct *_pick_next_task_rt(struct rq *rq)
 {
 	struct sched_rt_entity *rt_se;
 	struct task_struct *p;
@@ -933,6 +956,18 @@ static struct task_struct *pick_next_task_rt(struct rq *rq)
 
 	p = rt_task_of(rt_se);
 	p->se.exec_start = rq->clock;
+
+	return p;
+}
+
+static struct task_struct *pick_next_task_rt(struct rq *rq)
+{
+	struct task_struct *p = _pick_next_task_rt(rq);
+
+	/* The running task is never eligible for pushing */
+	if (p)
+		dequeue_pushable_task(rq, p);
+
 	return p;
 }
 
@@ -940,6 +975,13 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
 {
 	update_curr_rt(rq);
 	p->se.exec_start = 0;
+
+	/*
+	 * The previous task needs to be made eligible for pushing
+	 * if it is still active
+	 */
+	if (p->se.on_rq && p->rt.nr_cpus_allowed > 1)
+		enqueue_pushable_task(rq, p);
 }
 
 #ifdef CONFIG_SMP
@@ -1116,6 +1158,31 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq)
 	return lowest_rq;
 }
 
+static inline int has_pushable_tasks(struct rq *rq)
+{
+	return !plist_head_empty(&rq->rt.pushable_tasks);
+}
+
+static struct task_struct *pick_next_pushable_task(struct rq *rq)
+{
+	struct task_struct *p;
+
+	if (!has_pushable_tasks(rq))
+		return NULL;
+
+	p = plist_first_entry(&rq->rt.pushable_tasks,
+			      struct task_struct, pushable_tasks);
+
+	BUG_ON(rq->cpu != task_cpu(p));
+	BUG_ON(task_current(rq, p));
+	BUG_ON(p->rt.nr_cpus_allowed <= 1);
+
+	BUG_ON(!p->se.on_rq);
+	BUG_ON(!rt_task(p));
+
+	return p;
+}
+
 /*
  * If the current CPU has more than one RT task, see if the non
  * running task can migrate over to a CPU that is running a task
@@ -1125,13 +1192,12 @@ static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
-	int ret = 0;
 	int paranoid = RT_MAX_TRIES;
 
 	if (!rq->rt.overloaded)
 		return 0;
 
-	next_task = pick_next_highest_task_rt(rq, -1);
+	next_task = pick_next_pushable_task(rq);
 	if (!next_task)
 		return 0;
 
@@ -1163,12 +1229,19 @@ static int push_rt_task(struct rq *rq)
 		 * so it is possible that next_task has changed.
 		 * If it has, then try again.
 		 */
-		task = pick_next_highest_task_rt(rq, -1);
+		task = pick_next_pushable_task(rq);
 		if (unlikely(task != next_task) && task && paranoid--) {
 			put_task_struct(next_task);
 			next_task = task;
 			goto retry;
 		}
+
+		/*
+		 * Once we have failed to push this task, we will not
+		 * try again, since the other cpus will pull from us
+		 * when they are ready
+		 */
+		dequeue_pushable_task(rq, next_task);
 		goto out;
 	}
 
@@ -1180,23 +1253,12 @@ static int push_rt_task(struct rq *rq)
 
 	double_unlock_balance(rq, lowest_rq);
 
-	ret = 1;
 out:
 	put_task_struct(next_task);
 
-	return ret;
+	return 1;
 }
 
-/*
- * TODO: Currently we just use the second highest prio task on
- *       the queue, and stop when it can't migrate (or there's
- *       no more RT tasks).  There may be a case where a lower
- *       priority RT task has a different affinity than the
- *       higher RT task. In this case the lower RT task could
- *       possibly be able to migrate where as the higher priority
- *       RT task could not.  We currently ignore this issue.
- *       Enhancements are welcome!
- */
 static void push_rt_tasks(struct rq *rq)
 {
 	/* push_rt_task will return true if it moved an RT */
@@ -1295,7 +1357,7 @@ static void pre_schedule_rt(struct rq *rq, struct task_struct *prev)
  */
 static int needs_post_schedule_rt(struct rq *rq)
 {
-	return rq->rt.overloaded ? 1 : 0;
+	return has_pushable_tasks(rq);
 }
 
 static void post_schedule_rt(struct rq *rq)
@@ -1317,7 +1379,7 @@ static void task_wake_up_rt(struct rq *rq, struct task_struct *p)
 {
 	if (!task_running(rq, p) &&
 	    !test_tsk_need_resched(rq->curr) &&
-	    rq->rt.overloaded &&
+	    has_pushable_tasks(rq) &&
 	    p->rt.nr_cpus_allowed > 1)
 		push_rt_tasks(rq);
 }
@@ -1354,6 +1416,24 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 	if (p->se.on_rq && (weight != p->rt.nr_cpus_allowed)) {
 		struct rq *rq = task_rq(p);
 
+		if (!task_current(rq, p)) {
+			/*
+			 * Make sure we dequeue this task from the pushable list
+			 * before going further.  It will either remain off of
+			 * the list because we are no longer pushable, or it
+			 * will be requeued.
+			 */
+			if (p->rt.nr_cpus_allowed > 1)
+				dequeue_pushable_task(rq, p);
+
+			/*
+			 * Requeue if our weight is changing and still > 1
+			 */
+			if (weight > 1)
+				enqueue_pushable_task(rq, p);
+
+		}
+
 		if ((p->rt.nr_cpus_allowed <= 1) && (weight > 1)) {
 			rq->rt.rt_nr_migratory++;
 		} else if ((p->rt.nr_cpus_allowed > 1) && (weight <= 1)) {
@@ -1538,6 +1618,9 @@ static void set_curr_task_rt(struct rq *rq)
 	struct task_struct *p = rq->curr;
 
 	p->se.exec_start = rq->clock;
+
+	/* The running task is never eligible for pushing */
+	dequeue_pushable_task(rq, p);
 }
 
 static const struct sched_class rt_sched_class = {
-- 
cgit v1.2.3


From 1563513d34ed4b12ef32bc2adde4a53ce05701a1 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Mon, 29 Dec 2008 09:39:53 -0500
Subject: RT: fix push_rt_task() to handle dequeue_pushable properly

A panic was discovered by Chirag Jog where a BUG_ON sanity check
in the new "pushable_task" logic would trigger a panic under
certain circumstances:

http://lkml.org/lkml/2008/9/25/189

Gilles Carry discovered that the root cause was attributed to the
pushable_tasks list getting corrupted in the push_rt_task logic.
This was the result of a dropped rq lock in double_lock_balance
allowing a task in the process of being pushed to potentially migrate
away, and thus corrupt the pushable_tasks() list.

I traced back the problem as introduced by the pushable_tasks patch
that went in recently.   There is a "retry" path in push_rt_task()
that actually had a compound conditional to decide whether to
retry or exit.  I missed the meaning behind the rationale for the
virtual "if(!task) goto out;" portion of the compound statement and
thus did not handle it properly.  The new pushable_tasks logic
actually creates three distinct conditions:

1) an untouched and unpushable task should be dequeued
2) a migrated task where more pushable tasks remain should be retried
3) a migrated task where no more pushable tasks exist should exit

The original logic mushed (1) and (3) together, resulting in the
system dequeuing a migrated task (against an unlocked foreign run-queue
nonetheless).

To fix this, we get rid of the notion of "paranoid" and we support the
three unique conditions properly.  The paranoid feature is no longer
relevant with the new pushable logic (since pushable naturally limits
the loop) anyway, so lets just remove it.

Reported-By: Chirag Jog <chirag@linux.vnet.ibm.com>
Found-by: Gilles Carry <gilles.carry@bull.net>
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched_rt.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index fe9da6084c87..64a8f0aa117b 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1192,7 +1192,6 @@ static int push_rt_task(struct rq *rq)
 {
 	struct task_struct *next_task;
 	struct rq *lowest_rq;
-	int paranoid = RT_MAX_TRIES;
 
 	if (!rq->rt.overloaded)
 		return 0;
@@ -1226,23 +1225,34 @@ static int push_rt_task(struct rq *rq)
 		struct task_struct *task;
 		/*
 		 * find lock_lowest_rq releases rq->lock
-		 * so it is possible that next_task has changed.
-		 * If it has, then try again.
+		 * so it is possible that next_task has migrated.
+		 *
+		 * We need to make sure that the task is still on the same
+		 * run-queue and is also still the next task eligible for
+		 * pushing.
 		 */
 		task = pick_next_pushable_task(rq);
-		if (unlikely(task != next_task) && task && paranoid--) {
-			put_task_struct(next_task);
-			next_task = task;
-			goto retry;
+		if (task_cpu(next_task) == rq->cpu && task == next_task) {
+			/*
+			 * If we get here, the task hasnt moved at all, but
+			 * it has failed to push.  We will not try again,
+			 * since the other cpus will pull from us when they
+			 * are ready.
+			 */
+			dequeue_pushable_task(rq, next_task);
+			goto out;
 		}
 
+		if (!task)
+			/* No more tasks, just exit */
+			goto out;
+
 		/*
-		 * Once we have failed to push this task, we will not
-		 * try again, since the other cpus will pull from us
-		 * when they are ready
+		 * Something has shifted, try again.
 		 */
-		dequeue_pushable_task(rq, next_task);
-		goto out;
+		put_task_struct(next_task);
+		next_task = task;
+		goto retry;
 	}
 
 	deactivate_task(rq, next_task, 0);
-- 
cgit v1.2.3


From 7a51cffbd10886c0557677dd916c090097c691ef Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 29 Dec 2008 16:03:40 +0100
Subject: relayfs: replace BUG() with WARN_ON() in relay_late_setup_files()

Impact: turn boot crash into boot warning

This BUG() can trigger:

[   16.684131] initcall fail_page_alloc_debugfs+0x0/0xc1 returned 0 after 0 usecs
[   16.692035] calling  kmemtrace_setup_late+0x0/0xd5 @ 1
[   16.700087] relay_late_setup_files: CPU 1 has no buffer, it must have!
[   16.704044] ------------[ cut here ]------------
[   16.708030] kernel BUG at kernel/relay.c:680!
[   16.708030] invalid opcode: 0000 [#1] SMP DEBUG_PAGEALLOC
[   16.708030] last sysfs file:
[   16.708030]
[   16.708030] Pid: 1, comm: swapper Not tainted (2.6.28-tip-03903-g9a39f58-dirty #13207) System Product Name
[   16.708030] EIP: 0060:[<c01604ae>] EFLAGS: 00010246 CPU: 1
[   16.708030] EIP is at relay_late_setup_files+0x8c/0x176

Reduce it to a more reportable WARN_ONCE().

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/relay.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 09ac2008f77b..d06450670c86 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -675,9 +675,7 @@ int relay_late_setup_files(struct rchan *chan,
 	 */
 	for_each_online_cpu(i) {
 		if (unlikely(!chan->buf[i])) {
-			printk(KERN_ERR "relay_late_setup_files: CPU %u "
-					"has no buffer, it must have!\n", i);
-			BUG();
+			WARN_ONCE(1, KERN_ERR "CPU has no buffer!\n");
 			err = -EINVAL;
 			break;
 		}
-- 
cgit v1.2.3


From 5762ba1873b0bb9faa631aaa02f533c2b9837f82 Mon Sep 17 00:00:00 2001
From: Sebastien Dugue <sebastien.dugue@bull.net>
Date: Mon, 1 Dec 2008 14:09:07 +0100
Subject: hrtimers: allow the hot-unplugging of all cpus

Impact: fix CPU hotplug hang on Power6 testbox

On architectures that support offlining all cpus (at least powerpc/pseries),
hot-unpluging the tick_do_timer_cpu can result in a system hang.

This comes from the fact that if the cpu going down happens to be the
cpu doing the tick, then as the tick_do_timer_cpu handover happens after the
cpu is dead (via the CPU_DEAD notification), we're left without ticks,
jiffies are frozen and any task relying on timers (msleep, ...) is stuck.
That's particularly the case for the cpu looping in __cpu_die() waiting
for the dying cpu to be dead.

This patch addresses this by having the tick_do_timer_cpu handover happen
earlier during the CPU_DYING notification. For this, a new clockevent
notification type is introduced (CLOCK_EVT_NOTIFY_CPU_DYING) which is triggered
in hrtimer_cpu_notify().

Signed-off-by: Sebastien Dugue <sebastien.dugue@bull.net>
Cc: <stable@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/clockchips.h |  1 +
 kernel/hrtimer.c           |  4 ++++
 kernel/time/tick-common.c  | 26 +++++++++++++++++++-------
 3 files changed, 24 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h
index ed3a5d473e52..c6de413c5dd1 100644
--- a/include/linux/clockchips.h
+++ b/include/linux/clockchips.h
@@ -36,6 +36,7 @@ enum clock_event_nofitiers {
 	CLOCK_EVT_NOTIFY_BROADCAST_EXIT,
 	CLOCK_EVT_NOTIFY_SUSPEND,
 	CLOCK_EVT_NOTIFY_RESUME,
+	CLOCK_EVT_NOTIFY_CPU_DYING,
 	CLOCK_EVT_NOTIFY_CPU_DEAD,
 };
 
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index c2a69b89ac61..61cb933395ba 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -1609,6 +1609,10 @@ static int __cpuinit hrtimer_cpu_notify(struct notifier_block *self,
 		break;
 
 #ifdef CONFIG_HOTPLUG_CPU
+	case CPU_DYING:
+	case CPU_DYING_FROZEN:
+		clockevents_notify(CLOCK_EVT_NOTIFY_CPU_DYING, &scpu);
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	{
diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index df12434b43ca..457d281258ee 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -273,6 +273,21 @@ out_bc:
 	return ret;
 }
 
+/*
+ * Transfer the do_timer job away from a dying cpu.
+ *
+ * Called with interrupts disabled.
+ */
+static void tick_handover_do_timer(int *cpup)
+{
+	if (*cpup == tick_do_timer_cpu) {
+		int cpu = first_cpu(cpu_online_map);
+
+		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
+			TICK_DO_TIMER_NONE;
+	}
+}
+
 /*
  * Shutdown an event device on a given cpu:
  *
@@ -297,13 +312,6 @@ static void tick_shutdown(unsigned int *cpup)
 		clockevents_exchange_device(dev, NULL);
 		td->evtdev = NULL;
 	}
-	/* Transfer the do_timer job away from this cpu */
-	if (*cpup == tick_do_timer_cpu) {
-		int cpu = first_cpu(cpu_online_map);
-
-		tick_do_timer_cpu = (cpu != NR_CPUS) ? cpu :
-			TICK_DO_TIMER_NONE;
-	}
 	spin_unlock_irqrestore(&tick_device_lock, flags);
 }
 
@@ -357,6 +365,10 @@ static int tick_notify(struct notifier_block *nb, unsigned long reason,
 		tick_broadcast_oneshot_control(reason);
 		break;
 
+	case CLOCK_EVT_NOTIFY_CPU_DYING:
+		tick_handover_do_timer(dev);
+		break;
+
 	case CLOCK_EVT_NOTIFY_CPU_DEAD:
 		tick_shutdown_broadcast_oneshot(dev);
 		tick_shutdown_broadcast(dev);
-- 
cgit v1.2.3


From 36994e58a48fb8f9651c7dc845a6de298aba5bfc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 29 Dec 2008 13:42:23 -0800
Subject: tracing/kmemtrace: normalize the raw tracer event to the unified
 tracing API

Impact: new tracer plugin

This patch adapts kmemtrace raw events tracing to the unified tracing API.

To enable and use this tracer, just do the following:

 echo kmemtrace > /debugfs/tracing/current_tracer
 cat /debugfs/tracing/trace

You will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

type_id 1 call_site 18446744071565527833 ptr 18446612134395152256
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164672 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345164912 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 0 call_site 18446744071565636711 ptr 18446612134345165152 bytes_req 240 bytes_alloc 240 gfp_flags 208 node -1
type_id 0 call_site 18446744071566144042 ptr 18446612134346191680 bytes_req 1304 bytes_alloc 1312 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584
type_id 0 call_site 18446744071565585597 ptr 18446612134405955584 bytes_req 4096 bytes_alloc 4096 gfp_flags 208 node -1
type_id 1 call_site 18446744071565585534 ptr 18446612134405955584

That was to stay backward compatible with the format output produced in
inux/tracepoint.h.

This is the default ouput, but note that I tried something else.

If you change an option:

echo kmem_minimalistic > /debugfs/trace_options

and then cat /debugfs/trace, you will have the following output:

 # tracer: kmemtrace
 #
 #
 # ALLOC  TYPE  REQ   GIVEN  FLAGS           POINTER         NODE    CALLER
 # FREE   |      |     |       |              |   |            |        |
 # |

   -      C                            0xffff88007c088780          file_free_rcu
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc780     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc870     -1   d_alloc
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   +      K    240    240   000000d0   0xffff8800790dc960     -1   d_alloc
   +      K   1304   1312   000000d0   0xffff8800791d7340     -1   reiserfs_alloc_inode
   -      C                            0xffff88007cad6000          putname
   +      K   4096   4096   000000d0   0xffff88007cad6000     -1   getname
   -      C                            0xffff88007cad6000          putname
   +      K    992   1000   000000d0   0xffff880079045b58     -1   alloc_inode
   +      K    768   1024   000080d0   0xffff88007c096400     -1   alloc_pipe_info
   +      K    240    240   000000d0   0xffff8800790dca50     -1   d_alloc
   +      K    272    320   000080d0   0xffff88007c088780     -1   get_empty_filp
   +      K    272    320   000080d0   0xffff88007c088000     -1   get_empty_filp

Yeah I shall confess kmem_minimalistic should be: kmem_alternative.

Whatever, I find it more readable but this a personal opinion of course.
We can drop it if you want.

On the ALLOC/FREE column, + means an allocation and - a free.

On the type column, you have K = kmalloc, C = cache, P = page

I would like the flags to be GFP_* strings but that would not be easy to not
break the column with strings....

About the node...it seems to always be -1. I don't know why but that shouldn't
be difficult to find.

I moved linux/tracepoint.h to trace/tracepoint.h as well. I think that would
be more easy to find the tracer headers if they are all in their common
directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/kmemtrace.h |  86 ------------
 include/linux/slab_def.h  |   2 +-
 include/linux/slub_def.h  |   2 +-
 include/trace/kmemtrace.h |  75 ++++++++++
 init/main.c               |   2 +-
 kernel/trace/Kconfig      |  22 +++
 kernel/trace/Makefile     |   1 +
 kernel/trace/kmemtrace.c  | 343 ++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h      |  25 ++++
 lib/Kconfig.debug         |  20 ---
 mm/kmemtrace.c            |   2 +-
 mm/slob.c                 |   2 +-
 mm/slub.c                 |   2 +-
 13 files changed, 472 insertions(+), 112 deletions(-)
 delete mode 100644 include/linux/kmemtrace.h
 create mode 100644 include/trace/kmemtrace.h
 create mode 100644 kernel/trace/kmemtrace.c

(limited to 'kernel')

diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h
deleted file mode 100644
index 5bea8ead6a6b..000000000000
--- a/include/linux/kmemtrace.h
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (C) 2008 Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#ifndef _LINUX_KMEMTRACE_H
-#define _LINUX_KMEMTRACE_H
-
-#ifdef __KERNEL__
-
-#include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
-
-#ifdef CONFIG_KMEMTRACE
-
-extern void kmemtrace_init(void);
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-	trace_mark(kmemtrace_alloc, "type_id %d call_site %lu ptr %lu "
-		   "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d",
-		   type_id, call_site, (unsigned long) ptr,
-		   (unsigned long) bytes_req, (unsigned long) bytes_alloc,
-		   (unsigned long) gfp_flags, node);
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-	trace_mark(kmemtrace_free, "type_id %d call_site %lu ptr %lu",
-		   type_id, call_site, (unsigned long) ptr);
-}
-
-#else /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_init(void)
-{
-}
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
-
-#endif /* __KERNEL__ */
-
-#endif /* _LINUX_KMEMTRACE_H */
-
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 7555ce99f6d2..455f9affea9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -14,7 +14,7 @@
 #include <asm/page.h>		/* kmalloc_sizes.h needs PAGE_SIZE */
 #include <asm/cache.h>		/* kmalloc_sizes.h needs L1_CACHE_BYTES */
 #include <linux/compiler.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 /* Size description struct for general caches. */
 struct cache_sizes {
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index dc28432b5b9a..6b657f7dcb2b 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -10,7 +10,7 @@
 #include <linux/gfp.h>
 #include <linux/workqueue.h>
 #include <linux/kobject.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 enum stat_item {
 	ALLOC_FASTPATH,		/* Allocation from cpu slab */
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
new file mode 100644
index 000000000000..ad8b7857855a
--- /dev/null
+++ b/include/trace/kmemtrace.h
@@ -0,0 +1,75 @@
+/*
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ *
+ * This file is released under GPL version 2.
+ */
+
+#ifndef _LINUX_KMEMTRACE_H
+#define _LINUX_KMEMTRACE_H
+
+#ifdef __KERNEL__
+
+#include <linux/types.h>
+#include <linux/marker.h>
+
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
+#ifdef CONFIG_KMEMTRACE
+
+extern void kmemtrace_init(void);
+
+extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node);
+
+extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr);
+
+#else /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_init(void)
+{
+}
+
+static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+					     unsigned long call_site,
+					     const void *ptr,
+					     size_t bytes_req,
+					     size_t bytes_alloc,
+					     gfp_t gfp_flags,
+					     int node)
+{
+}
+
+static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+				       unsigned long call_site,
+				       const void *ptr)
+{
+}
+
+#endif /* CONFIG_KMEMTRACE */
+
+static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
+					unsigned long call_site,
+					const void *ptr,
+					size_t bytes_req,
+					size_t bytes_alloc,
+					gfp_t gfp_flags)
+{
+	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
+				  bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+#endif /* __KERNEL__ */
+
+#endif /* _LINUX_KMEMTRACE_H */
+
diff --git a/init/main.c b/init/main.c
index 9711586aa7c9..beca7aaddb22 100644
--- a/init/main.c
+++ b/init/main.c
@@ -70,7 +70,7 @@
 #include <asm/setup.h>
 #include <asm/sections.h>
 #include <asm/cacheflush.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #ifdef CONFIG_X86_LOCAL_APIC
 #include <asm/smp.h>
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..27fb74b06b3c 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -264,6 +264,28 @@ config HW_BRANCH_TRACER
 	  This tracer records all branches on the system in a circular
 	  buffer giving access to the last N branches for each cpu.
 
+config KMEMTRACE
+	bool "Trace SLAB allocations"
+	select TRACING
+	depends on RELAY
+	help
+	  kmemtrace provides tracing for slab allocator functions, such as
+	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
+	  data is then fed to the userspace application in order to analyse
+	  allocation hotspots, internal fragmentation and so on, making it
+	  possible to see how well an allocator performs, as well as debug
+	  and profile kernel code.
+
+	  This requires an userspace application to use. See
+	  Documentation/vm/kmemtrace.txt for more information.
+
+	  Saying Y will make the kernel somewhat larger and slower. However,
+	  if you disable kmemtrace at run-time or boot-time, the performance
+	  impact is minimal (depending on the arch the kernel is built for).
+
+	  If unsure, say N.
+
+
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 349d5a93653f..513dc86b5dfa 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -33,5 +33,6 @@ obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
+obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
new file mode 100644
index 000000000000..d69cbe3c2a4b
--- /dev/null
+++ b/kernel/trace/kmemtrace.c
@@ -0,0 +1,343 @@
+/*
+ * Memory allocator tracing
+ *
+ * Copyright (C) 2008 Eduard - Gabriel Munteanu
+ * Copyright (C) 2008 Pekka Enberg <penberg@cs.helsinki.fi>
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ */
+
+#include <linux/dcache.h>
+#include <linux/debugfs.h>
+#include <linux/fs.h>
+#include <linux/seq_file.h>
+#include <trace/kmemtrace.h>
+
+#include "trace.h"
+#include "trace_output.h"
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_KMEM_OPT_MINIMAL	0x1
+
+static struct tracer_opt kmem_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(kmem_minimalistic, TRACE_KMEM_OPT_MINIMAL) },
+	{ }
+};
+
+static struct tracer_flags kmem_tracer_flags = {
+	.val = 0,
+	.opts = kmem_opts
+};
+
+
+static bool kmem_tracing_enabled __read_mostly;
+static struct trace_array *kmemtrace_array;
+
+static int kmem_trace_init(struct trace_array *tr)
+{
+	int cpu;
+	kmemtrace_array = tr;
+
+	for_each_cpu_mask(cpu, cpu_possible_map)
+		tracing_reset(tr, cpu);
+
+	kmem_tracing_enabled = true;
+
+	return 0;
+}
+
+static void kmem_trace_reset(struct trace_array *tr)
+{
+	kmem_tracing_enabled = false;
+}
+
+static void kmemtrace_headers(struct seq_file *s)
+{
+	/* Don't need headers for the original kmemtrace output */
+	if (!(kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL))
+		return;
+
+	seq_printf(s, "#\n");
+	seq_printf(s, "# ALLOC  TYPE  REQ   GIVEN  FLAGS     "
+			"      POINTER         NODE    CALLER\n");
+	seq_printf(s, "# FREE   |      |     |       |       "
+			"       |   |            |        |\n");
+	seq_printf(s, "# |\n\n");
+}
+
+/*
+ * The two following functions give the original output from kmemtrace,
+ * or something close to....perhaps they need some missing things
+ */
+static enum print_line_t
+kmemtrace_print_alloc_original(struct trace_iterator *iter,
+				struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
+	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
+	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
+	   (unsigned long) entry->gfp_flags, entry->node);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_original(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Taken from the old linux/kmemtrace.h */
+	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
+	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+
+/* The two other following provide a more minimalistic output */
+static enum print_line_t
+kmemtrace_print_alloc_compress(struct trace_iterator *iter,
+					struct kmemtrace_alloc_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Alloc entry */
+	ret = trace_seq_printf(s, "  +      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K   ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C   ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P   ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?   ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Requested */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Allocated */
+	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Flags
+	 * TODO: would be better to see the name of the GFP flag names
+	 */
+	ret = trace_seq_printf(s, "%08x   ", entry->gfp_flags);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Node */
+	ret = trace_seq_printf(s, "%4d   ", entry->node);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t
+kmemtrace_print_free_compress(struct trace_iterator *iter,
+				struct kmemtrace_free_entry *entry)
+{
+	struct trace_seq *s = &iter->seq;
+	int ret;
+
+	/* Free entry */
+	ret = trace_seq_printf(s, "  -      ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Type */
+	switch (entry->type_id) {
+	case KMEMTRACE_TYPE_KMALLOC:
+		ret = trace_seq_printf(s, "K     ");
+		break;
+	case KMEMTRACE_TYPE_CACHE:
+		ret = trace_seq_printf(s, "C     ");
+		break;
+	case KMEMTRACE_TYPE_PAGES:
+		ret = trace_seq_printf(s, "P     ");
+		break;
+	default:
+		ret = trace_seq_printf(s, "?     ");
+	}
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip requested/allocated/flags */
+	ret = trace_seq_printf(s, "                       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Pointer to allocated */
+	ret = trace_seq_printf(s, "0x%tx   ", (ptrdiff_t)entry->ptr);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Skip node */
+	ret = trace_seq_printf(s, "       ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Call site */
+	ret = seq_print_ip_sym(s, entry->call_site, 0);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (!trace_seq_printf(s, "\n"))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
+{
+	struct trace_entry *entry = iter->ent;
+
+	switch (entry->type) {
+	case TRACE_KMEM_ALLOC: {
+		struct kmemtrace_alloc_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_alloc_compress(iter, field);
+		else
+			return kmemtrace_print_alloc_original(iter, field);
+	}
+
+	case TRACE_KMEM_FREE: {
+		struct kmemtrace_free_entry *field;
+		trace_assign_type(field, entry);
+		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
+			return kmemtrace_print_free_compress(iter, field);
+		else
+			return kmemtrace_print_free_original(iter, field);
+	}
+
+	default:
+		return TRACE_TYPE_UNHANDLED;
+	}
+}
+
+/* Trace allocations */
+void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
+			     unsigned long call_site,
+			     const void *ptr,
+			     size_t bytes_req,
+			     size_t bytes_alloc,
+			     gfp_t gfp_flags,
+			     int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
+		       unsigned long call_site,
+		       const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+	unsigned long irq_flags;
+
+	if (!kmem_tracing_enabled)
+		return;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
+					 &irq_flags);
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+
+	trace_wake_up();
+}
+
+static struct tracer kmem_tracer __read_mostly = {
+	.name		= "kmemtrace",
+	.init		= kmem_trace_init,
+	.reset		= kmem_trace_reset,
+	.print_line	= kmemtrace_print_line,
+	.print_header = kmemtrace_headers,
+	.flags		= &kmem_tracer_flags
+};
+
+static int __init init_kmem_tracer(void)
+{
+	return register_tracer(&kmem_tracer);
+}
+
+device_initcall(init_kmem_tracer);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cc7a4f864036..534505bb39b0 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -9,6 +9,7 @@
 #include <linux/mmiotrace.h>
 #include <linux/ftrace.h>
 #include <trace/boot.h>
+#include <trace/kmemtrace.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
@@ -29,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_KMEM_ALLOC,
+	TRACE_KMEM_FREE,
 	TRACE_POWER,
 
 	__TRACE_LAST_TYPE
@@ -170,6 +173,24 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+struct kmemtrace_alloc_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+	size_t bytes_req;
+	size_t bytes_alloc;
+	gfp_t gfp_flags;
+	int node;
+};
+
+struct kmemtrace_free_entry {
+	struct trace_entry	ent;
+	enum kmemtrace_type_id type_id;
+	unsigned long call_site;
+	const void *ptr;
+};
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -280,6 +301,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
  		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
+			  TRACE_KMEM_ALLOC);	\
+		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
+			  TRACE_KMEM_FREE);	\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index b5417e23ba94..b0f239e443bc 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -803,26 +803,6 @@ config FIREWIRE_OHCI_REMOTE_DMA
 
 	  If unsure, say N.
 
-config KMEMTRACE
-	bool "Kernel memory tracer (kmemtrace)"
-	depends on RELAY && DEBUG_FS && MARKERS
-	help
-	  kmemtrace provides tracing for slab allocator functions, such as
-	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
-	  data is then fed to the userspace application in order to analyse
-	  allocation hotspots, internal fragmentation and so on, making it
-	  possible to see how well an allocator performs, as well as debug
-	  and profile kernel code.
-
-	  This requires an userspace application to use. See
-	  Documentation/vm/kmemtrace.txt for more information.
-
-	  Saying Y will make the kernel somewhat larger and slower. However,
-	  if you disable kmemtrace at run-time or boot-time, the performance
-	  impact is minimal (depending on the arch the kernel is built for).
-
-	  If unsure, say N.
-
 menuconfig BUILD_DOCSRC
 	bool "Build targets in Documentation/ tree"
 	depends on HEADERS_CHECK
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
index 2a70a805027c..0573b5080cc4 100644
--- a/mm/kmemtrace.c
+++ b/mm/kmemtrace.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/marker.h>
 #include <linux/gfp.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 
 #define KMEMTRACE_SUBBUF_SIZE		524288
 #define KMEMTRACE_DEF_N_SUBBUFS		20
diff --git a/mm/slob.c b/mm/slob.c
index 0f1a49f40690..4d1c0fc33b6b 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -65,7 +65,7 @@
 #include <linux/module.h>
 #include <linux/rcupdate.h>
 #include <linux/list.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <asm/atomic.h>
 
 /*
diff --git a/mm/slub.c b/mm/slub.c
index cc4001fee7ac..7bf8cf8ec082 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -16,7 +16,7 @@
 #include <linux/slab.h>
 #include <linux/proc_fs.h>
 #include <linux/seq_file.h>
-#include <linux/kmemtrace.h>
+#include <trace/kmemtrace.h>
 #include <linux/cpu.h>
 #include <linux/cpuset.h>
 #include <linux/mempolicy.h>
-- 
cgit v1.2.3


From 3fd4bc015ef879a7d2b955ce97fb125e3a51ba7e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 30 Dec 2008 12:07:27 +0100
Subject: tracing/kmemtrace: export kmemtrace_mark_alloc_node() /
 kmemtrace_mark_free()

Impact: build fix

Also fix up Kconfig dependencies and include files.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig     | 3 ++-
 kernel/trace/kmemtrace.c | 2 ++
 mm/slab.c                | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 27fb74b06b3c..cc9f91e7daf4 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -267,7 +267,8 @@ config HW_BRANCH_TRACER
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select TRACING
-	depends on RELAY
+	select MARKERS
+	select RELAY
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index d69cbe3c2a4b..2bfdcd326226 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -296,6 +296,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 
 	trace_wake_up();
 }
+EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
 
 void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 		       unsigned long call_site,
@@ -325,6 +326,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 
 	trace_wake_up();
 }
+EXPORT_SYMBOL(kmemtrace_mark_free);
 
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
diff --git a/mm/slab.c b/mm/slab.c
index bcf08ea88380..7f72bb386a09 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -102,7 +102,7 @@
 #include	<linux/cpu.h>
 #include	<linux/sysctl.h>
 #include	<linux/module.h>
-#include	<linux/kmemtrace.h>
+#include	<tracing/kmemtrace.h>
 #include	<linux/rcupdate.h>
 #include	<linux/string.h>
 #include	<linux/uaccess.h>
-- 
cgit v1.2.3


From 723cbe0775514853c22dc45005af59c360916af1 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 5 Jan 2009 22:09:58 +0200
Subject: kmemtrace: Remove the relay version of kmemtrace

Impact: cleanup

kmemtrace now uses ftrace. This patch removes the relay version.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig |   2 -
 mm/Makefile          |   1 -
 mm/kmemtrace.c       | 333 ---------------------------------------------------
 3 files changed, 336 deletions(-)
 delete mode 100644 mm/kmemtrace.c

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index cc9f91e7daf4..1c0b7504cab3 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -267,8 +267,6 @@ config HW_BRANCH_TRACER
 config KMEMTRACE
 	bool "Trace SLAB allocations"
 	select TRACING
-	select MARKERS
-	select RELAY
 	help
 	  kmemtrace provides tracing for slab allocator functions, such as
 	  kmalloc, kfree, kmem_cache_alloc, kmem_cache_free etc.. Collected
diff --git a/mm/Makefile b/mm/Makefile
index c92e8af13206..51c27709cc7c 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -35,4 +35,3 @@ obj-$(CONFIG_MIGRATION) += migrate.o
 obj-$(CONFIG_SMP) += allocpercpu.o
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
-obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
diff --git a/mm/kmemtrace.c b/mm/kmemtrace.c
deleted file mode 100644
index 0573b5080cc4..000000000000
--- a/mm/kmemtrace.c
+++ /dev/null
@@ -1,333 +0,0 @@
-/*
- * Copyright (C) 2008 Pekka Enberg, Eduard - Gabriel Munteanu
- *
- * This file is released under GPL version 2.
- */
-
-#include <linux/string.h>
-#include <linux/debugfs.h>
-#include <linux/relay.h>
-#include <linux/module.h>
-#include <linux/marker.h>
-#include <linux/gfp.h>
-#include <trace/kmemtrace.h>
-
-#define KMEMTRACE_SUBBUF_SIZE		524288
-#define KMEMTRACE_DEF_N_SUBBUFS		20
-
-static struct rchan *kmemtrace_chan;
-static u32 kmemtrace_buf_overruns;
-
-static unsigned int kmemtrace_n_subbufs;
-
-/* disabled by default */
-static unsigned int kmemtrace_enabled;
-
-/*
- * The sequence number is used for reordering kmemtrace packets
- * in userspace, since they are logged as per-CPU data.
- *
- * atomic_t should always be a 32-bit signed integer. Wraparound is not
- * likely to occur, but userspace can deal with it by expecting a certain
- * sequence number in the next packet that will be read.
- */
-static atomic_t kmemtrace_seq_num;
-
-#define KMEMTRACE_ABI_VERSION		1
-
-static u32 kmemtrace_abi_version __read_mostly = KMEMTRACE_ABI_VERSION;
-
-enum kmemtrace_event_id {
-	KMEMTRACE_EVENT_ALLOC = 0,
-	KMEMTRACE_EVENT_FREE,
-};
-
-struct kmemtrace_event {
-	u8		event_id;
-	u8		type_id;
-	u16		event_size;
-	s32		seq_num;
-	u64		call_site;
-	u64		ptr;
-} __attribute__ ((__packed__));
-
-struct kmemtrace_stats_alloc {
-	u64		bytes_req;
-	u64		bytes_alloc;
-	u32		gfp_flags;
-	s32		numa_node;
-} __attribute__ ((__packed__));
-
-static void kmemtrace_probe_alloc(void *probe_data, void *call_data,
-				  const char *format, va_list *args)
-{
-	unsigned long flags;
-	struct kmemtrace_event *ev;
-	struct kmemtrace_stats_alloc *stats;
-	void *buf;
-
-	local_irq_save(flags);
-
-	buf = relay_reserve(kmemtrace_chan,
-			    sizeof(struct kmemtrace_event) +
-			    sizeof(struct kmemtrace_stats_alloc));
-	if (!buf)
-		goto failed;
-
-	/*
-	 * Don't convert this to use structure initializers,
-	 * C99 does not guarantee the rvalues evaluation order.
-	 */
-
-	ev = buf;
-	ev->event_id = KMEMTRACE_EVENT_ALLOC;
-	ev->type_id = va_arg(*args, int);
-	ev->event_size = sizeof(struct kmemtrace_event) +
-			 sizeof(struct kmemtrace_stats_alloc);
-	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
-	ev->call_site = va_arg(*args, unsigned long);
-	ev->ptr = va_arg(*args, unsigned long);
-
-	stats = buf + sizeof(struct kmemtrace_event);
-	stats->bytes_req = va_arg(*args, unsigned long);
-	stats->bytes_alloc = va_arg(*args, unsigned long);
-	stats->gfp_flags = va_arg(*args, unsigned long);
-	stats->numa_node = va_arg(*args, int);
-
-failed:
-	local_irq_restore(flags);
-}
-
-static void kmemtrace_probe_free(void *probe_data, void *call_data,
-				 const char *format, va_list *args)
-{
-	unsigned long flags;
-	struct kmemtrace_event *ev;
-
-	local_irq_save(flags);
-
-	ev = relay_reserve(kmemtrace_chan, sizeof(struct kmemtrace_event));
-	if (!ev)
-		goto failed;
-
-	/*
-	 * Don't convert this to use structure initializers,
-	 * C99 does not guarantee the rvalues evaluation order.
-	 */
-	ev->event_id = KMEMTRACE_EVENT_FREE;
-	ev->type_id = va_arg(*args, int);
-	ev->event_size = sizeof(struct kmemtrace_event);
-	ev->seq_num = atomic_add_return(1, &kmemtrace_seq_num);
-	ev->call_site = va_arg(*args, unsigned long);
-	ev->ptr = va_arg(*args, unsigned long);
-
-failed:
-	local_irq_restore(flags);
-}
-
-static struct dentry *
-kmemtrace_create_buf_file(const char *filename, struct dentry *parent,
-			  int mode, struct rchan_buf *buf, int *is_global)
-{
-	return debugfs_create_file(filename, mode, parent, buf,
-				   &relay_file_operations);
-}
-
-static int kmemtrace_remove_buf_file(struct dentry *dentry)
-{
-	debugfs_remove(dentry);
-
-	return 0;
-}
-
-static int kmemtrace_subbuf_start(struct rchan_buf *buf,
-				  void *subbuf,
-				  void *prev_subbuf,
-				  size_t prev_padding)
-{
-	if (relay_buf_full(buf)) {
-		/*
-		 * We know it's not SMP-safe, but neither
-		 * debugfs_create_u32() is.
-		 */
-		kmemtrace_buf_overruns++;
-		return 0;
-	}
-
-	return 1;
-}
-
-static struct rchan_callbacks relay_callbacks = {
-	.create_buf_file = kmemtrace_create_buf_file,
-	.remove_buf_file = kmemtrace_remove_buf_file,
-	.subbuf_start = kmemtrace_subbuf_start,
-};
-
-static struct dentry *kmemtrace_dir;
-static struct dentry *kmemtrace_overruns_dentry;
-static struct dentry *kmemtrace_abi_version_dentry;
-
-static struct dentry *kmemtrace_enabled_dentry;
-
-static int kmemtrace_start_probes(void)
-{
-	int err;
-
-	err = marker_probe_register("kmemtrace_alloc", "type_id %d "
-				    "call_site %lu ptr %lu "
-				    "bytes_req %lu bytes_alloc %lu "
-				    "gfp_flags %lu node %d",
-				    kmemtrace_probe_alloc, NULL);
-	if (err)
-		return err;
-	err = marker_probe_register("kmemtrace_free", "type_id %d "
-				    "call_site %lu ptr %lu",
-				    kmemtrace_probe_free, NULL);
-
-	return err;
-}
-
-static void kmemtrace_stop_probes(void)
-{
-	marker_probe_unregister("kmemtrace_alloc",
-				kmemtrace_probe_alloc, NULL);
-	marker_probe_unregister("kmemtrace_free",
-				kmemtrace_probe_free, NULL);
-}
-
-static int kmemtrace_enabled_get(void *data, u64 *val)
-{
-	*val = *((int *) data);
-
-	return 0;
-}
-
-static int kmemtrace_enabled_set(void *data, u64 val)
-{
-	u64 old_val = kmemtrace_enabled;
-
-	*((int *) data) = !!val;
-
-	if (old_val == val)
-		return 0;
-	if (val)
-		kmemtrace_start_probes();
-	else
-		kmemtrace_stop_probes();
-
-	return 0;
-}
-
-DEFINE_SIMPLE_ATTRIBUTE(kmemtrace_enabled_fops,
-			kmemtrace_enabled_get,
-			kmemtrace_enabled_set, "%llu\n");
-
-static void kmemtrace_cleanup(void)
-{
-	if (kmemtrace_enabled_dentry)
-		debugfs_remove(kmemtrace_enabled_dentry);
-
-	kmemtrace_stop_probes();
-
-	if (kmemtrace_abi_version_dentry)
-		debugfs_remove(kmemtrace_abi_version_dentry);
-	if (kmemtrace_overruns_dentry)
-		debugfs_remove(kmemtrace_overruns_dentry);
-
-	relay_close(kmemtrace_chan);
-	kmemtrace_chan = NULL;
-
-	if (kmemtrace_dir)
-		debugfs_remove(kmemtrace_dir);
-}
-
-static int __init kmemtrace_setup_late(void)
-{
-	if (!kmemtrace_chan)
-		goto failed;
-
-	kmemtrace_dir = debugfs_create_dir("kmemtrace", NULL);
-	if (!kmemtrace_dir)
-		goto cleanup;
-
-	kmemtrace_abi_version_dentry =
-		debugfs_create_u32("abi_version", S_IRUSR,
-				   kmemtrace_dir, &kmemtrace_abi_version);
-	kmemtrace_overruns_dentry =
-		debugfs_create_u32("total_overruns", S_IRUSR,
-				   kmemtrace_dir, &kmemtrace_buf_overruns);
-	if (!kmemtrace_overruns_dentry || !kmemtrace_abi_version_dentry)
-		goto cleanup;
-
-	kmemtrace_enabled_dentry =
-		debugfs_create_file("enabled", S_IRUSR | S_IWUSR,
-				    kmemtrace_dir, &kmemtrace_enabled,
-				    &kmemtrace_enabled_fops);
-	if (!kmemtrace_enabled_dentry)
-		goto cleanup;
-
-	if (relay_late_setup_files(kmemtrace_chan, "cpu", kmemtrace_dir))
-		goto cleanup;
-
-	printk(KERN_INFO "kmemtrace: fully up.\n");
-
-	return 0;
-
-cleanup:
-	kmemtrace_cleanup();
-failed:
-	return 1;
-}
-late_initcall(kmemtrace_setup_late);
-
-static int __init kmemtrace_set_boot_enabled(char *str)
-{
-	if (!str)
-		return -EINVAL;
-
-	if (!strcmp(str, "yes"))
-		kmemtrace_enabled = 1;
-	else if (!strcmp(str, "no"))
-		kmemtrace_enabled = 0;
-	else
-		return -EINVAL;
-
-	return 0;
-}
-early_param("kmemtrace.enable", kmemtrace_set_boot_enabled);
-
-static int __init kmemtrace_set_subbufs(char *str)
-{
-	get_option(&str, &kmemtrace_n_subbufs);
-	return 0;
-}
-early_param("kmemtrace.subbufs", kmemtrace_set_subbufs);
-
-void kmemtrace_init(void)
-{
-	if (!kmemtrace_n_subbufs)
-		kmemtrace_n_subbufs = KMEMTRACE_DEF_N_SUBBUFS;
-
-	kmemtrace_chan = relay_open(NULL, NULL, KMEMTRACE_SUBBUF_SIZE,
-				    kmemtrace_n_subbufs, &relay_callbacks,
-				    NULL);
-	if (!kmemtrace_chan) {
-		printk(KERN_ERR "kmemtrace: could not open relay channel.\n");
-		return;
-	}
-
-	if (!kmemtrace_enabled) {
-		printk(KERN_INFO "kmemtrace: disabled. Pass "
-			"kemtrace.enable=yes as kernel parameter for "
-			"boot-time tracing.\n");
-		return;
-	}
-	if (kmemtrace_start_probes()) {
-		printk(KERN_ERR "kmemtrace: could not register marker probes!\n");
-		kmemtrace_cleanup();
-		return;
-	}
-
-	printk(KERN_INFO "kmemtrace: enabled.\n");
-}
-
-- 
cgit v1.2.3


From 3e80680208ba6ce9635ca7c21ad0019442ea166a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 6 Jan 2009 10:16:35 +0100
Subject: kmemtrace: add kmemtrace_init()

Impact: build fix

leftover from the relayfs version - but we want to keep it because
this call is the earliest opportunity when we can start kmemtrace
tracing. (after kmem_cache_init()).

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 2bfdcd326226..faaa5ae7e75a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -337,6 +337,11 @@ static struct tracer kmem_tracer __read_mostly = {
 	.flags		= &kmem_tracer_flags
 };
 
+void kmemtrace_init(void)
+{
+	/* earliest opportunity to start kmem tracing */
+}
+
 static int __init init_kmem_tracer(void)
 {
 	return register_tracer(&kmem_tracer);
-- 
cgit v1.2.3


From 431aa3fbf5bbe3be79809c7e603c2ed2ac64b015 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 6 Jan 2009 12:43:01 -0500
Subject: ftrace: convert unsigned index to signed

Impact: fix to unsigned compared to less than zero

Roel Kluin pointed out that there is a compare of an unsigned number
to less than zero. A previous clean up had the unsigned index set
to -1 for certain cases, but never converted it to signed.

Frederic Weisbecker noticed that another index is used to compare
the above index to and it also needs to be converted to signed.

[
  Converted ftrace_page->index to int from unsigned long as
  Andrew Morton pointed out that there's no need for it to
  stay a long.
]

Reported-by: Roel Kluin <roel.kluin@gmail.com>
Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 2f32969c09df..9e54a6ccdb93 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -289,7 +289,7 @@ static DEFINE_MUTEX(ftrace_regex_lock);
 
 struct ftrace_page {
 	struct ftrace_page	*next;
-	unsigned long		index;
+	int			index;
 	struct dyn_ftrace	records[];
 };
 
@@ -786,7 +786,7 @@ enum {
 
 struct ftrace_iterator {
 	struct ftrace_page	*pg;
-	unsigned		idx;
+	int			idx;
 	unsigned		flags;
 	unsigned char		buffer[FTRACE_BUFF_MAX+1];
 	unsigned		buffer_idx;
-- 
cgit v1.2.3


From ff288b274a9b383046fdbda4be3067daba4d5fe8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 6 Jan 2009 21:33:30 +0100
Subject: tracing/ftrace: fix a memory leak in stat tracing

Impact: fix memory leak

This patch fixes a memory leak inside reset_stat_list(). The freeing
loop iterated only once.

Also turn the stat_list into a simple struct list_head, which
simplify the code and avoid an unused static pointer.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 39 +++++++++++++++------------------------
 1 file changed, 15 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 6f194a33a64a..4cb4ff27646d 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -21,7 +21,7 @@ struct trace_stat_list {
 	void *stat;
 };
 
-static struct trace_stat_list stat_list;
+static LIST_HEAD(stat_list);
 
 /*
  * This is a copy of the current tracer to avoid racy
@@ -39,22 +39,12 @@ static DEFINE_MUTEX(stat_list_mutex);
 
 static void reset_stat_list(void)
 {
-	struct trace_stat_list *node;
-	struct list_head *next;
+	struct trace_stat_list *node, *next;
 
-	if (list_empty(&stat_list.list))
-		return;
-
-	node = list_entry(stat_list.list.next, struct trace_stat_list, list);
-	next = node->list.next;
-
-	while (&node->list != next) {
+	list_for_each_entry_safe(node, next, &stat_list, list)
 		kfree(node);
-		node = list_entry(next, struct trace_stat_list, list);
-	}
-	kfree(node);
 
-	INIT_LIST_HEAD(&stat_list.list);
+	INIT_LIST_HEAD(&stat_list);
 }
 
 void init_tracer_stat(struct tracer *trace)
@@ -107,7 +97,7 @@ static int stat_seq_init(void)
 	}
 
 	INIT_LIST_HEAD(&new_entry->list);
-	list_add(&new_entry->list, &stat_list.list);
+	list_add(&new_entry->list, &stat_list);
 	new_entry->stat = current_tracer.stat_start();
 
 	prev_stat = new_entry->stat;
@@ -130,7 +120,7 @@ static int stat_seq_init(void)
 		if (!new_entry->stat)
 			break;
 
-		list_for_each_entry(iter_entry, &stat_list.list, list) {
+		list_for_each_entry(iter_entry, &stat_list, list) {
 			/* Insertion with a descendent sorting */
 			if (current_tracer.stat_cmp(new_entry->stat,
 						iter_entry->stat) > 0) {
@@ -141,7 +131,7 @@ static int stat_seq_init(void)
 
 			/* The current smaller value */
 			} else if (list_is_last(&iter_entry->list,
-						&stat_list.list)) {
+						&stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
@@ -162,7 +152,7 @@ exit_free_list:
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+	struct list_head *l = (struct list_head *)s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
 	mutex_lock(&stat_list_mutex);
@@ -171,14 +161,14 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 	if (!*pos && current_tracer.stat_headers)
 		current_tracer.stat_headers(s);
 
-	return seq_list_start(&l->list, *pos);
+	return seq_list_start(l, *pos);
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct trace_stat_list *l = (struct trace_stat_list *)s->private;
+	struct list_head *l = (struct list_head *)s->private;
 
-	return seq_list_next(p, &l->list, pos);
+	return seq_list_next(p, l, pos);
 }
 
 static void stat_seq_stop(struct seq_file *m, void *p)
@@ -188,8 +178,10 @@ static void stat_seq_stop(struct seq_file *m, void *p)
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
-	return current_tracer.stat_show(s, l->stat);
+	struct trace_stat_list *entry =
+		list_entry(v, struct trace_stat_list, list);
+
+	return current_tracer.stat_show(s, entry->stat);
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
@@ -237,7 +229,6 @@ static int __init tracing_stat_init(void)
 	struct dentry *d_tracing;
 	struct dentry *entry;
 
-	INIT_LIST_HEAD(&stat_list.list);
 	d_tracing = tracing_init_dentry();
 
 	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
-- 
cgit v1.2.3


From e8a9cbf6ae620d9e5ba9cb42001c033287a284a3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 6 Jan 2009 22:02:35 -0500
Subject: trace: clean up funny line breaks in stat_seq_show

Impact: clean up

Andrew Morton pointed out that the entry assignment in stat_seq_show
did not need to be done in the declaration, causing funny line breaks.

This patch makes it a bit more pleasing on the eyes.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 4cb4ff27646d..f110ce9ce7fb 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -178,8 +178,9 @@ static void stat_seq_stop(struct seq_file *m, void *p)
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *entry =
-		list_entry(v, struct trace_stat_list, list);
+	struct trace_stat_list *entry;
+
+	entry =	list_entry(v, struct trace_stat_list, list);
 
 	return current_tracer.stat_show(s, entry->stat);
 }
-- 
cgit v1.2.3


From 34a148bf0911a4a1cae85f8ecba57affb4d76aee Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:09 -0800
Subject: kernel/trace/ring_buffer.c: reduce inlining

text    data     bss     dec     hex filename
before:  11320     228       8   11556    2d24 kernel/trace/ring_buffer.o
after:   10592     228       8   10828    2a4c kernel/trace/ring_buffer.o

Also: free_page(0) is legal.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8b0daf0662ef..9542990f515b 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -133,7 +133,7 @@ enum {
 };
 
 /* inline for ring buffer fast paths */
-static inline unsigned
+static unsigned
 rb_event_length(struct ring_buffer_event *event)
 {
 	unsigned length;
@@ -179,7 +179,7 @@ unsigned ring_buffer_event_length(struct ring_buffer_event *event)
 EXPORT_SYMBOL_GPL(ring_buffer_event_length);
 
 /* inline for ring buffer fast paths */
-static inline void *
+static void *
 rb_event_data(struct ring_buffer_event *event)
 {
 	BUG_ON(event->type != RINGBUF_TYPE_DATA);
@@ -229,10 +229,9 @@ static void rb_init_page(struct buffer_data_page *bpage)
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
  */
-static inline void free_buffer_page(struct buffer_page *bpage)
+static void free_buffer_page(struct buffer_page *bpage)
 {
-	if (bpage->page)
-		free_page((unsigned long)bpage->page);
+	free_page((unsigned long)bpage->page);
 	kfree(bpage);
 }
 
@@ -811,7 +810,7 @@ rb_event_index(struct ring_buffer_event *event)
 	return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE);
 }
 
-static inline int
+static int
 rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 	     struct ring_buffer_event *event)
 {
@@ -825,7 +824,7 @@ rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer,
 		rb_commit_index(cpu_buffer) == index;
 }
 
-static inline void
+static void
 rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 		    struct ring_buffer_event *event)
 {
@@ -850,7 +849,7 @@ rb_set_commit_event(struct ring_buffer_per_cpu *cpu_buffer,
 	local_set(&cpu_buffer->commit_page->page->commit, index);
 }
 
-static inline void
+static void
 rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	/*
@@ -896,7 +895,7 @@ static void rb_reset_reader_page(struct ring_buffer_per_cpu *cpu_buffer)
 	cpu_buffer->reader_page->read = 0;
 }
 
-static inline void rb_inc_iter(struct ring_buffer_iter *iter)
+static void rb_inc_iter(struct ring_buffer_iter *iter)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 
@@ -926,7 +925,7 @@ static inline void rb_inc_iter(struct ring_buffer_iter *iter)
  * and with this, we can determine what to place into the
  * data field.
  */
-static inline void
+static void
 rb_update_event(struct ring_buffer_event *event,
 			 unsigned type, unsigned length)
 {
@@ -964,7 +963,7 @@ rb_update_event(struct ring_buffer_event *event,
 	}
 }
 
-static inline unsigned rb_calculate_event_length(unsigned length)
+static unsigned rb_calculate_event_length(unsigned length)
 {
 	struct ring_buffer_event event; /* Used only for sizeof array */
 
@@ -1438,7 +1437,7 @@ int ring_buffer_write(struct ring_buffer *buffer,
 }
 EXPORT_SYMBOL_GPL(ring_buffer_write);
 
-static inline int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
+static int rb_per_cpu_empty(struct ring_buffer_per_cpu *cpu_buffer)
 {
 	struct buffer_page *reader = cpu_buffer->reader_page;
 	struct buffer_page *head = cpu_buffer->head_page;
-- 
cgit v1.2.3


From 67d347245f76a149c45bffb1a10145d31d61d1da Mon Sep 17 00:00:00 2001
From: Andrew Morton <akpm@linux-foundation.org>
Date: Fri, 9 Jan 2009 12:27:09 -0800
Subject: kernel/trace/ring_buffer.c: use DIV_ROUND_UP

Instead of open-coding it.

Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 17 +++++------------
 1 file changed, 5 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9542990f515b..4832ffa5d937 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -123,8 +123,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
 #define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
-#define RB_ALIGNMENT_SHIFT	2
-#define RB_ALIGNMENT		(1 << RB_ALIGNMENT_SHIFT)
+#define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
 
 enum {
@@ -151,7 +150,7 @@ rb_event_length(struct ring_buffer_event *event)
 
 	case RINGBUF_TYPE_DATA:
 		if (event->len)
-			length = event->len << RB_ALIGNMENT_SHIFT;
+			length = event->len * RB_ALIGNMENT;
 		else
 			length = event->array[0];
 		return length + RB_EVNT_HDR_SIZE;
@@ -937,15 +936,11 @@ rb_update_event(struct ring_buffer_event *event,
 		break;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
-		event->len =
-			(RB_LEN_TIME_EXTEND + (RB_ALIGNMENT-1))
-			>> RB_ALIGNMENT_SHIFT;
+		event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT);
 		break;
 
 	case RINGBUF_TYPE_TIME_STAMP:
-		event->len =
-			(RB_LEN_TIME_STAMP + (RB_ALIGNMENT-1))
-			>> RB_ALIGNMENT_SHIFT;
+		event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT);
 		break;
 
 	case RINGBUF_TYPE_DATA:
@@ -954,9 +949,7 @@ rb_update_event(struct ring_buffer_event *event,
 			event->len = 0;
 			event->array[0] = length;
 		} else
-			event->len =
-				(length + (RB_ALIGNMENT-1))
-				>> RB_ALIGNMENT_SHIFT;
+			event->len = DIV_ROUND_UP(length, RB_ALIGNMENT);
 		break;
 	default:
 		BUG();
-- 
cgit v1.2.3


From 034939b65ad5ff64b9709210b3469a95153c51a3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 8 Jan 2009 10:03:56 -0800
Subject: tracing/ftrace: handle more than one stat file per tracer

Impact: new API for tracers

Make the stat tracing API reentrant. And also provide the new directory
/debugfs/tracing/trace_stat which will contain all the stat files for the
current active tracer.

Now a tracer will, if desired, want to provide a zero terminated array of
tracer_stat structures.
Each one contains the callbacks necessary for one stat file.
It have to provide at least a name for its stat file, an iterator with
stat_start/start_next callback and an output callback for one stat entry.

Also adapt the branch tracer to this new API.
We create two files "all" and "annotated" inside the /debugfs/tracing/trace_stat
directory, making the both stats simultaneously available instead of needing
to change an option to switch from one stat file to another.

The output of these stats haven't changed.

Changes in v2:

_ Apply the previous memory leak fix (rebase against tip/master)

Changes in v3:

_ Merge the patch that adapted the branch tracer to this Api in this patch to
  not break the kernel build.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h        |  35 ++++---
 kernel/trace/trace_branch.c |  69 ++++++-------
 kernel/trace/trace_stat.c   | 230 ++++++++++++++++++++++++++++++++------------
 3 files changed, 217 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 94ed45e93a80..b3f9ad1b4d84 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,6 +334,25 @@ struct tracer_flags {
 /* Makes more easy to define a tracer opt */
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+	/* The name of your stat file */
+	const char		*name;
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for sorting (optional) for stats */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
+};
+
 /*
  * A specific tracer, represented by methods that operate on a trace array:
  */
@@ -361,21 +380,7 @@ struct tracer {
 	struct tracer		*next;
 	int			print_max;
 	struct tracer_flags 	*flags;
-
-	/*
-	 * If you change one of the following on tracing runtime, recall
-	 * init_tracer_stat()
-	 */
-
-	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
-	void			*(*stat_next)(void *prev, int idx);
-	/* Compare two entries for sorting (optional) for stats */
-	int			(*stat_cmp)(void *p1, void *p2);
-	/* Print a stat entry */
-	int			(*stat_show)(struct seq_file *s, void *p);
-	/* Print the headers of your stat entries */
-	int			(*stat_headers)(struct seq_file *s);
+	struct tracer_stat	*stats;
 };
 
 struct trace_seq {
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 4785a3b9bc4a..da5cf3e5581b 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -306,19 +306,6 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 }
 
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
-enum {
-	TRACE_BRANCH_OPT_ALL = 0x1
-};
-
-static struct tracer_opt branch_opts[] = {
-	{ TRACER_OPT(stat_all_branch, TRACE_BRANCH_OPT_ALL) },
-	{ }
-};
-
-static struct tracer_flags branch_flags = {
-	.val = 0,
-	.opts = branch_opts
-};
 
 extern unsigned long __start_branch_profile[];
 extern unsigned long __stop_branch_profile[];
@@ -352,28 +339,36 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static int branch_set_flag(u32 old_flags, u32 bit, int set)
-{
-	if (bit == TRACE_BRANCH_OPT_ALL) {
-		if (set) {
-			branch_trace.stat_headers = all_branch_stat_headers;
-			branch_trace.stat_start = all_branch_stat_start;
-			branch_trace.stat_next = all_branch_stat_next;
-			branch_trace.stat_cmp = NULL;
-		} else {
-			branch_trace.stat_headers =
-				annotated_branch_stat_headers;
-			branch_trace.stat_start = annotated_branch_stat_start;
-			branch_trace.stat_next = annotated_branch_stat_next;
-			branch_trace.stat_cmp = annotated_branch_stat_cmp;
-		}
-		init_tracer_stat(&branch_trace);
-	}
-	return 0;
-}
+static struct tracer_stat branch_stats[] = {
+	{.name = "annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show},
 
+	{.name = "all",
+	.stat_start = all_branch_stat_start,
+	.stat_next = all_branch_stat_next,
+	.stat_headers = all_branch_stat_headers,
+	.stat_show = branch_stat_show},
+
+	{ }
+};
+#else
+static struct tracer_stat branch_stats[] = {
+	{.name = "annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show},
+
+	{ }
+};
 #endif /* CONFIG_PROFILE_ALL_BRANCHES */
 
+
 static struct tracer branch_trace __read_mostly =
 {
 	.name		= "branch",
@@ -383,16 +378,8 @@ static struct tracer branch_trace __read_mostly =
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_branch,
 #endif /* CONFIG_FTRACE_SELFTEST */
-#endif /* CONFIG_BRANCH_TRACER */
-	.stat_start	=	annotated_branch_stat_start,
-	.stat_next	= annotated_branch_stat_next,
-	.stat_show	= branch_stat_show,
-	.stat_headers	= annotated_branch_stat_headers,
-	.stat_cmp	= annotated_branch_stat_cmp,
-#ifdef CONFIG_PROFILE_ALL_BRANCHES
-	.flags	= &branch_flags,
-	.set_flag	= branch_set_flag,
 #endif
+	.stats		= branch_stats
 };
 
 __init static int init_branch_trace(void)
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f110ce9ce7fb..1515f9e7adfc 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -21,37 +21,87 @@ struct trace_stat_list {
 	void *stat;
 };
 
-static LIST_HEAD(stat_list);
-
-/*
- * This is a copy of the current tracer to avoid racy
- * and dangerous output while the current tracer is
- * switched.
- */
-static struct tracer current_tracer;
+/* A stat session is the stats output in one file */
+struct tracer_stat_session {
+	struct tracer_stat *ts;
+	struct list_head stat_list;
+	struct mutex stat_mutex;
+};
 
-/*
- * Protect both the current tracer and the global
- * stat list.
- */
-static DEFINE_MUTEX(stat_list_mutex);
+/* All of the sessions currently in use. Each stat file embeed one session */
+static struct tracer_stat_session **all_stat_sessions;
+static int nb_sessions;
+static struct dentry *stat_dir, **stat_files;
 
 
-static void reset_stat_list(void)
+static void reset_stat_session(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *node, *next;
 
-	list_for_each_entry_safe(node, next, &stat_list, list)
+	list_for_each_entry_safe(node, next, &session->stat_list, list)
 		kfree(node);
 
-	INIT_LIST_HEAD(&stat_list);
+	INIT_LIST_HEAD(&session->stat_list);
 }
 
-void init_tracer_stat(struct tracer *trace)
+/* Called when a tracer is initialized */
+static int init_all_sessions(int nb, struct tracer_stat *ts)
 {
-	mutex_lock(&stat_list_mutex);
-	current_tracer = *trace;
-	mutex_unlock(&stat_list_mutex);
+	int i, j;
+	struct tracer_stat_session *session;
+
+	nb_sessions = 0;
+
+	if (all_stat_sessions) {
+		for (i = 0; i < nb_sessions; i++) {
+			session = all_stat_sessions[i];
+			reset_stat_session(session);
+			mutex_destroy(&session->stat_mutex);
+			kfree(session);
+		}
+	}
+	all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb,
+				    GFP_KERNEL);
+	if (!all_stat_sessions)
+		return -ENOMEM;
+
+	for (i = 0; i < nb; i++) {
+		session = kmalloc(sizeof(struct tracer_stat_session) * nb,
+				  GFP_KERNEL);
+		if (!session)
+			goto free_sessions;
+
+		INIT_LIST_HEAD(&session->stat_list);
+		mutex_init(&session->stat_mutex);
+		session->ts = &ts[i];
+		all_stat_sessions[i] = session;
+	}
+	nb_sessions = nb;
+	return 0;
+
+free_sessions:
+
+	for (j = 0; j < i; j++)
+		kfree(all_stat_sessions[i]);
+
+	kfree(all_stat_sessions);
+	all_stat_sessions = NULL;
+
+	return -ENOMEM;
+}
+
+static int basic_tracer_stat_checks(struct tracer_stat *ts)
+{
+	int i;
+
+	if (!ts)
+		return 0;
+
+	for (i = 0; ts[i].name; i++) {
+		if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show)
+			return -EBUSY;
+	}
+	return i;
 }
 
 /*
@@ -69,22 +119,19 @@ static int dummy_cmp(void *p1, void *p2)
  * All of these copies and sorting are required on all opening
  * since the stats could have changed between two file sessions.
  */
-static int stat_seq_init(void)
+static int stat_seq_init(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
+	struct tracer_stat *ts = session->ts;
 	void *prev_stat;
 	int ret = 0;
 	int i;
 
-	mutex_lock(&stat_list_mutex);
-	reset_stat_list();
-
-	if (!current_tracer.stat_start || !current_tracer.stat_next ||
-					!current_tracer.stat_show)
-		goto exit;
+	mutex_lock(&session->stat_mutex);
+	reset_stat_session(session);
 
-	if (!current_tracer.stat_cmp)
-		current_tracer.stat_cmp = dummy_cmp;
+	if (!ts->stat_cmp)
+		ts->stat_cmp = dummy_cmp;
 
 	/*
 	 * The first entry. Actually this is the second, but the first
@@ -97,9 +144,10 @@ static int stat_seq_init(void)
 	}
 
 	INIT_LIST_HEAD(&new_entry->list);
-	list_add(&new_entry->list, &stat_list);
-	new_entry->stat = current_tracer.stat_start();
 
+	list_add(&new_entry->list, &session->stat_list);
+
+	new_entry->stat = ts->stat_start();
 	prev_stat = new_entry->stat;
 
 	/*
@@ -114,15 +162,16 @@ static int stat_seq_init(void)
 		}
 
 		INIT_LIST_HEAD(&new_entry->list);
-		new_entry->stat = current_tracer.stat_next(prev_stat, i);
+		new_entry->stat = ts->stat_next(prev_stat, i);
 
 		/* End of insertion */
 		if (!new_entry->stat)
 			break;
 
-		list_for_each_entry(iter_entry, &stat_list, list) {
+		list_for_each_entry(iter_entry, &session->stat_list, list) {
+
 			/* Insertion with a descendent sorting */
-			if (current_tracer.stat_cmp(new_entry->stat,
+			if (ts->stat_cmp(new_entry->stat,
 						iter_entry->stat) > 0) {
 
 				list_add_tail(&new_entry->list,
@@ -131,7 +180,7 @@ static int stat_seq_init(void)
 
 			/* The current smaller value */
 			} else if (list_is_last(&iter_entry->list,
-						&stat_list)) {
+						&session->stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
@@ -140,49 +189,49 @@ static int stat_seq_init(void)
 		prev_stat = new_entry->stat;
 	}
 exit:
-	mutex_unlock(&stat_list_mutex);
+	mutex_unlock(&session->stat_mutex);
 	return ret;
 
 exit_free_list:
-	reset_stat_list();
-	mutex_unlock(&stat_list_mutex);
+	reset_stat_session(session);
+	mutex_unlock(&session->stat_mutex);
 	return ret;
 }
 
 
 static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 {
-	struct list_head *l = (struct list_head *)s->private;
+	struct tracer_stat_session *session = s->private;
 
 	/* Prevent from tracer switch or stat_list modification */
-	mutex_lock(&stat_list_mutex);
+	mutex_lock(&session->stat_mutex);
 
 	/* If we are in the beginning of the file, print the headers */
-	if (!*pos && current_tracer.stat_headers)
-		current_tracer.stat_headers(s);
+	if (!*pos && session->ts->stat_headers)
+		session->ts->stat_headers(s);
 
-	return seq_list_start(l, *pos);
+	return seq_list_start(&session->stat_list, *pos);
 }
 
 static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
-	struct list_head *l = (struct list_head *)s->private;
+	struct tracer_stat_session *session = s->private;
 
-	return seq_list_next(p, l, pos);
+	return seq_list_next(p, &session->stat_list, pos);
 }
 
-static void stat_seq_stop(struct seq_file *m, void *p)
+static void stat_seq_stop(struct seq_file *s, void *p)
 {
-	mutex_unlock(&stat_list_mutex);
+	struct tracer_stat_session *session = s->private;
+	mutex_unlock(&session->stat_mutex);
 }
 
 static int stat_seq_show(struct seq_file *s, void *v)
 {
-	struct trace_stat_list *entry;
-
-	entry =	list_entry(v, struct trace_stat_list, list);
+	struct tracer_stat_session *session = s->private;
+	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
-	return current_tracer.stat_show(s, entry->stat);
+	return session->ts->stat_show(s, l->stat);
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
@@ -192,15 +241,18 @@ static const struct seq_operations trace_stat_seq_ops = {
 	.show = stat_seq_show
 };
 
+/* The session stat is refilled and resorted at each stat file opening */
 static int tracing_stat_open(struct inode *inode, struct file *file)
 {
 	int ret;
 
+	struct tracer_stat_session *session = inode->i_private;
+
 	ret = seq_open(file, &trace_stat_seq_ops);
 	if (!ret) {
 		struct seq_file *m = file->private_data;
-		m->private = &stat_list;
-		ret = stat_seq_init();
+		m->private = session;
+		ret = stat_seq_init(session);
 	}
 
 	return ret;
@@ -212,9 +264,12 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
  */
 static int tracing_stat_release(struct inode *i, struct file *f)
 {
-	mutex_lock(&stat_list_mutex);
-	reset_stat_list();
-	mutex_unlock(&stat_list_mutex);
+	struct tracer_stat_session *session = i->i_private;
+
+	mutex_lock(&session->stat_mutex);
+	reset_stat_session(session);
+	mutex_unlock(&session->stat_mutex);
+
 	return 0;
 }
 
@@ -225,17 +280,70 @@ static const struct file_operations tracing_stat_fops = {
 	.release	= tracing_stat_release
 };
 
+
+static void destroy_trace_stat_files(void)
+{
+	int i;
+
+	if (stat_files) {
+		for (i = 0; i < nb_sessions; i++)
+			debugfs_remove(stat_files[i]);
+		kfree(stat_files);
+		stat_files = NULL;
+	}
+}
+
+static void init_trace_stat_files(void)
+{
+	int i;
+
+	if (!stat_dir || !nb_sessions)
+		return;
+
+	stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL);
+
+	if (!stat_files) {
+		pr_warning("trace stat: not enough memory\n");
+		return;
+	}
+
+	for (i = 0; i < nb_sessions; i++) {
+		struct tracer_stat_session *session = all_stat_sessions[i];
+		stat_files[i] = debugfs_create_file(session->ts->name, 0644,
+						stat_dir,
+						session, &tracing_stat_fops);
+		if (!stat_files[i])
+			pr_warning("cannot create %s entry\n",
+				   session->ts->name);
+	}
+}
+
+void init_tracer_stat(struct tracer *trace)
+{
+	int nb = basic_tracer_stat_checks(trace->stats);
+
+	destroy_trace_stat_files();
+
+	if (nb < 0) {
+		pr_warning("stat tracing: missing stat callback on %s\n",
+			   trace->name);
+		return;
+	}
+	if (!nb)
+		return;
+
+	init_all_sessions(nb, trace->stats);
+	init_trace_stat_files();
+}
+
 static int __init tracing_stat_init(void)
 {
 	struct dentry *d_tracing;
-	struct dentry *entry;
 
 	d_tracing = tracing_init_dentry();
 
-	entry = debugfs_create_file("trace_stat", 0444, d_tracing,
-					NULL,
-				    &tracing_stat_fops);
-	if (!entry)
+	stat_dir = debugfs_create_dir("trace_stat", d_tracing);
+	if (!stat_dir)
 		pr_warning("Could not create debugfs "
 			   "'trace_stat' entry\n");
 	return 0;
-- 
cgit v1.2.3


From fe6f90e57fd31af8daca534ea01db2e5666c15da Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sat, 3 Jan 2009 21:23:51 +0200
Subject: trace: mmiotrace to the tracer menu in Kconfig

Impact: cosmetic change in Kconfig menu layout

This patch was originally suggested by Peter Zijlstra, but seems it
was forgotten.

CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable
directly under the Kernel hacking / debugging menu in the kernel
configuration system. They were present only for x86 and x86_64.

Other tracers that use the ftrace tracing framework are in their own
sub-menu. This patch moves the mmiotrace configuration options there.
Since the Kconfig file, where the tracer menu is, is not architecture
specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by
x86/x86_64. CONFIG_MMIOTRACE now depends on it.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug | 24 ++----------------------
 kernel/trace/Kconfig   | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 10d6cc3fd052..e1983fa025d2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,28 +174,8 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config MMIOTRACE
-	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && PCI
-	select TRACING
-	help
-	  Mmiotrace traces Memory Mapped I/O access and is meant for
-	  debugging and reverse engineering. It is called from the ioremap
-	  implementation and works via page faults. Tracing is disabled by
-	  default and can be enabled at run-time.
-
-	  See Documentation/tracers/mmiotrace.txt.
-	  If you are not helping to develop drivers, say N.
-
-config MMIOTRACE_TEST
-	tristate "Test module for mmiotrace"
-	depends on MMIOTRACE && m
-	help
-	  This is a dumb module for testing mmiotrace. It is very dangerous
-	  as it will write garbage to IO memory starting at a given address.
-	  However, it should be safe to use on e.g. unused portion of VRAM.
-
-	  Say N, unless you absolutely know what you are doing.
+config HAVE_MMIOTRACE_SUPPORT
+	def_bool y
 
 #
 # IO delay types:
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 1c0b7504cab3..944239296f13 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -323,4 +323,27 @@ config FTRACE_STARTUP_TEST
 	  functioning properly. It will do tests on all the configured
 	  tracers of ftrace.
 
+config MMIOTRACE
+	bool "Memory mapped IO tracing"
+	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	select TRACING
+	help
+	  Mmiotrace traces Memory Mapped I/O access and is meant for
+	  debugging and reverse engineering. It is called from the ioremap
+	  implementation and works via page faults. Tracing is disabled by
+	  default and can be enabled at run-time.
+
+	  See Documentation/tracers/mmiotrace.txt.
+	  If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+	tristate "Test module for mmiotrace"
+	depends on MMIOTRACE && m
+	help
+	  This is a dumb module for testing mmiotrace. It is very dangerous
+	  as it will write garbage to IO memory starting at a given address.
+	  However, it should be safe to use on e.g. unused portion of VRAM.
+
+	  Say N, unless you absolutely know what you are doing.
+
 endmenu
-- 
cgit v1.2.3


From 173ed24ee2d64f5de28654eb456ec1ee18a142e5 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 6 Jan 2009 13:57:11 +0200
Subject: mmiotrace: count events lost due to not recording

Impact: enhances lost events counting in mmiotrace

The tracing framework, or the ring buffer facility it uses, has a switch
to stop recording data. When recording is off, the trace events will be
lost. The framework does not count these, so mmiotrace has to count them
itself.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fcec59ff2355..621c8c3f3139 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <asm/atomic.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -20,6 +21,7 @@ struct header_iter {
 static struct trace_array *mmio_trace_array;
 static bool overrun_detected;
 static unsigned long prev_overruns;
+static atomic_t dropped_count;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
@@ -122,11 +124,11 @@ static void mmio_close(struct trace_iterator *iter)
 
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
-	unsigned long cnt = 0;
+	unsigned long cnt = atomic_xchg(&dropped_count, 0);
 	unsigned long over = ring_buffer_overruns(iter->tr->buffer);
 
 	if (over > prev_overruns)
-		cnt = over - prev_overruns;
+		cnt += over - prev_overruns;
 	prev_overruns = over;
 	return cnt;
 }
@@ -308,8 +310,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
@@ -336,8 +340,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
-- 
cgit v1.2.3


From d7e51e66899f95dabc89b4d4c6674a6e50fa37fc Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Wed, 7 Jan 2009 15:03:13 -0800
Subject: sparseirq: make some func to be used with genirq

Impact: clean up sparseirq fallout on random.c

Ingo suggested to change some ifdef from SPARSE_IRQ to GENERIC_HARDIRQS
so we could some #ifdef later if all arch support genirq

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Acked-by: Matt Mackall <mpm@selenic.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 drivers/char/random.c        | 2 +-
 drivers/pci/intr_remapping.c | 2 +-
 include/linux/irq.h          | 6 ++----
 include/linux/kernel_stat.h  | 6 +++---
 kernel/irq/handle.c          | 7 ++++---
 5 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/random.c b/drivers/char/random.c
index 7c13581ca9cd..a778918c8f42 100644
--- a/drivers/char/random.c
+++ b/drivers/char/random.c
@@ -558,7 +558,7 @@ struct timer_rand_state {
 	unsigned dont_count_entropy:1;
 };
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
 
 static struct timer_rand_state *irq_timer_state[NR_IRQS];
 
diff --git a/drivers/pci/intr_remapping.c b/drivers/pci/intr_remapping.c
index f78371b22529..3d604132a04f 100644
--- a/drivers/pci/intr_remapping.c
+++ b/drivers/pci/intr_remapping.c
@@ -20,7 +20,7 @@ struct irq_2_iommu {
 	u8  irte_mask;
 };
 
-#ifdef CONFIG_SPARSE_IRQ
+#ifdef CONFIG_GENERIC_HARDIRQS
 static struct irq_2_iommu *get_one_free_irq_2_iommu(int cpu)
 {
 	struct irq_2_iommu *iommu;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..e9a878978c85 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -160,12 +160,10 @@ struct irq_2_iommu;
  */
 struct irq_desc {
 	unsigned int		irq;
-#ifdef CONFIG_SPARSE_IRQ
 	struct timer_rand_state *timer_rand_state;
 	unsigned int            *kstat_irqs;
-# ifdef CONFIG_INTR_REMAP
+#ifdef CONFIG_INTR_REMAP
 	struct irq_2_iommu      *irq_2_iommu;
-# endif
 #endif
 	irq_flow_handler_t	handle_irq;
 	struct irq_chip		*chip;
@@ -202,13 +200,13 @@ extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc
 extern struct irq_desc irq_desc[NR_IRQS];
 #else /* CONFIG_SPARSE_IRQ */
 extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu);
+#endif /* CONFIG_SPARSE_IRQ */
 
 #define kstat_irqs_this_cpu(DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()])
 #define kstat_incr_irqs_this_cpu(irqno, DESC) \
 	((DESC)->kstat_irqs[smp_processor_id()]++)
 
-#endif /* CONFIG_SPARSE_IRQ */
 
 extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu);
 
diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h
index 570d20413119..a3431b164bea 100644
--- a/include/linux/kernel_stat.h
+++ b/include/linux/kernel_stat.h
@@ -28,7 +28,7 @@ struct cpu_usage_stat {
 
 struct kernel_stat {
 	struct cpu_usage_stat	cpustat;
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
        unsigned int irqs[NR_IRQS];
 #endif
 };
@@ -41,7 +41,7 @@ DECLARE_PER_CPU(struct kernel_stat, kstat);
 
 extern unsigned long long nr_context_switches(void);
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
 #define kstat_irqs_this_cpu(irq) \
 	(kstat_this_cpu.irqs[irq])
 
@@ -55,7 +55,7 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq,
 #endif
 
 
-#ifndef CONFIG_SPARSE_IRQ
+#ifndef CONFIG_GENERIC_HARDIRQS
 static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
        return kstat_cpu(cpu).irqs[irq];
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..48299a8a22f8 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -213,6 +213,7 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 	}
 };
 
+static unsigned int kstat_irqs_all[NR_IRQS][NR_CPUS];
 int __init early_irq_init(void)
 {
 	struct irq_desc *desc;
@@ -222,8 +223,10 @@ int __init early_irq_init(void)
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
+		desc[i].kstat_irqs = kstat_irqs_all[i];
+	}
 
 	return arch_early_irq_init();
 }
@@ -451,12 +454,10 @@ void early_init_irq_lock_class(void)
 	}
 }
 
-#ifdef CONFIG_SPARSE_IRQ
 unsigned int kstat_irqs_cpu(unsigned int irq, int cpu)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	return desc ? desc->kstat_irqs[cpu] : 0;
 }
-#endif
 EXPORT_SYMBOL(kstat_irqs_cpu);
 
-- 
cgit v1.2.3


From 7f7ace0cda64c99599c23785f8979a072e118058 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:08 -0800
Subject: cpumask: update irq_desc to use cpumask_var_t

Impact: reduce memory usage, use new cpumask API.

Replace the affinity and pending_masks with cpumask_var_t's.  This adds
to the significant size reduction done with the SPARSE_IRQS changes.

The added functions (init_alloc_desc_masks & init_copy_desc_masks) are
in the include file so they can be inlined (and optimized out for the
!CONFIG_CPUMASKS_OFFSTACK case.)  [Naming chosen to be consistent with
the other init*irq functions, as well as the backwards arg declaration
of "from, to" instead of the more common "to, from" standard.]

Includes a slight change to the declaration of struct irq_desc to embed
the pending_mask within ifdef(CONFIG_SMP) to be consistent with other
references, and some small changes to Xen.

Tested: sparse/non-sparse/cpumask_offstack/non-cpumask_offstack/nonuma/nosmp on x86_64

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Chris Wright <chrisw@sous-sol.org>
Cc: Jeremy Fitzhardinge <jeremy@xensource.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
Cc: virtualization@lists.osdl.org
Cc: xen-devel@lists.xensource.com
Cc: Yinghai Lu <yhlu.kernel@gmail.com>
---
 arch/x86/kernel/io_apic.c | 20 ++++++------
 arch/x86/kernel/irq_32.c  |  2 +-
 arch/x86/kernel/irq_64.c  |  2 +-
 drivers/xen/events.c      |  4 +--
 include/linux/irq.h       | 81 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/irq/chip.c         |  5 ++-
 kernel/irq/handle.c       | 26 ++++++++-------
 kernel/irq/manage.c       | 12 +++----
 kernel/irq/migration.c    | 12 +++----
 kernel/irq/numa_migrate.c | 12 ++++++-
 kernel/irq/proc.c         |  4 +--
 11 files changed, 135 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1c4a1302536c..1337eab60ecc 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -356,7 +356,7 @@ set_extra_move_desc(struct irq_desc *desc, const struct cpumask *mask)
 
 	if (!cfg->move_in_progress) {
 		/* it means that domain is not changed */
-		if (!cpumask_intersects(&desc->affinity, mask))
+		if (!cpumask_intersects(desc->affinity, mask))
 			cfg->move_desc_pending = 1;
 	}
 }
@@ -579,9 +579,9 @@ set_desc_affinity(struct irq_desc *desc, const struct cpumask *mask)
 	if (assign_irq_vector(irq, cfg, mask))
 		return BAD_APICID;
 
-	cpumask_and(&desc->affinity, cfg->domain, mask);
+	cpumask_and(desc->affinity, cfg->domain, mask);
 	set_extra_move_desc(desc, mask);
-	return cpu_mask_to_apicid_and(&desc->affinity, cpu_online_mask);
+	return cpu_mask_to_apicid_and(desc->affinity, cpu_online_mask);
 }
 
 static void
@@ -2383,7 +2383,7 @@ migrate_ioapic_irq_desc(struct irq_desc *desc, const struct cpumask *mask)
 	if (cfg->move_in_progress)
 		send_cleanup_vector(cfg);
 
-	cpumask_copy(&desc->affinity, mask);
+	cpumask_copy(desc->affinity, mask);
 }
 
 static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
@@ -2405,11 +2405,11 @@ static int migrate_irq_remapped_level_desc(struct irq_desc *desc)
 	}
 
 	/* everthing is clear. we have right of way */
-	migrate_ioapic_irq_desc(desc, &desc->pending_mask);
+	migrate_ioapic_irq_desc(desc, desc->pending_mask);
 
 	ret = 0;
 	desc->status &= ~IRQ_MOVE_PENDING;
-	cpumask_clear(&desc->pending_mask);
+	cpumask_clear(desc->pending_mask);
 
 unmask:
 	unmask_IO_APIC_irq_desc(desc);
@@ -2434,7 +2434,7 @@ static void ir_irq_migration(struct work_struct *work)
 				continue;
 			}
 
-			desc->chip->set_affinity(irq, &desc->pending_mask);
+			desc->chip->set_affinity(irq, desc->pending_mask);
 			spin_unlock_irqrestore(&desc->lock, flags);
 		}
 	}
@@ -2448,7 +2448,7 @@ static void set_ir_ioapic_affinity_irq_desc(struct irq_desc *desc,
 {
 	if (desc->status & IRQ_LEVEL) {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, mask);
+		cpumask_copy(desc->pending_mask, mask);
 		migrate_irq_remapped_level_desc(desc);
 		return;
 	}
@@ -2516,7 +2516,7 @@ static void irq_complete_move(struct irq_desc **descp)
 
 		/* domain has not changed, but affinity did */
 		me = smp_processor_id();
-		if (cpu_isset(me, desc->affinity)) {
+		if (cpumask_test_cpu(me, desc->affinity)) {
 			*descp = desc = move_irq_desc(desc, me);
 			/* get the new one */
 			cfg = desc->chip_data;
@@ -4039,7 +4039,7 @@ void __init setup_ioapic_dest(void)
 			 */
 			if (desc->status &
 			    (IRQ_NO_BALANCING | IRQ_AFFINITY_SET))
-				mask = &desc->affinity;
+				mask = desc->affinity;
 			else
 				mask = TARGET_CPUS;
 
diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c
index 74b9ff7341e9..e0f29be8ab0b 100644
--- a/arch/x86/kernel/irq_32.c
+++ b/arch/x86/kernel/irq_32.c
@@ -248,7 +248,7 @@ void fixup_irqs(void)
 		if (irq == 2)
 			continue;
 
-		affinity = &desc->affinity;
+		affinity = desc->affinity;
 		if (cpumask_any_and(affinity, cpu_online_mask) >= nr_cpu_ids) {
 			printk("Breaking affinity for irq %i\n", irq);
 			affinity = cpu_all_mask;
diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c
index 63c88e6ec025..0b21cb1ea11f 100644
--- a/arch/x86/kernel/irq_64.c
+++ b/arch/x86/kernel/irq_64.c
@@ -100,7 +100,7 @@ void fixup_irqs(void)
 		/* interrupt's are disabled at this point */
 		spin_lock(&desc->lock);
 
-		affinity = &desc->affinity;
+		affinity = desc->affinity;
 		if (!irq_has_action(irq) ||
 		    cpumask_equal(affinity, cpu_online_mask)) {
 			spin_unlock(&desc->lock);
diff --git a/drivers/xen/events.c b/drivers/xen/events.c
index eb0dfdeaa949..e0767ff35d6c 100644
--- a/drivers/xen/events.c
+++ b/drivers/xen/events.c
@@ -125,7 +125,7 @@ static void bind_evtchn_to_cpu(unsigned int chn, unsigned int cpu)
 
 	BUG_ON(irq == -1);
 #ifdef CONFIG_SMP
-	irq_to_desc(irq)->affinity = cpumask_of_cpu(cpu);
+	cpumask_copy(irq_to_desc(irq)->affinity, cpumask_of(cpu));
 #endif
 
 	__clear_bit(chn, cpu_evtchn_mask[cpu_evtchn[chn]]);
@@ -142,7 +142,7 @@ static void init_evtchn_cpu_bindings(void)
 
 	/* By default all event channels notify CPU#0. */
 	for_each_irq_desc(i, desc) {
-		desc->affinity = cpumask_of_cpu(0);
+		cpumask_copy(desc->affinity, cpumask_of(0));
 	}
 #endif
 
diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..fa27210f1dfd 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -182,11 +182,11 @@ struct irq_desc {
 	unsigned int		irqs_unhandled;
 	spinlock_t		lock;
 #ifdef CONFIG_SMP
-	cpumask_t		affinity;
+	cpumask_var_t		affinity;
 	unsigned int		cpu;
-#endif
 #ifdef CONFIG_GENERIC_PENDING_IRQ
-	cpumask_t		pending_mask;
+	cpumask_var_t		pending_mask;
+#endif
 #endif
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
@@ -422,4 +422,79 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 
 #endif /* !CONFIG_S390 */
 
+#ifdef CONFIG_SMP
+/**
+ * init_alloc_desc_masks - allocate cpumasks for irq_desc
+ * @desc:	pointer to irq_desc struct
+ * @boot:	true if need bootmem
+ *
+ * Allocates affinity and pending_mask cpumask if required.
+ * Returns true if successful (or not required).
+ * Side effect: affinity has all bits set, pending_mask has all bits clear.
+ */
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+								bool boot)
+{
+	if (boot) {
+		alloc_bootmem_cpumask_var(&desc->affinity);
+		cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+		alloc_bootmem_cpumask_var(&desc->pending_mask);
+		cpumask_clear(desc->pending_mask);
+#endif
+		return true;
+	}
+
+	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
+		return false;
+	cpumask_setall(desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	if (!alloc_cpumask_var_node(&desc->pending_mask, GFP_ATOMIC, node)) {
+		free_cpumask_var(desc->affinity);
+		return false;
+	}
+	cpumask_clear(desc->pending_mask);
+#endif
+	return true;
+}
+
+/**
+ * init_copy_desc_masks - copy cpumasks for irq_desc
+ * @old_desc:	pointer to old irq_desc struct
+ * @new_desc:	pointer to new irq_desc struct
+ *
+ * Insures affinity and pending_masks are copied to new irq_desc.
+ * If !CONFIG_CPUMASKS_OFFSTACK the cpumasks are embedded in the
+ * irq_desc struct so the copy is redundant.
+ */
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+					struct irq_desc *new_desc)
+{
+#ifdef CONFIG_CPUMASKS_OFFSTACK
+	cpumask_copy(new_desc->affinity, old_desc->affinity);
+
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_copy(new_desc->pending_mask, old_desc->pending_mask);
+#endif
+#endif
+}
+
+#else /* !CONFIG_SMP */
+
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+								bool boot)
+{
+	return true;
+}
+
+static inline void init_copy_desc_masks(struct irq_desc *old_desc,
+					struct irq_desc *new_desc)
+{
+}
+
+#endif	/* CONFIG_SMP */
+
 #endif /* _LINUX_IRQ_H */
diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..c248eba98b43 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -46,7 +46,10 @@ void dynamic_irq_init(unsigned int irq)
 	desc->irq_count = 0;
 	desc->irqs_unhandled = 0;
 #ifdef CONFIG_SMP
-	cpumask_setall(&desc->affinity);
+	cpumask_setall(desc->affinity);
+#ifdef CONFIG_GENERIC_PENDING_IRQ
+	cpumask_clear(desc->pending_mask);
+#endif
 #endif
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index c20db0be9173..b8fa1354f01c 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -64,9 +64,6 @@ static struct irq_desc irq_desc_init = {
 	.handle_irq = handle_bad_irq,
 	.depth      = 1,
 	.lock       = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-	.affinity   = CPU_MASK_ALL
-#endif
 };
 
 void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
@@ -88,6 +85,8 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
+	int node = cpu_to_node(cpu);
+
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
@@ -101,6 +100,10 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+		BUG_ON(1);
+	}
 	arch_init_chip_data(desc, cpu);
 }
 
@@ -119,9 +122,6 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 		.handle_irq = handle_bad_irq,
 		.depth	    = 1,
 		.lock	    = __SPIN_LOCK_UNLOCKED(irq_desc_init.lock),
-#ifdef CONFIG_SMP
-		.affinity   = CPU_MASK_ALL
-#endif
 	}
 };
 
@@ -141,7 +141,7 @@ int __init early_irq_init(void)
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy[i];
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
-
+		init_alloc_desc_masks(&desc[i], 0, true);
 		irq_desc_ptrs[i] = desc + i;
 	}
 
@@ -188,6 +188,10 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
+		BUG_ON(1);
+	}
 	init_one_irq_desc(irq, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
@@ -207,9 +211,6 @@ struct irq_desc irq_desc[NR_IRQS] __cacheline_aligned_in_smp = {
 		.handle_irq = handle_bad_irq,
 		.depth = 1,
 		.lock = __SPIN_LOCK_UNLOCKED(irq_desc->lock),
-#ifdef CONFIG_SMP
-		.affinity = CPU_MASK_ALL
-#endif
 	}
 };
 
@@ -222,9 +223,10 @@ int __init early_irq_init(void)
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-	for (i = 0; i < count; i++)
+	for (i = 0; i < count; i++) {
 		desc[i].irq = i;
-
+		init_alloc_desc_masks(&desc[i], 0, true);
+	}
 	return arch_early_irq_init();
 }
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..b98739af4558 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -98,14 +98,14 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PCNTXT || desc->status & IRQ_DISABLED) {
-		cpumask_copy(&desc->affinity, cpumask);
+		cpumask_copy(desc->affinity, cpumask);
 		desc->chip->set_affinity(irq, cpumask);
 	} else {
 		desc->status |= IRQ_MOVE_PENDING;
-		cpumask_copy(&desc->pending_mask, cpumask);
+		cpumask_copy(desc->pending_mask, cpumask);
 	}
 #else
-	cpumask_copy(&desc->affinity, cpumask);
+	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
 	desc->status |= IRQ_AFFINITY_SET;
@@ -127,16 +127,16 @@ int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
 	 * one of the targets is online.
 	 */
 	if (desc->status & (IRQ_AFFINITY_SET | IRQ_NO_BALANCING)) {
-		if (cpumask_any_and(&desc->affinity, cpu_online_mask)
+		if (cpumask_any_and(desc->affinity, cpu_online_mask)
 		    < nr_cpu_ids)
 			goto set_affinity;
 		else
 			desc->status &= ~IRQ_AFFINITY_SET;
 	}
 
-	cpumask_and(&desc->affinity, cpu_online_mask, irq_default_affinity);
+	cpumask_and(desc->affinity, cpu_online_mask, irq_default_affinity);
 set_affinity:
-	desc->chip->set_affinity(irq, &desc->affinity);
+	desc->chip->set_affinity(irq, desc->affinity);
 
 	return 0;
 }
diff --git a/kernel/irq/migration.c b/kernel/irq/migration.c
index bd72329e630c..e05ad9be43b7 100644
--- a/kernel/irq/migration.c
+++ b/kernel/irq/migration.c
@@ -18,7 +18,7 @@ void move_masked_irq(int irq)
 
 	desc->status &= ~IRQ_MOVE_PENDING;
 
-	if (unlikely(cpumask_empty(&desc->pending_mask)))
+	if (unlikely(cpumask_empty(desc->pending_mask)))
 		return;
 
 	if (!desc->chip->set_affinity)
@@ -38,13 +38,13 @@ void move_masked_irq(int irq)
 	 * For correct operation this depends on the caller
 	 * masking the irqs.
 	 */
-	if (likely(cpumask_any_and(&desc->pending_mask, cpu_online_mask)
+	if (likely(cpumask_any_and(desc->pending_mask, cpu_online_mask)
 		   < nr_cpu_ids)) {
-		cpumask_and(&desc->affinity,
-			    &desc->pending_mask, cpu_online_mask);
-		desc->chip->set_affinity(irq, &desc->affinity);
+		cpumask_and(desc->affinity,
+			    desc->pending_mask, cpu_online_mask);
+		desc->chip->set_affinity(irq, desc->affinity);
 	}
-	cpumask_clear(&desc->pending_mask);
+	cpumask_clear(desc->pending_mask);
 }
 
 void move_native_irq(int irq)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..f001a4ea6414 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -46,6 +46,7 @@ static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
+	init_copy_desc_masks(old_desc, desc);
 	arch_init_copy_chip_data(old_desc, desc, cpu);
 }
 
@@ -76,11 +77,20 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 	node = cpu_to_node(cpu);
 	desc = kzalloc_node(sizeof(*desc), GFP_ATOMIC, node);
 	if (!desc) {
-		printk(KERN_ERR "irq %d: can not get new irq_desc for migration.\n", irq);
+		printk(KERN_ERR "irq %d: can not get new irq_desc "
+				"for migration.\n", irq);
 		/* still use old one */
 		desc = old_desc;
 		goto out_unlock;
 	}
+	if (!init_alloc_desc_masks(desc, node, false)) {
+		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+				"for migration.\n", irq);
+		/* still use old one */
+		kfree(desc);
+		desc = old_desc;
+		goto out_unlock;
+	}
 	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/proc.c b/kernel/irq/proc.c
index aae3f742bcec..692363dd591f 100644
--- a/kernel/irq/proc.c
+++ b/kernel/irq/proc.c
@@ -20,11 +20,11 @@ static struct proc_dir_entry *root_irq_dir;
 static int irq_affinity_proc_show(struct seq_file *m, void *v)
 {
 	struct irq_desc *desc = irq_to_desc((long)m->private);
-	const struct cpumask *mask = &desc->affinity;
+	const struct cpumask *mask = desc->affinity;
 
 #ifdef CONFIG_GENERIC_PENDING_IRQ
 	if (desc->status & IRQ_MOVE_PENDING)
-		mask = &desc->pending_mask;
+		mask = desc->pending_mask;
 #endif
 	seq_cpumask(m, mask);
 	seq_putc(m, '\n');
-- 
cgit v1.2.3


From 802bf931f2688ad125b73db597ce63cc842fb27a Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:09 -0800
Subject: cpumask: fix bug in use cpumask_var_t in irq_desc

Impact: fix bug where new irq_desc uses old cpumask pointers which are freed.

As Yinghai pointed out, init_copy_one_irq_desc() copies the old desc to
the new desc overwriting the cpumask pointers.  Since the old_desc and
the cpumask pointers are freed, then memory corruption will occur if
these old pointers are used.

Move the allocation of these pointers to after the copy.

Signed-off-by: Mike Travis <travis@sgi.com>
Cc: Yinghai Lu <yinghai@kernel.org>
---
 include/linux/irq.h       |  9 +++++++--
 kernel/irq/handle.c       |  8 +-------
 kernel/irq/numa_migrate.c | 13 ++++++++-----
 3 files changed, 16 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index fa27210f1dfd..27a67536511e 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -426,15 +426,18 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry);
 /**
  * init_alloc_desc_masks - allocate cpumasks for irq_desc
  * @desc:	pointer to irq_desc struct
+ * @cpu:	cpu which will be handling the cpumasks
  * @boot:	true if need bootmem
  *
  * Allocates affinity and pending_mask cpumask if required.
  * Returns true if successful (or not required).
  * Side effect: affinity has all bits set, pending_mask has all bits clear.
  */
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
+	int node;
+
 	if (boot) {
 		alloc_bootmem_cpumask_var(&desc->affinity);
 		cpumask_setall(desc->affinity);
@@ -446,6 +449,8 @@ static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
 		return true;
 	}
 
+	node = cpu_to_node(cpu);
+
 	if (!alloc_cpumask_var_node(&desc->affinity, GFP_ATOMIC, node))
 		return false;
 	cpumask_setall(desc->affinity);
@@ -484,7 +489,7 @@ static inline void init_copy_desc_masks(struct irq_desc *old_desc,
 
 #else /* !CONFIG_SMP */
 
-static inline bool init_alloc_desc_masks(struct irq_desc *desc, int node,
+static inline bool init_alloc_desc_masks(struct irq_desc *desc, int cpu,
 								bool boot)
 {
 	return true;
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b8fa1354f01c..f01c0a30cb42 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -85,8 +85,6 @@ void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 {
-	int node = cpu_to_node(cpu);
-
 	memcpy(desc, &irq_desc_init, sizeof(struct irq_desc));
 
 	spin_lock_init(&desc->lock);
@@ -100,7 +98,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
 		printk(KERN_ERR "can not alloc kstat_irqs\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
+	if (!init_alloc_desc_masks(desc, cpu, false)) {
 		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
 		BUG_ON(1);
 	}
@@ -188,10 +186,6 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 		printk(KERN_ERR "can not alloc irq_desc\n");
 		BUG_ON(1);
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
-		printk(KERN_ERR "can not alloc irq_desc cpumasks\n");
-		BUG_ON(1);
-	}
 	init_one_irq_desc(irq, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index f001a4ea6414..666260e4c065 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -38,16 +38,22 @@ static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
 	old_desc->kstat_irqs = NULL;
 }
 
-static void init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
+static bool init_copy_one_irq_desc(int irq, struct irq_desc *old_desc,
 		 struct irq_desc *desc, int cpu)
 {
 	memcpy(desc, old_desc, sizeof(struct irq_desc));
+	if (!init_alloc_desc_masks(desc, cpu, false)) {
+		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
+				"for migration.\n", irq);
+		return false;
+	}
 	spin_lock_init(&desc->lock);
 	desc->cpu = cpu;
 	lockdep_set_class(&desc->lock, &irq_desc_lock_class);
 	init_copy_kstat_irqs(old_desc, desc, cpu, nr_cpu_ids);
 	init_copy_desc_masks(old_desc, desc);
 	arch_init_copy_chip_data(old_desc, desc, cpu);
+	return true;
 }
 
 static void free_one_irq_desc(struct irq_desc *old_desc, struct irq_desc *desc)
@@ -83,15 +89,12 @@ static struct irq_desc *__real_move_irq_desc(struct irq_desc *old_desc,
 		desc = old_desc;
 		goto out_unlock;
 	}
-	if (!init_alloc_desc_masks(desc, node, false)) {
-		printk(KERN_ERR "irq %d: can not get new irq_desc cpumask "
-				"for migration.\n", irq);
+	if (!init_copy_one_irq_desc(irq, old_desc, desc, cpu)) {
 		/* still use old one */
 		kfree(desc);
 		desc = old_desc;
 		goto out_unlock;
 	}
-	init_copy_one_irq_desc(irq, old_desc, desc, cpu);
 
 	irq_desc_ptrs[irq] = desc;
 
-- 
cgit v1.2.3


From d38b223c86db3162dc85b5a1997ac8a210e1660b Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 21:58:11 -0800
Subject: cpumask: reduce stack usage in find_lowest_rq

Impact: reduce stack usage, cleanup

Use a cpumask_var_t in find_lowest_rq() and clean up other old
cpumask_t calls.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/sched_rt.c | 36 ++++++++++++++++++++++--------------
 1 file changed, 22 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 954e1a81b796..da932f4c8524 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -960,16 +960,17 @@ static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu)
 
 static DEFINE_PER_CPU(cpumask_var_t, local_cpu_mask);
 
-static inline int pick_optimal_cpu(int this_cpu, cpumask_t *mask)
+static inline int pick_optimal_cpu(int this_cpu,
+				   const struct cpumask *mask)
 {
 	int first;
 
 	/* "this_cpu" is cheaper to preempt than a remote processor */
-	if ((this_cpu != -1) && cpu_isset(this_cpu, *mask))
+	if ((this_cpu != -1) && cpumask_test_cpu(this_cpu, mask))
 		return this_cpu;
 
-	first = first_cpu(*mask);
-	if (first != NR_CPUS)
+	first = cpumask_first(mask);
+	if (first < nr_cpu_ids)
 		return first;
 
 	return -1;
@@ -981,6 +982,7 @@ static int find_lowest_rq(struct task_struct *task)
 	struct cpumask *lowest_mask = __get_cpu_var(local_cpu_mask);
 	int this_cpu = smp_processor_id();
 	int cpu      = task_cpu(task);
+	cpumask_var_t domain_mask;
 
 	if (task->rt.nr_cpus_allowed == 1)
 		return -1; /* No other targets possible */
@@ -1013,19 +1015,25 @@ static int find_lowest_rq(struct task_struct *task)
 	if (this_cpu == cpu)
 		this_cpu = -1; /* Skip this_cpu opt if the same */
 
-	for_each_domain(cpu, sd) {
-		if (sd->flags & SD_WAKE_AFFINE) {
-			cpumask_t domain_mask;
-			int       best_cpu;
+	if (alloc_cpumask_var(&domain_mask, GFP_ATOMIC)) {
+		for_each_domain(cpu, sd) {
+			if (sd->flags & SD_WAKE_AFFINE) {
+				int best_cpu;
 
-			cpumask_and(&domain_mask, sched_domain_span(sd),
-				    lowest_mask);
+				cpumask_and(domain_mask,
+					    sched_domain_span(sd),
+					    lowest_mask);
 
-			best_cpu = pick_optimal_cpu(this_cpu,
-						    &domain_mask);
-			if (best_cpu != -1)
-				return best_cpu;
+				best_cpu = pick_optimal_cpu(this_cpu,
+							    domain_mask);
+
+				if (best_cpu != -1) {
+					free_cpumask_var(domain_mask);
+					return best_cpu;
+				}
+			}
 		}
+		free_cpumask_var(domain_mask);
 	}
 
 	/*
-- 
cgit v1.2.3


From 9594949b060efe86ecaa1a66839232a3b9800bc9 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:06 -0800
Subject: irq: change references from NR_IRQS to nr_irqs

Impact: preparation, cleanup, add KERN_INFO printk

Modify references from NR_IRQS to nr_irqs as the later will become
variable-sized based on nr_cpu_ids when CONFIG_SPARSE_IRQS=y.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/kernel/io_apic.c |  2 +-
 kernel/irq/handle.c       | 14 +++++++++-----
 2 files changed, 10 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index 1337eab60ecc..ae80638012de 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3183,7 +3183,7 @@ unsigned int create_irq_nr(unsigned int irq_want)
 
 	irq = 0;
 	spin_lock_irqsave(&vector_lock, flags);
-	for (new = irq_want; new < NR_IRQS; new++) {
+	for (new = irq_want; new < nr_irqs; new++) {
 		if (platform_legacy_irq(new))
 			continue;
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index f01c0a30cb42..790c5fa7ea39 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -132,6 +132,8 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
+	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
+
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
@@ -143,7 +145,7 @@ int __init early_irq_init(void)
 		irq_desc_ptrs[i] = desc + i;
 	}
 
-	for (i = legacy_count; i < NR_IRQS; i++)
+	for (i = legacy_count; i < nr_irqs; i++)
 		irq_desc_ptrs[i] = NULL;
 
 	return arch_early_irq_init();
@@ -151,7 +153,7 @@ int __init early_irq_init(void)
 
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
-	return (irq < NR_IRQS) ? irq_desc_ptrs[irq] : NULL;
+	return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL;
 }
 
 struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
@@ -160,9 +162,9 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	unsigned long flags;
 	int node;
 
-	if (irq >= NR_IRQS) {
-		printk(KERN_WARNING "irq >= NR_IRQS in irq_to_desc_alloc: %d %d\n",
-				irq, NR_IRQS);
+	if (irq >= nr_irqs) {
+		printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n",
+				irq, nr_irqs);
 		WARN_ON(1);
 		return NULL;
 	}
@@ -214,6 +216,8 @@ int __init early_irq_init(void)
 	int count;
 	int i;
 
+	printk(KERN_INFO "NR_IRQS:%d\n", NR_IRQS);
+
 	desc = irq_desc;
 	count = ARRAY_SIZE(irq_desc);
 
-- 
cgit v1.2.3


From e2f4d06545ec1f29b0e838ee34cbf3500ea5b9a4 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:06 -0800
Subject: irq: use WARN() instead of WARN_ON().

Impact: cleanup WARN msg.

Ingo requested:
> While at it, could you please also convert this to a WARN() construct
> instead? (in a separate commit)

... and it shall be done.  ;-)

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/irq/handle.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 790c5fa7ea39..fd1ef16252f4 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -163,9 +163,8 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 	int node;
 
 	if (irq >= nr_irqs) {
-		printk(KERN_WARNING "irq >= nr_irqs in irq_to_desc_alloc: %d %d\n",
-				irq, nr_irqs);
-		WARN_ON(1);
+		WARN(1, "irq (%d) >= nr_irqs (%d) in irq_to_desc_alloc\n",
+			irq, nr_irqs);
 		return NULL;
 	}
 
-- 
cgit v1.2.3


From 0fa0ebbf15addc1be8f73325d809c8547a9de304 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:06 -0800
Subject: irq: allocate irq_desc_ptrs array based on nr_irqs

Impact: allocate irq_desc_ptrs in preparation for making it variable-sized.

This addresses this memory usage bump when NR_CPUS bumped from 128 to 4096:

    34816   +229376    264192  +658%  irq_desc_ptrs(.data.read_mostly)

The patch is split into two parts, the first simply allocates the
irq_desc_ptrs array.  Then next will deal with making it variable.
This is only when CONFIG_SPARSE_IRQS=y.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/irq/handle.c    | 11 +++++++++--
 kernel/irq/internals.h |  7 +++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fd1ef16252f4..d0b8f7e72790 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <linux/bootmem.h>
 
 #include "internals.h"
 
@@ -110,7 +111,7 @@ static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
  */
 DEFINE_SPINLOCK(sparse_irq_lock);
 
-struct irq_desc *irq_desc_ptrs[NR_IRQS] __read_mostly;
+struct irq_desc **irq_desc_ptrs __read_mostly;
 
 static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_smp = {
 	[0 ... NR_IRQS_LEGACY-1] = {
@@ -137,6 +138,9 @@ int __init early_irq_init(void)
 	desc = irq_desc_legacy;
 	legacy_count = ARRAY_SIZE(irq_desc_legacy);
 
+	/* allocate irq_desc_ptrs array based on nr_irqs */
+	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
+
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
 		desc[i].kstat_irqs = kstat_irqs_legacy[i];
@@ -153,7 +157,10 @@ int __init early_irq_init(void)
 
 struct irq_desc *irq_to_desc(unsigned int irq)
 {
-	return (irq < nr_irqs) ? irq_desc_ptrs[irq] : NULL;
+	if (irq_desc_ptrs && irq < nr_irqs)
+		return irq_desc_ptrs[irq];
+
+	return NULL;
 }
 
 struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..40416a81a0f5 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -16,7 +16,14 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
 extern spinlock_t sparse_irq_lock;
+
+#ifdef CONFIG_SPARSE_IRQ
+/* irq_desc_ptrs allocated at boot time */
+extern struct irq_desc **irq_desc_ptrs;
+#else
+/* irq_desc_ptrs is a fixed size array */
 extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
+#endif
 
 #ifdef CONFIG_PROC_FS
 extern void register_irq_proc(unsigned int irq, struct irq_desc *desc);
-- 
cgit v1.2.3


From 9332fccdedf8e09448f3b69b624211ae879f6c45 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:07 -0800
Subject: irq: initialize nr_irqs based on nr_cpu_ids

Impact: Reduce memory usage.

This is the second half of the changes to make the irq_desc_ptrs be
variable sized based on nr_cpu_ids.  This is done by adding a new
"max_nr_irqs" macro to irq_vectors.h (and a dummy in irqnr.h) to
return a max NR_IRQS value based on NR_CPUS or nr_cpu_ids.

This necessitated moving the define of MAX_IO_APICS to a separate
file (asm/apicnum.h) so it could be included without the baggage
of the other asm/apicdef.h declarations.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/include/asm/apicdef.h     |  8 ++------
 arch/x86/include/asm/apicnum.h     | 12 ++++++++++++
 arch/x86/include/asm/irq_vectors.h | 16 +++++++++++-----
 include/linux/irqnr.h              |  7 +++++++
 kernel/irq/handle.c                |  3 +++
 5 files changed, 35 insertions(+), 11 deletions(-)
 create mode 100644 arch/x86/include/asm/apicnum.h

(limited to 'kernel')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 63134e31e8b9..1a6454ef7f6c 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -132,12 +132,8 @@
 #define APIC_BASE_MSR	0x800
 #define X2APIC_ENABLE	(1UL << 10)
 
-#ifdef CONFIG_X86_32
-# define MAX_IO_APICS 64
-#else
-# define MAX_IO_APICS 128
-# define MAX_LOCAL_APIC 32768
-#endif
+/* get MAX_IO_APICS */
+#include <asm/apicnum.h>
 
 /*
  * All x86-64 systems are xAPIC compatible.
diff --git a/arch/x86/include/asm/apicnum.h b/arch/x86/include/asm/apicnum.h
new file mode 100644
index 000000000000..82f613c607ce
--- /dev/null
+++ b/arch/x86/include/asm/apicnum.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_APICNUM_H
+#define _ASM_X86_APICNUM_H
+
+/* define MAX_IO_APICS */
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
+
+#endif /* _ASM_X86_APICNUM_H */
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index f7ff65032b9d..602361ad0e74 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -105,6 +105,8 @@
 
 #if defined(CONFIG_X86_IO_APIC) && !defined(CONFIG_X86_VOYAGER)
 
+#include <asm/apicnum.h>	/* need MAX_IO_APICS */
+
 #ifndef CONFIG_SPARSE_IRQ
 # if NR_CPUS < MAX_IO_APICS
 #  define NR_IRQS (NR_VECTORS + (32 * NR_CPUS))
@@ -112,11 +114,15 @@
 #  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
 # endif
 #else
-# if (8 * NR_CPUS) > (32 * MAX_IO_APICS)
-#  define NR_IRQS (NR_VECTORS + (8 * NR_CPUS))
-# else
-#  define NR_IRQS (NR_VECTORS + (32 * MAX_IO_APICS))
-# endif
+
+/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */
+# define max_nr_irqs(nr_cpus)				\
+	((8 * nr_cpus) > (32 * MAX_IO_APICS) ?		\
+		(NR_VECTORS + (8 * NR_CPUS)) :		\
+		(NR_VECTORS + (32 * MAX_IO_APICS)))	\
+
+# define NR_IRQS max_nr_irqs(NR_CPUS)
+
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index 86af92e9e84c..de66e4e10406 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -20,11 +20,18 @@
 
 # define for_each_irq_desc_reverse(irq, desc)                          \
 	for (irq = nr_irqs - 1; irq >= 0; irq--)
+
 #else /* CONFIG_GENERIC_HARDIRQS */
 
+#include <asm/irq_vectors.h>	/* need possible max_nr_irqs() */
+
 extern int nr_irqs;
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 
+# ifndef max_nr_irqs
+#  define max_nr_irqs(nr_cpus)	NR_IRQS
+# endif
+
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index d0b8f7e72790..ebba7a116f14 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -133,6 +133,9 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
+	/* initialize nr_irqs based on nr_cpu_ids */
+	nr_irqs = max_nr_irqs(nr_cpu_ids);
+
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
 
 	desc = irq_desc_legacy;
-- 
cgit v1.2.3


From 542d865bbed4ce1f050f586e53cf1cfadda93766 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sat, 10 Jan 2009 22:24:07 -0800
Subject: kstat: modify kstat_irqs_legacy to be variable sized

Impact: reduce memory usage.

Allocate kstat_irqs_legacy based on nr_cpu_ids to deal with this
memory usage bump when NR_CPUS bumped from 128 to 4096:

     8192   +253952    262144 +3100%  kstat_irqs_legacy(.bss)

This is only when CONFIG_SPARSE_IRQS=y.

Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/irq/handle.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index ebba7a116f14..b39f32ac8f80 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -124,8 +124,7 @@ static struct irq_desc irq_desc_legacy[NR_IRQS_LEGACY] __cacheline_aligned_in_sm
 	}
 };
 
-/* FIXME: use bootmem alloc ...*/
-static unsigned int kstat_irqs_legacy[NR_IRQS_LEGACY][NR_CPUS];
+static unsigned int *kstat_irqs_legacy;
 
 int __init early_irq_init(void)
 {
@@ -144,9 +143,14 @@ int __init early_irq_init(void)
 	/* allocate irq_desc_ptrs array based on nr_irqs */
 	irq_desc_ptrs = alloc_bootmem(nr_irqs * sizeof(void *));
 
+	/* allocate based on nr_cpu_ids */
+	/* FIXME: invert kstat_irgs, and it'd be a per_cpu_alloc'd thing */
+	kstat_irqs_legacy = alloc_bootmem(NR_IRQS_LEGACY * nr_cpu_ids *
+					  sizeof(int));
+
 	for (i = 0; i < legacy_count; i++) {
 		desc[i].irq = i;
-		desc[i].kstat_irqs = kstat_irqs_legacy[i];
+		desc[i].kstat_irqs = kstat_irqs_legacy + i * nr_cpu_ids;
 		lockdep_set_class(&desc[i].lock, &irq_desc_lock_class);
 		init_alloc_desc_masks(&desc[i], 0, true);
 		irq_desc_ptrs[i] = desc + i;
-- 
cgit v1.2.3


From 92296c6d6e908c35fca287a21af27be814af9c75 Mon Sep 17 00:00:00 2001
From: Mike Travis <travis@sgi.com>
Date: Sun, 11 Jan 2009 09:22:58 -0800
Subject: cpumask, irq: non-x86 build failures

Ingo Molnar wrote:

> All non-x86 architectures fail to build:
>
> In file included from /home/mingo/tip/include/linux/random.h:11,
>                  from /home/mingo/tip/include/linux/stackprotector.h:6,
>                  from /home/mingo/tip/init/main.c:17:
> /home/mingo/tip/include/linux/irqnr.h:26:63: error: asm/irq_vectors.h: No such file or directory

Do not include asm/irq_vectors.h in generic code - it's not available
on all architectures.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/include/asm/apicdef.h | 8 ++++++--
 include/linux/irqnr.h          | 6 ------
 kernel/irq/handle.c            | 5 +++++
 3 files changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/apicdef.h b/arch/x86/include/asm/apicdef.h
index 1a6454ef7f6c..63134e31e8b9 100644
--- a/arch/x86/include/asm/apicdef.h
+++ b/arch/x86/include/asm/apicdef.h
@@ -132,8 +132,12 @@
 #define APIC_BASE_MSR	0x800
 #define X2APIC_ENABLE	(1UL << 10)
 
-/* get MAX_IO_APICS */
-#include <asm/apicnum.h>
+#ifdef CONFIG_X86_32
+# define MAX_IO_APICS 64
+#else
+# define MAX_IO_APICS 128
+# define MAX_LOCAL_APIC 32768
+#endif
 
 /*
  * All x86-64 systems are xAPIC compatible.
diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h
index de66e4e10406..887477bc2ab0 100644
--- a/include/linux/irqnr.h
+++ b/include/linux/irqnr.h
@@ -23,15 +23,9 @@
 
 #else /* CONFIG_GENERIC_HARDIRQS */
 
-#include <asm/irq_vectors.h>	/* need possible max_nr_irqs() */
-
 extern int nr_irqs;
 extern struct irq_desc *irq_to_desc(unsigned int irq);
 
-# ifndef max_nr_irqs
-#  define max_nr_irqs(nr_cpus)	NR_IRQS
-# endif
-
 # define for_each_irq_desc(irq, desc)					\
 	for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs;		\
 	     irq++, desc = irq_to_desc(irq))				\
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index b39f32ac8f80..04d3e46031e5 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -58,6 +58,11 @@ int nr_irqs = NR_IRQS;
 EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
+
+#ifndef max_nr_irqs
+#define max_nr_irqs(nr_cpus)	NR_IRQS
+#endif
+
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
 	.status	    = IRQ_DISABLED,
-- 
cgit v1.2.3


From 4a046d1754ee6ebb6f399696805ed61ea0444d4c Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Mon, 12 Jan 2009 17:39:24 -0800
Subject: x86: arch_probe_nr_irqs

Impact: save RAM with large NR_CPUS, get smaller nr_irqs

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 arch/x86/include/asm/irq_vectors.h |  7 ++-----
 arch/x86/kernel/io_apic.c          | 16 ++++++++++++++++
 include/linux/interrupt.h          |  1 +
 kernel/irq/handle.c                |  9 ++-------
 kernel/softirq.c                   |  5 +++++
 5 files changed, 26 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 602361ad0e74..a16a2ab2b429 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -115,14 +115,11 @@
 # endif
 #else
 
-/* defined as a macro so nr_irqs = max_nr_irqs(nr_cpu_ids) can be used */
-# define max_nr_irqs(nr_cpus)				\
-	((8 * nr_cpus) > (32 * MAX_IO_APICS) ?		\
+# define NR_IRQS					\
+	((8 * NR_CPUS) > (32 * MAX_IO_APICS) ?		\
 		(NR_VECTORS + (8 * NR_CPUS)) :		\
 		(NR_VECTORS + (32 * MAX_IO_APICS)))	\
 
-# define NR_IRQS max_nr_irqs(NR_CPUS)
-
 #endif
 
 #elif defined(CONFIG_X86_VOYAGER)
diff --git a/arch/x86/kernel/io_apic.c b/arch/x86/kernel/io_apic.c
index ae80638012de..157986916cd1 100644
--- a/arch/x86/kernel/io_apic.c
+++ b/arch/x86/kernel/io_apic.c
@@ -3850,6 +3850,22 @@ void __init probe_nr_irqs_gsi(void)
 		nr_irqs_gsi = nr;
 }
 
+#ifdef CONFIG_SPARSE_IRQ
+int __init arch_probe_nr_irqs(void)
+{
+	int nr;
+
+	nr = ((8 * nr_cpu_ids) > (32 * nr_ioapics) ?
+		(NR_VECTORS + (8 * nr_cpu_ids)) :
+		(NR_VECTORS + (32 * nr_ioapics)));
+
+	if (nr < nr_irqs && nr > nr_irqs_gsi)
+		nr_irqs = nr;
+
+	return 0;
+}
+#endif
+
 /* --------------------------------------------------------------------------
                           ACPI-based IOAPIC Configuration
    -------------------------------------------------------------------------- */
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..472f11765f60 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -467,6 +467,7 @@ int show_interrupts(struct seq_file *p, void *v);
 struct irq_desc;
 
 extern int early_irq_init(void);
+extern int arch_probe_nr_irqs(void);
 extern int arch_early_irq_init(void);
 extern int arch_init_chip_data(struct irq_desc *desc, int cpu);
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 04d3e46031e5..375d68cd5bf0 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -59,10 +59,6 @@ EXPORT_SYMBOL_GPL(nr_irqs);
 
 #ifdef CONFIG_SPARSE_IRQ
 
-#ifndef max_nr_irqs
-#define max_nr_irqs(nr_cpus)	NR_IRQS
-#endif
-
 static struct irq_desc irq_desc_init = {
 	.irq	    = -1,
 	.status	    = IRQ_DISABLED,
@@ -137,9 +133,8 @@ int __init early_irq_init(void)
 	int legacy_count;
 	int i;
 
-	/* initialize nr_irqs based on nr_cpu_ids */
-	nr_irqs = max_nr_irqs(nr_cpu_ids);
-
+	 /* initialize nr_irqs based on nr_cpu_ids */
+	arch_probe_nr_irqs();
 	printk(KERN_INFO "NR_IRQS:%d nr_irqs:%d\n", NR_IRQS, nr_irqs);
 
 	desc = irq_desc_legacy;
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..0365b4899a3d 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -795,6 +795,11 @@ int __init __weak early_irq_init(void)
 	return 0;
 }
 
+int __init __weak arch_probe_nr_irqs(void)
+{
+	return 0;
+}
+
 int __init __weak arch_early_irq_init(void)
 {
 	return 0;
-- 
cgit v1.2.3


From 25aac9dc7c8c73798c1be8aa36141f980d32579e Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:40 +0800
Subject: ftrace, ia64: explictly ignore a file in recordmcount.pl

In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'. MCOUNT_ADDR is determined
at link time not compile time, so explictly ignore kernel/trace/ftrace.o
in recordmcount.pl.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c   | 10 +---------
 scripts/recordmcount.pl |  5 +++++
 2 files changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e54a6ccdb93..76bb884b6e16 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -263,14 +263,6 @@ static void ftrace_update_pid_func(void)
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-/*
- * Since MCOUNT_ADDR may point to mcount itself, we do not want
- * to get it confused by reading a reference in the code as we
- * are parsing on objcopy output of text. Use a variable for
- * it instead.
- */
-static unsigned long mcount_addr = MCOUNT_ADDR;
-
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -575,7 +567,7 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 
 	ip = rec->ip;
 
-	ret = ftrace_make_nop(mod, rec, mcount_addr);
+	ret = ftrace_make_nop(mod, rec, MCOUNT_ADDR);
 	if (ret) {
 		ftrace_bug(ret, ip);
 		rec->flags |= FTRACE_FL_FAILED;
diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl
index 282485a22bf8..070042b4ccd5 100755
--- a/scripts/recordmcount.pl
+++ b/scripts/recordmcount.pl
@@ -109,6 +109,11 @@ if ($#ARGV < 7) {
 my ($arch, $bits, $objdump, $objcopy, $cc,
     $ld, $nm, $rm, $mv, $is_module, $inputfile) = @ARGV;
 
+# This file refers to mcount and shouldn't be ftraced, so lets' ignore it
+if ($inputfile eq "kernel/trace/ftrace.o") {
+    exit(0);
+}
+
 # Acceptable sections to record.
 my %text_sections = (
      ".text" => 1,
-- 
cgit v1.2.3


From f00012074b1a1a67d9c8603617bbbab267347ca6 Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 9 Jan 2009 11:29:42 +0800
Subject: ftrace, ia64: Add macro for ftrace_caller

Define FTRACE_ADDR. In IA64, a function pointer isn't a 'unsigned long' but a
'struct {unsigned long ip, unsigned long gp}'.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 4 ++++
 kernel/trace/ftrace.c  | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..054721487574 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -126,6 +126,10 @@ extern int ftrace_update_ftrace_func(ftrace_func_t func);
 extern void ftrace_caller(void);
 extern void ftrace_call(void);
 extern void mcount_call(void);
+
+#ifndef FTRACE_ADDR
+#define FTRACE_ADDR ((unsigned long)ftrace_caller)
+#endif
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 extern void ftrace_graph_caller(void);
 extern int ftrace_enable_ftrace_graph_caller(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 76bb884b6e16..9f536108d3f3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -455,7 +455,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	unsigned long ip, fl;
 	unsigned long ftrace_addr;
 
-	ftrace_addr = (unsigned long)ftrace_caller;
+	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
 	ip = rec->ip;
 
-- 
cgit v1.2.3


From 002bb86d8d42f18937aef396c3ecd65c7e02e21a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 10 Jan 2009 11:34:13 -0800
Subject: tracing/ftrace: separate events tracing and stats tracing engine

Impact: tracing's Api change

Currently, the stat tracing depends on the events tracing.
When you switch to a new tracer, the stats files of the previous tracer
will disappear. But it's more scalable to separate those two engines.
This way, we can keep the stat files of one or several tracers when we
want, without bothering of multiple tracer stat files or tracer switching.

To build/destroys its stats files, a tracer just have to call
register_stat_tracer/unregister_stat_tracer everytimes it wants to.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        |   2 -
 kernel/trace/trace.h        |  20 -----
 kernel/trace/trace_branch.c | 108 ++++++++++++++-----------
 kernel/trace/trace_stat.c   | 191 +++++++++++++++++++-------------------------
 kernel/trace/trace_stat.h   |  31 +++++++
 5 files changed, 172 insertions(+), 180 deletions(-)
 create mode 100644 kernel/trace/trace_stat.h

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0418fc338b5c..40217fb499ea 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2353,7 +2353,6 @@ static int tracing_set_tracer(char *buf)
 		if (ret)
 			goto out;
 	}
-	init_tracer_stat(t);
 
 	trace_branch_enable(tr);
  out:
@@ -3218,7 +3217,6 @@ __init static int tracer_alloc_buffers(void)
 #else
 	current_trace = &nop_trace;
 #endif
-	init_tracer_stat(current_trace);
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b3f9ad1b4d84..79c872100dd5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -334,24 +334,6 @@ struct tracer_flags {
 /* Makes more easy to define a tracer opt */
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
-/*
- * If you want to provide a stat file (one-shot statistics), fill
- * an iterator with stat_start/stat_next and a stat_show callbacks.
- * The others callbacks are optional.
- */
-struct tracer_stat {
-	/* The name of your stat file */
-	const char		*name;
-	/* Iteration over statistic entries */
-	void			*(*stat_start)(void);
-	void			*(*stat_next)(void *prev, int idx);
-	/* Compare two entries for sorting (optional) for stats */
-	int			(*stat_cmp)(void *p1, void *p2);
-	/* Print a stat entry */
-	int			(*stat_show)(struct seq_file *s, void *p);
-	/* Print the headers of your stat entries */
-	int			(*stat_headers)(struct seq_file *s);
-};
 
 /*
  * A specific tracer, represented by methods that operate on a trace array:
@@ -466,8 +448,6 @@ void tracing_start_sched_switch_record(void);
 int register_tracer(struct tracer *type);
 void unregister_tracer(struct tracer *type);
 
-void init_tracer_stat(struct tracer *trace);
-
 extern unsigned long nsecs_to_usecs(unsigned long nsecs);
 
 extern unsigned long tracing_max_latency;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index da5cf3e5581b..ca017e0a9a27 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -16,12 +16,12 @@
 #include <asm/local.h>
 
 #include "trace.h"
+#include "trace_stat.h"
 #include "trace_output.h"
 
-static struct tracer branch_trace;
-
 #ifdef CONFIG_BRANCH_TRACER
 
+static struct tracer branch_trace;
 static int branch_tracing_enabled __read_mostly;
 static DEFINE_MUTEX(branch_tracing_mutex);
 
@@ -191,6 +191,30 @@ static struct trace_event trace_branch_event = {
 	.binary		= trace_nop_print,
 };
 
+static struct tracer branch_trace __read_mostly =
+{
+	.name		= "branch",
+	.init		= branch_trace_init,
+	.reset		= branch_trace_reset,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_branch,
+#endif /* CONFIG_FTRACE_SELFTEST */
+};
+
+__init static int init_branch_tracer(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&trace_branch_event);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "branch events\n");
+		return 1;
+	}
+	return register_tracer(&branch_trace);
+}
+device_initcall(init_branch_tracer);
+
 #else
 static inline
 void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
@@ -305,6 +329,29 @@ static int annotated_branch_stat_cmp(void *p1, void *p2)
 		return 0;
 }
 
+static struct tracer_stat annotated_branch_stats = {
+	.name = "branch_annotated",
+	.stat_start = annotated_branch_stat_start,
+	.stat_next = annotated_branch_stat_next,
+	.stat_cmp = annotated_branch_stat_cmp,
+	.stat_headers = annotated_branch_stat_headers,
+	.stat_show = branch_stat_show
+};
+
+__init static int init_annotated_branch_stats(void)
+{
+	int ret;
+
+	ret = register_stat_tracer(&annotated_branch_stats);
+	if (!ret) {
+		printk(KERN_WARNING "Warning: could not register "
+				    "annotated branches stats\n");
+		return 1;
+	}
+	return 0;
+}
+fs_initcall(init_annotated_branch_stats);
+
 #ifdef CONFIG_PROFILE_ALL_BRANCHES
 
 extern unsigned long __start_branch_profile[];
@@ -339,60 +386,25 @@ all_branch_stat_next(void *v, int idx)
 	return p;
 }
 
-static struct tracer_stat branch_stats[] = {
-	{.name = "annotated",
-	.stat_start = annotated_branch_stat_start,
-	.stat_next = annotated_branch_stat_next,
-	.stat_cmp = annotated_branch_stat_cmp,
-	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{.name = "all",
+static struct tracer_stat all_branch_stats = {
+	.name = "branch_all",
 	.stat_start = all_branch_stat_start,
 	.stat_next = all_branch_stat_next,
 	.stat_headers = all_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{ }
-};
-#else
-static struct tracer_stat branch_stats[] = {
-	{.name = "annotated",
-	.stat_start = annotated_branch_stat_start,
-	.stat_next = annotated_branch_stat_next,
-	.stat_cmp = annotated_branch_stat_cmp,
-	.stat_headers = annotated_branch_stat_headers,
-	.stat_show = branch_stat_show},
-
-	{ }
+	.stat_show = branch_stat_show
 };
-#endif /* CONFIG_PROFILE_ALL_BRANCHES */
 
-
-static struct tracer branch_trace __read_mostly =
+__init static int all_annotated_branch_stats(void)
 {
-	.name		= "branch",
-#ifdef CONFIG_BRANCH_TRACER
-	.init		= branch_trace_init,
-	.reset		= branch_trace_reset,
-#ifdef CONFIG_FTRACE_SELFTEST
-	.selftest	= trace_selftest_startup_branch,
-#endif /* CONFIG_FTRACE_SELFTEST */
-#endif
-	.stats		= branch_stats
-};
-
-__init static int init_branch_trace(void)
-{
-#ifdef CONFIG_BRANCH_TRACER
 	int ret;
-	ret = register_ftrace_event(&trace_branch_event);
+
+	ret = register_stat_tracer(&all_branch_stats);
 	if (!ret) {
-		printk(KERN_WARNING "Warning: could not register branch events\n");
+		printk(KERN_WARNING "Warning: could not register "
+				    "all branches stats\n");
 		return 1;
 	}
-#endif
-
-	return register_tracer(&branch_trace);
+	return 0;
 }
-device_initcall(init_branch_trace);
+fs_initcall(all_annotated_branch_stats);
+#endif /* CONFIG_PROFILE_ALL_BRANCHES */
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 1515f9e7adfc..cb29282b9485 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -10,28 +10,32 @@
 
 
 #include <linux/list.h>
-#include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include "trace_stat.h"
 #include "trace.h"
 
 
 /* List of stat entries from a tracer */
 struct trace_stat_list {
-	struct list_head list;
-	void *stat;
+	struct list_head 	list;
+	void 			*stat;
 };
 
 /* A stat session is the stats output in one file */
 struct tracer_stat_session {
-	struct tracer_stat *ts;
-	struct list_head stat_list;
-	struct mutex stat_mutex;
+	struct list_head	session_list;
+	struct tracer_stat 	*ts;
+	struct list_head 	stat_list;
+	struct mutex 		stat_mutex;
+	struct dentry		*file;
 };
 
 /* All of the sessions currently in use. Each stat file embeed one session */
-static struct tracer_stat_session **all_stat_sessions;
-static int nb_sessions;
-static struct dentry *stat_dir, **stat_files;
+static LIST_HEAD(all_stat_sessions);
+static DEFINE_MUTEX(all_stat_sessions_mutex);
+
+/* The root directory for all stat files */
+static struct dentry *stat_dir;
 
 
 static void reset_stat_session(struct tracer_stat_session *session)
@@ -44,66 +48,77 @@ static void reset_stat_session(struct tracer_stat_session *session)
 	INIT_LIST_HEAD(&session->stat_list);
 }
 
-/* Called when a tracer is initialized */
-static int init_all_sessions(int nb, struct tracer_stat *ts)
+static void destroy_session(struct tracer_stat_session *session)
 {
-	int i, j;
-	struct tracer_stat_session *session;
+	debugfs_remove(session->file);
+	reset_stat_session(session);
+	mutex_destroy(&session->stat_mutex);
+	kfree(session);
+}
 
-	nb_sessions = 0;
 
-	if (all_stat_sessions) {
-		for (i = 0; i < nb_sessions; i++) {
-			session = all_stat_sessions[i];
-			reset_stat_session(session);
-			mutex_destroy(&session->stat_mutex);
-			kfree(session);
-		}
-	}
-	all_stat_sessions = kmalloc(sizeof(struct tracer_stat_session *) * nb,
-				    GFP_KERNEL);
-	if (!all_stat_sessions)
-		return -ENOMEM;
+static int init_stat_file(struct tracer_stat_session *session);
 
-	for (i = 0; i < nb; i++) {
-		session = kmalloc(sizeof(struct tracer_stat_session) * nb,
-				  GFP_KERNEL);
-		if (!session)
-			goto free_sessions;
+int register_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *session, *node, *tmp;
+	int ret;
+
+	if (!trace)
+		return -EINVAL;
+
+	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+		return -EINVAL;
 
-		INIT_LIST_HEAD(&session->stat_list);
-		mutex_init(&session->stat_mutex);
-		session->ts = &ts[i];
-		all_stat_sessions[i] = session;
+	/* Already registered? */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace)
+			return -EINVAL;
 	}
-	nb_sessions = nb;
-	return 0;
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	/* Init the session */
+	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
 
-free_sessions:
+	session->ts = trace;
+	INIT_LIST_HEAD(&session->session_list);
+	INIT_LIST_HEAD(&session->stat_list);
+	mutex_init(&session->stat_mutex);
+	session->file = NULL;
 
-	for (j = 0; j < i; j++)
-		kfree(all_stat_sessions[i]);
+	ret = init_stat_file(session);
+	if (ret) {
+		destroy_session(session);
+		return ret;
+	}
 
-	kfree(all_stat_sessions);
-	all_stat_sessions = NULL;
+	/* Register */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_add_tail(&session->session_list, &all_stat_sessions);
+	mutex_unlock(&all_stat_sessions_mutex);
 
-	return -ENOMEM;
+	return 0;
 }
 
-static int basic_tracer_stat_checks(struct tracer_stat *ts)
+void unregister_stat_tracer(struct tracer_stat *trace)
 {
-	int i;
+	struct tracer_stat_session *node, *tmp;
 
-	if (!ts)
-		return 0;
-
-	for (i = 0; ts[i].name; i++) {
-		if (!ts[i].stat_start || !ts[i].stat_next || !ts[i].stat_show)
-			return -EBUSY;
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			list_del(&node->session_list);
+			destroy_session(node);
+			break;
+		}
 	}
-	return i;
+	mutex_unlock(&all_stat_sessions_mutex);
 }
 
+
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -280,63 +295,7 @@ static const struct file_operations tracing_stat_fops = {
 	.release	= tracing_stat_release
 };
 
-
-static void destroy_trace_stat_files(void)
-{
-	int i;
-
-	if (stat_files) {
-		for (i = 0; i < nb_sessions; i++)
-			debugfs_remove(stat_files[i]);
-		kfree(stat_files);
-		stat_files = NULL;
-	}
-}
-
-static void init_trace_stat_files(void)
-{
-	int i;
-
-	if (!stat_dir || !nb_sessions)
-		return;
-
-	stat_files = kmalloc(sizeof(struct dentry *) * nb_sessions, GFP_KERNEL);
-
-	if (!stat_files) {
-		pr_warning("trace stat: not enough memory\n");
-		return;
-	}
-
-	for (i = 0; i < nb_sessions; i++) {
-		struct tracer_stat_session *session = all_stat_sessions[i];
-		stat_files[i] = debugfs_create_file(session->ts->name, 0644,
-						stat_dir,
-						session, &tracing_stat_fops);
-		if (!stat_files[i])
-			pr_warning("cannot create %s entry\n",
-				   session->ts->name);
-	}
-}
-
-void init_tracer_stat(struct tracer *trace)
-{
-	int nb = basic_tracer_stat_checks(trace->stats);
-
-	destroy_trace_stat_files();
-
-	if (nb < 0) {
-		pr_warning("stat tracing: missing stat callback on %s\n",
-			   trace->name);
-		return;
-	}
-	if (!nb)
-		return;
-
-	init_all_sessions(nb, trace->stats);
-	init_trace_stat_files();
-}
-
-static int __init tracing_stat_init(void)
+static int tracing_stat_init(void)
 {
 	struct dentry *d_tracing;
 
@@ -348,4 +307,16 @@ static int __init tracing_stat_init(void)
 			   "'trace_stat' entry\n");
 	return 0;
 }
-fs_initcall(tracing_stat_init);
+
+static int init_stat_file(struct tracer_stat_session *session)
+{
+	if (!stat_dir && tracing_stat_init())
+		return -ENODEV;
+
+	session->file = debugfs_create_file(session->ts->name, 0644,
+					    stat_dir,
+					    session, &tracing_stat_fops);
+	if (!session->file)
+		return -ENOMEM;
+	return 0;
+}
diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h
new file mode 100644
index 000000000000..202274cf7f3d
--- /dev/null
+++ b/kernel/trace/trace_stat.h
@@ -0,0 +1,31 @@
+#ifndef __TRACE_STAT_H
+#define __TRACE_STAT_H
+
+#include <linux/seq_file.h>
+
+/*
+ * If you want to provide a stat file (one-shot statistics), fill
+ * an iterator with stat_start/stat_next and a stat_show callbacks.
+ * The others callbacks are optional.
+ */
+struct tracer_stat {
+	/* The name of your stat file */
+	const char		*name;
+	/* Iteration over statistic entries */
+	void			*(*stat_start)(void);
+	void			*(*stat_next)(void *prev, int idx);
+	/* Compare two entries for stats sorting */
+	int			(*stat_cmp)(void *p1, void *p2);
+	/* Print a stat entry */
+	int			(*stat_show)(struct seq_file *s, void *p);
+	/* Print the headers of your stat entries */
+	int			(*stat_headers)(struct seq_file *s);
+};
+
+/*
+ * Destroy or create a stat file
+ */
+extern int register_stat_tracer(struct tracer_stat *trace);
+extern void unregister_stat_tracer(struct tracer_stat *trace);
+
+#endif /* __TRACE_STAT_H */
-- 
cgit v1.2.3


From e1d8aa9f1dd655a3534b22fcfbecb70cdb125766 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 12 Jan 2009 23:15:46 +0100
Subject: tracing: add a new workqueue tracer

Impact: new tracer

The workqueue tracer provides some statistical informations
about each cpu workqueue thread such as the number of the
works inserted and executed since their creation. It can help
to evaluate the amount of work each of them have to perform.
For example it can help a developer to decide whether he should
choose a per cpu workqueue instead of a singlethreaded one.

It only traces statistical informations for now but it will probably later
provide event tracing too.

Such a tracer could help too, and be improved, to help rt priority sorted
workqueue development.

To have a snapshot of the workqueues state at any time, just do

cat /debugfs/tracing/trace_stat/workqueues

Ie:

  1    125        125       reiserfs/1
  1      0          0       scsi_tgtd/1
  1      0          0       aio/1
  1      0          0       ata/1
  1    114        114       kblockd/1
  1      0          0       kintegrityd/1
  1   2147       2147       events/1

  0      0          0       kpsmoused
  0    105        105       reiserfs/0
  0      0          0       scsi_tgtd/0
  0      0          0       aio/0
  0      0          0       ata_aux
  0      0          0       ata/0
  0      0          0       cqueue
  0      0          0       kacpi_notify
  0      0          0       kacpid
  0    149        149       kblockd/0
  0      0          0       kintegrityd/0
  0   1000       1000       khelper
  0   2270       2270       events/0

Changes in V2:

_ Drop the static array based on NR_CPU and dynamically allocate the stat array
  with num_possible_cpus() and other cpu mask facilities....
_ Trace workqueue insertion at a bit lower level (insert_work instead of queue_work) to handle
  even the workqueue barriers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/workqueue.h      |  25 ++++
 kernel/trace/Kconfig           |  11 ++
 kernel/trace/Makefile          |   1 +
 kernel/trace/trace_workqueue.c | 287 +++++++++++++++++++++++++++++++++++++++++
 kernel/workqueue.c             |  16 ++-
 5 files changed, 339 insertions(+), 1 deletion(-)
 create mode 100644 include/trace/workqueue.h
 create mode 100644 kernel/trace/trace_workqueue.c

(limited to 'kernel')

diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
new file mode 100644
index 000000000000..867829df4571
--- /dev/null
+++ b/include/trace/workqueue.h
@@ -0,0 +1,25 @@
+#ifndef __TRACE_WORKQUEUE_H
+#define __TRACE_WORKQUEUE_H
+
+#include <linux/tracepoint.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+
+DECLARE_TRACE(workqueue_insertion,
+	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TPARGS(wq_thread, work));
+
+DECLARE_TRACE(workqueue_execution,
+	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TPARGS(wq_thread, work));
+
+/* Trace the creation of one workqueue thread on a cpu */
+DECLARE_TRACE(workqueue_creation,
+	   TPPROTO(struct task_struct *wq_thread, int cpu),
+	   TPARGS(wq_thread, cpu));
+
+DECLARE_TRACE(workqueue_destruction,
+	   TPPROTO(struct task_struct *wq_thread),
+	   TPARGS(wq_thread));
+
+#endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 944239296f13..dde1d46f77e5 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -284,6 +284,17 @@ config KMEMTRACE
 
 	  If unsure, say N.
 
+config WORKQUEUE_TRACER
+	bool "Trace workqueues"
+	select TRACING
+	help
+	  The workqueue tracer provides some statistical informations
+          about each cpu workqueue thread such as the number of the
+          works inserted and executed since their creation. It can help
+          to evaluate the amount of work each of them have to perform.
+          For example it can help a developer to decide whether he should
+          choose a per cpu workqueue instead of a singlethreaded one.
+
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 05c9182061de..f76d48f3527d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -36,5 +36,6 @@ obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
+obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
new file mode 100644
index 000000000000..f8118d39ca9b
--- /dev/null
+++ b/kernel/trace/trace_workqueue.c
@@ -0,0 +1,287 @@
+/*
+ * Workqueue statistical tracer.
+ *
+ * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ *
+ */
+
+
+#include <trace/workqueue.h>
+#include <linux/list.h>
+#include "trace_stat.h"
+#include "trace.h"
+
+
+/* A cpu workqueue thread */
+struct cpu_workqueue_stats {
+	struct list_head            list;
+/* Useful to know if we print the cpu headers */
+	bool		            first_entry;
+	int		            cpu;
+	pid_t 			    pid;
+/* Can be inserted from interrupt or user context, need to be atomic */
+	atomic_t 	            inserted;
+/*
+ *  Don't need to be atomic, works are serialized in a single workqueue thread
+ *  on a single CPU.
+ */
+	unsigned int 	 	    executed;
+};
+
+/* List of workqueue threads on one cpu */
+struct workqueue_global_stats {
+	struct list_head	list;
+	spinlock_t		lock;
+};
+
+/* Don't need a global lock because allocated before the workqueues, and
+ * never freed.
+ */
+static struct workqueue_global_stats *all_workqueue_stat;
+
+/* Insertion of a work */
+static void
+probe_workqueue_insertion(struct task_struct *wq_thread,
+			  struct work_struct *work)
+{
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			atomic_inc(&node->inserted);
+			goto found;
+		}
+	}
+	pr_debug("trace_workqueue: entry not found\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Execution of a work */
+static void
+probe_workqueue_execution(struct task_struct *wq_thread,
+			  struct work_struct *work)
+{
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			node->executed++;
+			goto found;
+		}
+	}
+	pr_debug("trace_workqueue: entry not found\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Creation of a cpu workqueue thread */
+static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
+{
+	struct cpu_workqueue_stats *cws;
+	unsigned long flags;
+
+	WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+
+	/* Workqueues are sometimes created in atomic context */
+	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
+	if (!cws) {
+		pr_warning("trace_workqueue: not enough memory\n");
+		return;
+	}
+	tracing_record_cmdline(wq_thread);
+
+	INIT_LIST_HEAD(&cws->list);
+	cws->cpu = cpu;
+
+	cws->pid = wq_thread->pid;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (list_empty(&all_workqueue_stat[cpu].list))
+		cws->first_entry = true;
+	list_add_tail(&cws->list, &all_workqueue_stat[cpu].list);
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+}
+
+/* Destruction of a cpu workqueue thread */
+static void probe_workqueue_destruction(struct task_struct *wq_thread)
+{
+	/* Workqueue only execute on one cpu */
+	int cpu = cpumask_first(&wq_thread->cpus_allowed);
+	struct cpu_workqueue_stats *node, *next;
+	unsigned long flags;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+							list) {
+		if (node->pid == wq_thread->pid) {
+			list_del(&node->list);
+			kfree(node);
+			goto found;
+		}
+	}
+
+	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
+found:
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+}
+
+static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
+{
+	unsigned long flags;
+	struct cpu_workqueue_stats *ret = NULL;
+
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+
+	if (!list_empty(&all_workqueue_stat[cpu].list))
+		ret = list_entry(all_workqueue_stat[cpu].list.next,
+				 struct cpu_workqueue_stats, list);
+
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return ret;
+}
+
+static void *workqueue_stat_start(void)
+{
+	int cpu;
+	void *ret = NULL;
+
+	for_each_possible_cpu(cpu) {
+		ret = workqueue_stat_start_cpu(cpu);
+		if (ret)
+			return ret;
+	}
+	return NULL;
+}
+
+static void *workqueue_stat_next(void *prev, int idx)
+{
+	struct cpu_workqueue_stats *prev_cws = prev;
+	int cpu = prev_cws->cpu;
+	unsigned long flags;
+	void *ret = NULL;
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) {
+		spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
+			ret = workqueue_stat_start_cpu(cpu);
+			if (ret)
+				return ret;
+		}
+		return NULL;
+	}
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
+			  list);
+}
+
+static int workqueue_stat_show(struct seq_file *s, void *p)
+{
+	struct cpu_workqueue_stats *cws = p;
+	unsigned long flags;
+	int cpu = cws->cpu;
+
+	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
+		   atomic_read(&cws->inserted),
+		   cws->executed,
+		   trace_find_cmdline(cws->pid));
+
+	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	if (&cws->list == all_workqueue_stat[cpu].list.next)
+		seq_printf(s, "\n");
+	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+
+	return 0;
+}
+
+static int workqueue_stat_headers(struct seq_file *s)
+{
+	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
+	seq_printf(s, "# |      |         |          |\n\n");
+	return 0;
+}
+
+struct tracer_stat workqueue_stats __read_mostly = {
+	.name = "workqueues",
+	.stat_start = workqueue_stat_start,
+	.stat_next = workqueue_stat_next,
+	.stat_show = workqueue_stat_show,
+	.stat_headers = workqueue_stat_headers
+};
+
+
+int __init stat_workqueue_init(void)
+{
+	if (register_stat_tracer(&workqueue_stats)) {
+		pr_warning("Unable to register workqueue stat tracer\n");
+		return 1;
+	}
+
+	return 0;
+}
+fs_initcall(stat_workqueue_init);
+
+/*
+ * Workqueues are created very early, just after pre-smp initcalls.
+ * So we must register our tracepoints at this stage.
+ */
+int __init trace_workqueue_early_init(void)
+{
+	int ret, cpu;
+
+	ret = register_trace_workqueue_insertion(probe_workqueue_insertion);
+	if (ret)
+		goto out;
+
+	ret = register_trace_workqueue_execution(probe_workqueue_execution);
+	if (ret)
+		goto no_insertion;
+
+	ret = register_trace_workqueue_creation(probe_workqueue_creation);
+	if (ret)
+		goto no_execution;
+
+	ret = register_trace_workqueue_destruction(probe_workqueue_destruction);
+	if (ret)
+		goto no_creation;
+
+	all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats)
+				     * num_possible_cpus(), GFP_KERNEL);
+
+	if (!all_workqueue_stat) {
+		pr_warning("trace_workqueue: not enough memory\n");
+		goto no_creation;
+	}
+
+	for_each_possible_cpu(cpu) {
+		spin_lock_init(&all_workqueue_stat[cpu].lock);
+		INIT_LIST_HEAD(&all_workqueue_stat[cpu].list);
+	}
+
+	return 0;
+
+no_creation:
+	unregister_trace_workqueue_creation(probe_workqueue_creation);
+no_execution:
+	unregister_trace_workqueue_execution(probe_workqueue_execution);
+no_insertion:
+	unregister_trace_workqueue_insertion(probe_workqueue_insertion);
+out:
+	pr_warning("trace_workqueue: unable to trace workqueues\n");
+
+	return 1;
+}
+early_initcall(trace_workqueue_early_init);
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..1fc2bc20603f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -33,6 +33,7 @@
 #include <linux/kallsyms.h>
 #include <linux/debug_locks.h>
 #include <linux/lockdep.h>
+#include <trace/workqueue.h>
 
 /*
  * The per-CPU workqueue (if single thread, we always use the first
@@ -125,9 +126,13 @@ struct cpu_workqueue_struct *get_wq_data(struct work_struct *work)
 	return (void *) (atomic_long_read(&work->data) & WORK_STRUCT_WQ_DATA_MASK);
 }
 
+DEFINE_TRACE(workqueue_insertion);
+
 static void insert_work(struct cpu_workqueue_struct *cwq,
 			struct work_struct *work, struct list_head *head)
 {
+	trace_workqueue_insertion(cwq->thread, work);
+
 	set_wq_data(work, cwq);
 	/*
 	 * Ensure that we get the right work->data if we see the
@@ -259,6 +264,8 @@ int queue_delayed_work_on(int cpu, struct workqueue_struct *wq,
 }
 EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 
+DEFINE_TRACE(workqueue_execution);
+
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
@@ -284,7 +291,7 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		 */
 		struct lockdep_map lockdep_map = work->lockdep_map;
 #endif
-
+		trace_workqueue_execution(cwq->thread, work);
 		cwq->current_work = work;
 		list_del_init(cwq->worklist.next);
 		spin_unlock_irq(&cwq->lock);
@@ -765,6 +772,8 @@ init_cpu_workqueue(struct workqueue_struct *wq, int cpu)
 	return cwq;
 }
 
+DEFINE_TRACE(workqueue_creation);
+
 static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
@@ -787,6 +796,8 @@ static int create_workqueue_thread(struct cpu_workqueue_struct *cwq, int cpu)
 		sched_setscheduler_nocheck(p, SCHED_FIFO, &param);
 	cwq->thread = p;
 
+	trace_workqueue_creation(cwq->thread, cpu);
+
 	return 0;
 }
 
@@ -868,6 +879,8 @@ struct workqueue_struct *__create_workqueue_key(const char *name,
 }
 EXPORT_SYMBOL_GPL(__create_workqueue_key);
 
+DEFINE_TRACE(workqueue_destruction);
+
 static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 {
 	/*
@@ -891,6 +904,7 @@ static void cleanup_workqueue_thread(struct cpu_workqueue_struct *cwq)
 	 * checks list_empty(), and a "normal" queue_work() can't use
 	 * a dead CPU.
 	 */
+	trace_workqueue_destruction(cwq->thread);
 	kthread_stop(cwq->thread);
 	cwq->thread = NULL;
 }
-- 
cgit v1.2.3


From 32632920a788fb13da35b131b77cc4324c38c1c5 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <u.kleine-koenig@pengutronix.de>
Date: Mon, 12 Jan 2009 23:35:50 +0100
Subject: ftrace, trivial: fix typo "resgister" -> "register"

Signed-off-by: Uwe Kleine-Koenig <ukleinek@strlen.de>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9f536108d3f3..8c1c9c0f4775 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1894,7 +1894,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 }
 
 /**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
  * @ops - ops structure that holds the function to unregister
  *
  * Unregister a function that was added to be called by ftrace profiling.
-- 
cgit v1.2.3


From b07430ac37103218b5c1e542490a1b98e6deb3d6 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 14 Jan 2009 08:55:39 -0500
Subject: sched: de CPP-ify the scheduler code

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched_rt.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 18c7b5b3158a..4eda5f795f04 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -64,8 +64,10 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 
 #else
 
-#define enqueue_pushable_task(rq, p) do { } while (0)
-#define dequeue_pushable_task(rq, p) do { } while (0)
+static inline
+void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {}
+static inline
+void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {}
 
 #endif /* CONFIG_SMP */
 
-- 
cgit v1.2.3


From 398a153b16b09a68739928d4502455db9725ac86 Mon Sep 17 00:00:00 2001
From: Gregory Haskins <ghaskins@novell.com>
Date: Wed, 14 Jan 2009 09:10:04 -0500
Subject: sched: fix build error in kernel/sched_rt.c when RT_GROUP_SCHED &&
 !SMP

Ingo found a build error in the scheduler when RT_GROUP_SCHED was
enabled, but SMP was not.  This patch rearranges the code such
that it is a little more streamlined and compiles under all permutations
of SMP, UP and RT_GROUP_SCHED.  It was boot tested on my 4-way x86_64
and it still passes preempt-test.

Signed-off-by: Gregory Haskins <ghaskins@novell.com>
---
 kernel/sched.c    |   4 +
 kernel/sched_rt.c | 264 +++++++++++++++++++++++++++++++++++-------------------
 2 files changed, 174 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dd1a1466c1e6..2b703f1fac3a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -466,7 +466,9 @@ struct rt_rq {
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	struct {
 		int curr; /* highest queued rt task prio */
+#ifdef CONFIG_SMP
 		int next; /* next highest */
+#endif
 	} highest_prio;
 #endif
 #ifdef CONFIG_SMP
@@ -8267,8 +8269,10 @@ static void init_rt_rq(struct rt_rq *rt_rq, struct rq *rq)
 
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
 	rt_rq->highest_prio.curr = MAX_RT_PRIO;
+#ifdef CONFIG_SMP
 	rt_rq->highest_prio.next = MAX_RT_PRIO;
 #endif
+#endif
 #ifdef CONFIG_SMP
 	rt_rq->rt_nr_migratory = 0;
 	rt_rq->overloaded = 0;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4eda5f795f04..4230b15fe90e 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,40 @@
  * policies)
  */
 
+static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
+{
+	return container_of(rt_se, struct task_struct, rt);
+}
+
+#ifdef CONFIG_RT_GROUP_SCHED
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return rt_rq->rq;
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	return rt_se->rt_rq;
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
+{
+	return container_of(rt_rq, struct rq, rt);
+}
+
+static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
+{
+	struct task_struct *p = rt_task_of(rt_se);
+	struct rq *rq = task_rq(p);
+
+	return &rq->rt;
+}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
 #ifdef CONFIG_SMP
 
 static inline int rt_overloaded(struct rq *rq)
@@ -37,19 +71,35 @@ static inline void rt_clear_overload(struct rq *rq)
 	cpumask_clear_cpu(rq->cpu, rq->rd->rto_mask);
 }
 
-static void update_rt_migration(struct rq *rq)
+static void update_rt_migration(struct rt_rq *rt_rq)
 {
-	if (rq->rt.rt_nr_migratory && (rq->rt.rt_nr_running > 1)) {
-		if (!rq->rt.overloaded) {
-			rt_set_overload(rq);
-			rq->rt.overloaded = 1;
+	if (rt_rq->rt_nr_migratory && (rt_rq->rt_nr_running > 1)) {
+		if (!rt_rq->overloaded) {
+			rt_set_overload(rq_of_rt_rq(rt_rq));
+			rt_rq->overloaded = 1;
 		}
-	} else if (rq->rt.overloaded) {
-		rt_clear_overload(rq);
-		rq->rt.overloaded = 0;
+	} else if (rt_rq->overloaded) {
+		rt_clear_overload(rq_of_rt_rq(rt_rq));
+		rt_rq->overloaded = 0;
 	}
 }
 
+static void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	if (rt_se->nr_cpus_allowed > 1)
+		rt_rq->rt_nr_migratory++;
+
+	update_rt_migration(rt_rq);
+}
+
+static void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	if (rt_se->nr_cpus_allowed > 1)
+		rt_rq->rt_nr_migratory--;
+
+	update_rt_migration(rt_rq);
+}
+
 static void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
 {
 	plist_del(&p->pushable_tasks, &rq->rt.pushable_tasks);
@@ -68,14 +118,13 @@ static inline
 void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {}
 static inline
 void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {}
+static inline
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+static inline
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
 
 #endif /* CONFIG_SMP */
 
-static inline struct task_struct *rt_task_of(struct sched_rt_entity *rt_se)
-{
-	return container_of(rt_se, struct task_struct, rt);
-}
-
 static inline int on_rt_rq(struct sched_rt_entity *rt_se)
 {
 	return !list_empty(&rt_se->run_list);
@@ -99,16 +148,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	list_for_each_entry_rcu(rt_rq, &rq->leaf_rt_rq_list, leaf_rt_rq_list)
 
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-	return rt_rq->rq;
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-	return rt_se->rt_rq;
-}
-
 #define for_each_sched_rt_entity(rt_se) \
 	for (; rt_se; rt_se = rt_se->parent)
 
@@ -196,19 +235,6 @@ static inline u64 sched_rt_period(struct rt_rq *rt_rq)
 #define for_each_leaf_rt_rq(rt_rq, rq) \
 	for (rt_rq = &rq->rt; rt_rq; rt_rq = NULL)
 
-static inline struct rq *rq_of_rt_rq(struct rt_rq *rt_rq)
-{
-	return container_of(rt_rq, struct rq, rt);
-}
-
-static inline struct rt_rq *rt_rq_of_se(struct sched_rt_entity *rt_se)
-{
-	struct task_struct *p = rt_task_of(rt_se);
-	struct rq *rq = task_rq(p);
-
-	return &rq->rt;
-}
-
 #define for_each_sched_rt_entity(rt_se) \
 	for (; rt_se; rt_se = NULL)
 
@@ -567,7 +593,7 @@ static void update_curr_rt(struct rq *rq)
 	}
 }
 
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+#if defined CONFIG_SMP
 
 static struct task_struct *pick_next_highest_task_rt(struct rq *rq, int cpu);
 
@@ -580,33 +606,24 @@ static inline int next_prio(struct rq *rq)
 	else
 		return MAX_RT_PRIO;
 }
-#endif
 
-static inline
-void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+static void
+inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
 {
-	int prio = rt_se_prio(rt_se);
-#ifdef CONFIG_SMP
 	struct rq *rq = rq_of_rt_rq(rt_rq);
-#endif
 
-	WARN_ON(!rt_prio(prio));
-	rt_rq->rt_nr_running++;
-#if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
-	if (prio < rt_rq->highest_prio.curr) {
+	if (prio < prev_prio) {
 
 		/*
 		 * If the new task is higher in priority than anything on the
-		 * run-queue, we have a new high that must be published to
-		 * the world.  We also know that the previous high becomes
-		 * our next-highest.
+		 * run-queue, we know that the previous high becomes our
+		 * next-highest.
 		 */
-		rt_rq->highest_prio.next = rt_rq->highest_prio.curr;
-		rt_rq->highest_prio.curr = prio;
-#ifdef CONFIG_SMP
+		rt_rq->highest_prio.next = prev_prio;
+
 		if (rq->online)
 			cpupri_set(&rq->rd->cpupri, rq->cpu, prio);
-#endif
+
 	} else if (prio == rt_rq->highest_prio.curr)
 		/*
 		 * If the next task is equal in priority to the highest on
@@ -619,72 +636,131 @@ void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
 		 * Otherwise, we need to recompute next-highest
 		 */
 		rt_rq->highest_prio.next = next_prio(rq);
-#endif
-#ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1)
-		rq->rt.rt_nr_migratory++;
+}
 
-	update_rt_migration(rq);
-#endif
-#ifdef CONFIG_RT_GROUP_SCHED
-	if (rt_se_boosted(rt_se))
-		rt_rq->rt_nr_boosted++;
+static void
+dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio)
+{
+	struct rq *rq = rq_of_rt_rq(rt_rq);
 
-	if (rt_rq->tg)
-		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
-#else
-	start_rt_bandwidth(&def_rt_bandwidth);
-#endif
+	if (rt_rq->rt_nr_running && (prio <= rt_rq->highest_prio.next))
+		rt_rq->highest_prio.next = next_prio(rq);
+
+	if (rq->online && rt_rq->highest_prio.curr != prev_prio)
+		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
 }
 
+#else /* CONFIG_SMP */
+
 static inline
-void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
-{
-#ifdef CONFIG_SMP
-	struct rq *rq = rq_of_rt_rq(rt_rq);
-	int highest_prio = rt_rq->highest_prio.curr;
-#endif
+void inc_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+static inline
+void dec_rt_prio_smp(struct rt_rq *rt_rq, int prio, int prev_prio) {}
+
+#endif /* CONFIG_SMP */
 
-	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
-	WARN_ON(!rt_rq->rt_nr_running);
-	rt_rq->rt_nr_running--;
 #if defined CONFIG_SMP || defined CONFIG_RT_GROUP_SCHED
+static void
+inc_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	int prev_prio = rt_rq->highest_prio.curr;
+
+	if (prio < prev_prio)
+		rt_rq->highest_prio.curr = prio;
+
+	inc_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+static void
+dec_rt_prio(struct rt_rq *rt_rq, int prio)
+{
+	int prev_prio = rt_rq->highest_prio.curr;
+
 	if (rt_rq->rt_nr_running) {
-		int prio = rt_se_prio(rt_se);
 
-		WARN_ON(prio < rt_rq->highest_prio.curr);
+		WARN_ON(prio < prev_prio);
 
 		/*
-		 * This may have been our highest or next-highest priority
-		 * task and therefore we may have some recomputation to do
+		 * This may have been our highest task, and therefore
+		 * we may have some recomputation to do
 		 */
-		if (prio == rt_rq->highest_prio.curr) {
+		if (prio == prev_prio) {
 			struct rt_prio_array *array = &rt_rq->active;
 
 			rt_rq->highest_prio.curr =
 				sched_find_first_bit(array->bitmap);
 		}
 
-		if (prio <= rt_rq->highest_prio.next)
-			rt_rq->highest_prio.next = next_prio(rq);
 	} else
 		rt_rq->highest_prio.curr = MAX_RT_PRIO;
-#endif
-#ifdef CONFIG_SMP
-	if (rt_se->nr_cpus_allowed > 1)
-		rq->rt.rt_nr_migratory--;
 
-	if (rq->online && rt_rq->highest_prio.curr != highest_prio)
-		cpupri_set(&rq->rd->cpupri, rq->cpu, rt_rq->highest_prio.curr);
+	dec_rt_prio_smp(rt_rq, prio, prev_prio);
+}
+
+#else
+
+static inline void inc_rt_prio(struct rt_rq *rt_rq, int prio) {}
+static inline void dec_rt_prio(struct rt_rq *rt_rq, int prio) {}
+
+#endif /* CONFIG_SMP || CONFIG_RT_GROUP_SCHED */
 
-	update_rt_migration(rq);
-#endif /* CONFIG_SMP */
 #ifdef CONFIG_RT_GROUP_SCHED
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	if (rt_se_boosted(rt_se))
+		rt_rq->rt_nr_boosted++;
+
+	if (rt_rq->tg)
+		start_rt_bandwidth(&rt_rq->tg->rt_bandwidth);
+}
+
+static void
+dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
 	if (rt_se_boosted(rt_se))
 		rt_rq->rt_nr_boosted--;
 
 	WARN_ON(!rt_rq->rt_nr_running && rt_rq->rt_nr_boosted);
-#endif
+}
+
+#else /* CONFIG_RT_GROUP_SCHED */
+
+static void
+inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	start_rt_bandwidth(&def_rt_bandwidth);
+}
+
+static inline
+void dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+
+#endif /* CONFIG_RT_GROUP_SCHED */
+
+static inline
+void inc_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	int prio = rt_se_prio(rt_se);
+
+	WARN_ON(!rt_prio(prio));
+	rt_rq->rt_nr_running++;
+
+	inc_rt_prio(rt_rq, prio);
+	inc_rt_migration(rt_se, rt_rq);
+	inc_rt_group(rt_se, rt_rq);
+}
+
+static inline
+void dec_rt_tasks(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+	WARN_ON(!rt_prio(rt_se_prio(rt_se)));
+	WARN_ON(!rt_rq->rt_nr_running);
+	rt_rq->rt_nr_running--;
+
+	dec_rt_prio(rt_rq, rt_se_prio(rt_se));
+	dec_rt_migration(rt_se, rt_rq);
+	dec_rt_group(rt_se, rt_rq);
 }
 
 static void __enqueue_rt_entity(struct sched_rt_entity *rt_se)
@@ -1453,7 +1529,7 @@ static void set_cpus_allowed_rt(struct task_struct *p,
 			rq->rt.rt_nr_migratory--;
 		}
 
-		update_rt_migration(rq);
+		update_rt_migration(&rq->rt);
 	}
 
 	cpumask_copy(&p->cpus_allowed, new_mask);
-- 
cgit v1.2.3


From 428aee1460a75197f0190534b4d610450ee59af1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 14 Jan 2009 12:24:42 -0500
Subject: trace: print ftrace_dump at KERN_EMERG log level

Impact: fix to print out ftrace_dump when expected

I was debugging a hard race condition to only find out that
after I hit the race, my log level was not at level to show
KERN_INFO. The time it took to trigger the race was wasted because
I did not capture the trace.

Since ftrace_dump is only called from kernel oops (and only when
it is set in the kernel command line to do so), or when a
developer adds it to their own local tree, the log level of
the print should be at KERN_EMERG to make sure the print appears.

ftrace_dump is not called by a normal user setup, and will not
add extra unwanted print out to the console. There is no reason
it should be at KERN_INFO.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 40217fb499ea..408c03f2b8a9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3076,7 +3076,7 @@ static struct notifier_block trace_die_notifier = {
  * it if we decide to change what log level the ftrace dump
  * should be at.
  */
-#define KERN_TRACE		KERN_INFO
+#define KERN_TRACE		KERN_EMERG
 
 static void
 trace_printk_seq(struct trace_seq *s)
-- 
cgit v1.2.3


From 6f3b34402e7282cde49dff395d7ea462bf33bf50 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 12 Jan 2009 11:06:18 +0800
Subject: ring_buffer: reset write when reserve buffer fail

Impact: reset struct buffer_page.write when interrupt storm

if struct buffer_page.write is not reset, any succedent committing
will corrupted ring_buffer:

static inline void
rb_set_commit_to_write(struct ring_buffer_per_cpu *cpu_buffer)
{
	......
		cpu_buffer->commit_page->commit =
			cpu_buffer->commit_page->write;
	......
}

when "if (RB_WARN_ON(cpu_buffer, next_page == reader_page))", ring_buffer
is disabled, but some reserved buffers may haven't been committed.
we need reset struct buffer_page.write.

when "if (unlikely(next_page == cpu_buffer->commit_page))", ring_buffer
is still available, we should not corrupt it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 4832ffa5d937..0b9de5a3d699 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1017,12 +1017,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		}
 
 		if (next_page == head_page) {
-			if (!(buffer->flags & RB_FL_OVERWRITE)) {
-				/* reset write */
-				if (tail <= BUF_PAGE_SIZE)
-					local_set(&tail_page->write, tail);
+			if (!(buffer->flags & RB_FL_OVERWRITE))
 				goto out_unlock;
-			}
 
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
@@ -1097,6 +1093,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	return event;
 
  out_unlock:
+	/* reset write */
+	if (tail <= BUF_PAGE_SIZE)
+		local_set(&tail_page->write, tail);
+
 	__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
-- 
cgit v1.2.3


From 0ee6b6cf5bdb793b4c68507dd65adf16341aa4ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 14 Jan 2009 14:50:19 -0500
Subject: trace: stop all recording to ring buffer on ftrace_dump

Impact: limit ftrace dump output

Currently ftrace_dump only calls ftrace_kill that is a fast way
to prevent the function tracer functions from being called (just sets
a flag and clears the function to call, nothing else). It is better
to also turn off any recording to the ring buffers as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 408c03f2b8a9..dcb757f70d21 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3110,6 +3110,7 @@ void ftrace_dump(void)
 	dump_ran = 1;
 
 	/* No turning back! */
+	tracing_off();
 	ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
-- 
cgit v1.2.3


From 4a2b8dda3f8705880ec7408135645602d5590f51 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 14 Jan 2009 13:33:27 -0800
Subject: tracing/function-graph-tracer: fix a regression while suspend to disk

Impact: fix a crash while kernel image restore

When the function graph tracer is running and while suspend to disk, some racy
and dangerous things happen against this tracer.

The current task will save its registers including the stack pointer which
contains the return address hooked by the tracer. But the current task will
continue to enter other functions after that to save the memory, and then
it will store other return addresses, and finally loose the old depth which
matches the return address saved in the old stack (during the registers saving).

So on image restore, the code will return to wrong addresses.
And there are other things: on restore, the task will have it's "current"
pointer overwritten during registers restoring....switching from one task to
another... That would be insane to try to trace function graphs at these
stages.

This patch makes the function graph tracer listening on power events, making
it's tracing disabled for the current task (the one that performs the
hibernation work) while suspend/resume to disk, making the tracing safe
during hibernation.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 27 +++++++++++++++++++++++++++
 1 file changed, 27 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 8c1c9c0f4775..7e9a20b69939 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -17,6 +17,7 @@
 #include <linux/clocksource.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
+#include <linux/suspend.h>
 #include <linux/debugfs.h>
 #include <linux/hardirq.h>
 #include <linux/kthread.h>
@@ -1957,6 +1958,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
 static atomic_t ftrace_graph_active;
+static struct notifier_block ftrace_suspend_notifier;
 
 int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace)
 {
@@ -2035,6 +2037,27 @@ static int start_graph_tracing(void)
 	return ret;
 }
 
+/*
+ * Hibernation protection.
+ * The state of the current task is too much unstable during
+ * suspend/restore to disk. We want to protect against that.
+ */
+static int
+ftrace_suspend_notifier_call(struct notifier_block *bl, unsigned long state,
+							void *unused)
+{
+	switch (state) {
+	case PM_HIBERNATION_PREPARE:
+		pause_graph_tracing();
+		break;
+
+	case PM_POST_HIBERNATION:
+		unpause_graph_tracing();
+		break;
+	}
+	return NOTIFY_DONE;
+}
+
 int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 			trace_func_graph_ent_t entryfunc)
 {
@@ -2042,6 +2065,9 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_sysctl_lock);
 
+	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
+	register_pm_notifier(&ftrace_suspend_notifier);
+
 	atomic_inc(&ftrace_graph_active);
 	ret = start_graph_tracing();
 	if (ret) {
@@ -2067,6 +2093,7 @@ void unregister_ftrace_graph(void)
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
+	unregister_pm_notifier(&ftrace_suspend_notifier);
 
 	mutex_unlock(&ftrace_sysctl_lock);
 }
-- 
cgit v1.2.3


From 42fab4b2cdc02cf28e2474ccfd75bc9225076590 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 09:30:52 +0800
Subject: tracing/ftrace: add missing unlock in register_stat_tracer()

We should unlock all_stat_sessions_mutex before returning failure.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index cb29282b9485..2110cea2ece3 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -73,8 +73,10 @@ int register_stat_tracer(struct tracer_stat *trace)
 	/* Already registered? */
 	mutex_lock(&all_stat_sessions_mutex);
 	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace)
+		if (node->ts == trace) {
+			mutex_unlock(&all_stat_sessions_mutex);
 			return -EINVAL;
+		}
 	}
 	mutex_unlock(&all_stat_sessions_mutex);
 
-- 
cgit v1.2.3


From 55922173f1f63903b6de03711ab8ff980cbe58d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 15 Jan 2009 11:31:21 +0100
Subject: tracing: trace_stat.c cleanup

Impact: cleanup

- whitespace / code alignment cleanups
- avoid unnecessary forward prototype by reordering functions

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 147 ++++++++++++++++++++++------------------------
 1 file changed, 71 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 2110cea2ece3..eae9cef39291 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -17,16 +17,16 @@
 
 /* List of stat entries from a tracer */
 struct trace_stat_list {
-	struct list_head 	list;
-	void 			*stat;
+	struct list_head	list;
+	void			*stat;
 };
 
 /* A stat session is the stats output in one file */
 struct tracer_stat_session {
 	struct list_head	session_list;
-	struct tracer_stat 	*ts;
-	struct list_head 	stat_list;
-	struct mutex 		stat_mutex;
+	struct tracer_stat	*ts;
+	struct list_head	stat_list;
+	struct mutex		stat_mutex;
 	struct dentry		*file;
 };
 
@@ -35,7 +35,7 @@ static LIST_HEAD(all_stat_sessions);
 static DEFINE_MUTEX(all_stat_sessions_mutex);
 
 /* The root directory for all stat files */
-static struct dentry *stat_dir;
+static struct dentry		*stat_dir;
 
 
 static void reset_stat_session(struct tracer_stat_session *session)
@@ -56,71 +56,6 @@ static void destroy_session(struct tracer_stat_session *session)
 	kfree(session);
 }
 
-
-static int init_stat_file(struct tracer_stat_session *session);
-
-int register_stat_tracer(struct tracer_stat *trace)
-{
-	struct tracer_stat_session *session, *node, *tmp;
-	int ret;
-
-	if (!trace)
-		return -EINVAL;
-
-	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
-		return -EINVAL;
-
-	/* Already registered? */
-	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			mutex_unlock(&all_stat_sessions_mutex);
-			return -EINVAL;
-		}
-	}
-	mutex_unlock(&all_stat_sessions_mutex);
-
-	/* Init the session */
-	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
-	if (!session)
-		return -ENOMEM;
-
-	session->ts = trace;
-	INIT_LIST_HEAD(&session->session_list);
-	INIT_LIST_HEAD(&session->stat_list);
-	mutex_init(&session->stat_mutex);
-	session->file = NULL;
-
-	ret = init_stat_file(session);
-	if (ret) {
-		destroy_session(session);
-		return ret;
-	}
-
-	/* Register */
-	mutex_lock(&all_stat_sessions_mutex);
-	list_add_tail(&session->session_list, &all_stat_sessions);
-	mutex_unlock(&all_stat_sessions_mutex);
-
-	return 0;
-}
-
-void unregister_stat_tracer(struct tracer_stat *trace)
-{
-	struct tracer_stat_session *node, *tmp;
-
-	mutex_lock(&all_stat_sessions_mutex);
-	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
-		if (node->ts == trace) {
-			list_del(&node->session_list);
-			destroy_session(node);
-			break;
-		}
-	}
-	mutex_unlock(&all_stat_sessions_mutex);
-}
-
-
 /*
  * For tracers that don't provide a stat_cmp callback.
  * This one will force an immediate insertion on tail of
@@ -252,10 +187,10 @@ static int stat_seq_show(struct seq_file *s, void *v)
 }
 
 static const struct seq_operations trace_stat_seq_ops = {
-	.start = stat_seq_start,
-	.next = stat_seq_next,
-	.stop = stat_seq_stop,
-	.show = stat_seq_show
+	.start		= stat_seq_start,
+	.next		= stat_seq_next,
+	.stop		= stat_seq_stop,
+	.show		= stat_seq_show
 };
 
 /* The session stat is refilled and resorted at each stat file opening */
@@ -275,7 +210,6 @@ static int tracing_stat_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-
 /*
  * Avoid consuming memory with our now useless list.
  */
@@ -322,3 +256,64 @@ static int init_stat_file(struct tracer_stat_session *session)
 		return -ENOMEM;
 	return 0;
 }
+
+int register_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *session, *node, *tmp;
+	int ret;
+
+	if (!trace)
+		return -EINVAL;
+
+	if (!trace->stat_start || !trace->stat_next || !trace->stat_show)
+		return -EINVAL;
+
+	/* Already registered? */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			mutex_unlock(&all_stat_sessions_mutex);
+			return -EINVAL;
+		}
+	}
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	/* Init the session */
+	session = kmalloc(sizeof(struct tracer_stat_session), GFP_KERNEL);
+	if (!session)
+		return -ENOMEM;
+
+	session->ts = trace;
+	INIT_LIST_HEAD(&session->session_list);
+	INIT_LIST_HEAD(&session->stat_list);
+	mutex_init(&session->stat_mutex);
+	session->file = NULL;
+
+	ret = init_stat_file(session);
+	if (ret) {
+		destroy_session(session);
+		return ret;
+	}
+
+	/* Register */
+	mutex_lock(&all_stat_sessions_mutex);
+	list_add_tail(&session->session_list, &all_stat_sessions);
+	mutex_unlock(&all_stat_sessions_mutex);
+
+	return 0;
+}
+
+void unregister_stat_tracer(struct tracer_stat *trace)
+{
+	struct tracer_stat_session *node, *tmp;
+
+	mutex_lock(&all_stat_sessions_mutex);
+	list_for_each_entry_safe(node, tmp, &all_stat_sessions, session_list) {
+		if (node->ts == trace) {
+			list_del(&node->session_list);
+			destroy_session(node);
+			break;
+		}
+	}
+	mutex_unlock(&all_stat_sessions_mutex);
+}
-- 
cgit v1.2.3


From 831451ac4e44d3a20b581ce726ef1d1144373f7d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 12:39:18 +0100
Subject: sched: introduce avg_wakeup

Introduce a new avg_wakeup statistic.

avg_wakeup is a measure of how frequently a task wakes up other tasks, it
represents the average time between wakeups, with a limit of avg_runtime
for when it doesn't wake up anybody.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  3 +++
 kernel/sched.c        | 36 ++++++++++++++++++++++++++++++------
 kernel/sched_debug.c  |  1 +
 3 files changed, 34 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..daf4e07bc978 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1046,6 +1046,9 @@ struct sched_entity {
 	u64			exec_max;
 	u64			slice_max;
 
+	u64			start_runtime;
+	u64			avg_wakeup;
+
 	u64			nr_migrations;
 	u64			nr_migrations_cold;
 	u64			nr_failed_migrations_affine;
diff --git a/kernel/sched.c b/kernel/sched.c
index 8be2c13b50d0..86f5a063f0b9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1705,6 +1705,9 @@ static void update_avg(u64 *avg, u64 sample)
 
 static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 {
+	if (wakeup)
+		p->se.start_runtime = p->se.sum_exec_runtime;
+
 	sched_info_queued(p);
 	p->sched_class->enqueue_task(rq, p, wakeup);
 	p->se.on_rq = 1;
@@ -1712,10 +1715,15 @@ static void enqueue_task(struct rq *rq, struct task_struct *p, int wakeup)
 
 static void dequeue_task(struct rq *rq, struct task_struct *p, int sleep)
 {
-	if (sleep && p->se.last_wakeup) {
-		update_avg(&p->se.avg_overlap,
-			   p->se.sum_exec_runtime - p->se.last_wakeup);
-		p->se.last_wakeup = 0;
+	if (sleep) {
+		if (p->se.last_wakeup) {
+			update_avg(&p->se.avg_overlap,
+				p->se.sum_exec_runtime - p->se.last_wakeup);
+			p->se.last_wakeup = 0;
+		} else {
+			update_avg(&p->se.avg_wakeup,
+				sysctl_sched_wakeup_granularity);
+		}
 	}
 
 	sched_info_dequeued(p);
@@ -2345,6 +2353,22 @@ out_activate:
 	activate_task(rq, p, 1);
 	success = 1;
 
+	/*
+	 * Only attribute actual wakeups done by this task.
+	 */
+	if (!in_interrupt()) {
+		struct sched_entity *se = &current->se;
+		u64 sample = se->sum_exec_runtime;
+
+		if (se->last_wakeup)
+			sample -= se->last_wakeup;
+		else
+			sample -= se->start_runtime;
+		update_avg(&se->avg_wakeup, sample);
+
+		se->last_wakeup = se->sum_exec_runtime;
+	}
+
 out_running:
 	trace_sched_wakeup(rq, p, success);
 	check_preempt_curr(rq, p, sync);
@@ -2355,8 +2379,6 @@ out_running:
 		p->sched_class->task_wake_up(rq, p);
 #endif
 out:
-	current->se.last_wakeup = current->se.sum_exec_runtime;
-
 	task_rq_unlock(rq, &flags);
 
 	return success;
@@ -2386,6 +2408,8 @@ static void __sched_fork(struct task_struct *p)
 	p->se.prev_sum_exec_runtime	= 0;
 	p->se.last_wakeup		= 0;
 	p->se.avg_overlap		= 0;
+	p->se.start_runtime		= 0;
+	p->se.avg_wakeup		= sysctl_sched_wakeup_granularity;
 
 #ifdef CONFIG_SCHEDSTATS
 	p->se.wait_start		= 0;
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 16eeba4e4169..2b1260f0e800 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -397,6 +397,7 @@ void proc_sched_show_task(struct task_struct *p, struct seq_file *m)
 	PN(se.vruntime);
 	PN(se.sum_exec_runtime);
 	PN(se.avg_overlap);
+	PN(se.avg_wakeup);
 
 	nr_switches = p->nvcsw + p->nivcsw;
 
-- 
cgit v1.2.3


From e52fb7c097238d34f4d8e2a596f8a3f85b0c0565 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 14 Jan 2009 12:39:19 +0100
Subject: sched: prefer wakers

Prefer tasks that wake other tasks to preempt quickly. This improves
performance because more work is available sooner.

The workload that prompted this patch was a kernel build over NFS4 (for some
curious and not understood reason we had to revert commit:
18de9735300756e3ca9c361ef58409d8561dfe0d to make any progress at all)

Without this patch a make -j8 bzImage (of x86-64 defconfig) would take
3m30-ish, with this patch we're down to 2m50-ish.

psql-sysbench/mysql-sysbench show a slight improvement in peak performance as
well, tbench and vmark seemed to not care.

It is possible to improve upon the build time (to 2m20-ish) but that seriously
destroys other benchmarks (just shows that there's more room for tinkering).

Much thanks to Mike who put in a lot of effort to benchmark things and proved
a worthy opponent with a competing patch.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_fair.c     | 59 ++++++++++++++++++++++++++++++++++++++++++++-----
 kernel/sched_features.h |  3 ++-
 2 files changed, 55 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index 8e1352c75557..bdf64346b4d1 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1295,16 +1295,63 @@ out:
 }
 #endif /* CONFIG_SMP */
 
-static unsigned long wakeup_gran(struct sched_entity *se)
+/*
+ * Adaptive granularity
+ *
+ * se->avg_wakeup gives the average time a task runs until it does a wakeup,
+ * with the limit of wakeup_gran -- when it never does a wakeup.
+ *
+ * So the smaller avg_wakeup is the faster we want this task to preempt,
+ * but we don't want to treat the preemptee unfairly and therefore allow it
+ * to run for at least the amount of time we'd like to run.
+ *
+ * NOTE: we use 2*avg_wakeup to increase the probability of actually doing one
+ *
+ * NOTE: we use *nr_running to scale with load, this nicely matches the
+ *       degrading latency on load.
+ */
+static unsigned long
+adaptive_gran(struct sched_entity *curr, struct sched_entity *se)
+{
+	u64 this_run = curr->sum_exec_runtime - curr->prev_sum_exec_runtime;
+	u64 expected_wakeup = 2*se->avg_wakeup * cfs_rq_of(se)->nr_running;
+	u64 gran = 0;
+
+	if (this_run < expected_wakeup)
+		gran = expected_wakeup - this_run;
+
+	return min_t(s64, gran, sysctl_sched_wakeup_granularity);
+}
+
+static unsigned long
+wakeup_gran(struct sched_entity *curr, struct sched_entity *se)
 {
 	unsigned long gran = sysctl_sched_wakeup_granularity;
 
+	if (cfs_rq_of(curr)->curr && sched_feat(ADAPTIVE_GRAN))
+		gran = adaptive_gran(curr, se);
+
 	/*
-	 * More easily preempt - nice tasks, while not making it harder for
-	 * + nice tasks.
+	 * Since its curr running now, convert the gran from real-time
+	 * to virtual-time in his units.
 	 */
-	if (!sched_feat(ASYM_GRAN) || se->load.weight > NICE_0_LOAD)
-		gran = calc_delta_fair(sysctl_sched_wakeup_granularity, se);
+	if (sched_feat(ASYM_GRAN)) {
+		/*
+		 * By using 'se' instead of 'curr' we penalize light tasks, so
+		 * they get preempted easier. That is, if 'se' < 'curr' then
+		 * the resulting gran will be larger, therefore penalizing the
+		 * lighter, if otoh 'se' > 'curr' then the resulting gran will
+		 * be smaller, again penalizing the lighter task.
+		 *
+		 * This is especially important for buddies when the leftmost
+		 * task is higher priority than the buddy.
+		 */
+		if (unlikely(se->load.weight != NICE_0_LOAD))
+			gran = calc_delta_fair(gran, se);
+	} else {
+		if (unlikely(curr->load.weight != NICE_0_LOAD))
+			gran = calc_delta_fair(gran, curr);
+	}
 
 	return gran;
 }
@@ -1331,7 +1378,7 @@ wakeup_preempt_entity(struct sched_entity *curr, struct sched_entity *se)
 	if (vdiff <= 0)
 		return -1;
 
-	gran = wakeup_gran(curr);
+	gran = wakeup_gran(curr, se);
 	if (vdiff > gran)
 		return 1;
 
diff --git a/kernel/sched_features.h b/kernel/sched_features.h
index da5d93b5d2c6..76f61756e677 100644
--- a/kernel/sched_features.h
+++ b/kernel/sched_features.h
@@ -1,5 +1,6 @@
 SCHED_FEAT(NEW_FAIR_SLEEPERS, 1)
-SCHED_FEAT(NORMALIZED_SLEEPER, 1)
+SCHED_FEAT(NORMALIZED_SLEEPER, 0)
+SCHED_FEAT(ADAPTIVE_GRAN, 1)
 SCHED_FEAT(WAKEUP_PREEMPT, 1)
 SCHED_FEAT(START_DEBIT, 1)
 SCHED_FEAT(AFFINE_WAKEUPS, 1)
-- 
cgit v1.2.3


From 6c1a99afbda99cd8d8c69d756387041567a13d87 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 15 Jan 2009 18:05:40 +0800
Subject: ftrace: fix trace_output

Impact: fix bug for handling partial line

trace_seq_printf(), seq_print_userip_objs(), ... return
0          -- partial line was written
other(>0)  -- success

duplicate output is also removed in trace_print_raw().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 65 +++++++++++++++++++++------------------------
 kernel/trace/trace_output.h |  4 +--
 2 files changed, 33 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index df0c25cbed30..4e3ad36b117c 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -440,9 +440,9 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (trace_seq_printf(s, "%x %x\n",
-			     field->ip,
-			     field->parent_ip))
+	if (!trace_seq_printf(s, "%x %x\n",
+			      field->ip,
+			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -497,14 +497,14 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
 	comm = trace_find_cmdline(field->next_pid);
-	if (trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
-			     field->prev_pid,
-			     field->prev_prio,
-			     S, delim,
-			     field->next_cpu,
-			     field->next_pid,
-			     field->next_prio,
-			     T, comm))
+	if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+			      field->prev_pid,
+			      field->prev_prio,
+			      S, delim,
+			      field->next_cpu,
+			      field->next_pid,
+			      field->next_prio,
+			      T, comm))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -534,14 +534,14 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
 	if (!S)
 		task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
-	if (trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
-			     field->prev_pid,
-			     field->prev_prio,
-			     S,
-			     field->next_cpu,
-			     field->next_pid,
-			     field->next_prio,
-			     T))
+	if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+			      field->prev_pid,
+			      field->prev_prio,
+			      S,
+			      field->next_cpu,
+			      field->next_pid,
+			      field->next_prio,
+			      T))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -639,10 +639,10 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (trace_seq_printf(s, "# %ld %ld %ld\n",
-			     field->arg1,
-			     field->arg2,
-			     field->arg3))
+	if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+			      field->arg1,
+			      field->arg2,
+			      field->arg3))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	return 0;
@@ -697,13 +697,13 @@ trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (i) {
-			if (trace_seq_puts(s, " <= "))
+			if (!trace_seq_puts(s, " <= "))
 				goto partial;
 
-			if (seq_print_ip_sym(s, field->caller[i], flags))
+			if (!seq_print_ip_sym(s, field->caller[i], flags))
 				goto partial;
 		}
-		if (trace_seq_puts(s, "\n"))
+		if (!trace_seq_puts(s, "\n"))
 			goto partial;
 	}
 
@@ -731,10 +731,10 @@ trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_userip_objs(field, s, flags))
+	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
 
-	if (trace_seq_putc(s, '\n'))
+	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
 	return 0;
@@ -760,10 +760,10 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_ip_sym(s, field->ip, flags))
+	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
 
-	if (trace_seq_printf(s, ": %s", field->buf))
+	if (!trace_seq_printf(s, ": %s", field->buf))
 		goto partial;
 
 	return 0;
@@ -779,10 +779,7 @@ trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+	if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ecab4ea4a4fd..b2c14615e0cd 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -45,14 +45,14 @@ trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
 #define SEQ_PUT_FIELD_RET(s, x)				\
 do {							\
 	if (!trace_seq_putmem(s, &(x), sizeof(x)))	\
-		return 0;				\
+		return TRACE_TYPE_PARTIAL_LINE;		\
 } while (0)
 
 #define SEQ_PUT_HEX_FIELD_RET(s, x)			\
 do {							\
 	BUILD_BUG_ON(sizeof(x) > MAX_MEMHEX_BYTES);	\
 	if (!trace_seq_putmem_hex(s, &(x), sizeof(x)))	\
-		return 0;				\
+		return TRACE_TYPE_PARTIAL_LINE;		\
 } while (0)
 
 #endif
-- 
cgit v1.2.3


From 5361499101306cfb776c3cfa0f69d0479bc63868 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 19:12:40 -0500
Subject: ftrace: add stack trace to function tracer

Impact: new feature to stack trace any function

Chris Mason asked about being able to pick and choose a function
and get a stack trace from it. This feature enables his request.

 # echo io_schedule > /debug/tracing/set_ftrace_filter
 # echo function > /debug/tracing/current_tracer
 # echo func_stack_trace > /debug/tracing/trace_options

Produces the following in /debug/tracing/trace:

       kjournald-702   [001]   135.673060: io_schedule <-sync_buffer
       kjournald-702   [002]   135.673671:
 <= sync_buffer
 <= __wait_on_bit
 <= out_of_line_wait_on_bit
 <= __wait_on_buffer
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald

Note, be careful about turning this on without filtering the functions.
You may find that you have a 10 second lag between typing and seeing
what you typed. This is why the stack trace for the function tracer
does not use the same stack_trace flag as the other tracers use.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           | 26 ++++++++-----
 kernel/trace/trace.h           |  7 ++++
 kernel/trace/trace_functions.c | 84 ++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 108 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dcb757f70d21..3c54cb125228 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -835,10 +835,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
 		trace_function(tr, data, ip, parent_ip, flags, pc);
 }
 
-static void ftrace_trace_stack(struct trace_array *tr,
-			       struct trace_array_cpu *data,
-			       unsigned long flags,
-			       int skip, int pc)
+static void __ftrace_trace_stack(struct trace_array *tr,
+				 struct trace_array_cpu *data,
+				 unsigned long flags,
+				 int skip, int pc)
 {
 #ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
@@ -846,9 +846,6 @@ static void ftrace_trace_stack(struct trace_array *tr,
 	struct stack_trace trace;
 	unsigned long irq_flags;
 
-	if (!(trace_flags & TRACE_ITER_STACKTRACE))
-		return;
-
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					 &irq_flags);
 	if (!event)
@@ -869,12 +866,23 @@ static void ftrace_trace_stack(struct trace_array *tr,
 #endif
 }
 
+static void ftrace_trace_stack(struct trace_array *tr,
+			       struct trace_array_cpu *data,
+			       unsigned long flags,
+			       int skip, int pc)
+{
+	if (!(trace_flags & TRACE_ITER_STACKTRACE))
+		return;
+
+	__ftrace_trace_stack(tr, data, flags, skip, pc);
+}
+
 void __trace_stack(struct trace_array *tr,
 		   struct trace_array_cpu *data,
 		   unsigned long flags,
-		   int skip)
+		   int skip, int pc)
 {
-	ftrace_trace_stack(tr, data, flags, skip, preempt_count());
+	__ftrace_trace_stack(tr, data, flags, skip, pc);
 }
 
 static void ftrace_trace_userstack(struct trace_array *tr,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 79c872100dd5..bf39a369e4b3 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -457,6 +457,11 @@ void update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu);
 void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
+void __trace_stack(struct trace_array *tr,
+		   struct trace_array_cpu *data,
+		   unsigned long flags,
+		   int skip, int pc);
+
 extern cycle_t ftrace_now(int cpu);
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -467,6 +472,8 @@ void tracing_stop_function_trace(void);
 # define tracing_stop_function_trace()		do { } while (0)
 #endif
 
+extern int ftrace_function_enabled;
+
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 9236d7e25a16..3a5fa08cedb0 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,6 +16,8 @@
 
 #include "trace.h"
 
+static struct trace_array	*func_trace;
+
 static void start_function_trace(struct trace_array *tr)
 {
 	tr->cpu = get_cpu();
@@ -34,6 +36,7 @@ static void stop_function_trace(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
+	func_trace = tr;
 	start_function_trace(tr);
 	return 0;
 }
@@ -48,12 +51,93 @@ static void function_trace_start(struct trace_array *tr)
 	tracing_reset_online_cpus(tr);
 }
 
+static void
+function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		/*
+		 * skip over 5 funcs:
+		 *    __ftrace_trace_stack,
+		 *    __trace_stack,
+		 *    function_stack_trace_call
+		 *    ftrace_list_func
+		 *    ftrace_call
+		 */
+		__trace_stack(tr, data, flags, 5, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
+static struct ftrace_ops trace_stack_ops __read_mostly =
+{
+	.func = function_stack_trace_call,
+};
+
+/* Our two options */
+enum {
+	TRACE_FUNC_OPT_STACK = 0x1,
+};
+
+static struct tracer_opt func_opts[] = {
+#ifdef CONFIG_STACKTRACE
+	{ TRACER_OPT(func_stack_trace, TRACE_FUNC_OPT_STACK) },
+#endif
+	{ } /* Always set a last empty entry */
+};
+
+static struct tracer_flags func_flags = {
+	.val = 0, /* By default: all flags disabled */
+	.opts = func_opts
+};
+
+static int func_set_flag(u32 old_flags, u32 bit, int set)
+{
+	if (bit == TRACE_FUNC_OPT_STACK) {
+		/* do nothing if already set */
+		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
+			return 0;
+
+		if (set)
+			register_ftrace_function(&trace_stack_ops);
+		else
+			unregister_ftrace_function(&trace_stack_ops);
+
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static struct tracer function_trace __read_mostly =
 {
 	.name	     = "function",
 	.init	     = function_trace_init,
 	.reset	     = function_trace_reset,
 	.start	     = function_trace_start,
+	.flags		= &func_flags,
+	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_function,
 #endif
-- 
cgit v1.2.3


From bb3c3c95f330f7bf16e33b002e48882616089db1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 20:40:23 -0500
Subject: ftrace: move function tracer functions out of trace.c

Impact: clean up of trace.c

The function tracer functions were put in trace.c because it needed
to share static variables that were in trace.c.  Since then, those
variables have become global for various reasons. This patch moves
the function tracer functions into trace_function.c where they belong.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           | 84 ------------------------------------------
 kernel/trace/trace_functions.c | 84 +++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 83 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3c54cb125228..2585ffb6c6b5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1046,65 +1046,6 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	local_irq_restore(flags);
 }
 
-#ifdef CONFIG_FUNCTION_TRACER
-static void
-function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu, resched;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	local_save_flags(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
-
-	atomic_dec(&data->disabled);
-	ftrace_preempt_enable(resched);
-}
-
-static void
-function_trace_call(unsigned long ip, unsigned long parent_ip)
-{
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	unsigned long flags;
-	long disabled;
-	int cpu;
-	int pc;
-
-	if (unlikely(!ftrace_function_enabled))
-		return;
-
-	/*
-	 * Need to use raw, since this must be called before the
-	 * recursive protection is performed.
-	 */
-	local_irq_save(flags);
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-	disabled = atomic_inc_return(&data->disabled);
-
-	if (likely(disabled == 1)) {
-		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
-	}
-
-	atomic_dec(&data->disabled);
-	local_irq_restore(flags);
-}
-
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 int trace_graph_entry(struct ftrace_graph_ent *trace)
 {
@@ -1162,31 +1103,6 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
-static struct ftrace_ops trace_ops __read_mostly =
-{
-	.func = function_trace_call,
-};
-
-void tracing_start_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
-	register_ftrace_function(&trace_ops);
-	ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
-}
-#endif
-
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 3a5fa08cedb0..2dce3c7370d1 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -20,6 +20,7 @@ static struct trace_array	*func_trace;
 
 static void start_function_trace(struct trace_array *tr)
 {
+	func_trace = tr;
 	tr->cpu = get_cpu();
 	tracing_reset_online_cpus(tr);
 	put_cpu();
@@ -36,7 +37,6 @@ static void stop_function_trace(struct trace_array *tr)
 
 static int function_trace_init(struct trace_array *tr)
 {
-	func_trace = tr;
 	start_function_trace(tr);
 	return 0;
 }
@@ -51,6 +51,64 @@ static void function_trace_start(struct trace_array *tr)
 	tracing_reset_online_cpus(tr);
 }
 
+static void
+function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu, resched;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	local_save_flags(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1))
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+
+	atomic_dec(&data->disabled);
+	ftrace_preempt_enable(resched);
+}
+
+static void
+function_trace_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct trace_array *tr = func_trace;
+	struct trace_array_cpu *data;
+	unsigned long flags;
+	long disabled;
+	int cpu;
+	int pc;
+
+	if (unlikely(!ftrace_function_enabled))
+		return;
+
+	/*
+	 * Need to use raw, since this must be called before the
+	 * recursive protection is performed.
+	 */
+	local_irq_save(flags);
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+	disabled = atomic_inc_return(&data->disabled);
+
+	if (likely(disabled == 1)) {
+		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
+	}
+
+	atomic_dec(&data->disabled);
+	local_irq_restore(flags);
+}
+
 static void
 function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 {
@@ -90,6 +148,30 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 	local_irq_restore(flags);
 }
 
+
+static struct ftrace_ops trace_ops __read_mostly =
+{
+	.func = function_trace_call,
+};
+
+void tracing_start_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
+	register_ftrace_function(&trace_ops);
+	ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+	unregister_ftrace_function(&trace_ops);
+}
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
 	.func = function_stack_trace_call,
-- 
cgit v1.2.3


From c37abc5515b5ed5b1d2134d2deaead492d9f92a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 20:50:54 -0500
Subject: trace: add gcc printf check to trace_seq_printf

Andrew Morton suggested adding a printf checker to trace_seq_printf
since there are a number of users that have improper format arguments.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index b2c14615e0cd..1cbab5e3dc99 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -16,7 +16,8 @@ struct trace_event {
 	trace_print_func	binary;
 };
 
-extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...);
+extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
 extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
-- 
cgit v1.2.3


From 5e4abc9839191e213965e0f1dbf36e2e44356c3a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 21:00:50 -0500
Subject: trace: clean up format errors in calls to trace_seq_printf

After adding the printf format checking for trace_seq_printf, several
warnings now show up. This patch cleans them up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c       |  4 ++--
 kernel/trace/trace_mmiotrace.c | 13 +++++++------
 kernel/trace/trace_output.c    |  2 +-
 3 files changed, 10 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index faaa5ae7e75a..7ebc58cee3bd 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4d   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4d   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 621c8c3f3139..ec78e244242e 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -184,21 +184,22 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter)
 	switch (rw->opcode) {
 	case MMIO_READ:
 		ret = trace_seq_printf(s,
-			"R %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			"R %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_WRITE:
 		ret = trace_seq_printf(s,
-			"W %d %lu.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
+			"W %d %u.%06lu %d 0x%llx 0x%lx 0x%lx %d\n",
 			rw->width, secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			rw->value, rw->pc, 0);
 		break;
 	case MMIO_UNKNOWN_OP:
 		ret = trace_seq_printf(s,
-			"UNKNOWN %lu.%06lu %d 0x%llx %02x,%02x,%02x 0x%lx %d\n",
+			"UNKNOWN %u.%06lu %d 0x%llx %02lx,%02lx,"
+			"%02lx 0x%lx %d\n",
 			secs, usec_rem, rw->map_id,
 			(unsigned long long)rw->phys,
 			(rw->value >> 16) & 0xff, (rw->value >> 8) & 0xff,
@@ -230,14 +231,14 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter)
 	switch (m->opcode) {
 	case MMIO_PROBE:
 		ret = trace_seq_printf(s,
-			"MAP %lu.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
+			"MAP %u.%06lu %d 0x%llx 0x%lx 0x%lx 0x%lx %d\n",
 			secs, usec_rem, m->map_id,
 			(unsigned long long)m->phys, m->virt, m->len,
 			0UL, 0);
 		break;
 	case MMIO_UNPROBE:
 		ret = trace_seq_printf(s,
-			"UNMAP %lu.%06lu %d 0x%lx %d\n",
+			"UNMAP %u.%06lu %d 0x%lx %d\n",
 			secs, usec_rem, m->map_id, 0UL, 0);
 		break;
 	default:
@@ -261,7 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %lu.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4e3ad36b117c..1a4e144a9f8f 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -440,7 +440,7 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 
 	trace_assign_type(field, entry);
 
-	if (!trace_seq_printf(s, "%x %x\n",
+	if (!trace_seq_printf(s, "%lx %lx\n",
 			      field->ip,
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3


From 3eb36aa05329a47cbe201c151fd0024a4a3649cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 22:21:43 -0500
Subject: ftrace: combine stack trace in function call

Impact: less likely to interleave function and stack traces

This patch does replaces the separate stack trace on function with
a record function and stack trace together. This will switch between
the function only recording to a function and stack recording.

Also some whitespace fix ups as well.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions.c | 61 +++++++++++++++++++++++++-----------------
 1 file changed, 36 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 2dce3c7370d1..61d0b73dabf5 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -133,6 +133,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
+		trace_function(tr, data, ip, parent_ip, flags, pc);
 		/*
 		 * skip over 5 funcs:
 		 *    __ftrace_trace_stack,
@@ -154,24 +155,6 @@ static struct ftrace_ops trace_ops __read_mostly =
 	.func = function_trace_call,
 };
 
-void tracing_start_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-
-	if (trace_flags & TRACE_ITER_PREEMPTONLY)
-		trace_ops.func = function_trace_call_preempt_only;
-	else
-		trace_ops.func = function_trace_call;
-
-	register_ftrace_function(&trace_ops);
-	ftrace_function_enabled = 1;
-}
-
-void tracing_stop_function_trace(void)
-{
-	ftrace_function_enabled = 0;
-	unregister_ftrace_function(&trace_ops);
-}
 static struct ftrace_ops trace_stack_ops __read_mostly =
 {
 	.func = function_stack_trace_call,
@@ -194,6 +177,31 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
+void tracing_start_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+
+	if (trace_flags & TRACE_ITER_PREEMPTONLY)
+		trace_ops.func = function_trace_call_preempt_only;
+	else
+		trace_ops.func = function_trace_call;
+
+	if (func_flags.val & TRACE_FUNC_OPT_STACK)
+		register_ftrace_function(&trace_stack_ops);
+	else
+		register_ftrace_function(&trace_ops);
+
+	ftrace_function_enabled = 1;
+}
+
+void tracing_stop_function_trace(void)
+{
+	ftrace_function_enabled = 0;
+	/* OK if they are not registered */
+	unregister_ftrace_function(&trace_stack_ops);
+	unregister_ftrace_function(&trace_ops);
+}
+
 static int func_set_flag(u32 old_flags, u32 bit, int set)
 {
 	if (bit == TRACE_FUNC_OPT_STACK) {
@@ -201,10 +209,13 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
 		if (!!set == !!(func_flags.val & TRACE_FUNC_OPT_STACK))
 			return 0;
 
-		if (set)
+		if (set) {
+			unregister_ftrace_function(&trace_ops);
 			register_ftrace_function(&trace_stack_ops);
-		else
+		} else {
 			unregister_ftrace_function(&trace_stack_ops);
+			register_ftrace_function(&trace_ops);
+		}
 
 		return 0;
 	}
@@ -214,14 +225,14 @@ static int func_set_flag(u32 old_flags, u32 bit, int set)
 
 static struct tracer function_trace __read_mostly =
 {
-	.name	     = "function",
-	.init	     = function_trace_init,
-	.reset	     = function_trace_reset,
-	.start	     = function_trace_start,
+	.name		= "function",
+	.init		= function_trace_init,
+	.reset		= function_trace_reset,
+	.start		= function_trace_start,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
-	.selftest    = trace_selftest_startup_function,
+	.selftest	= trace_selftest_startup_function,
 #endif
 };
 
-- 
cgit v1.2.3


From a225cdd263f340c864febb1992802fb5b08bc328 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 23:06:03 -0500
Subject: ftrace: remove static from function tracer functions

Impact: clean up

After reorganizing the functions in trace.c and trace_function.c,
they no longer need to be in global context. This patch makes the
functions and one variable into static.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c           |  3 ---
 kernel/trace/trace.h           | 10 ----------
 kernel/trace/trace_functions.c | 10 ++++++++--
 3 files changed, 8 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2585ffb6c6b5..7de6a94063dd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -187,9 +187,6 @@ int tracing_is_enabled(void)
 	return tracer_enabled;
 }
 
-/* function tracing enabled */
-int				ftrace_function_enabled;
-
 /*
  * trace_buf_size is the size in bytes that is allocated
  * for a buffer. Note, the number of bytes is always rounded
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index bf39a369e4b3..54b72781e920 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -464,16 +464,6 @@ void __trace_stack(struct trace_array *tr,
 
 extern cycle_t ftrace_now(int cpu);
 
-#ifdef CONFIG_FUNCTION_TRACER
-void tracing_start_function_trace(void);
-void tracing_stop_function_trace(void);
-#else
-# define tracing_start_function_trace()		do { } while (0)
-# define tracing_stop_function_trace()		do { } while (0)
-#endif
-
-extern int ftrace_function_enabled;
-
 #ifdef CONFIG_CONTEXT_SWITCH_TRACER
 typedef void
 (*tracer_switch_func_t)(void *private,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 61d0b73dabf5..b3a320f8aba7 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -16,8 +16,14 @@
 
 #include "trace.h"
 
+/* function tracing enabled */
+static int			ftrace_function_enabled;
+
 static struct trace_array	*func_trace;
 
+static void tracing_start_function_trace(void);
+static void tracing_stop_function_trace(void);
+
 static void start_function_trace(struct trace_array *tr)
 {
 	func_trace = tr;
@@ -177,7 +183,7 @@ static struct tracer_flags func_flags = {
 	.opts = func_opts
 };
 
-void tracing_start_function_trace(void)
+static void tracing_start_function_trace(void)
 {
 	ftrace_function_enabled = 0;
 
@@ -194,7 +200,7 @@ void tracing_start_function_trace(void)
 	ftrace_function_enabled = 1;
 }
 
-void tracing_stop_function_trace(void)
+static void tracing_stop_function_trace(void)
 {
 	ftrace_function_enabled = 0;
 	/* OK if they are not registered */
-- 
cgit v1.2.3


From 745b1626dd71ce9661a05ea4db57859ed5c773d2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 15 Jan 2009 23:40:11 -0500
Subject: trace: set max latency variable to zero on default

Impact: trace max latencies on start of latency tracing

This patch sets the max latency to zero whenever one of the
irq variant tracers or the wakeup tracer is set to current tracer.

Most developers expect to see output when starting up a latency
tracer. But since the max_latency is already set to max, and
it takes a latency greater than max_latency to be recorded, there
is no trace. This is not the expected behavior and has even confused
myself.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c              | 2 +-
 kernel/trace/trace_irqsoff.c      | 1 +
 kernel/trace/trace_sched_wakeup.c | 1 +
 3 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7de6a94063dd..220c264e3111 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -41,7 +41,7 @@
 
 #define TRACE_BUFFER_FLAGS	(RB_FL_OVERWRITE)
 
-unsigned long __read_mostly	tracing_max_latency = (cycle_t)ULONG_MAX;
+unsigned long __read_mostly	tracing_max_latency;
 unsigned long __read_mostly	tracing_thresh;
 
 /*
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 7c2e326bbc8b..62a78d943534 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -380,6 +380,7 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+	tracing_max_latency = 0;
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
 	smp_wmb();
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 43586b689e31..42ae1e77b6b3 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -333,6 +333,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int wakeup_tracer_init(struct trace_array *tr)
 {
+	tracing_max_latency = 0;
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
 	return 0;
-- 
cgit v1.2.3


From 2d68259db26ad57fd9643f1c69b5181ec9836ca9 Mon Sep 17 00:00:00 2001
From: Magnus Damm <magnus.damm@gmail.com>
Date: Fri, 16 Jan 2009 17:14:38 +0900
Subject: clockevents: let set_mode() setup delta information

Allow the set_mode() clockevent callback to decide and fill in delta
details such as shift, mult, max_delta_ns and min_delta_ns.

With this change the clockevent can be registered without delta details
which allows us to keep the parent clock disabled until the clockevent
gets setup using set_mode().

Letting set_mode() fill in or update delta details allows us to save
power by disabling the parent clock while the clockevent is unused.
This may however make the parent clock rate change, so next time the
clockevent gets enabled we need let set_mode() to update the detla
details accordingly. Doing it at registration time is not enough.

Furthermore, the delta details seem unused in the case of periodic-only
clockevent drivers, so this change also allows registration of such
drivers without the delta details filled in.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/time/clockevents.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c
index ea2f48af83cf..d13be216a790 100644
--- a/kernel/time/clockevents.c
+++ b/kernel/time/clockevents.c
@@ -68,6 +68,17 @@ void clockevents_set_mode(struct clock_event_device *dev,
 	if (dev->mode != mode) {
 		dev->set_mode(mode, dev);
 		dev->mode = mode;
+
+		/*
+		 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
+		 * on it, so fix it up and emit a warning:
+		 */
+		if (mode == CLOCK_EVT_MODE_ONESHOT) {
+			if (unlikely(!dev->mult)) {
+				dev->mult = 1;
+				WARN_ON(1);
+			}
+		}
 	}
 }
 
@@ -168,15 +179,6 @@ void clockevents_register_device(struct clock_event_device *dev)
 	BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED);
 	BUG_ON(!dev->cpumask);
 
-	/*
-	 * A nsec2cyc multiplicator of 0 is invalid and we'd crash
-	 * on it, so fix it up and emit a warning:
-	 */
-	if (unlikely(!dev->mult)) {
-		dev->mult = 1;
-		WARN_ON(1);
-	}
-
 	spin_lock(&clockevents_lock);
 
 	list_add(&dev->list, &clockevent_devices);
-- 
cgit v1.2.3


From e162b39a368f0401e41b558f430c354d12a85b37 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 15 Jan 2009 11:08:40 -0800
Subject: softlockup: decouple hung tasks check from softlockup detection

Decoupling allows:

* hung tasks check to happen at very low priority

* hung tasks check and softlockup to be enabled/disabled independently
  at compile and/or run-time

* individual panic settings to be enabled disabled independently
  at compile and/or run-time

* softlockup threshold to be reduced without increasing hung tasks
  poll frequency (hung task check is expensive relative to softlock watchdog)

* hung task check to be zero over-head when disabled at run-time

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  14 +++-
 kernel/Makefile       |   1 +
 kernel/hung_task.c    | 198 ++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/softlockup.c   | 100 -------------------------
 kernel/sysctl.c       |  15 +++-
 lib/Kconfig.debug     |  38 ++++++++++
 6 files changed, 261 insertions(+), 105 deletions(-)
 create mode 100644 kernel/hung_task.c

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 54cbabf3b871..f2f94d532302 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -297,9 +297,6 @@ extern int proc_dosoftlockup_thresh(struct ctl_table *table, int write,
 				    struct file *filp, void __user *buffer,
 				    size_t *lenp, loff_t *ppos);
 extern unsigned int  softlockup_panic;
-extern unsigned long sysctl_hung_task_check_count;
-extern unsigned long sysctl_hung_task_timeout_secs;
-extern unsigned long sysctl_hung_task_warnings;
 extern int softlockup_thresh;
 #else
 static inline void softlockup_tick(void)
@@ -316,6 +313,15 @@ static inline void touch_all_softlockup_watchdogs(void)
 }
 #endif
 
+#ifdef CONFIG_DETECT_HUNG_TASK
+extern unsigned int  sysctl_hung_task_panic;
+extern unsigned long sysctl_hung_task_check_count;
+extern unsigned long sysctl_hung_task_timeout_secs;
+extern unsigned long sysctl_hung_task_warnings;
+extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+					 struct file *filp, void __user *buffer,
+					 size_t *lenp, loff_t *ppos);
+#endif
 
 /* Attach to any functions which should be ignored in wchan output. */
 #define __sched		__attribute__((__section__(".sched.text")))
@@ -1236,7 +1242,7 @@ struct task_struct {
 /* ipc stuff */
 	struct sysv_sem sysvsem;
 #endif
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
 	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
diff --git a/kernel/Makefile b/kernel/Makefile
index 2aebc4cd7878..979745f1b4bc 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -74,6 +74,7 @@ obj-$(CONFIG_AUDIT_TREE) += audit_tree.o
 obj-$(CONFIG_KPROBES) += kprobes.o
 obj-$(CONFIG_KGDB) += kgdb.o
 obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o
+obj-$(CONFIG_DETECT_HUNG_TASK) += hung_task.o
 obj-$(CONFIG_GENERIC_HARDIRQS) += irq/
 obj-$(CONFIG_SECCOMP) += seccomp.o
 obj-$(CONFIG_RCU_TORTURE_TEST) += rcutorture.o
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
new file mode 100644
index 000000000000..ba5a77cad3bb
--- /dev/null
+++ b/kernel/hung_task.c
@@ -0,0 +1,198 @@
+/*
+ * Detect Hung Task
+ *
+ * kernel/hung_task.c - kernel thread for detecting tasks stuck in D state
+ *
+ */
+
+#include <linux/mm.h>
+#include <linux/cpu.h>
+#include <linux/nmi.h>
+#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/freezer.h>
+#include <linux/kthread.h>
+#include <linux/lockdep.h>
+#include <linux/module.h>
+#include <linux/sysctl.h>
+
+/*
+ * Have a reasonable limit on the number of tasks checked:
+ */
+unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+
+/*
+ * Zero means infinite timeout - no checking done:
+ */
+unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
+static unsigned long __read_mostly hung_task_poll_jiffies;
+
+unsigned long __read_mostly sysctl_hung_task_warnings = 10;
+
+static int __read_mostly did_panic;
+
+static struct task_struct *watchdog_task;
+
+/*
+ * Should we panic (and reboot, if panic_timeout= is set) when a
+ * hung task is detected:
+ */
+unsigned int __read_mostly sysctl_hung_task_panic =
+				CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE;
+
+static int __init hung_task_panic_setup(char *str)
+{
+	sysctl_hung_task_panic = simple_strtoul(str, NULL, 0);
+
+	return 1;
+}
+__setup("hung_task_panic=", hung_task_panic_setup);
+
+static int
+hung_task_panic(struct notifier_block *this, unsigned long event, void *ptr)
+{
+	did_panic = 1;
+
+	return NOTIFY_DONE;
+}
+
+static struct notifier_block panic_block = {
+	.notifier_call = hung_task_panic,
+};
+
+/*
+ * Returns seconds, approximately.  We don't need nanosecond
+ * resolution, and we don't need to waste time with a big divide when
+ * 2^30ns == 1.074s.
+ */
+static unsigned long get_timestamp(void)
+{
+	int this_cpu = raw_smp_processor_id();
+
+	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
+}
+
+static void check_hung_task(struct task_struct *t, unsigned long now)
+{
+	unsigned long switch_count = t->nvcsw + t->nivcsw;
+
+	if (t->flags & PF_FROZEN)
+		return;
+
+	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+		t->last_switch_count = switch_count;
+		t->last_switch_timestamp = now;
+		return;
+	}
+	if ((long)(now - t->last_switch_timestamp) <
+					sysctl_hung_task_timeout_secs)
+		return;
+	if (!sysctl_hung_task_warnings)
+		return;
+	sysctl_hung_task_warnings--;
+
+	/*
+	 * Ok, the task did not get scheduled for more than 2 minutes,
+	 * complain:
+	 */
+	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
+			"%ld seconds.\n", t->comm, t->pid,
+			sysctl_hung_task_timeout_secs);
+	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
+			" disables this message.\n");
+	sched_show_task(t);
+	__debug_show_held_locks(t);
+
+	t->last_switch_timestamp = now;
+	touch_nmi_watchdog();
+
+	if (sysctl_hung_task_panic)
+		panic("hung_task: blocked tasks");
+}
+
+/*
+ * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
+ * a really long time (120 seconds). If that happens, print out
+ * a warning.
+ */
+static void check_hung_uninterruptible_tasks(void)
+{
+	int max_count = sysctl_hung_task_check_count;
+	unsigned long now = get_timestamp();
+	struct task_struct *g, *t;
+
+	/*
+	 * If the system crashed already then all bets are off,
+	 * do not report extra hung tasks:
+	 */
+	if (test_taint(TAINT_DIE) || did_panic)
+		return;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, t) {
+		if (!--max_count)
+			goto unlock;
+		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
+		if (t->state == TASK_UNINTERRUPTIBLE)
+			check_hung_task(t, now);
+	} while_each_thread(g, t);
+ unlock:
+	read_unlock(&tasklist_lock);
+}
+
+static void update_poll_jiffies(void)
+{
+	/* timeout of 0 will disable the watchdog */
+	if (sysctl_hung_task_timeout_secs == 0)
+		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
+	else
+		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+}
+
+/*
+ * Process updating of timeout sysctl
+ */
+int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
+				  struct file *filp, void __user *buffer,
+				  size_t *lenp, loff_t *ppos)
+{
+	int ret;
+
+	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
+
+	if (ret || !write)
+		goto out;
+
+	update_poll_jiffies();
+
+	wake_up_process(watchdog_task);
+
+ out:
+	return ret;
+}
+
+/*
+ * kthread which checks for tasks stuck in D state
+ */
+static int watchdog(void *dummy)
+{
+	set_user_nice(current, 0);
+	update_poll_jiffies();
+
+	for ( ; ; ) {
+		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		check_hung_uninterruptible_tasks();
+	}
+
+	return 0;
+}
+
+static int __init hung_task_init(void)
+{
+	atomic_notifier_chain_register(&panic_notifier_list, &panic_block);
+	watchdog_task = kthread_run(watchdog, NULL, "khungtaskd");
+
+	return 0;
+}
+
+module_init(hung_task_init);
diff --git a/kernel/softlockup.c b/kernel/softlockup.c
index 85d5a2455103..88796c330838 100644
--- a/kernel/softlockup.c
+++ b/kernel/softlockup.c
@@ -165,98 +165,12 @@ void softlockup_tick(void)
 		panic("softlockup: hung tasks");
 }
 
-/*
- * Have a reasonable limit on the number of tasks checked:
- */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
-
-/*
- * Zero means infinite timeout - no checking done:
- */
-unsigned long __read_mostly sysctl_hung_task_timeout_secs = 480;
-
-unsigned long __read_mostly sysctl_hung_task_warnings = 10;
-
-/*
- * Only do the hung-tasks check on one CPU:
- */
-static int check_cpu __read_mostly = -1;
-
-static void check_hung_task(struct task_struct *t, unsigned long now)
-{
-	unsigned long switch_count = t->nvcsw + t->nivcsw;
-
-	if (t->flags & PF_FROZEN)
-		return;
-
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
-		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
-		return;
-	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
-		return;
-	if (!sysctl_hung_task_warnings)
-		return;
-	sysctl_hung_task_warnings--;
-
-	/*
-	 * Ok, the task did not get scheduled for more than 2 minutes,
-	 * complain:
-	 */
-	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
-	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
-			" disables this message.\n");
-	sched_show_task(t);
-	__debug_show_held_locks(t);
-
-	t->last_switch_timestamp = now;
-	touch_nmi_watchdog();
-
-	if (softlockup_panic)
-		panic("softlockup: blocked tasks");
-}
-
-/*
- * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
- * a really long time (120 seconds). If that happens, print out
- * a warning.
- */
-static void check_hung_uninterruptible_tasks(int this_cpu)
-{
-	int max_count = sysctl_hung_task_check_count;
-	unsigned long now = get_timestamp(this_cpu);
-	struct task_struct *g, *t;
-
-	/*
-	 * If the system crashed already then all bets are off,
-	 * do not report extra hung tasks:
-	 */
-	if (test_taint(TAINT_DIE) || did_panic)
-		return;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, t) {
-		if (!--max_count)
-			goto unlock;
-		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
-		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
-	} while_each_thread(g, t);
- unlock:
-	read_unlock(&tasklist_lock);
-}
-
 /*
  * The watchdog thread - runs every second and touches the timestamp.
  */
 static int watchdog(void *__bind_cpu)
 {
 	struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 };
-	int this_cpu = (long)__bind_cpu;
 
 	sched_setscheduler(current, SCHED_FIFO, &param);
 
@@ -276,11 +190,6 @@ static int watchdog(void *__bind_cpu)
 		if (kthread_should_stop())
 			break;
 
-		if (this_cpu == check_cpu) {
-			if (sysctl_hung_task_timeout_secs)
-				check_hung_uninterruptible_tasks(this_cpu);
-		}
-
 		set_current_state(TASK_INTERRUPTIBLE);
 	}
 	__set_current_state(TASK_RUNNING);
@@ -312,18 +221,9 @@ cpu_callback(struct notifier_block *nfb, unsigned long action, void *hcpu)
 		break;
 	case CPU_ONLINE:
 	case CPU_ONLINE_FROZEN:
-		check_cpu = cpumask_any(cpu_online_mask);
 		wake_up_process(per_cpu(watchdog_task, hotcpu));
 		break;
 #ifdef CONFIG_HOTPLUG_CPU
-	case CPU_DOWN_PREPARE:
-	case CPU_DOWN_PREPARE_FROZEN:
-		if (hotcpu == check_cpu) {
-			/* Pick any other online cpu. */
-			check_cpu = cpumask_any_but(cpu_online_mask, hotcpu);
-		}
-		break;
-
 	case CPU_UP_CANCELED:
 	case CPU_UP_CANCELED_FROZEN:
 		if (!per_cpu(watchdog_task, hotcpu))
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 596dc31a7116..2481ed30d2b5 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -805,6 +805,19 @@ static struct ctl_table kern_table[] = {
 		.extra1		= &neg_one,
 		.extra2		= &sixty,
 	},
+#endif
+#ifdef CONFIG_DETECT_HUNG_TASK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "hung_task_panic",
+		.data		= &sysctl_hung_task_panic,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &one,
+	},
 	{
 		.ctl_name	= CTL_UNNUMBERED,
 		.procname	= "hung_task_check_count",
@@ -820,7 +833,7 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_hung_task_timeout_secs,
 		.maxlen		= sizeof(unsigned long),
 		.mode		= 0644,
-		.proc_handler	= &proc_doulongvec_minmax,
+		.proc_handler	= &proc_dohung_task_timeout_secs,
 		.strategy	= &sysctl_intvec,
 	},
 	{
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 4c9ae6085c75..883ecea22f37 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -186,6 +186,44 @@ config BOOTPARAM_SOFTLOCKUP_PANIC_VALUE
 	default 0 if !BOOTPARAM_SOFTLOCKUP_PANIC
 	default 1 if BOOTPARAM_SOFTLOCKUP_PANIC
 
+config DETECT_HUNG_TASK
+	bool "Detect Hung Tasks"
+	depends on DEBUG_KERNEL
+	default y
+	help
+	  Say Y here to enable the kernel to detect "hung tasks",
+	  which are bugs that cause the task to be stuck in
+	  uninterruptible "D" state indefinitiley.
+
+	  When a hung task is detected, the kernel will print the
+	  current stack trace (which you should report), but the
+	  task will stay in uninterruptible state. If lockdep is
+	  enabled then all held locks will also be reported. This
+	  feature has negligible overhead.
+
+config BOOTPARAM_HUNG_TASK_PANIC
+	bool "Panic (Reboot) On Hung Tasks"
+	depends on DETECT_HUNG_TASK
+	help
+	  Say Y here to enable the kernel to panic on "hung tasks",
+	  which are bugs that cause the kernel to leave a task stuck
+	  in uninterruptible "D" state.
+
+	  The panic can be used in combination with panic_timeout,
+	  to cause the system to reboot automatically after a
+	  hung task has been detected. This feature is useful for
+	  high-availability systems that have uptime guarantees and
+	  where a hung tasks must be resolved ASAP.
+
+	  Say N if unsure.
+
+config BOOTPARAM_HUNG_TASK_PANIC_VALUE
+	int
+	depends on DETECT_HUNG_TASK
+	range 0 1
+	default 0 if !BOOTPARAM_HUNG_TASK_PANIC
+	default 1 if BOOTPARAM_HUNG_TASK_PANIC
+
 config SCHED_DEBUG
 	bool "Collect scheduler debugging info"
 	depends on DEBUG_KERNEL && PROC_FS
-- 
cgit v1.2.3


From ceacc2c1c85ac498ca4cf297bdfe5b4aaa9fd0e0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Fri, 16 Jan 2009 14:46:40 +0100
Subject: sched: make plist a library facility

Ingo Molnar wrote:

> here's a new build failure with tip/sched/rt:
>
>   LD      .tmp_vmlinux1
> kernel/built-in.o: In function `set_curr_task_rt':
> sched.c:(.text+0x3675): undefined reference to `plist_del'
> kernel/built-in.o: In function `pick_next_task_rt':
> sched.c:(.text+0x37ce): undefined reference to `plist_del'
> kernel/built-in.o: In function `enqueue_pushable_task':
> sched.c:(.text+0x381c): undefined reference to `plist_del'

Eliminate the plist library kconfig and make it available
unconditionally.

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig      |  1 -
 kernel/sched_rt.c | 21 +++++++++++++++------
 lib/Kconfig       |  6 ------
 lib/Makefile      |  4 ++--
 4 files changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index a724a149bf3f..19b78aa010e3 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -874,7 +874,6 @@ config SLABINFO
 
 config RT_MUTEXES
 	boolean
-	select PLIST
 
 config BASE_SMALL
 	int
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 4230b15fe90e..48d1f6e8497a 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -114,14 +114,23 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
 
 #else
 
+static inline void enqueue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
+static inline void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
+{
+}
+
 static inline
-void enqueue_pushable_task(struct rq *rq, struct task_struct *p) {}
-static inline
-void dequeue_pushable_task(struct rq *rq, struct task_struct *p) {}
-static inline
-void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+void inc_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
+
 static inline
-void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) {}
+void dec_rt_migration(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq)
+{
+}
 
 #endif /* CONFIG_SMP */
 
diff --git a/lib/Kconfig b/lib/Kconfig
index 03c2c24b9083..fc8ea1ca59d8 100644
--- a/lib/Kconfig
+++ b/lib/Kconfig
@@ -136,12 +136,6 @@ config TEXTSEARCH_BM
 config TEXTSEARCH_FSM
 	tristate
 
-#
-# plist support is select#ed if needed
-#
-config PLIST
-	boolean
-
 config HAS_IOMEM
 	boolean
 	depends on !NO_IOMEM
diff --git a/lib/Makefile b/lib/Makefile
index 32b0e64ded27..902d73851044 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -11,7 +11,8 @@ lib-y := ctype.o string.o vsprintf.o cmdline.o \
 	 rbtree.o radix-tree.o dump_stack.o \
 	 idr.o int_sqrt.o extable.o prio_tree.o \
 	 sha1.o irq_regs.o reciprocal_div.o argv_split.o \
-	 proportions.o prio_heap.o ratelimit.o show_mem.o is_single_threaded.o
+	 proportions.o prio_heap.o ratelimit.o show_mem.o \
+	 is_single_threaded.o plist.o
 
 lib-$(CONFIG_MMU) += ioremap.o
 lib-$(CONFIG_SMP) += cpumask.o
@@ -40,7 +41,6 @@ lib-$(CONFIG_GENERIC_FIND_NEXT_BIT) += find_next_bit.o
 lib-$(CONFIG_GENERIC_FIND_LAST_BIT) += find_last_bit.o
 obj-$(CONFIG_GENERIC_HWEIGHT) += hweight.o
 obj-$(CONFIG_LOCK_KERNEL) += kernel_lock.o
-obj-$(CONFIG_PLIST) += plist.o
 obj-$(CONFIG_DEBUG_PREEMPT) += smp_processor_id.o
 obj-$(CONFIG_DEBUG_LIST) += list_debug.o
 obj-$(CONFIG_DEBUG_OBJECTS) += debugobjects.o
-- 
cgit v1.2.3


From 74296a8ed6aa3c5bf672808ada690de7ba323ecc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 16 Jan 2009 17:43:50 +0100
Subject: irq: provide debug_poll_all_shared_irqs() method under
 CONFIG_DEBUG_SHIRQ

Provide a shared interrupt debug facility under CONFIG_DEBUG_SHIRQ:
it uses the existing irqpoll facilities to iterate through all
registered interrupt handlers and call those which can handle shared
IRQ lines.

This can be handy for suspend/resume debugging: if we call this function
early during resume we can trigger crashes in those drivers which have
incorrect assumptions about when exactly their ISRs will be called
during suspend/resume.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  6 ++++++
 kernel/irq/spurious.c     | 14 +++++++++++++-
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 9127f6b51a39..468e3a25a4a1 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -462,6 +462,12 @@ static inline void init_irq_proc(void)
 }
 #endif
 
+#if defined(CONFIG_GENERIC_HARDIRQS) && defined(CONFIG_DEBUG_SHIRQ)
+extern void debug_poll_all_shared_irqs(void);
+#else
+static inline void debug_poll_all_shared_irqs(void) { }
+#endif
+
 int show_interrupts(struct seq_file *p, void *v);
 
 struct irq_desc;
diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c
index dd364c11e56e..4d568294de3e 100644
--- a/kernel/irq/spurious.c
+++ b/kernel/irq/spurious.c
@@ -104,7 +104,7 @@ static int misrouted_irq(int irq)
 	return ok;
 }
 
-static void poll_spurious_irqs(unsigned long dummy)
+static void poll_all_shared_irqs(void)
 {
 	struct irq_desc *desc;
 	int i;
@@ -123,11 +123,23 @@ static void poll_spurious_irqs(unsigned long dummy)
 
 		try_one_irq(i, desc);
 	}
+}
+
+static void poll_spurious_irqs(unsigned long dummy)
+{
+	poll_all_shared_irqs();
 
 	mod_timer(&poll_spurious_irq_timer,
 		  jiffies + POLL_SPURIOUS_IRQ_INTERVAL);
 }
 
+#ifdef CONFIG_DEBUG_SHIRQ
+void debug_poll_all_shared_irqs(void)
+{
+	poll_all_shared_irqs();
+}
+#endif
+
 /*
  * If 99,900 of the previous 100,000 interrupts have not been handled
  * then assume that the IRQ is stuck in some manner. Drop a diagnostic
-- 
cgit v1.2.3


From af432eb1cc3178ec7109aca2283aafb1c12ccac1 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 16 Jan 2009 09:09:38 -0800
Subject: softlockup: fix to allow compiling with !DETECT_HUNG_TASK

Fixes the following compile error:

 kernel/fork.c:1049: error: 'struct task_struct' has no member named 'last_switch_count'
 kernel/fork.c:1050: error: 'struct task_struct' has no member named 'last_switch_timestamp'

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 1d68f1255dd8..fb9444282836 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1041,7 +1041,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_SOFTLOCKUP
+#ifdef CONFIG_DETECT_HUNG_TASK
 	p->last_switch_count = 0;
 	p->last_switch_timestamp = 0;
 #endif
-- 
cgit v1.2.3


From 091d71e023557136e96f0e54f301497a3fc95dc3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 17 Jan 2009 00:10:45 +0100
Subject: PM: Fix compilation warning in kernel/power/main.c

Reorder the code in kernel/power/main.c to fix compilation warning
triggered by unsetting CONFIG_SUSPEND.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Len Brown <len.brown@intel.com>
---
 kernel/power/main.c | 26 +++++++++++++-------------
 1 file changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index 239988873971..b4d219016b6c 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -57,16 +57,6 @@ int pm_notifier_call_chain(unsigned long val)
 #ifdef CONFIG_PM_DEBUG
 int pm_test_level = TEST_NONE;
 
-static int suspend_test(int level)
-{
-	if (pm_test_level == level) {
-		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
-		mdelay(5000);
-		return 1;
-	}
-	return 0;
-}
-
 static const char * const pm_tests[__TEST_AFTER_LAST] = {
 	[TEST_NONE] = "none",
 	[TEST_CORE] = "core",
@@ -125,14 +115,24 @@ static ssize_t pm_test_store(struct kobject *kobj, struct kobj_attribute *attr,
 }
 
 power_attr(pm_test);
-#else /* !CONFIG_PM_DEBUG */
-static inline int suspend_test(int level) { return 0; }
-#endif /* !CONFIG_PM_DEBUG */
+#endif /* CONFIG_PM_DEBUG */
 
 #endif /* CONFIG_PM_SLEEP */
 
 #ifdef CONFIG_SUSPEND
 
+static int suspend_test(int level)
+{
+#ifdef CONFIG_PM_DEBUG
+	if (pm_test_level == level) {
+		printk(KERN_INFO "suspend debug: Waiting for 5 seconds.\n");
+		mdelay(5000);
+		return 1;
+	}
+#endif /* !CONFIG_PM_DEBUG */
+	return 0;
+}
+
 #ifdef CONFIG_PM_TEST_SUSPEND
 
 /*
-- 
cgit v1.2.3


From 68564a46976017496c2227660930d81240f82355 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 16 Jan 2009 15:31:15 -0800
Subject: work_on_cpu: don't try to get_online_cpus() in work_on_cpu.

Impact: remove potential circular lock dependency with cpu hotplug lock

This has caused more problems than it solved, with a pile of cpu
hotplug locking issues.

Followup patches will get_online_cpus() in callers that need it, but
if they don't do it they're no worse than before when they were using
set_cpus_allowed without locking.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/workqueue.c | 14 ++++----------
 1 file changed, 4 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 2f445833ae37..a35afdbc0161 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -991,8 +991,8 @@ static void do_work_for_cpu(struct work_struct *w)
  * @fn: the function to run
  * @arg: the function arg
  *
- * This will return -EINVAL in the cpu is not online, or the return value
- * of @fn otherwise.
+ * This will return the value @fn returns.
+ * It is up to the caller to ensure that the cpu doesn't go offline.
  */
 long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 {
@@ -1001,14 +1001,8 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 	INIT_WORK(&wfc.work, do_work_for_cpu);
 	wfc.fn = fn;
 	wfc.arg = arg;
-	get_online_cpus();
-	if (unlikely(!cpu_online(cpu)))
-		wfc.ret = -EINVAL;
-	else {
-		schedule_work_on(cpu, &wfc.work);
-		flush_work(&wfc.work);
-	}
-	put_online_cpus();
+	schedule_work_on(cpu, &wfc.work);
+	flush_work(&wfc.work);
 
 	return wfc.ret;
 }
-- 
cgit v1.2.3


From e1d9ec6246a2668a5d037f529877efb7cf176af8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 16 Jan 2009 15:31:15 -0800
Subject: work_on_cpu: Use our own workqueue.

Impact: remove potential clashes with generic kevent workqueue

Annoyingly, some places we want to use work_on_cpu are already in
workqueues.  As per Ingo's suggestion, we create a different workqueue
for work_on_cpu.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Mike Travis <travis@sgi.com>
---
 kernel/workqueue.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index a35afdbc0161..1f0c509b40d3 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -971,6 +971,8 @@ undo:
 }
 
 #ifdef CONFIG_SMP
+static struct workqueue_struct *work_on_cpu_wq __read_mostly;
+
 struct work_for_cpu {
 	struct work_struct work;
 	long (*fn)(void *);
@@ -1001,7 +1003,7 @@ long work_on_cpu(unsigned int cpu, long (*fn)(void *), void *arg)
 	INIT_WORK(&wfc.work, do_work_for_cpu);
 	wfc.fn = fn;
 	wfc.arg = arg;
-	schedule_work_on(cpu, &wfc.work);
+	queue_work_on(cpu, work_on_cpu_wq, &wfc.work);
 	flush_work(&wfc.work);
 
 	return wfc.ret;
@@ -1019,4 +1021,8 @@ void __init init_workqueues(void)
 	hotcpu_notifier(workqueue_cpu_callback, 0);
 	keventd_wq = create_workqueue("events");
 	BUG_ON(!keventd_wq);
+#ifdef CONFIG_SMP
+	work_on_cpu_wq = create_workqueue("work_on_cpu");
+	BUG_ON(!work_on_cpu_wq);
+#endif
 }
-- 
cgit v1.2.3


From 603a148f434742fff08273207ffa44176cad13a1 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Sat, 17 Jan 2009 10:31:48 -0800
Subject: softlockup: fix potential race in hung_task when resetting timeout

Impact: fix potential false panic

A potential race exists if sysctl_hung_task_timeout_secs is reset to 0
while inside check_hung_uniterruptible_tasks(). If check_task() is
entered, a comparison with 0 will result in a false hung_task being
detected.

If sysctl_hung_task_panic is set, the system will panic.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba5a77cad3bb..ba8ccd432963 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,7 +72,8 @@ static unsigned long get_timestamp(void)
 	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
 }
 
-static void check_hung_task(struct task_struct *t, unsigned long now)
+static void check_hung_task(struct task_struct *t, unsigned long now,
+			    unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
@@ -84,8 +85,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) <
-					sysctl_hung_task_timeout_secs)
+	if ((long)(now - t->last_switch_timestamp) < timeout)
 		return;
 	if (!sysctl_hung_task_warnings)
 		return;
@@ -96,8 +96,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
 	 * complain:
 	 */
 	printk(KERN_ERR "INFO: task %s:%d blocked for more than "
-			"%ld seconds.\n", t->comm, t->pid,
-			sysctl_hung_task_timeout_secs);
+			"%ld seconds.\n", t->comm, t->pid, timeout);
 	printk(KERN_ERR "\"echo 0 > /proc/sys/kernel/hung_task_timeout_secs\""
 			" disables this message.\n");
 	sched_show_task(t);
@@ -115,7 +114,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now)
  * a really long time (120 seconds). If that happens, print out
  * a warning.
  */
-static void check_hung_uninterruptible_tasks(void)
+static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	unsigned long now = get_timestamp();
@@ -134,7 +133,7 @@ static void check_hung_uninterruptible_tasks(void)
 			goto unlock;
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now);
+			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
 	read_unlock(&tasklist_lock);
@@ -180,8 +179,17 @@ static int watchdog(void *dummy)
 	update_poll_jiffies();
 
 	for ( ; ; ) {
+		unsigned long timeout;
+
 		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
-		check_hung_uninterruptible_tasks();
+
+		/*
+		 * Need to cache timeout here to avoid timeout being set
+		 * to 0 via sysctl while inside check_hung_*_tasks().
+		 */
+		timeout = sysctl_hung_task_timeout_secs;
+		if (timeout)
+			check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 5c5317de147e9b38ea9c4cbdc2d15bed7648d036 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:26:53 +0100
Subject: x86, ftrace, hw-branch-tracer: support hotplug cpus

Support hotplug cpus.

Reported-by: Andi Kleen <ak@linux.intel.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 123 ++++++++++++++++++++++++++++++++++-----
 1 file changed, 107 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index df21c1e72b95..398195397c75 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -1,7 +1,8 @@
 /*
  * h/w branch tracer for x86 based on bts
  *
- * Copyright (C) 2008 Markus Metzger <markus.t.metzger@gmail.com>
+ * Copyright (C) 2008-2009 Intel Corporation.
+ * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
  *
  */
 
@@ -10,6 +11,9 @@
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include <linux/kallsyms.h>
+#include <linux/mutex.h>
+#include <linux/cpu.h>
+#include <linux/smp.h>
 
 #include <asm/ds.h>
 
@@ -19,13 +23,31 @@
 
 #define SIZEOF_BTS (1 << 13)
 
+/* The tracer mutex protects the below per-cpu tracer array.
+   It needs to be held to:
+   - start tracing on all cpus
+   - stop tracing on all cpus
+   - start tracing on a single hotplug cpu
+   - stop tracing on a single hotplug cpu
+   - read the trace from all cpus
+   - read the trace from a single cpu
+*/
+static DEFINE_MUTEX(bts_tracer_mutex);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 
 #define this_tracer per_cpu(tracer, smp_processor_id())
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
+static int __read_mostly trace_hw_branches_enabled;
 
+
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
 static void bts_trace_start_cpu(void *arg)
 {
 	if (this_tracer)
@@ -43,14 +65,20 @@ static void bts_trace_start_cpu(void *arg)
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
 
-	tracing_reset_online_cpus(tr);
+	on_each_cpu(bts_trace_start_cpu, NULL, 1);
+	trace_hw_branches_enabled = 1;
 
-	for_each_cpu(cpu, cpu_possible_mask)
-		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+	mutex_unlock(&bts_tracer_mutex);
 }
 
+/*
+ * Start tracing on the current cpu.
+ * The argument is ignored.
+ *
+ * pre: bts_tracer_mutex must be locked.
+ */
 static void bts_trace_stop_cpu(void *arg)
 {
 	if (this_tracer) {
@@ -61,20 +89,58 @@ static void bts_trace_stop_cpu(void *arg)
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_hw_branches_enabled = 0;
+	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
 
-	for_each_cpu(cpu, cpu_possible_mask)
+	mutex_unlock(&bts_tracer_mutex);
+}
+
+static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
+				     unsigned long action, void *hcpu)
+{
+	unsigned int cpu = (unsigned long)hcpu;
+
+	mutex_lock(&bts_tracer_mutex);
+
+	if (!trace_hw_branches_enabled)
+		goto out;
+
+	switch (action) {
+	case CPU_ONLINE:
+	case CPU_DOWN_FAILED:
+		smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1);
+		break;
+	case CPU_DOWN_PREPARE:
 		smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1);
+		break;
+	}
+
+ out:
+	mutex_unlock(&bts_tracer_mutex);
+	return NOTIFY_DONE;
 }
 
+static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
+	.notifier_call = bts_hotcpu_handler
+};
+
 static int bts_trace_init(struct trace_array *tr)
 {
+	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
 }
 
+static void bts_trace_reset(struct trace_array *tr)
+{
+	bts_trace_stop(tr);
+	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
+}
+
 static void bts_trace_print_header(struct seq_file *m)
 {
 	seq_puts(m,
@@ -108,18 +174,34 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 {
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq;
+	unsigned long irq1, irq2;
+	int cpu;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq);
-	if (!event)
+	if (unlikely(!tr))
+		return;
+
+	if (unlikely(!trace_hw_branches_enabled))
 		return;
+
+	local_irq_save(irq1);
+	cpu = raw_smp_processor_id();
+	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
+		goto out;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	if (!event)
+		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, from);
 	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->ent.cpu = smp_processor_id();
+	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq);
+	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+
+ out:
+	atomic_dec(&tr->data[cpu]->disabled);
+	local_irq_restore(irq1);
 }
 
 static void trace_bts_at(struct trace_array *tr,
@@ -143,6 +225,11 @@ static void trace_bts_at(struct trace_array *tr,
 	}
 }
 
+/*
+ * Collect the trace on the current cpu and write it into the ftrace buffer.
+ *
+ * pre: bts_tracer_mutex must be locked
+ */
 static void trace_bts_cpu(void *arg)
 {
 	struct trace_array *tr = (struct trace_array *) arg;
@@ -152,6 +239,9 @@ static void trace_bts_cpu(void *arg)
 	if (!this_tracer)
 		return;
 
+	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -171,17 +261,18 @@ out:
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	int cpu;
+	mutex_lock(&bts_tracer_mutex);
+
+	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	for_each_cpu(cpu, cpu_possible_mask)
-		smp_call_function_single(cpu, trace_bts_cpu, iter->tr, 1);
+	mutex_unlock(&bts_tracer_mutex);
 }
 
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
 	.init		= bts_trace_init,
-	.reset		= bts_trace_stop,
+	.reset		= bts_trace_reset,
 	.print_header	= bts_trace_print_header,
 	.print_line	= bts_trace_print_line,
 	.start		= bts_trace_start,
-- 
cgit v1.2.3


From b1818748b0cf9427e48acf9713295e829a0d715f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:31:01 +0100
Subject: x86, ftrace, hw-branch-tracer: dump trace on oops

Dump the branch trace on an oops (based on ftrace_dump_on_oops).

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/dumpstack.c      |  6 ++++++
 include/linux/ftrace.h           | 13 +++++++++++++
 kernel/trace/trace.h             |  1 -
 kernel/trace/trace_hw_branches.c | 29 ++++++++++++++++++++++-------
 4 files changed, 41 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..077c9ea655fc 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -14,6 +14,7 @@
 #include <linux/bug.h>
 #include <linux/nmi.h>
 #include <linux/sysfs.h>
+#include <linux/ftrace.h>
 
 #include <asm/stacktrace.h>
 
@@ -195,6 +196,11 @@ unsigned __kprobes long oops_begin(void)
 	int cpu;
 	unsigned long flags;
 
+	/* notify the hw-branch tracer so it may disable tracing and
+	   add the last trace to the trace buffer -
+	   the earlier this happens, the more useful the trace. */
+	trace_hw_branch_oops();
+
 	oops_enter();
 
 	/* racy, but better than risking deadlock. */
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 054721487574..9f7880d87c39 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -496,4 +496,17 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk)
 
 #endif /* CONFIG_TRACING */
 
+
+#ifdef CONFIG_HW_BRANCH_TRACER
+
+void trace_hw_branch(u64 from, u64 to);
+void trace_hw_branch_oops(void);
+
+#else /* CONFIG_HW_BRANCH_TRACER */
+
+static inline void trace_hw_branch(u64 from, u64 to) {}
+static inline void trace_hw_branch_oops(void) {}
+
+#endif /* CONFIG_HW_BRANCH_TRACER */
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54b72781e920..b96037d970df 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -438,7 +438,6 @@ void trace_function(struct trace_array *tr,
 
 void trace_graph_return(struct ftrace_graph_ret *trace);
 int trace_graph_entry(struct ftrace_graph_ent *trace);
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to);
 
 void tracing_start_cmdline_record(void);
 void tracing_stop_cmdline_record(void);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 398195397c75..e56df2c7d679 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -40,6 +40,7 @@ static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 #define this_buffer per_cpu(buffer, smp_processor_id())
 
 static int __read_mostly trace_hw_branches_enabled;
+static struct trace_array *hw_branch_trace __read_mostly;
 
 
 /*
@@ -128,6 +129,8 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 
 static int bts_trace_init(struct trace_array *tr)
 {
+	hw_branch_trace = tr;
+
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
@@ -170,8 +173,9 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	return TRACE_TYPE_UNHANDLED;
 }
 
-void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
+void trace_hw_branch(u64 from, u64 to)
 {
+	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
 	unsigned long irq1, irq2;
@@ -204,8 +208,7 @@ void trace_hw_branch(struct trace_array *tr, u64 from, u64 to)
 	local_irq_restore(irq1);
 }
 
-static void trace_bts_at(struct trace_array *tr,
-			 const struct bts_trace *trace, void *at)
+static void trace_bts_at(const struct bts_trace *trace, void *at)
 {
 	struct bts_struct bts;
 	int err = 0;
@@ -220,7 +223,7 @@ static void trace_bts_at(struct trace_array *tr,
 
 	switch (bts.qualifier) {
 	case BTS_BRANCH:
-		trace_hw_branch(tr, bts.variant.lbr.from, bts.variant.lbr.to);
+		trace_hw_branch(bts.variant.lbr.from, bts.variant.lbr.to);
 		break;
 	}
 }
@@ -236,12 +239,15 @@ static void trace_bts_cpu(void *arg)
 	const struct bts_trace *trace;
 	unsigned char *at;
 
-	if (!this_tracer)
+	if (unlikely(!tr))
 		return;
 
 	if (unlikely(atomic_read(&tr->data[raw_smp_processor_id()]->disabled)))
 		return;
 
+	if (unlikely(!this_tracer))
+		return;
+
 	ds_suspend_bts(this_tracer);
 	trace = ds_read_bts(this_tracer);
 	if (!trace)
@@ -249,11 +255,11 @@ static void trace_bts_cpu(void *arg)
 
 	for (at = trace->ds.top; (void *)at < trace->ds.end;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 	for (at = trace->ds.begin; (void *)at < trace->ds.top;
 	     at += trace->ds.size)
-		trace_bts_at(tr, trace, at);
+		trace_bts_at(trace, at);
 
 out:
 	ds_resume_bts(this_tracer);
@@ -268,6 +274,15 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+void trace_hw_branch_oops(void)
+{
+	mutex_lock(&bts_tracer_mutex);
+
+	trace_bts_cpu(hw_branch_trace);
+
+	mutex_unlock(&bts_tracer_mutex);
+}
+
 struct tracer bts_tracer __read_mostly =
 {
 	.name		= "hw-branch-tracer",
-- 
cgit v1.2.3


From e23b8ad83430a6fdfbdbfac365f5b0312dd57f10 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:33:31 +0100
Subject: x86, ftrace, hw-branch-tracer: reset trace buffer on close

Reset the ftrace buffer on close. Since we use cyclic buffers, the
trace is not contiguous, anyway.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e56df2c7d679..372b47ac3154 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -274,6 +274,11 @@ static void trace_bts_prepare(struct trace_iterator *iter)
 	mutex_unlock(&bts_tracer_mutex);
 }
 
+static void trace_bts_close(struct trace_iterator *iter)
+{
+	tracing_reset_online_cpus(iter->tr);
+}
+
 void trace_hw_branch_oops(void)
 {
 	mutex_lock(&bts_tracer_mutex);
@@ -292,7 +297,8 @@ struct tracer bts_tracer __read_mostly =
 	.print_line	= bts_trace_print_line,
 	.start		= bts_trace_start,
 	.stop		= bts_trace_stop,
-	.open		= trace_bts_prepare
+	.open		= trace_bts_prepare,
+	.close		= trace_bts_close
 };
 
 __init static int init_bts_trace(void)
-- 
cgit v1.2.3


From 11edda06289d412d13ff7c672bd72e043f637e74 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Mon, 19 Jan 2009 10:29:16 +0100
Subject: x86, ftrace, hw-branch-tracer: change trace format

Change the hw-branch-tracer format to be more readable.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 372b47ac3154..fff3545fc866 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -146,10 +146,7 @@ static void bts_trace_reset(struct trace_array *tr)
 
 static void bts_trace_print_header(struct seq_file *m)
 {
-	seq_puts(m,
-		 "# CPU#        FROM                   TO         FUNCTION\n");
-	seq_puts(m,
-		 "#  |           |                     |             |\n");
+	seq_puts(m, "# CPU#        TO  <-  FROM\n");
 }
 
 static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
@@ -157,15 +154,15 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *seq = &iter->seq;
 	struct hw_branch_entry *it;
+	unsigned long symflags = TRACE_ITER_SYM_OFFSET;
 
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
 		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
-		    trace_seq_printf(seq, "0x%016llx -> 0x%016llx ",
-				     it->from, it->to) &&
-		    (!it->from ||
-		     seq_print_ip_sym(seq, it->from, /* sym_flags = */ 0)) &&
+		    seq_print_ip_sym(seq, it->to, symflags) &&
+		    trace_seq_printf(seq, "\t  <-  ") &&
+		    seq_print_ip_sym(seq, it->from, symflags) &&
 		    trace_seq_printf(seq, "\n"))
 			return TRACE_TYPE_HANDLED;
 		return TRACE_TYPE_PARTIAL_LINE;;
-- 
cgit v1.2.3


From 3690b5e6fd9daa030039ae9bda69044228bd476d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 16 Jan 2009 16:32:25 +0800
Subject: trace_workqueue: use percpu data for workqueue stat

Impact: use percpu data instead of a global structure

Use:

   static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);

instead of allocating a global structure.

percpu data also works well on NUMA.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 64 +++++++++++++++++++-----------------------
 1 file changed, 29 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index f8118d39ca9b..4664990fe9c5 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -8,6 +8,7 @@
 
 #include <trace/workqueue.h>
 #include <linux/list.h>
+#include <linux/percpu.h>
 #include "trace_stat.h"
 #include "trace.h"
 
@@ -37,7 +38,8 @@ struct workqueue_global_stats {
 /* Don't need a global lock because allocated before the workqueues, and
  * never freed.
  */
-static struct workqueue_global_stats *all_workqueue_stat;
+static DEFINE_PER_CPU(struct workqueue_global_stats, all_workqueue_stat);
+#define workqueue_cpu_stat(cpu) (&per_cpu(all_workqueue_stat, cpu))
 
 /* Insertion of a work */
 static void
@@ -48,8 +50,8 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			atomic_inc(&node->inserted);
@@ -58,7 +60,7 @@ probe_workqueue_insertion(struct task_struct *wq_thread,
 	}
 	pr_debug("trace_workqueue: entry not found\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Execution of a work */
@@ -70,8 +72,8 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			node->executed++;
@@ -80,7 +82,7 @@ probe_workqueue_execution(struct task_struct *wq_thread,
 	}
 	pr_debug("trace_workqueue: entry not found\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Creation of a cpu workqueue thread */
@@ -104,11 +106,11 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 
 	cws->pid = wq_thread->pid;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (list_empty(&all_workqueue_stat[cpu].list))
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (list_empty(&workqueue_cpu_stat(cpu)->list))
 		cws->first_entry = true;
-	list_add_tail(&cws->list, &all_workqueue_stat[cpu].list);
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	list_add_tail(&cws->list, &workqueue_cpu_stat(cpu)->list);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 }
 
 /* Destruction of a cpu workqueue thread */
@@ -119,8 +121,8 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
 	struct cpu_workqueue_stats *node, *next;
 	unsigned long flags;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	list_for_each_entry_safe(node, next, &all_workqueue_stat[cpu].list,
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	list_for_each_entry_safe(node, next, &workqueue_cpu_stat(cpu)->list,
 							list) {
 		if (node->pid == wq_thread->pid) {
 			list_del(&node->list);
@@ -131,7 +133,7 @@ static void probe_workqueue_destruction(struct task_struct *wq_thread)
 
 	pr_debug("trace_workqueue: don't find workqueue to destroy\n");
 found:
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 }
 
@@ -141,13 +143,13 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu)
 	struct cpu_workqueue_stats *ret = NULL;
 
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 
-	if (!list_empty(&all_workqueue_stat[cpu].list))
-		ret = list_entry(all_workqueue_stat[cpu].list.next,
+	if (!list_empty(&workqueue_cpu_stat(cpu)->list))
+		ret = list_entry(workqueue_cpu_stat(cpu)->list.next,
 				 struct cpu_workqueue_stats, list);
 
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return ret;
 }
@@ -172,9 +174,9 @@ static void *workqueue_stat_next(void *prev, int idx)
 	unsigned long flags;
 	void *ret = NULL;
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (list_is_last(&prev_cws->list, &all_workqueue_stat[cpu].list)) {
-		spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
+		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
 			ret = workqueue_stat_start_cpu(cpu);
 			if (ret)
@@ -182,7 +184,7 @@ static void *workqueue_stat_next(void *prev, int idx)
 		}
 		return NULL;
 	}
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return list_entry(prev_cws->list.next, struct cpu_workqueue_stats,
 			  list);
@@ -199,10 +201,10 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 		   cws->executed,
 		   trace_find_cmdline(cws->pid));
 
-	spin_lock_irqsave(&all_workqueue_stat[cpu].lock, flags);
-	if (&cws->list == all_workqueue_stat[cpu].list.next)
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
 		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&all_workqueue_stat[cpu].lock, flags);
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
 	return 0;
 }
@@ -258,17 +260,9 @@ int __init trace_workqueue_early_init(void)
 	if (ret)
 		goto no_creation;
 
-	all_workqueue_stat = kmalloc(sizeof(struct workqueue_global_stats)
-				     * num_possible_cpus(), GFP_KERNEL);
-
-	if (!all_workqueue_stat) {
-		pr_warning("trace_workqueue: not enough memory\n");
-		goto no_creation;
-	}
-
 	for_each_possible_cpu(cpu) {
-		spin_lock_init(&all_workqueue_stat[cpu].lock);
-		INIT_LIST_HEAD(&all_workqueue_stat[cpu].list);
+		spin_lock_init(&workqueue_cpu_stat(cpu)->lock);
+		INIT_LIST_HEAD(&workqueue_cpu_stat(cpu)->list);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 5bc4564b224c3d9fe6dddafa25f56059bd978231 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 14:36:52 -0500
Subject: trace: do not disable wake up tracer on output of trace

Impact: fix to erased trace output

To try not to have the outputing of a trace interfere with the wakeup
tracer, it would disable tracing while the output was printing. But
if a trace had started when it was disabled, it can show a partial
trace. To try to solve this, on closing of the tracer, it would
clear the trace buffer.

The latency tracers (wakeup and irqsoff) have two buffers. One for
recording and one for holding the max trace that is printed. The
clearing of the trace above should only affect the recording buffer.
But for some reason it would move the erased trace to the print
buffer. Probably due to a race with the closing of the trace and
the saving ofhe max race.

The above is all pretty useless, and if the user does not want the
printing of the trace to be traced itself, then the user can manual
disable tracing. This patch removes all the code that tries to keep
the output of the tracer from modifying the trace.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 42ae1e77b6b3..e27adef0171a 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -262,12 +262,6 @@ out:
 	atomic_dec(&wakeup_trace->data[cpu]->disabled);
 }
 
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
 static void start_wakeup_tracer(struct trace_array *tr)
 {
 	int ret;
@@ -306,13 +300,10 @@ static void start_wakeup_tracer(struct trace_array *tr)
 
 	register_ftrace_function(&trace_ops);
 
-	if (tracing_is_enabled()) {
+	if (tracing_is_enabled())
 		tracer_enabled = 1;
-		save_tracer_enabled = 1;
-	} else {
+	else
 		tracer_enabled = 0;
-		save_tracer_enabled = 0;
-	}
 
 	return;
 fail_deprobe_wake_new:
@@ -324,7 +315,6 @@ fail_deprobe:
 static void stop_wakeup_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 	unregister_trace_sched_switch(probe_wakeup_sched_switch);
 	unregister_trace_sched_wakeup_new(probe_wakeup);
@@ -350,28 +340,11 @@ static void wakeup_tracer_start(struct trace_array *tr)
 {
 	wakeup_reset(tr);
 	tracer_enabled = 1;
-	save_tracer_enabled = 1;
 }
 
 static void wakeup_tracer_stop(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
-}
-
-static void wakeup_tracer_open(struct trace_iterator *iter)
-{
-	/* stop the trace while dumping */
-	tracer_enabled = 0;
-}
-
-static void wakeup_tracer_close(struct trace_iterator *iter)
-{
-	/* forget about any processes we were recording */
-	if (save_tracer_enabled) {
-		wakeup_reset(iter->tr);
-		tracer_enabled = 1;
-	}
 }
 
 static struct tracer wakeup_tracer __read_mostly =
@@ -381,8 +354,6 @@ static struct tracer wakeup_tracer __read_mostly =
 	.reset		= wakeup_tracer_reset,
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
-	.open		= wakeup_tracer_open,
-	.close		= wakeup_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
-- 
cgit v1.2.3


From 97b17efe4537e11bf6669106cfe4ee2c5331b267 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 15:24:56 -0500
Subject: ring-buffer: do not swap if recording is disabled

If the ring buffer recording has been disabled. Do not let
swapping of ring buffers occur. Simply return -EAGAIN.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 15 +++++++++++++++
 kernel/trace/trace.c       |  2 +-
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 0b9de5a3d699..890020e28a35 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2266,9 +2266,24 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	if (buffer_a->pages != buffer_b->pages)
 		return -EINVAL;
 
+	if (ring_buffer_flags != RB_BUFFERS_ON)
+		return -EAGAIN;
+
+	if (atomic_read(&buffer_a->record_disabled))
+		return -EAGAIN;
+
+	if (atomic_read(&buffer_b->record_disabled))
+		return -EAGAIN;
+
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
+	if (atomic_read(&cpu_buffer_a->record_disabled))
+		return -EAGAIN;
+
+	if (atomic_read(&cpu_buffer_b->record_disabled))
+		return -EAGAIN;
+
 	/*
 	 * We can't do a synchronize_sched here because this
 	 * function can be called in atomic context.
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 220c264e3111..757ae6f7e648 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -415,7 +415,7 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
 
 	ftrace_enable_cpu();
 
-	WARN_ON_ONCE(ret);
+	WARN_ON_ONCE(ret && ret != -EAGAIN);
 
 	__update_max_tr(tr, tsk, cpu);
 	__raw_spin_unlock(&ftrace_max_lock);
-- 
cgit v1.2.3


From 3244351c31211a8b1ba8b4b34c3de04d5dfa03e4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 16:24:46 -0500
Subject: trace: separate out rt tasks from wakeup tracer

Impact: add option to trace all tasks or just RT tasks

The current wakeup tracer only traces RT task wakeups. This is
fine for those interested in wake up timings of RT tasks, but
it is useless for those that are interested in the causes
of long wakeups for non RT tasks.

This patch creates a "wakeup_rt" to implement the tracing of just
RT tasks (as the current "wakeup" does). And makes "wakeup" now
trace all tasks as an average developer would expect.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 34 ++++++++++++++++++++++++++++++++--
 1 file changed, 32 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index e27adef0171a..f48957886102 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -25,6 +25,7 @@ static int __read_mostly	tracer_enabled;
 static struct task_struct	*wakeup_task;
 static int			wakeup_cpu;
 static unsigned			wakeup_prio = -1;
+static int			wakeup_rt;
 
 static raw_spinlock_t wakeup_lock =
 	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -224,7 +225,7 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	tracing_record_cmdline(p);
 	tracing_record_cmdline(current);
 
-	if (likely(!rt_task(p)) ||
+	if ((wakeup_rt && !rt_task(p)) ||
 			p->prio >= wakeup_prio ||
 			p->prio >= current->prio)
 		return;
@@ -321,7 +322,7 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 	unregister_trace_sched_wakeup(probe_wakeup);
 }
 
-static int wakeup_tracer_init(struct trace_array *tr)
+static int __wakeup_tracer_init(struct trace_array *tr)
 {
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
@@ -329,6 +330,18 @@ static int wakeup_tracer_init(struct trace_array *tr)
 	return 0;
 }
 
+static int wakeup_tracer_init(struct trace_array *tr)
+{
+	wakeup_rt = 0;
+	return __wakeup_tracer_init(tr);
+}
+
+static int wakeup_rt_tracer_init(struct trace_array *tr)
+{
+	wakeup_rt = 1;
+	return __wakeup_tracer_init(tr);
+}
+
 static void wakeup_tracer_reset(struct trace_array *tr)
 {
 	stop_wakeup_tracer(tr);
@@ -360,6 +373,19 @@ static struct tracer wakeup_tracer __read_mostly =
 #endif
 };
 
+static struct tracer wakeup_rt_tracer __read_mostly =
+{
+	.name		= "wakeup_rt",
+	.init		= wakeup_rt_tracer_init,
+	.reset		= wakeup_tracer_reset,
+	.start		= wakeup_tracer_start,
+	.stop		= wakeup_tracer_stop,
+	.print_max	= 1,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest    = trace_selftest_startup_wakeup,
+#endif
+};
+
 __init static int init_wakeup_tracer(void)
 {
 	int ret;
@@ -368,6 +394,10 @@ __init static int init_wakeup_tracer(void)
 	if (ret)
 		return ret;
 
+	ret = register_tracer(&wakeup_rt_tracer);
+	if (ret)
+		return ret;
+
 	return 0;
 }
 device_initcall(init_wakeup_tracer);
-- 
cgit v1.2.3


From f8ec1062f589cdb1cffcffab1376124a1bc08500 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 17:17:04 -0500
Subject: wakeup-tracer: show scheduling data in output

Impact: better data for wakeup tracer

This patch adds the wakeup and schedule calls that are used by
the scheduler tracer to make the wakeup tracer more readable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index f48957886102..93cecda650b2 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -153,6 +153,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 		goto out_unlock;
 
 	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -214,6 +215,7 @@ static void wakeup_reset(struct trace_array *tr)
 static void
 probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 {
+	struct trace_array_cpu *data;
 	int cpu = smp_processor_id();
 	unsigned long flags;
 	long disabled;
@@ -253,9 +255,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 
 	local_save_flags(flags);
 
-	wakeup_trace->data[wakeup_cpu]->preempt_timestamp = ftrace_now(cpu);
-	trace_function(wakeup_trace, wakeup_trace->data[wakeup_cpu],
-		       CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	data = wakeup_trace->data[wakeup_cpu];
+	data->preempt_timestamp = ftrace_now(cpu);
+	tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
+				   flags, pc);
+	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
+		       flags, pc);
 
 out_locked:
 	__raw_spin_unlock(&wakeup_lock);
-- 
cgit v1.2.3


From 69507c06539332e6e49f83aa478844130233bece Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 21 Jan 2009 18:45:57 -0500
Subject: ring-buffer: reset timestamps when ring buffer is reset

Impact: fix bad times of recent resets

The ring buffer needs to reset its timestamps when reseting of the
buffer, otherwise the timestamps are stale and might be used to
calculate times in the buffer causing funny timestamps to appear.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 890020e28a35..7839280ffcd8 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2166,6 +2166,9 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer)
 
 	cpu_buffer->overrun = 0;
 	cpu_buffer->entries = 0;
+
+	cpu_buffer->write_stamp = 0;
+	cpu_buffer->read_stamp = 0;
 }
 
 /**
-- 
cgit v1.2.3


From 94523e818f283d3c69f621406f633afff46dbf82 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 11:18:06 -0500
Subject: trace: remove internal irqsoff disabling for trace output

Impact: cleanup of duplicate features

The trace output disables the ring buffer and prevents tracing to
occur. The code in irqsoff to do the same thing is no longer needed.
This patch removes it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_irqsoff.c | 34 ++--------------------------------
 1 file changed, 2 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 62a78d943534..ed344b022a14 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -353,28 +353,18 @@ void trace_preempt_off(unsigned long a0, unsigned long a1)
 }
 #endif /* CONFIG_PREEMPT_TRACER */
 
-/*
- * save_tracer_enabled is used to save the state of the tracer_enabled
- * variable when we disable it when we open a trace output file.
- */
-static int save_tracer_enabled;
-
 static void start_irqsoff_tracer(struct trace_array *tr)
 {
 	register_ftrace_function(&trace_ops);
-	if (tracing_is_enabled()) {
+	if (tracing_is_enabled())
 		tracer_enabled = 1;
-		save_tracer_enabled = 1;
-	} else {
+	else
 		tracer_enabled = 0;
-		save_tracer_enabled = 0;
-	}
 }
 
 static void stop_irqsoff_tracer(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
 	unregister_ftrace_function(&trace_ops);
 }
 
@@ -395,25 +385,11 @@ static void irqsoff_tracer_reset(struct trace_array *tr)
 static void irqsoff_tracer_start(struct trace_array *tr)
 {
 	tracer_enabled = 1;
-	save_tracer_enabled = 1;
 }
 
 static void irqsoff_tracer_stop(struct trace_array *tr)
 {
 	tracer_enabled = 0;
-	save_tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_open(struct trace_iterator *iter)
-{
-	/* stop the trace while dumping */
-	tracer_enabled = 0;
-}
-
-static void irqsoff_tracer_close(struct trace_iterator *iter)
-{
-	/* restart tracing */
-	tracer_enabled = save_tracer_enabled;
 }
 
 #ifdef CONFIG_IRQSOFF_TRACER
@@ -431,8 +407,6 @@ static struct tracer irqsoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_irqsoff,
@@ -459,8 +433,6 @@ static struct tracer preemptoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptoff,
@@ -489,8 +461,6 @@ static struct tracer preemptirqsoff_tracer __read_mostly =
 	.reset		= irqsoff_tracer_reset,
 	.start		= irqsoff_tracer_start,
 	.stop		= irqsoff_tracer_stop,
-	.open		= irqsoff_tracer_open,
-	.close		= irqsoff_tracer_close,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_preemptirqsoff,
-- 
cgit v1.2.3


From b06a830183b610c0a88c29a92feb7991a867ab46 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 14:26:15 -0500
Subject: trace: fix logic to start/stop counting

The logic in the tracing_start/stop code prevents the WARN_ON
from ever detecting if a start/stop pair was mismatched.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 757ae6f7e648..2129ab9d2a48 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -610,13 +610,12 @@ void tracing_start(void)
 		return;
 
 	spin_lock_irqsave(&tracing_start_lock, flags);
-	if (--trace_stop_count)
-		goto out;
-
-	if (trace_stop_count < 0) {
-		/* Someone screwed up their debugging */
-		WARN_ON_ONCE(1);
-		trace_stop_count = 0;
+	if (--trace_stop_count) {
+		if (trace_stop_count < 0) {
+			/* Someone screwed up their debugging */
+			WARN_ON_ONCE(1);
+			trace_stop_count = 0;
+		}
 		goto out;
 	}
 
-- 
cgit v1.2.3


From 7e49fcce1bdadd723ae6a0b3b324c4daced61563 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 22 Jan 2009 19:01:40 -0500
Subject: trace, lockdep: manual preempt count adding for local_bh_disable

Impact: fix to preempt trace triggering lockdep check_flag failure

In local_bh_disable, the use of add_preempt_count causes the
preempt tracer to start recording the time preemption is off.
But because it already modified the preempt_count to show
softirqs disabled, and before it called the lockdep code to
handle this, it causes a state that lockdep can not handle.

The preempt tracer will reset the ring buffer on start of a trace,
and the ring buffer reset code does a spin_lock_irqsave. This
calls into lockdep and lockdep will fail when it detects the
invalid state of having softirqs disabled but the internal
current->softirqs_enabled is still set.

The fix is to manually add the SOFTIRQ_OFFSET to preempt count
and call the preempt tracer code outside the lockdep critical
area.

Thanks to Peter Zijlstra for suggesting this solution.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  2 ++
 kernel/sched.c        |  8 ++++----
 kernel/softirq.c      | 13 ++++++++++++-
 3 files changed, 18 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4cae9b81a1f8..33085b88f87b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -137,6 +137,8 @@ extern unsigned long nr_uninterruptible(void);
 extern unsigned long nr_active(void);
 extern unsigned long nr_iowait(void);
 
+extern unsigned long get_parent_ip(unsigned long addr);
+
 struct seq_file;
 struct cfs_rq;
 struct task_group;
diff --git a/kernel/sched.c b/kernel/sched.c
index 52bbf1c842a8..c154825ae753 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4399,10 +4399,7 @@ void scheduler_tick(void)
 #endif
 }
 
-#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
-				defined(CONFIG_PREEMPT_TRACER))
-
-static inline unsigned long get_parent_ip(unsigned long addr)
+unsigned long get_parent_ip(unsigned long addr)
 {
 	if (in_lock_functions(addr)) {
 		addr = CALLER_ADDR2;
@@ -4412,6 +4409,9 @@ static inline unsigned long get_parent_ip(unsigned long addr)
 	return addr;
 }
 
+#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \
+				defined(CONFIG_PREEMPT_TRACER))
+
 void __kprobes add_preempt_count(int val)
 {
 #ifdef CONFIG_DEBUG_PREEMPT
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..6edfc2c11d99 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -21,6 +21,7 @@
 #include <linux/freezer.h>
 #include <linux/kthread.h>
 #include <linux/rcupdate.h>
+#include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
 
@@ -79,13 +80,23 @@ static void __local_bh_disable(unsigned long ip)
 	WARN_ON_ONCE(in_irq());
 
 	raw_local_irq_save(flags);
-	add_preempt_count(SOFTIRQ_OFFSET);
+	/*
+	 * The preempt tracer hooks into add_preempt_count and will break
+	 * lockdep because it calls back into lockdep after SOFTIRQ_OFFSET
+	 * is set and before current->softirq_enabled is cleared.
+	 * We must manually increment preempt_count here and manually
+	 * call the trace_preempt_off later.
+	 */
+	preempt_count() += SOFTIRQ_OFFSET;
 	/*
 	 * Were softirqs turned off above:
 	 */
 	if (softirq_count() == SOFTIRQ_OFFSET)
 		trace_softirqs_off(ip);
 	raw_local_irq_restore(flags);
+
+	if (preempt_count() == SOFTIRQ_OFFSET)
+		trace_preempt_off(CALLER_ADDR0, get_parent_ip(CALLER_ADDR1));
 }
 #else /* !CONFIG_TRACE_IRQFLAGS */
 static inline void __local_bh_disable(unsigned long ip)
-- 
cgit v1.2.3


From 9005f3ebebfcfe9ccd731d16c468907a35ac1f9a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 22 Jan 2009 17:04:53 -0800
Subject: tracing/function-graph-tracer: various fixes and features

This patch brings various bugfixes:

- Drop the first irrelevant task switch on the very beginning of a trace.

- Drop the OVERHEAD word from the headers, the DURATION word is sufficient
  and will not overlap other columns.

- Make the headers fit well their respective columns whatever the
  selected options.

Ie, default options:

 # tracer: function_graph
 #
 # CPU  DURATION                  FUNCTION CALLS
 # |     |   |                     |   |   |   |

  1)   0.646 us    |                    }
  1)               |                    mem_cgroup_del_lru_list() {
  1)   0.624 us    |                      lookup_page_cgroup();
  1)   1.970 us    |                    }

 echo funcgraph-proc > trace_options

 # tracer: function_graph
 #
 # CPU  TASK/PID        DURATION                  FUNCTION CALLS
 # |    |    |           |   |                     |   |   |   |

  0)   bash-2937    |   0.895 us    |                }
  0)   bash-2937    |   0.888 us    |                __rcu_read_unlock();
  0)   bash-2937    |   0.864 us    |                conv_uni_to_pc();
  0)   bash-2937    |   1.015 us    |                __rcu_read_lock();

 echo nofuncgraph-cpu > trace_options
 echo nofuncgraph-proc > trace_options

 # tracer: function_graph
 #
 #   DURATION                  FUNCTION CALLS
 #    |   |                     |   |   |   |

   3.752 us    |                  native_pud_val();
   0.616 us    |                  native_pud_val();
   0.624 us    |                  native_pmd_val();

About features, one can now disable the duration (this will hide the
overhead too for convenient reasons and because on  doesn't need
overhead if it hasn't the duration):

 echo nofuncgraph-duration > trace_options

 # tracer: function_graph
 #
 #                FUNCTION CALLS
 #                |   |   |   |

           cap_vm_enough_memory() {
             __vm_enough_memory() {
               vm_acct_memory();
             }
           }
         }

And at last, an option to print the absolute time:

 //Restart from default options
 echo funcgraph-abstime > trace_options

 # tracer: function_graph
 #
 #      TIME       CPU  DURATION                  FUNCTION CALLS
 #       |         |     |   |                     |   |   |   |

   261.339774 |   1) + 42.823 us   |    }
   261.339775 |   1)   1.045 us    |    _spin_lock_irq();
   261.339777 |   1)   0.940 us    |    _spin_lock_irqsave();
   261.339778 |   1)   0.752 us    |    _spin_unlock_irqrestore();
   261.339780 |   1)   0.857 us    |    _spin_unlock_irq();
   261.339782 |   1)               |    flush_to_ldisc() {
   261.339783 |   1)               |      tty_ldisc_ref() {
   261.339783 |   1)               |        tty_ldisc_try() {
   261.339784 |   1)   1.075 us    |          _spin_lock_irqsave();
   261.339786 |   1)   0.842 us    |          _spin_unlock_irqrestore();
   261.339788 |   1)   4.211 us    |        }
   261.339788 |   1)   5.662 us    |      }

The format is seconds.usecs.

I guess no one needs the nanosec precision here, the main goal is to have
an overview about the general timings of events, and to see the place when
the trace switches from one cpu to another.

ie:

   274.874760 |   1)   0.676 us    |      _spin_unlock();
   274.874762 |   1)   0.609 us    |      native_load_sp0();
   274.874763 |   1)   0.602 us    |      native_load_tls();
   274.878739 |   0)   0.722 us    |                  }
   274.878740 |   0)   0.714 us    |                  native_pmd_val();
   274.878741 |   0)   0.730 us    |                  native_pmd_val();

Here there is a 4000 usecs difference when we switch the cpu.

Changes in V2:

- Completely fix the first pointless task switch.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 298 +++++++++++++++++++++++------------
 1 file changed, 195 insertions(+), 103 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 3c545984816f..66fc7b806a38 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -1,7 +1,7 @@
 /*
  *
  * Function graph tracer.
- * Copyright (c) 2008 Frederic Weisbecker <fweisbec@gmail.com>
+ * Copyright (c) 2008-2009 Frederic Weisbecker <fweisbec@gmail.com>
  * Mostly borrowed from function tracer which
  * is Copyright (c) Steven Rostedt <srostedt@redhat.com>
  *
@@ -21,9 +21,11 @@
 #define TRACE_GRAPH_PRINT_CPU		0x2
 #define TRACE_GRAPH_PRINT_OVERHEAD	0x4
 #define TRACE_GRAPH_PRINT_PROC		0x8
+#define TRACE_GRAPH_PRINT_DURATION	0x10
+#define TRACE_GRAPH_PRINT_ABS_TIME	0X20
 
 static struct tracer_opt trace_opts[] = {
-	/* Display overruns ? */
+	/* Display overruns? (for self-debug purpose) */
 	{ TRACER_OPT(funcgraph-overrun, TRACE_GRAPH_PRINT_OVERRUN) },
 	/* Display CPU ? */
 	{ TRACER_OPT(funcgraph-cpu, TRACE_GRAPH_PRINT_CPU) },
@@ -31,17 +33,22 @@ static struct tracer_opt trace_opts[] = {
 	{ TRACER_OPT(funcgraph-overhead, TRACE_GRAPH_PRINT_OVERHEAD) },
 	/* Display proc name/pid */
 	{ TRACER_OPT(funcgraph-proc, TRACE_GRAPH_PRINT_PROC) },
+	/* Display duration of execution */
+	{ TRACER_OPT(funcgraph-duration, TRACE_GRAPH_PRINT_DURATION) },
+	/* Display absolute time of an entry */
+	{ TRACER_OPT(funcgraph-abstime, TRACE_GRAPH_PRINT_ABS_TIME) },
 	{ } /* Empty entry */
 };
 
 static struct tracer_flags tracer_flags = {
 	/* Don't display overruns and proc by default */
-	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD,
+	.val = TRACE_GRAPH_PRINT_CPU | TRACE_GRAPH_PRINT_OVERHEAD |
+	       TRACE_GRAPH_PRINT_DURATION,
 	.opts = trace_opts
 };
 
 /* pid on the last trace processed */
-static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
+
 
 static int graph_trace_init(struct trace_array *tr)
 {
@@ -154,17 +161,25 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 {
 	pid_t prev_pid;
+	pid_t *last_pid;
 	int ret;
 
-	if (last_pid[cpu] != -1 && last_pid[cpu] == pid)
+	if (!last_pids_cpu)
+		return TRACE_TYPE_HANDLED;
+
+	last_pid = per_cpu_ptr(last_pids_cpu, cpu);
+
+	if (*last_pid == pid)
 		return TRACE_TYPE_HANDLED;
 
-	prev_pid = last_pid[cpu];
-	last_pid[cpu] = pid;
+	prev_pid = *last_pid;
+	*last_pid = pid;
 
+	if (prev_pid == -1)
+		return TRACE_TYPE_HANDLED;
 /*
  * Context-switch trace line:
 
@@ -232,9 +247,34 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 	return true;
 }
 
+/* Signal a overhead of time execution to the output */
+static int
+print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+{
+	/* If duration disappear, we don't need anything */
+	if (!(tracer_flags.val & TRACE_GRAPH_PRINT_DURATION))
+		return 1;
+
+	/* Non nested entry or return */
+	if (duration == -1)
+		return trace_seq_printf(s, "  ");
+
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
+		/* Duration exceeded 100 msecs */
+		if (duration > 100000ULL)
+			return trace_seq_printf(s, "! ");
+
+		/* Duration exceeded 10 msecs */
+		if (duration > 10000ULL)
+			return trace_seq_printf(s, "+ ");
+	}
+
+	return trace_seq_printf(s, "  ");
+}
+
 static enum print_line_t
 print_graph_irq(struct trace_seq *s, unsigned long addr,
-				enum trace_type type, int cpu, pid_t pid)
+		enum trace_type type, int cpu, pid_t pid)
 {
 	int ret;
 
@@ -242,35 +282,40 @@ print_graph_irq(struct trace_seq *s, unsigned long addr,
 		addr >= (unsigned long)__irqentry_text_end)
 		return TRACE_TYPE_UNHANDLED;
 
-	if (type == TRACE_GRAPH_ENT) {
-		ret = trace_seq_printf(s, "==========> |  ");
-	} else {
-		/* Cpu */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-			ret = print_graph_cpu(s, cpu);
-			if (ret == TRACE_TYPE_PARTIAL_LINE)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
-		/* Proc */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-			ret = print_graph_proc(s, pid);
-			if (ret == TRACE_TYPE_PARTIAL_LINE)
-				return TRACE_TYPE_PARTIAL_LINE;
+	/* Cpu */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
+		ret = print_graph_cpu(s, cpu);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+	/* Proc */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
+		ret = print_graph_proc(s, pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+		ret = trace_seq_printf(s, " | ");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
-			ret = trace_seq_printf(s, " | ");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	/* No overhead */
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-		/* No overhead */
-		if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-			ret = trace_seq_printf(s, "  ");
-			if (!ret)
-				return TRACE_TYPE_PARTIAL_LINE;
-		}
+	if (type == TRACE_GRAPH_ENT)
+		ret = trace_seq_printf(s, "==========>");
+	else
+		ret = trace_seq_printf(s, "<==========");
+
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Don't close the duration column if haven't one */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		trace_seq_printf(s, " |");
+	ret = trace_seq_printf(s, "\n");
 
-		ret = trace_seq_printf(s, "<========== |\n");
-	}
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 	return TRACE_TYPE_HANDLED;
@@ -289,7 +334,7 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 	sprintf(msecs_str, "%lu", (unsigned long) duration);
 
 	/* Print msecs */
-	ret = trace_seq_printf(s, msecs_str);
+	ret = trace_seq_printf(s, "%s", msecs_str);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -322,19 +367,15 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 }
 
-/* Signal a overhead of time execution to the output */
-static int
-print_graph_overhead(unsigned long long duration, struct trace_seq *s)
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
 {
-	/* Duration exceeded 100 msecs */
-	if (duration > 100000ULL)
-		return trace_seq_printf(s, "! ");
+	unsigned long usecs_rem;
 
-	/* Duration exceeded 10 msecs */
-	if (duration > 10000ULL)
-		return trace_seq_printf(s, "+ ");
+	usecs_rem = do_div(t, 1000000000);
+	usecs_rem /= 1000;
 
-	return trace_seq_printf(s, "  ");
+	return trace_seq_printf(s, "%5lu.%06lu |  ",
+			(unsigned long)t, usecs_rem);
 }
 
 /* Case of a leaf function on its call entry */
@@ -357,16 +398,16 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	duration = graph_ret->rettime - graph_ret->calltime;
 
 	/* Overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = print_graph_overhead(duration, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	ret = print_graph_duration(duration, s);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = print_graph_duration(duration, s);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
@@ -395,25 +436,17 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	struct ftrace_graph_ent *call = &entry->graph_ent;
 
 	/* No overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = trace_seq_printf(s, "  ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Interrupt */
-	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, pid);
-	if (ret == TRACE_TYPE_UNHANDLED) {
-		/* No time */
+	/* No time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
 		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
-	} else {
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-
 	/* Function */
 	for (i = 0; i < call->depth * TRACE_GRAPH_INDENT; i++) {
 		ret = trace_seq_printf(s, " ");
@@ -434,15 +467,30 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 
 static enum print_line_t
 print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter, int cpu)
+			struct trace_iterator *iter)
 {
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
+	struct ftrace_graph_ent *call = &field->graph_ent;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* Interrupt */
+	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
+	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -470,16 +518,25 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
-		   struct trace_entry *ent, int cpu)
+		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_pid = iter->private;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -499,16 +556,16 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	}
 
 	/* Overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = print_graph_overhead(duration, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
+	ret = print_graph_overhead(duration, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Duration */
-	ret = print_graph_duration(duration, s);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = print_graph_duration(duration, s);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Closing brace */
 	for (i = 0; i < trace->depth * TRACE_GRAPH_INDENT; i++) {
@@ -542,14 +599,23 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 {
 	int i;
 	int ret;
+	int cpu = iter->cpu;
+	pid_t *last_pid = iter->private;
+
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, iter->cpu) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, iter->cpu);
+		ret = print_graph_cpu(s, cpu);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
@@ -566,17 +632,17 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	}
 
 	/* No overhead */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		ret = trace_seq_printf(s, "  ");
+	ret = print_graph_overhead(-1, s);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	/* No time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION) {
+		ret = trace_seq_printf(s, "            |  ");
 		if (!ret)
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* No time */
-	ret = trace_seq_printf(s, "            |  ");
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	/* Indentation */
 	if (trace->depth > 0)
 		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
@@ -614,13 +680,12 @@ print_graph_function(struct trace_iterator *iter)
 	case TRACE_GRAPH_ENT: {
 		struct ftrace_graph_ent_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_entry(field, s, iter,
-					 iter->cpu);
+		return print_graph_entry(field, s, iter);
 	}
 	case TRACE_GRAPH_RET: {
 		struct ftrace_graph_ret_entry *field;
 		trace_assign_type(field, entry);
-		return print_graph_return(&field->ret, s, entry, iter->cpu);
+		return print_graph_return(&field->ret, s, entry, iter);
 	}
 	case TRACE_PRINT: {
 		struct print_entry *field;
@@ -636,28 +701,55 @@ static void print_graph_headers(struct seq_file *s)
 {
 	/* 1st line */
 	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		seq_printf(s, "     TIME       ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "CPU ");
+		seq_printf(s, "CPU");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "TASK/PID     ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD)
-		seq_printf(s, "OVERHEAD/");
-	seq_printf(s, "DURATION            FUNCTION CALLS\n");
+		seq_printf(s, "  TASK/PID      ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		seq_printf(s, "  DURATION   ");
+	seq_printf(s, "               FUNCTION CALLS\n");
 
 	/* 2nd line */
 	seq_printf(s, "# ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME)
+		seq_printf(s, "      |         ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU)
-		seq_printf(s, "|   ");
+		seq_printf(s, "|  ");
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC)
-		seq_printf(s, "|      |     ");
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_OVERHEAD) {
-		seq_printf(s, "|        ");
-		seq_printf(s, "|                   |   |   |   |\n");
-	} else
-		seq_printf(s, "    |               |   |   |   |\n");
+		seq_printf(s, "  |    |        ");
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_DURATION)
+		seq_printf(s, "   |   |      ");
+	seq_printf(s, "               |   |   |   |\n");
 }
+
+static void graph_trace_open(struct trace_iterator *iter)
+{
+	/* pid on the last trace processed */
+	pid_t *last_pid = alloc_percpu(pid_t);
+	int cpu;
+
+	if (!last_pid)
+		pr_warning("function graph tracer: not enough memory\n");
+	else
+		for_each_possible_cpu(cpu) {
+			pid_t *pid = per_cpu_ptr(last_pid, cpu);
+			*pid = -1;
+		}
+
+	iter->private = last_pid;
+}
+
+static void graph_trace_close(struct trace_iterator *iter)
+{
+	percpu_free(iter->private);
+}
+
 static struct tracer graph_trace __read_mostly = {
 	.name	     	= "function_graph",
+	.open		= graph_trace_open,
+	.close		= graph_trace_close,
 	.init	     	= graph_trace_init,
 	.reset	     	= graph_trace_reset,
 	.print_line	= print_graph_function,
-- 
cgit v1.2.3


From 6626bff24578753808c8b5bd4f1619e14e980f0f Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 25 Jan 2009 11:31:36 +0100
Subject: hrtimer: prevent negative expiry value after clock_was_set()

Impact: prevent false positive WARN_ON() in clockevents_program_event()

clock_was_set() changes the base->offset of CLOCK_REALTIME and
enforces the reprogramming of the clockevent device to expire timers
which are based on CLOCK_REALTIME. If the clock change is large enough
then the subtraction of the timer expiry value and base->offset can
become negative which triggers the warning in
clockevents_program_event().

Check the subtraction result and set a negative value to 0.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/hrtimer.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index 2c40ee8f44bd..d71cef25954b 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -501,6 +501,13 @@ static void hrtimer_force_reprogram(struct hrtimer_cpu_base *cpu_base)
 			continue;
 		timer = rb_entry(base->first, struct hrtimer, node);
 		expires = ktime_sub(hrtimer_get_expires(timer), base->offset);
+		/*
+		 * clock_was_set() has changed base->offset so the
+		 * result might be negative. Fix it up to prevent a
+		 * false positive in clockevents_program_event()
+		 */
+		if (expires.tv64 < 0)
+			expires.tv64 = 0;
 		if (expires.tv64 < cpu_base->expires_next.tv64)
 			cpu_base->expires_next = expires;
 	}
-- 
cgit v1.2.3


From cc2f6d90e950b69ad31d483c19cc1d121bb25c16 Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Fri, 23 Jan 2009 13:03:37 -0800
Subject: kmemtrace: fix printk format warnings

Fix kmemtrace printk warnings:

  kernel/trace/kmemtrace.c:142: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t'
  kernel/trace/kmemtrace.c:147: warning: format '%4ld' expects type 'long int', but argument 3 has type 'size_t'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 7ebc58cee3bd..72b326bba005 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4ld   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3


From 9011262a37cb438f0fa9394b5e83840db8f9680a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:23 -0200
Subject: ftrace: add ftrace_vprintk

Impact: new helper function

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 8 ++++++++
 kernel/trace/trace.c   | 9 +++++++++
 2 files changed, 17 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 9f7880d87c39..7840e718c6c7 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -302,6 +302,9 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 extern int
 __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
+# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+extern int
+__ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -317,6 +320,11 @@ ftrace_printk(const char *fmt, ...)
 {
 	return 0;
 }
+static inline int
+ftrace_vprintk(const char *fmt, va_list ap)
+{
+	return 0;
+}
 static inline void ftrace_dump(void) { }
 #endif
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2129ab9d2a48..2f8ac1f008f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2951,6 +2951,15 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 }
 EXPORT_SYMBOL_GPL(__ftrace_printk);
 
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
-- 
cgit v1.2.3


From c71a896154119f4ca9e89d6078f5f63ad60ef199 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 23 Jan 2009 12:06:27 -0200
Subject: blktrace: add ftrace plugin

Impact: New way of using the blktrace infrastructure

This drops the requirement of userspace utilities to use the blktrace
facility.

Configuration is done thru sysfs, adding a "trace" directory to the
partition directory where blktrace can be enabled for the associated
request_queue.

The same filters present in the IOCTL interface are present as sysfs
device attributes.

The /sys/block/sdX/sdXN/trace/enable file allows tracing without any
filters.

The other files in this directory: pid, act_mask, start_lba and end_lba
can be used with the same meaning as with the IOCTL interface.

Using the sysfs interface will only setup the request_queue->blk_trace
fields, tracing will only take place when the "blk" tracer is selected
via the ftrace interface, as in the following example:

To see the trace, one can use the /d/tracing/trace file or the
/d/tracign/trace_pipe file, with semantics defined in the ftrace
documentation in Documentation/ftrace.txt.

[root@f10-1 ~]# cat /t/trace
       kjournald-305   [000]  3046.491224:   8,1    A WBS 6367 + 8 <- (8,1) 6304
       kjournald-305   [000]  3046.491227:   8,1    Q   R 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491236:   8,1    G  RB 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491239:   8,1    P  NS [kjournald]
       kjournald-305   [000]  3046.491242:   8,1    I RBS 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491251:   8,1    D  WB 6367 + 8 [kjournald]
       kjournald-305   [000]  3046.491610:   8,1    U  WS [kjournald] 1
          <idle>-0     [000]  3046.511914:   8,1    C  RS 6367 + 8 [6367]
[root@f10-1 ~]#

The default line context (prefix) format is the one described in the ftrace
documentation, with the blktrace specific bits using its existing format,
described in blkparse(8).

If one wants to have the classic blktrace formatting, this is possible by
using:

[root@f10-1 ~]# echo blk_classic > /t/trace_options
[root@f10-1 ~]# cat /t/trace
  8,1    0  3046.491224   305  A WBS 6367 + 8 <- (8,1) 6304
  8,1    0  3046.491227   305  Q   R 6367 + 8 [kjournald]
  8,1    0  3046.491236   305  G  RB 6367 + 8 [kjournald]
  8,1    0  3046.491239   305  P  NS [kjournald]
  8,1    0  3046.491242   305  I RBS 6367 + 8 [kjournald]
  8,1    0  3046.491251   305  D  WB 6367 + 8 [kjournald]
  8,1    0  3046.491610   305  U  WS [kjournald] 1
  8,1    0  3046.511914     0  C  RS 6367 + 8 [6367]
[root@f10-1 ~]#

Using the ftrace standard format allows more flexibility, such
as the ability of asking for backtraces via trace_options:

[root@f10-1 ~]# echo noblk_classic > /t/trace_options
[root@f10-1 ~]# echo stacktrace > /t/trace_options

[root@f10-1 ~]# cat /t/trace
       kjournald-305   [000]  3318.826779:   8,1    A WBS 6375 + 8 <- (8,1) 6312
       kjournald-305   [000]  3318.826782:
 <= submit_bio
 <= submit_bh
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald
 <= kthread
 <= child_rip
       kjournald-305   [000]  3318.826836:   8,1    Q   R 6375 + 8 [kjournald]
       kjournald-305   [000]  3318.826837:
 <= generic_make_request
 <= submit_bio
 <= submit_bh
 <= sync_dirty_buffer
 <= journal_commit_transaction
 <= kjournald
 <= kthread

Please read the ftrace documentation to use aditional, standardized
tracing filters such as /d/tracing/trace_cpumask, etc.

See also /d/tracing/trace_mark to add comments in the trace stream,
that is equivalent to the /d/block/sdaN/msg interface.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c      | 651 +++++++++++++++++++++++++++++++++++++++++++++++++-
 fs/partitions/check.c |   7 +
 kernel/trace/trace.h  |   1 +
 3 files changed, 654 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index b0a2cae886db..630f167f8240 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -25,9 +25,27 @@
 #include <linux/time.h>
 #include <trace/block.h>
 #include <asm/uaccess.h>
+#include <../kernel/trace/trace_output.h>
 
 static unsigned int blktrace_seq __read_mostly = 1;
 
+static struct trace_array *blk_tr;
+static int __read_mostly  blk_tracer_enabled;
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 	0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC ) },
+	{ }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+	.val  = 0,
+	.opts = blk_tracer_opts,
+};
+
 /* Global reference count of probes */
 static DEFINE_MUTEX(blk_probe_mutex);
 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
@@ -43,6 +61,9 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 {
 	struct blk_io_trace *t;
 
+	if (!bt->rchan)
+		return;
+
 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
 	if (t) {
 		const int cpu = smp_processor_id();
@@ -90,6 +111,16 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
+	if (blk_tr) {
+		va_start(args, fmt);
+		ftrace_vprintk(fmt, args);
+		va_end(args);
+		return;
+	}
+
+	if (!bt->msg_data)
+		return;
+
 	local_irq_save(flags);
 	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
 	va_start(args, fmt);
@@ -131,13 +162,14 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
 {
 	struct task_struct *tsk = current;
+	struct ring_buffer_event *event = NULL;
 	struct blk_io_trace *t;
 	unsigned long flags;
 	unsigned long *sequence;
 	pid_t pid;
-	int cpu;
+	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running))
+	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -150,6 +182,24 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid = tsk->pid;
 	if (unlikely(act_log_check(bt, what, sector, pid)))
 		return;
+	cpu = raw_smp_processor_id();
+
+	if (blk_tr) {
+		struct trace_entry *ent;
+		tracing_record_cmdline(current);
+
+		event = ring_buffer_lock_reserve(blk_tr->buffer,
+						 sizeof(*t) + pdu_len, &flags);
+		if (!event)
+			return;
+		
+		ent = ring_buffer_event_data(event);
+		t = (struct blk_io_trace *)ent;
+		pc = preempt_count();
+		tracing_generic_entry_update(ent, 0, pc);
+		ent->type = TRACE_BLK;
+		goto record_it;
+	}
 
 	/*
 	 * A word about the locking here - we disable interrupts to reserve
@@ -163,23 +213,33 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
 	if (t) {
-		cpu = smp_processor_id();
 		sequence = per_cpu_ptr(bt->sequence, cpu);
 
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->sequence = ++(*sequence);
 		t->time = ktime_to_ns(ktime_get());
+		t->cpu = cpu;
+		t->pid = pid;
+record_it:
 		t->sector = sector;
 		t->bytes = bytes;
 		t->action = what;
-		t->pid = pid;
 		t->device = bt->dev;
-		t->cpu = cpu;
 		t->error = error;
 		t->pdu_len = pdu_len;
 
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+		if (blk_tr) {
+			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
+			if (pid != 0 &&
+			    (blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) == 0 &&
+			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
+				__trace_stack(blk_tr, NULL, flags, 5, pc);
+			trace_wake_up();
+			return;
+		}
 	}
 
 	local_irq_restore(flags);
@@ -888,3 +948,584 @@ static void blk_unregister_tracepoints(void)
 
 	tracepoint_synchronize_unregister();
 }
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+        int i = 0;
+
+        if (t->action & BLK_TC_DISCARD)	   rwbs[i++] = 'D';
+        else if (t->action & BLK_TC_WRITE) rwbs[i++] = 'W';
+        else if (t->bytes)		   rwbs[i++] = 'R';
+        else				   rwbs[i++] = 'N';
+
+        if (t->action & BLK_TC_AHEAD)	   rwbs[i++] = 'A';
+        if (t->action & BLK_TC_BARRIER)	   rwbs[i++] = 'B';
+        if (t->action & BLK_TC_SYNC)	   rwbs[i++] = 'S';
+        if (t->action & BLK_TC_META)	   rwbs[i++] = 'M';
+
+        rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+	return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+	const __u64 *val = pdu_start(ent);
+	return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+			  struct blk_io_trace_remap *r)
+{
+	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	__u64 sector = __r->sector;
+
+	r->device = be32_to_cpu(__r->device);
+	r->device_from = be32_to_cpu(__r->device_from);
+	r->sector = be64_to_cpu(sector);
+}
+
+static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+{
+	char rwbs[6];
+	unsigned long long ts  = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned secs	       = (unsigned long)ts;
+	const struct trace_entry *ent = iter->ent;
+	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+
+	fill_rwbs(rwbs, t);
+
+	return trace_seq_printf(&iter->seq,
+				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), iter->cpu,
+				secs, usec_rem, ent->pid, act, rwbs);
+}
+
+static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
+			      const char *act)
+{
+	char rwbs[6];
+	fill_rwbs(rwbs, t);
+	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *cmd = trace_find_cmdline(ent->pid);
+
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%s]\n",
+					t_sector(ent), t_sec(ent), cmd);
+	return trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent)
+{
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
+					t_sec(ent), t_error(ent));
+	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+}
+
+static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+	struct blk_io_trace_remap r = { .device = 0, };
+
+	get_pdu_remap(ent, &r);
+	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+			       t_sector(ent),
+			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
+			       (unsigned long long)r.sector);
+}
+
+static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+}
+
+static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
+				get_pdu_int(ent));
+}
+
+static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return;
+	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
+		    "#  |     |     |           |   |   |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+	int cpu;
+
+	tr->time_start = ftrace_now(tr->cpu);
+
+	for_each_online_cpu(cpu)
+		tracing_reset(tr, cpu);
+
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		if (blk_register_tracepoints())
+			atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+	blk_tr = tr;
+	blk_tracer_start(tr);
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled++;
+	mutex_unlock(&blk_probe_mutex);
+	return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+	if (!atomic_read(&blk_probes_ref))
+		return;
+
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled--;
+	WARN_ON(blk_tracer_enabled < 0);
+	mutex_unlock(&blk_probe_mutex);
+
+	blk_tracer_stop(tr);
+}
+
+static struct {
+	const char *act[2];
+	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] __read_mostly = {
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
+	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
+	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
+	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
+	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
+	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
+	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
+	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
+	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
+	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
+	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
+	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
+	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
+};
+
+static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
+				 int flags)
+{
+	const struct blk_io_trace *t = (struct blk_io_trace *)ent;
+	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	int ret;
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(s, ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+	const struct blk_io_trace *t;
+	u16 what;
+	int ret;
+
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return TRACE_TYPE_UNHANDLED;
+
+	t = (const struct blk_io_trace *)iter->ent;
+	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(&iter->seq, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+	.name		= "blk",
+	.init		= blk_tracer_init,
+	.reset		= blk_tracer_reset,
+	.start		= blk_tracer_start,
+	.stop		= blk_tracer_stop,
+	.print_header	= blk_tracer_print_header,
+	.print_line	= blk_tracer_print_line,
+	.flags		= &blk_tracer_flags,
+};
+
+static struct trace_event trace_blk_event = {
+	.type	 	= TRACE_BLK,
+	.trace		= blk_trace_event_print,
+	.latency_trace	= blk_trace_event_print,
+	.raw		= trace_nop_print,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
+static int __init init_blk_tracer(void)
+{
+	if (!register_ftrace_event(&trace_blk_event)) {
+		pr_warning("Warning: could not register block events\n");
+		return 1;
+	}
+
+	if (register_tracer(&blk_tracer) != 0) {
+		pr_warning("Warning: could not register the block tracer\n");
+		unregister_ftrace_event(&trace_blk_event);
+		return 1;
+	}
+
+	return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (bt == NULL)
+		return -EINVAL;
+
+	kfree(bt);
+	return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	int ret;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->dev = dev;
+	bt->act_mask = (u16)-1;
+	bt->end_lba = -1ULL;
+	bt->trace_state = Blktrace_running;
+
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt != NULL) {
+		(void)xchg(&q->blk_trace, old_bt);
+		kfree(bt);
+		ret = -EBUSY;
+	}
+	return 0;
+err:
+	return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev != NULL) {
+		struct request_queue *q = bdev_get_queue(bdev);
+
+		if (q != NULL) {
+			mutex_lock(&bdev->bd_mutex);
+			ret = sprintf(buf, "%u\n", !!q->blk_trace);
+			mutex_unlock(&bdev->bd_mutex);
+		}
+
+		bdput(bdev);
+	}
+
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	int value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0 || sscanf(buf, "%d", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	if (value)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+	else
+		ret = blk_trace_remove_queue(q);
+	mutex_unlock(&bdev->bd_mutex);
+
+	if (ret == 0)
+		ret = count;
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+		    sysfs_blk_trace_attr_show, \
+		    sysfs_blk_trace_attr_store)
+
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_act_mask.attr,
+	&dev_attr_pid.attr,
+	&dev_attr_start_lba.attr,
+	&dev_attr_end_lba.attr,
+	NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+	.name  = "trace",
+	.attrs = blk_trace_attrs,
+};
+
+static int blk_str2act_mask(const char *str)
+{
+	int mask = 0;
+	char *copy = kstrdup(str, GFP_KERNEL), *s;
+
+	if (copy == NULL)
+		return -ENOMEM;
+
+	s = strstrip(copy);
+
+	while (1) {
+		char *sep = strchr(s, ',');
+
+		if (sep != NULL)
+			*sep = '\0';
+
+		if (strcasecmp(s, "barrier") == 0)
+			mask |= BLK_TC_BARRIER;
+		else if (strcasecmp(s, "complete") == 0)
+			mask |= BLK_TC_COMPLETE;
+		else if (strcasecmp(s, "fs") == 0)
+			mask |= BLK_TC_FS;
+		else if (strcasecmp(s, "issue") == 0)
+			mask |= BLK_TC_ISSUE;
+		else if (strcasecmp(s, "pc") == 0)
+			mask |= BLK_TC_PC;
+		else if (strcasecmp(s, "queue") == 0)
+			mask |= BLK_TC_QUEUE;
+		else if (strcasecmp(s, "read") == 0)
+			mask |= BLK_TC_READ;
+		else if (strcasecmp(s, "requeue") == 0)
+			mask |= BLK_TC_REQUEUE;
+		else if (strcasecmp(s, "sync") == 0)
+			mask |= BLK_TC_SYNC;
+		else if (strcasecmp(s, "write") == 0)
+			mask |= BLK_TC_WRITE;
+
+		if (sep == NULL)
+			break;
+
+		s = sep + 1;
+	}
+	kfree(copy);
+
+	return mask;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct request_queue *q;
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+	mutex_lock(&bdev->bd_mutex);
+	if (q->blk_trace == NULL)
+		ret = sprintf(buf, "disabled\n");
+	else if (attr == &dev_attr_act_mask)
+		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+	else if (attr == &dev_attr_pid)
+		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+	else if (attr == &dev_attr_start_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+	else if (attr == &dev_attr_end_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	u64 value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0)
+		goto out;
+
+	if (attr == &dev_attr_act_mask) {
+		if (sscanf(buf, "%llx", &value) != 1) {
+			/* Assume it is a list of trace category names */
+			value = blk_str2act_mask(buf);
+			if (value < 0)
+				goto out;
+		}
+	} else if (sscanf(buf, "%llu", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = 0;
+	if (q->blk_trace == NULL)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+
+	if (ret == 0) {
+		if (attr == &dev_attr_act_mask)
+			q->blk_trace->act_mask = value;
+		else if (attr == &dev_attr_pid)
+			q->blk_trace->pid = value;
+		else if (attr == &dev_attr_start_lba)
+			q->blk_trace->start_lba = value;
+		else if (attr == &dev_attr_end_lba)
+			q->blk_trace->end_lba = value;
+		ret = count;
+	}
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
diff --git a/fs/partitions/check.c b/fs/partitions/check.c
index 6d720243f5f4..01714efdc65a 100644
--- a/fs/partitions/check.c
+++ b/fs/partitions/check.c
@@ -268,6 +268,10 @@ ssize_t part_fail_store(struct device *dev,
 }
 #endif
 
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+extern struct attribute_group blk_trace_attr_group;
+#endif
+
 static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL);
 static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL);
 static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL);
@@ -294,6 +298,9 @@ static struct attribute_group part_attr_group = {
 
 static struct attribute_group *part_attr_groups[] = {
 	&part_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+	&blk_trace_attr_group,
+#endif
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b96037d970df..e603a291134b 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -32,6 +32,7 @@ enum trace_type {
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
+	TRACE_BLK,
 
 	__TRACE_LAST_TYPE,
 };
-- 
cgit v1.2.3


From f04109bf1be7449e27d38ae1bb8465013374bd49 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Jan 2009 13:02:12 -0200
Subject: trace: Use tracing_reset_online_cpus in more places
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c          | 6 +-----
 kernel/trace/trace_functions_graph.c | 8 ++------
 kernel/trace/trace_nop.c             | 6 +-----
 3 files changed, 4 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ca017e0a9a27..1284145c8898 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -133,11 +133,7 @@ static void stop_branch_trace(struct trace_array *tr)
 
 static int branch_trace_init(struct trace_array *tr)
 {
-	int cpu;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
+	tracing_reset_online_cpus(tr);
 	start_branch_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66fc7b806a38..c97594d826bc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -52,15 +52,11 @@ static struct tracer_flags tracer_flags = {
 
 static int graph_trace_init(struct trace_array *tr)
 {
-	int cpu, ret;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
-	ret = register_ftrace_graph(&trace_graph_return,
+	int ret = register_ftrace_graph(&trace_graph_return,
 					&trace_graph_entry);
 	if (ret)
 		return ret;
+	tracing_reset_online_cpus(tr);
 	tracing_start_cmdline_record();
 
 	return 0;
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index b9767acd30ac..087b6cbf4ea5 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -47,12 +47,8 @@ static void stop_nop_trace(struct trace_array *tr)
 
 static int nop_trace_init(struct trace_array *tr)
 {
-	int cpu;
 	ctx_trace = tr;
-
-	for_each_online_cpu(cpu)
-		tracing_reset(tr, cpu);
-
+	tracing_reset_online_cpus(tr);
 	start_nop_trace(tr);
 	return 0;
 }
-- 
cgit v1.2.3


From b3a8c34886d0e3dd3a24a5b614ee025181da2f41 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 28 Jan 2009 13:08:37 -0200
Subject: trace_sched_wakeup: Remove unused variable

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 93cecda650b2..a48c9b4b0c85 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -184,13 +184,10 @@ out:
 
 static void __wakeup_reset(struct trace_array *tr)
 {
-	struct trace_array_cpu *data;
 	int cpu;
 
-	for_each_possible_cpu(cpu) {
-		data = tr->data[cpu];
+	for_each_possible_cpu(cpu)
 		tracing_reset(tr, cpu);
-	}
 
 	wakeup_cpu = -1;
 	wakeup_prio = -1;
-- 
cgit v1.2.3


From ecf441b593ac41cb8cd8cd3695110167c42e098c Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Thu, 29 Jan 2009 13:49:45 -0800
Subject: kmemtrace: fix printk formats, fix

Geert Uytterhoeven wrote:

> %4zu?

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
Acked-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 72b326bba005..f04c0625f1cd 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -139,12 +139,12 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Requested */
-	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_req);
+	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_req);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Allocated */
-	ret = trace_seq_printf(s, "%4zd   ", entry->bytes_alloc);
+	ret = trace_seq_printf(s, "%4zu   ", entry->bytes_alloc);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3


From eefef1cf7653cd4e0aaf743c00ae8345086cdc01 Mon Sep 17 00:00:00 2001
From: Stephen Hemminger <shemminger@linux-foundation.org>
Date: Sun, 1 Feb 2009 01:04:33 -0800
Subject: net: add ARP notify option for devices

This adds another inet device option to enable gratuitous ARP
when device is brought up or address change. This is handy for
clusters or virtualization.

Signed-off-by: Stephen Hemminger <shemminger@linux-foundation.org>
Signed-off-by: Jeremy Fitzhardinge <jeremy.fitzhardinge@citrix.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 Documentation/networking/ip-sysctl.txt | 6 ++++++
 include/linux/inetdevice.h             | 1 +
 include/linux/sysctl.h                 | 1 +
 kernel/sysctl_check.c                  | 1 +
 net/ipv4/devinet.c                     | 9 +++++++++
 5 files changed, 18 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index c7712787933c..ff3f219ee4d7 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -782,6 +782,12 @@ arp_ignore - INTEGER
 	The max value from conf/{all,interface}/arp_ignore is used
 	when ARP request is received on the {interface}
 
+arp_notify - BOOLEAN
+	Define mode for notification of address and device changes.
+	0 - (default): do nothing
+	1 - Generate gratuitous arp replies when device is brought up
+	    or hardware address changes.
+
 arp_accept - BOOLEAN
 	Define behavior when gratuitous arp replies are received:
 	0 - drop gratuitous arp frames
diff --git a/include/linux/inetdevice.h b/include/linux/inetdevice.h
index 06fcdb45106b..acef2a770b6b 100644
--- a/include/linux/inetdevice.h
+++ b/include/linux/inetdevice.h
@@ -108,6 +108,7 @@ static inline void ipv4_devconf_setall(struct in_device *in_dev)
 #define IN_DEV_ARPFILTER(in_dev)	IN_DEV_ORCONF((in_dev), ARPFILTER)
 #define IN_DEV_ARP_ANNOUNCE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_ANNOUNCE)
 #define IN_DEV_ARP_IGNORE(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_IGNORE)
+#define IN_DEV_ARP_NOTIFY(in_dev)	IN_DEV_MAXCONF((in_dev), ARP_NOTIFY)
 
 struct in_ifaddr
 {
diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h
index 39d471d1163b..e76d3b22a466 100644
--- a/include/linux/sysctl.h
+++ b/include/linux/sysctl.h
@@ -490,6 +490,7 @@ enum
 	NET_IPV4_CONF_ARP_IGNORE=19,
 	NET_IPV4_CONF_PROMOTE_SECONDARIES=20,
 	NET_IPV4_CONF_ARP_ACCEPT=21,
+	NET_IPV4_CONF_ARP_NOTIFY=22,
 	__NET_IPV4_CONF_MAX
 };
 
diff --git a/kernel/sysctl_check.c b/kernel/sysctl_check.c
index fafeb48f27c0..b38423ca711a 100644
--- a/kernel/sysctl_check.c
+++ b/kernel/sysctl_check.c
@@ -219,6 +219,7 @@ static const struct trans_ctl_table trans_net_ipv4_conf_vars_table[] = {
 	{ NET_IPV4_CONF_ARP_IGNORE,		"arp_ignore" },
 	{ NET_IPV4_CONF_PROMOTE_SECONDARIES,	"promote_secondaries" },
 	{ NET_IPV4_CONF_ARP_ACCEPT,		"arp_accept" },
+	{ NET_IPV4_CONF_ARP_NOTIFY,		"arp_notify" },
 	{}
 };
 
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index 309997edc8a5..d519a6a66726 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -1075,6 +1075,14 @@ static int inetdev_event(struct notifier_block *this, unsigned long event,
 			}
 		}
 		ip_mc_up(in_dev);
+		/* fall through */
+	case NETDEV_CHANGEADDR:
+		if (IN_DEV_ARP_NOTIFY(in_dev))
+			arp_send(ARPOP_REQUEST, ETH_P_ARP,
+				 in_dev->ifa_list->ifa_address,
+				 dev,
+				 in_dev->ifa_list->ifa_address,
+				 NULL, dev->dev_addr, NULL);
 		break;
 	case NETDEV_DOWN:
 		ip_mc_down(in_dev);
@@ -1439,6 +1447,7 @@ static struct devinet_sysctl_table {
 		DEVINET_SYSCTL_RW_ENTRY(ARP_ANNOUNCE, "arp_announce"),
 		DEVINET_SYSCTL_RW_ENTRY(ARP_IGNORE, "arp_ignore"),
 		DEVINET_SYSCTL_RW_ENTRY(ARP_ACCEPT, "arp_accept"),
+		DEVINET_SYSCTL_RW_ENTRY(ARP_NOTIFY, "arp_notify"),
 
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
 		DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
-- 
cgit v1.2.3


From b2821ae68b14480bfc85ea1629537163310bc5cd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Feb 2009 21:38:32 -0500
Subject: trace: fix default boot up tracer

Peter Zijlstra started the functionality to start up a default
tracing at bootup. This patch finishes the work.

Now if you add 'ftrace=<tracer>' to the command line, when that tracer
is registered on bootup, that tracer is selected and starts tracing.

Note, all selftests for tracers that are registered after this tracer
is disabled. This prevents the selftests from disturbing the running
tracer, or the running tracer from disturbing the selftest.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++------
 1 file changed, 54 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2f8ac1f008f5..2c720c79bc60 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -53,6 +53,11 @@ unsigned long __read_mostly	tracing_thresh;
  */
 static bool __read_mostly tracing_selftest_running;
 
+/*
+ * If a tracer is running, we do not want to run SELFTEST.
+ */
+static bool __read_mostly tracing_selftest_disabled;
+
 /* For tracers that don't implement custom flags */
 static struct tracer_opt dummy_tracer_opt[] = {
 	{ }
@@ -110,14 +115,19 @@ static cpumask_var_t __read_mostly	tracing_buffer_mask;
  */
 int ftrace_dump_on_oops;
 
-static int tracing_set_tracer(char *buf);
+static int tracing_set_tracer(const char *buf);
+
+#define BOOTUP_TRACER_SIZE		100
+static char bootup_tracer_buf[BOOTUP_TRACER_SIZE] __initdata;
+static char *default_bootup_tracer;
 
 static int __init set_ftrace(char *str)
 {
-	tracing_set_tracer(str);
+	strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
+	default_bootup_tracer = bootup_tracer_buf;
 	return 1;
 }
-__setup("ftrace", set_ftrace);
+__setup("ftrace=", set_ftrace);
 
 static int __init set_ftrace_dump_on_oops(char *str)
 {
@@ -468,7 +478,7 @@ int register_tracer(struct tracer *type)
 			type->flags->opts = dummy_tracer_opt;
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-	if (type->selftest) {
+	if (type->selftest && !tracing_selftest_disabled) {
 		struct tracer *saved_tracer = current_trace;
 		struct trace_array *tr = &global_trace;
 		int i;
@@ -510,8 +520,25 @@ int register_tracer(struct tracer *type)
  out:
 	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
-	lock_kernel();
 
+	if (!ret && default_bootup_tracer) {
+		if (!strncmp(default_bootup_tracer, type->name,
+			     BOOTUP_TRACER_SIZE)) {
+			printk(KERN_INFO "Starting tracer '%s'\n",
+			       type->name);
+			/* Do we want this tracer to start on bootup? */
+			tracing_set_tracer(type->name);
+			default_bootup_tracer = NULL;
+			/* disable other selftests, since this will break it. */
+			tracing_selftest_disabled = 1;
+#ifdef CONFIG_FTRACE_STARTUP_TEST
+			printk(KERN_INFO "Disabling FTRACE selftests due"
+			       " to running tracer '%s'\n", type->name);
+#endif
+		}
+	}
+
+	lock_kernel();
 	return ret;
 }
 
@@ -2245,7 +2272,7 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-static int tracing_set_tracer(char *buf)
+static int tracing_set_tracer(const char *buf)
 {
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
@@ -3163,5 +3190,26 @@ out_free_buffer_mask:
 out:
 	return ret;
 }
+
+__init static int clear_boot_tracer(void)
+{
+	/*
+	 * The default tracer at boot buffer is an init section.
+	 * This function is called in lateinit. If we did not
+	 * find the boot tracer, then clear it out, to prevent
+	 * later registration from accessing the buffer that is
+	 * about to be freed.
+	 */
+	if (!default_bootup_tracer)
+		return 0;
+
+	printk(KERN_INFO "ftrace bootup tracer '%s' not registered.\n",
+	       default_bootup_tracer);
+	default_bootup_tracer = NULL;
+
+	return 0;
+}
+
 early_initcall(tracer_alloc_buffers);
 fs_initcall(tracer_init_debugfs);
+late_initcall(clear_boot_tracer);
-- 
cgit v1.2.3


From 79fb0768fbd371f3b94d909f51f587b3a24ab272 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Feb 2009 21:38:33 -0500
Subject: trace: let boot trace be chosen by command line

Now that we have a working ftrace=<tracer> function, make the boot
tracer get activated by it. This way we can turn it on or off without
recompiling the kernel, as well as keeping the selftests on. The
selftests are disabled whenever a default tracer starts running.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig      |  7 +++----
 kernel/trace/trace.c      |  5 +----
 kernel/trace/trace_boot.c | 11 +++++++----
 3 files changed, 11 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index dde1d46f77e5..28f2644484d9 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -164,9 +164,8 @@ config BOOT_TRACER
 	  representation of the delays during initcalls - but the raw
 	  /debug/tracing/trace text output is readable too.
 
-	  ( Note that tracing self tests can't be enabled if this tracer is
-	    selected, because the self-tests are an initcall as well and that
-	    would invalidate the boot trace. )
+	  You must pass in ftrace=initcall to the kernel command line
+	  to enable this on bootup.
 
 config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
@@ -326,7 +325,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING && DEBUG_KERNEL && !BOOT_TRACER
+	depends on TRACING && DEBUG_KERNEL
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2c720c79bc60..40edef4255c5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3167,12 +3167,9 @@ __init static int tracer_alloc_buffers(void)
 	trace_init_cmdlines();
 
 	register_tracer(&nop_trace);
+	current_trace = &nop_trace;
 #ifdef CONFIG_BOOT_TRACER
 	register_tracer(&boot_tracer);
-	current_trace = &boot_tracer;
-	current_trace->init(&global_trace);
-#else
-	current_trace = &nop_trace;
 #endif
 	/* All seems OK, enable tracing */
 	tracing_disabled = 0;
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 0e94b3d091f7..1f07895977a0 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -28,13 +28,13 @@ void start_boot_trace(void)
 
 void enable_boot_trace(void)
 {
-	if (pre_initcalls_finished)
+	if (boot_trace && pre_initcalls_finished)
 		tracing_start_sched_switch_record();
 }
 
 void disable_boot_trace(void)
 {
-	if (pre_initcalls_finished)
+	if (boot_trace && pre_initcalls_finished)
 		tracing_stop_sched_switch_record();
 }
 
@@ -43,6 +43,9 @@ static int boot_trace_init(struct trace_array *tr)
 	int cpu;
 	boot_trace = tr;
 
+	if (!tr)
+		return 0;
+
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
 
@@ -132,7 +135,7 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!pre_initcalls_finished)
+	if (!tr || !pre_initcalls_finished)
 		return;
 
 	/* Get its name now since this function could
@@ -164,7 +167,7 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
-	if (!pre_initcalls_finished)
+	if (!tr || !pre_initcalls_finished)
 		return;
 
 	sprint_symbol(bt->func, (unsigned long)fn);
-- 
cgit v1.2.3


From c4a8e8be2d43cc22b371e8e9c05c253409759d94 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 2 Feb 2009 20:29:21 -0200
Subject: trace: better manage the context info for events

Impact: make trace_event more convenient for tracers

All tracers (for the moment) that use the struct trace_event want to
have the context info printed before their own output: the pid/cmdline,
cpu, and timestamp.

But some other tracers that want to implement their trace_event
callbacks will not necessary need these information or they may want to
format them as they want.

This patch adds a new default-enabled trace option:
TRACE_ITER_CONTEXT_INFO When disabled through:

echo nocontext-info > /debugfs/tracing/trace_options

The pid, cpu and timestamps headers will not be printed.

IE with the sched_switch tracer with context-info (default):

     bash-2935 [001] 100.356561: 2935:120:S ==> [001]  0:140:R <idle>
   <idle>-0    [000] 100.412804:    0:140:R   + [000] 11:115:S events/0
   <idle>-0    [000] 100.412816:    0:140:R ==> [000] 11:115:R events/0
 events/0-11   [000] 100.412829:   11:115:S ==> [000]  0:140:R <idle>

Without context-info:

 2935:120:S ==> [001]  0:140:R <idle>
    0:140:R   + [000] 11:115:S events/0
    0:140:R ==> [000] 11:115:R events/0
   11:115:S ==> [000]  0:140:R <idle>

A tracer can disable it at runtime by clearing the bit
TRACE_ITER_CONTEXT_INFO in trace_flags.

The print routines were renamed to trace_print_context and
trace_print_lat_context, so that they can be used by tracers if they
want to use them for one of the trace_event callbacks.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 149 +++++++++++---------------------------------
 kernel/trace/trace.h        |   7 ++-
 kernel/trace/trace_output.c | 107 +++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |   3 +
 4 files changed, 151 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2f8ac1f008f5..5ec49c3c1597 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -227,7 +227,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -285,6 +285,7 @@ static const char *trace_options[] = {
 	"userstacktrace",
 	"sym-userobj",
 	"printk-msg-only",
+	"context-info",
 	NULL
 };
 
@@ -1171,8 +1172,8 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 }
 
 /* Find the next real entry, without updating the iterator itself */
-static struct trace_entry *
-find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+					  int *ent_cpu, u64 *ent_ts)
 {
 	return __find_next_entry(iter, ent_cpu, ent_ts);
 }
@@ -1351,57 +1352,6 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	seq_puts(m, "\n");
 }
 
-static void
-lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
-{
-	int hardirq, softirq;
-	char *comm;
-
-	comm = trace_find_cmdline(entry->pid);
-
-	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "%3d", cpu);
-	trace_seq_printf(s, "%c%c",
-			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
-			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
-	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
-	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq) {
-		trace_seq_putc(s, 'H');
-	} else {
-		if (hardirq) {
-			trace_seq_putc(s, 'h');
-		} else {
-			if (softirq)
-				trace_seq_putc(s, 's');
-			else
-				trace_seq_putc(s, '.');
-		}
-	}
-
-	if (entry->preempt_count)
-		trace_seq_printf(s, "%x", entry->preempt_count);
-	else
-		trace_seq_puts(s, ".");
-}
-
-unsigned long preempt_mark_thresh = 100;
-
-static void
-lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
-		    unsigned long rel_usecs)
-{
-	trace_seq_printf(s, " %4lldus", abs_usecs);
-	if (rel_usecs > preempt_mark_thresh)
-		trace_seq_puts(s, "!: ");
-	else if (rel_usecs > 1)
-		trace_seq_puts(s, "+: ");
-	else
-		trace_seq_puts(s, " : ");
-}
-
 static void test_cpu_buff_start(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1419,46 +1369,24 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
-static enum print_line_t
-print_lat_fmt(struct trace_iterator *iter, unsigned int trace_idx, int cpu)
+static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_entry *next_entry;
 	struct trace_event *event;
-	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
 	struct trace_entry *entry = iter->ent;
-	unsigned long abs_usecs;
-	unsigned long rel_usecs;
-	u64 next_ts;
-	char *comm;
 	int ret;
 
 	test_cpu_buff_start(iter);
 
-	next_entry = find_next_entry(iter, NULL, &next_ts);
-	if (!next_entry)
-		next_ts = iter->ts;
-	rel_usecs = ns2usecs(next_ts - iter->ts);
-	abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
-
-	if (verbose) {
-		comm = trace_find_cmdline(entry->pid);
-		trace_seq_printf(s, "%16s %5d %3d %d %08x %08x [%08lx]"
-				 " %ld.%03ldms (+%ld.%03ldms): ",
-				 comm,
-				 entry->pid, cpu, entry->flags,
-				 entry->preempt_count, trace_idx,
-				 ns2usecs(iter->ts),
-				 abs_usecs/1000,
-				 abs_usecs % 1000, rel_usecs/1000,
-				 rel_usecs % 1000);
-	} else {
-		lat_print_generic(s, entry, cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	event = ftrace_find_event(entry->type);
+
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_print_lat_context(iter);
+		if (ret)
+			return ret;
 	}
 
-	event = ftrace_find_event(entry->type);
 	if (event && event->latency_trace) {
 		ret = event->latency_trace(s, entry, sym_flags);
 		if (ret)
@@ -1476,33 +1404,20 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	struct trace_event *event;
-	unsigned long usec_rem;
-	unsigned long long t;
-	unsigned long secs;
-	char *comm;
 	int ret;
 
 	entry = iter->ent;
 
 	test_cpu_buff_start(iter);
 
-	comm = trace_find_cmdline(iter->ent->pid);
-
-	t = ns2usecs(iter->ts);
-	usec_rem = do_div(t, 1000000ULL);
-	secs = (unsigned long)t;
+	event = ftrace_find_event(entry->type);
 
-	ret = trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	ret = trace_seq_printf(s, "[%03d] ", iter->cpu);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-	ret = trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_print_context(iter);
+		if (ret)
+			return ret;
+	}
 
-	event = ftrace_find_event(entry->type);
 	if (event && event->trace) {
 		ret = event->trace(s, entry, sym_flags);
 		if (ret)
@@ -1525,10 +1440,12 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	ret = trace_seq_printf(s, "%d %d %llu ",
-		entry->pid, iter->cpu, iter->ts);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		ret = trace_seq_printf(s, "%d %d %llu ",
+			entry->pid, iter->cpu, iter->ts);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->raw) {
@@ -1553,9 +1470,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
-	SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
-	SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		SEQ_PUT_HEX_FIELD_RET(s, entry->pid);
+		SEQ_PUT_HEX_FIELD_RET(s, iter->cpu);
+		SEQ_PUT_HEX_FIELD_RET(s, iter->ts);
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex)
@@ -1575,7 +1494,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1590,9 +1509,11 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	entry = iter->ent;
 
-	SEQ_PUT_FIELD_RET(s, entry->pid);
-	SEQ_PUT_FIELD_RET(s, entry->cpu);
-	SEQ_PUT_FIELD_RET(s, iter->ts);
+	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
+		SEQ_PUT_FIELD_RET(s, entry->pid);
+		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->ts);
+	}
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
@@ -1643,7 +1564,7 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 		return print_raw_fmt(iter);
 
 	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-		return print_lat_fmt(iter, iter->idx, iter->cpu);
+		return print_lat_fmt(iter);
 
 	return print_trace_fmt(iter);
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e603a291134b..f0c7a0f08cac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,6 +405,10 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
+
+struct trace_entry *trace_find_next_entry(struct trace_iterator *iter,
+					  int *ent_cpu, u64 *ent_ts);
+
 void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
@@ -591,7 +595,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_ANNOTATE		= 0x2000,
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
 	TRACE_ITER_SYM_USEROBJ          = 0x8000,
-	TRACE_ITER_PRINTK_MSGONLY	= 0x10000
+	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
+	TRACE_ITER_CONTEXT_INFO		= 0x20000 /* Print pid/cpu/time */
 };
 
 /*
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 1a4e144a9f8f..a5752d4d3c33 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,6 +286,113 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
+static void
+lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
+{
+	int hardirq, softirq;
+	char *comm;
+
+	comm = trace_find_cmdline(entry->pid);
+
+	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
+	trace_seq_printf(s, "%3d", cpu);
+	trace_seq_printf(s, "%c%c",
+			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
+			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
+
+	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
+	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
+	if (hardirq && softirq) {
+		trace_seq_putc(s, 'H');
+	} else {
+		if (hardirq) {
+			trace_seq_putc(s, 'h');
+		} else {
+			if (softirq)
+				trace_seq_putc(s, 's');
+			else
+				trace_seq_putc(s, '.');
+		}
+	}
+
+	if (entry->preempt_count)
+		trace_seq_printf(s, "%x", entry->preempt_count);
+	else
+		trace_seq_puts(s, ".");
+}
+
+static unsigned long preempt_mark_thresh = 100;
+
+static void
+lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
+		    unsigned long rel_usecs)
+{
+	trace_seq_printf(s, " %4lldus", abs_usecs);
+	if (rel_usecs > preempt_mark_thresh)
+		trace_seq_puts(s, "!: ");
+	else if (rel_usecs > 1)
+		trace_seq_puts(s, "+: ");
+	else
+		trace_seq_puts(s, " : ");
+}
+
+int trace_print_context(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	char *comm = trace_find_cmdline(entry->pid);
+	unsigned long long t = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
+	unsigned long secs = (unsigned long)t;
+
+	if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid))
+		goto partial;
+	if (!trace_seq_printf(s, "[%03d] ", entry->cpu))
+		goto partial;
+	if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem))
+		goto partial;
+
+	return 0;
+
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+int trace_print_lat_context(struct trace_iterator *iter)
+{
+	u64 next_ts;
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent,
+			   *next_entry = trace_find_next_entry(iter, NULL,
+							       &next_ts);
+	unsigned long verbose = (trace_flags & TRACE_ITER_VERBOSE);
+	unsigned long abs_usecs = ns2usecs(iter->ts - iter->tr->time_start);
+	unsigned long rel_usecs;
+
+	if (!next_entry)
+		next_ts = iter->ts;
+	rel_usecs = ns2usecs(next_ts - iter->ts);
+
+	if (verbose) {
+		char *comm = trace_find_cmdline(entry->pid);
+		trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+				 " %ld.%03ldms (+%ld.%03ldms): ",
+				 comm,
+				 entry->pid, entry->cpu, entry->flags,
+				 entry->preempt_count, iter->idx,
+				 ns2usecs(iter->ts),
+				 abs_usecs/1000,
+				 abs_usecs % 1000, rel_usecs/1000,
+				 rel_usecs % 1000);
+	} else {
+		lat_print_generic(s, entry, entry->cpu);
+		lat_print_timestamp(s, abs_usecs, rel_usecs);
+	}
+
+	return 0;
+}
+
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
 
 static int task_state_char(unsigned long state)
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 1cbab5e3dc99..ec2ed90f10f0 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,6 +33,9 @@ int seq_print_userip_objs(const struct userstack_entry *entry,
 int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
 		      unsigned long ip, unsigned long sym_flags);
 
+int trace_print_context(struct trace_iterator *iter);
+int trace_print_lat_context(struct trace_iterator *iter);
+
 struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
-- 
cgit v1.2.3


From 2c9b238eb325895d3312dad64e2685783575e474 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 2 Feb 2009 20:30:12 -0200
Subject: trace: Change struct trace_event callbacks parameter list

Impact: API change

The trace_seq and trace_entry are in trace_iterator, where there are
more fields that may be needed by tracers, so just pass the
tracer_iterator as is already the case for struct tracer->print_line.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |   8 +--
 kernel/trace/trace.c        |  10 ++--
 kernel/trace/trace_branch.c |   7 +--
 kernel/trace/trace_output.c | 139 ++++++++++++++++++++------------------------
 kernel/trace/trace_output.h |   6 +-
 5 files changed, 76 insertions(+), 94 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 3f25425ade12..570cd3c40bd1 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1140,10 +1140,10 @@ static struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
-				 int flags)
+static int blk_trace_event_print(struct trace_iterator *iter, int flags)
 {
-	const struct blk_io_trace *t = (struct blk_io_trace *)ent;
+	struct trace_seq *s = &iter->seq;
+	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
 	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 	int ret;
 
@@ -1153,7 +1153,7 @@ static int blk_trace_event_print(struct trace_seq *s, struct trace_entry *ent,
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
 		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
 		if (ret)
-			ret = what2act[what].print(s, ent);
+			ret = what2act[what].print(s, iter->ent);
 	}
 
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5ec49c3c1597..152d0969adf8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1388,7 +1388,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	}
 
 	if (event && event->latency_trace) {
-		ret = event->latency_trace(s, entry, sym_flags);
+		ret = event->latency_trace(iter, sym_flags);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1419,7 +1419,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	}
 
 	if (event && event->trace) {
-		ret = event->trace(s, entry, sym_flags);
+		ret = event->trace(iter, sym_flags);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1449,7 +1449,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->raw) {
-		ret = event->raw(s, entry, 0);
+		ret = event->raw(iter, 0);
 		if (ret)
 			return ret;
 		return TRACE_TYPE_HANDLED;
@@ -1478,7 +1478,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex)
-		event->hex(s, entry, 0);
+		event->hex(iter, 0);
 
 	SEQ_PUT_FIELD_RET(s, newline);
 
@@ -1517,7 +1517,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
-		event->binary(s, entry, 0);
+		event->binary(iter, 0);
 
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 1284145c8898..ea62f101e615 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -160,14 +160,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_branch_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_branch_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_branch *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (trace_seq_printf(s, "[%s] %s:%s:%d\n",
+	if (trace_seq_printf(&iter->seq, "[%s] %s:%s:%d\n",
 			     field->correct ? "  ok  " : " MISS ",
 			     field->func,
 			     field->file,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index a5752d4d3c33..c24503b281a0 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -484,19 +484,18 @@ int unregister_ftrace_event(struct trace_event *event)
  * Standard events
  */
 
-int
-trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+int trace_nop_print(struct trace_iterator *iter, int flags)
 {
 	return 0;
 }
 
 /* TRACE_FN */
-static int
-trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_latency(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -513,12 +512,12 @@ trace_fn_latency(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -540,14 +539,13 @@ trace_fn_trace(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_raw(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "%lx %lx\n",
+	if (!trace_seq_printf(&iter->seq, "%lx %lx\n",
 			      field->ip,
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -555,12 +553,12 @@ trace_fn_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_hex(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
 	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
@@ -568,12 +566,12 @@ trace_fn_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_fn_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_fn_bin(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->ip);
 	SEQ_PUT_FIELD_RET(s, field->parent_ip);
@@ -591,20 +589,19 @@ static struct trace_event trace_fn_event = {
 };
 
 /* TRACE_CTX an TRACE_WAKE */
-static int
-trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
-		    char *delim)
+static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 {
 	struct ctx_switch_entry *field;
 	char *comm;
 	int S, T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
 	comm = trace_find_cmdline(field->next_pid);
-	if (!trace_seq_printf(s, " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
+	if (!trace_seq_printf(&iter->seq,
+			      " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 			      field->prev_pid,
 			      field->prev_prio,
 			      S, delim,
@@ -617,31 +614,27 @@ trace_ctxwake_print(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_print(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_print(s, entry, flags, "==>");
+	return trace_ctxwake_print(iter, "==>");
 }
 
-static int
-trace_wake_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_print(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_print(s, entry, flags, "  +");
+	return trace_ctxwake_print(iter, "  +");
 }
 
-static int
-trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
-		  char S)
+static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 {
 	struct ctx_switch_entry *field;
 	int T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!S)
 		task_state_char(field->prev_state);
 	T = task_state_char(field->next_state);
-	if (!trace_seq_printf(s, "%d %d %c %d %d %d %c\n",
+	if (!trace_seq_printf(&iter->seq, "%d %d %c %d %d %d %c\n",
 			      field->prev_pid,
 			      field->prev_prio,
 			      S,
@@ -654,27 +647,24 @@ trace_ctxwake_raw(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_raw(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_raw(s, entry, flags, 0);
+	return trace_ctxwake_raw(iter, 0);
 }
 
-static int
-trace_wake_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_raw(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_raw(s, entry, flags, '+');
+	return trace_ctxwake_raw(iter, '+');
 }
 
 
-static int
-trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
-		  char S)
+static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 {
 	struct ctx_switch_entry *field;
+	struct trace_seq *s = &iter->seq;
 	int T;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!S)
 		task_state_char(field->prev_state);
@@ -691,24 +681,22 @@ trace_ctxwake_hex(struct trace_seq *s, struct trace_entry *entry, int flags,
 	return 0;
 }
 
-static int
-trace_ctx_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctx_hex(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_hex(s, entry, flags, 0);
+	return trace_ctxwake_hex(iter, 0);
 }
 
-static int
-trace_wake_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_wake_hex(struct trace_iterator *iter, int flags)
 {
-	return trace_ctxwake_hex(s, entry, flags, '+');
+	return trace_ctxwake_hex(iter, '+');
 }
 
-static int
-trace_ctxwake_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
 {
 	struct ctx_switch_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->prev_pid);
 	SEQ_PUT_FIELD_RET(s, field->prev_prio);
@@ -739,14 +727,13 @@ static struct trace_event trace_wake_event = {
 };
 
 /* TRACE_SPECIAL */
-static int
-trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_print(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "# %ld %ld %ld\n",
+	if (!trace_seq_printf(&iter->seq, "# %ld %ld %ld\n",
 			      field->arg1,
 			      field->arg2,
 			      field->arg3))
@@ -755,12 +742,12 @@ trace_special_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_hex(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg1);
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
@@ -769,12 +756,12 @@ trace_special_hex(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return 0;
 }
 
-static int
-trace_special_bin(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_special_bin(struct trace_iterator *iter, int flags)
 {
 	struct special_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	SEQ_PUT_FIELD_RET(s, field->arg1);
 	SEQ_PUT_FIELD_RET(s, field->arg2);
@@ -794,13 +781,13 @@ static struct trace_event trace_special_event = {
 
 /* TRACE_STACK */
 
-static int
-trace_stack_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_stack_print(struct trace_iterator *iter, int flags)
 {
 	struct stack_entry *field;
+	struct trace_seq *s = &iter->seq;
 	int i;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	for (i = 0; i < FTRACE_STACK_ENTRIES; i++) {
 		if (i) {
@@ -830,13 +817,12 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
-static int
-trace_user_stack_print(struct trace_seq *s, struct trace_entry *entry,
-		       int flags)
+static int trace_user_stack_print(struct trace_iterator *iter, int flags)
 {
 	struct userstack_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_userip_objs(field, s, flags))
 		goto partial;
@@ -860,12 +846,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static int
-trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!seq_print_ip_sym(s, field->ip, flags))
 		goto partial;
@@ -879,14 +865,13 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int
-trace_print_raw(struct trace_seq *s, struct trace_entry *entry, int flags)
+static int trace_print_raw(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
-	if (!trace_seq_printf(s, "# %lx %s", field->ip, field->buf))
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
 	return 0;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index ec2ed90f10f0..3aeb31f6506b 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -3,8 +3,7 @@
 
 #include "trace.h"
 
-typedef int (*trace_print_func)(struct trace_seq *s, struct trace_entry *entry,
-				int flags);
+typedef int (*trace_print_func)(struct trace_iterator *iter, int flags);
 
 struct trace_event {
 	struct hlist_node	node;
@@ -40,8 +39,7 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
-int
-trace_nop_print(struct trace_seq *s, struct trace_entry *entry, int flags);
+int trace_nop_print(struct trace_iterator *iter, int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3


From d9793bd8018f835c64b10f44e278c86cecb8e932 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 3 Feb 2009 20:20:41 -0200
Subject: trace: judicious error checking of trace_seq results

Impact: bugfix and cleanup

Some callsites were returning either TRACE_ITER_PARTIAL_LINE if the
trace_seq routines (trace_seq_printf, etc) returned 0 meaning its buffer
was full, or zero otherwise.

But...

/* Return values for print_line callback */
enum print_line_t {
        TRACE_TYPE_PARTIAL_LINE = 0,    /* Retry after flushing the seq */
        TRACE_TYPE_HANDLED      = 1,
        TRACE_TYPE_UNHANDLED    = 2     /* Relay to other output functions */
};

In other cases the return value was not being relayed at all.

Most of the time it didn't hurt because the page wasn't get filled, but
for correctness sake, handle the return values everywhere.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |   2 +-
 kernel/trace/trace.c        |  75 ++++++++++++---------------
 kernel/trace/trace_branch.c |   2 +-
 kernel/trace/trace_output.c | 123 ++++++++++++++++++--------------------------
 4 files changed, 87 insertions(+), 115 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 8f5c37b0f80f..12df27693972 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1165,7 +1165,7 @@ static int blk_trace_event_print(struct trace_iterator *iter, int flags)
 	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 	int ret;
 
-	if (trace_print_context(iter))
+	if (!trace_print_context(iter))
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bbdfaa2cbdb9..5822ff4e5a3e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1402,27 +1402,25 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_event *event;
 	struct trace_entry *entry = iter->ent;
-	int ret;
 
 	test_cpu_buff_start(iter);
 
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_print_lat_context(iter);
-		if (ret)
-			return ret;
+		if (!trace_print_lat_context(iter))
+			goto partial;
 	}
 
-	if (event && event->latency_trace) {
-		ret = event->latency_trace(iter, sym_flags);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
+	if (event && event->latency_trace)
+		return event->latency_trace(iter, sym_flags);
+
+	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+		goto partial;
 
-	trace_seq_printf(s, "Unknown type %d\n", entry->type);
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1431,7 +1429,6 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct trace_entry *entry;
 	struct trace_event *event;
-	int ret;
 
 	entry = iter->ent;
 
@@ -1440,22 +1437,19 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_print_context(iter);
-		if (ret)
-			return ret;
+		if (!trace_print_context(iter))
+			goto partial;
 	}
 
-	if (event && event->trace) {
-		ret = event->trace(iter, sym_flags);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
-	ret = trace_seq_printf(s, "Unknown type %d\n", entry->type);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (event && event->trace)
+		return event->trace(iter, sym_flags);
+
+	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
+		goto partial;
 
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
@@ -1463,29 +1457,25 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry;
 	struct trace_event *event;
-	int ret;
 
 	entry = iter->ent;
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		ret = trace_seq_printf(s, "%d %d %llu ",
-			entry->pid, iter->cpu, iter->ts);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
+		if (!trace_seq_printf(s, "%d %d %llu ",
+				      entry->pid, iter->cpu, iter->ts))
+			goto partial;
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->raw) {
-		ret = event->raw(iter, 0);
-		if (ret)
-			return ret;
-		return TRACE_TYPE_HANDLED;
-	}
-	ret = trace_seq_printf(s, "%d ?\n", entry->type);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (event && event->raw)
+		return event->raw(iter, 0);
+
+	if (!trace_seq_printf(s, "%d ?\n", entry->type))
+		goto partial;
 
 	return TRACE_TYPE_HANDLED;
+partial:
+	return TRACE_TYPE_PARTIAL_LINE;
 }
 
 static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
@@ -1504,8 +1494,11 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->hex)
-		event->hex(iter, 0);
+	if (event && event->hex) {
+		int ret = event->hex(iter, 0);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+	}
 
 	SEQ_PUT_FIELD_RET(s, newline);
 
@@ -1544,7 +1537,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->binary)
-		event->binary(iter, 0);
+		return event->binary(iter, 0);
 
 	return TRACE_TYPE_HANDLED;
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index ea62f101e615..f6b35e162dfa 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -173,7 +173,7 @@ static int trace_branch_print(struct trace_iterator *iter, int flags)
 			     field->line))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index c24503b281a0..5b3c914053f2 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -286,55 +286,41 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags)
 	return ret;
 }
 
-static void
+static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
 	char *comm;
 
 	comm = trace_find_cmdline(entry->pid);
-
-	trace_seq_printf(s, "%8.8s-%-5d ", comm, entry->pid);
-	trace_seq_printf(s, "%3d", cpu);
-	trace_seq_printf(s, "%c%c",
-			(entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
-			 (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : '.',
-			((entry->flags & TRACE_FLAG_NEED_RESCHED) ? 'N' : '.'));
-
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
-	if (hardirq && softirq) {
-		trace_seq_putc(s, 'H');
-	} else {
-		if (hardirq) {
-			trace_seq_putc(s, 'h');
-		} else {
-			if (softirq)
-				trace_seq_putc(s, 's');
-			else
-				trace_seq_putc(s, '.');
-		}
-	}
+
+	if (!trace_seq_printf(s, "%8.8s-%-5d %3d%c%c%c",
+			      comm, entry->pid, cpu,
+			      (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' :
+				(entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ?
+				  'X' : '.',
+			      (entry->flags & TRACE_FLAG_NEED_RESCHED) ?
+				'N' : '.',
+			      (hardirq && softirq) ? 'H' :
+				hardirq ? 'h' : softirq ? 's' : '.'))
+		return 0;
 
 	if (entry->preempt_count)
-		trace_seq_printf(s, "%x", entry->preempt_count);
-	else
-		trace_seq_puts(s, ".");
+		return trace_seq_printf(s, "%x", entry->preempt_count);
+	return trace_seq_puts(s, ".");
 }
 
 static unsigned long preempt_mark_thresh = 100;
 
-static void
+static int
 lat_print_timestamp(struct trace_seq *s, u64 abs_usecs,
 		    unsigned long rel_usecs)
 {
-	trace_seq_printf(s, " %4lldus", abs_usecs);
-	if (rel_usecs > preempt_mark_thresh)
-		trace_seq_puts(s, "!: ");
-	else if (rel_usecs > 1)
-		trace_seq_puts(s, "+: ");
-	else
-		trace_seq_puts(s, " : ");
+	return trace_seq_printf(s, " %4lldus%c: ", abs_usecs,
+				rel_usecs > preempt_mark_thresh ? '!' :
+				  rel_usecs > 1 ? '+' : ' ');
 }
 
 int trace_print_context(struct trace_iterator *iter)
@@ -346,22 +332,14 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
 	unsigned long secs = (unsigned long)t;
 
-	if (!trace_seq_printf(s, "%16s-%-5d ", comm, entry->pid))
-		goto partial;
-	if (!trace_seq_printf(s, "[%03d] ", entry->cpu))
-		goto partial;
-	if (!trace_seq_printf(s, "%5lu.%06lu: ", secs, usec_rem))
-		goto partial;
-
-	return 0;
-
-partial:
-	return TRACE_TYPE_PARTIAL_LINE;
+	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
+				comm, entry->pid, entry->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
 {
 	u64 next_ts;
+	int ret;
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent,
 			   *next_entry = trace_find_next_entry(iter, NULL,
@@ -376,21 +354,22 @@ int trace_print_lat_context(struct trace_iterator *iter)
 
 	if (verbose) {
 		char *comm = trace_find_cmdline(entry->pid);
-		trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
-				 " %ld.%03ldms (+%ld.%03ldms): ",
-				 comm,
-				 entry->pid, entry->cpu, entry->flags,
-				 entry->preempt_count, iter->idx,
-				 ns2usecs(iter->ts),
-				 abs_usecs/1000,
-				 abs_usecs % 1000, rel_usecs/1000,
-				 rel_usecs % 1000);
+		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
+				       entry->pid, entry->cpu, entry->flags,
+				       entry->preempt_count, iter->idx,
+				       ns2usecs(iter->ts),
+				       abs_usecs / USEC_PER_MSEC,
+				       abs_usecs % USEC_PER_MSEC,
+				       rel_usecs / USEC_PER_MSEC,
+				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		lat_print_generic(s, entry, entry->cpu);
-		lat_print_timestamp(s, abs_usecs, rel_usecs);
+		ret = lat_print_generic(s, entry, entry->cpu);
+		if (ret)
+			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
 
-	return 0;
+	return ret;
 }
 
 static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
@@ -486,7 +465,7 @@ int unregister_ftrace_event(struct trace_event *event)
 
 int trace_nop_print(struct trace_iterator *iter, int flags)
 {
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
@@ -506,7 +485,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags)
 	if (!trace_seq_puts(s, ")\n"))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -533,7 +512,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(s, "\n"))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -550,7 +529,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags)
 			      field->parent_ip))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_fn_hex(struct trace_iterator *iter, int flags)
@@ -563,7 +542,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags)
 	SEQ_PUT_HEX_FIELD_RET(s, field->ip);
 	SEQ_PUT_HEX_FIELD_RET(s, field->parent_ip);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_fn_bin(struct trace_iterator *iter, int flags)
@@ -576,7 +555,7 @@ static int trace_fn_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->ip);
 	SEQ_PUT_FIELD_RET(s, field->parent_ip);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_fn_event = {
@@ -611,7 +590,7 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 			      T, comm))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_print(struct trace_iterator *iter, int flags)
@@ -644,7 +623,7 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 			      T))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_raw(struct trace_iterator *iter, int flags)
@@ -678,7 +657,7 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	SEQ_PUT_HEX_FIELD_RET(s, field->next_prio);
 	SEQ_PUT_HEX_FIELD_RET(s, T);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_ctx_hex(struct trace_iterator *iter, int flags)
@@ -705,7 +684,7 @@ static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->next_prio);
 	SEQ_PUT_FIELD_RET(s, field->next_state);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_ctx_event = {
@@ -739,7 +718,7 @@ static int trace_special_print(struct trace_iterator *iter, int flags)
 			      field->arg3))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_special_hex(struct trace_iterator *iter, int flags)
@@ -753,7 +732,7 @@ static int trace_special_hex(struct trace_iterator *iter, int flags)
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg2);
 	SEQ_PUT_HEX_FIELD_RET(s, field->arg3);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static int trace_special_bin(struct trace_iterator *iter, int flags)
@@ -767,7 +746,7 @@ static int trace_special_bin(struct trace_iterator *iter, int flags)
 	SEQ_PUT_FIELD_RET(s, field->arg2);
 	SEQ_PUT_FIELD_RET(s, field->arg3);
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct trace_event trace_special_event = {
@@ -801,7 +780,7 @@ static int trace_stack_print(struct trace_iterator *iter, int flags)
 			goto partial;
 	}
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -830,7 +809,7 @@ static int trace_user_stack_print(struct trace_iterator *iter, int flags)
 	if (!trace_seq_putc(s, '\n'))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -859,7 +838,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(s, ": %s", field->buf))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
@@ -874,7 +853,7 @@ static int trace_print_raw(struct trace_iterator *iter, int flags)
 	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
 		goto partial;
 
-	return 0;
+	return TRACE_TYPE_HANDLED;
 
  partial:
 	return TRACE_TYPE_PARTIAL_LINE;
-- 
cgit v1.2.3


From ae7462b4f1fe1f36b5d562dbd5202a2eba01f072 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Tue, 3 Feb 2009 22:05:50 -0200
Subject: trace: make the trace_event callbacks return enum print_line_t

As they actually all return these enumerators.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |  6 ++++--
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace_branch.c |  3 ++-
 kernel/trace/trace_output.c | 52 +++++++++++++++++++++++++++------------------
 kernel/trace/trace_output.h |  5 +++--
 5 files changed, 41 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 12df27693972..c7698d1617a1 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1158,7 +1158,8 @@ static struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static int blk_trace_event_print(struct trace_iterator *iter, int flags)
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
 {
 	struct trace_seq *s = &iter->seq;
 	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
@@ -1196,7 +1197,8 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 				sizeof(old) - offset + t->pdu_len);
 }
 
-static int blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
 {
 	return blk_trace_synthesize_old_trace(iter) ?
 			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5822ff4e5a3e..fd51cf0b94c7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1495,7 +1495,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 
 	event = ftrace_find_event(entry->type);
 	if (event && event->hex) {
-		int ret = event->hex(iter, 0);
+		enum print_line_t ret = event->hex(iter, 0);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
 	}
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index f6b35e162dfa..7ac72a44b2d3 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -160,7 +160,8 @@ trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_branch_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_branch_print(struct trace_iterator *iter,
+					    int flags)
 {
 	struct trace_branch *field;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 5b3c914053f2..b7380eee9fa1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -463,13 +463,14 @@ int unregister_ftrace_event(struct trace_event *event)
  * Standard events
  */
 
-int trace_nop_print(struct trace_iterator *iter, int flags)
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
 {
 	return TRACE_TYPE_HANDLED;
 }
 
 /* TRACE_FN */
-static int trace_fn_latency(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_latency(struct trace_iterator *iter,
+					  int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -491,7 +492,7 @@ static int trace_fn_latency(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_fn_trace(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -518,7 +519,7 @@ static int trace_fn_trace(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_fn_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_raw(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 
@@ -532,7 +533,7 @@ static int trace_fn_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_fn_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_hex(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -545,7 +546,7 @@ static int trace_fn_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_fn_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -568,7 +569,8 @@ static struct trace_event trace_fn_event = {
 };
 
 /* TRACE_CTX an TRACE_WAKE */
-static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
+static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
+					     char *delim)
 {
 	struct ctx_switch_entry *field;
 	char *comm;
@@ -593,12 +595,13 @@ static int trace_ctxwake_print(struct trace_iterator *iter, char *delim)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_print(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_print(iter, "==>");
 }
 
-static int trace_wake_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_print(struct trace_iterator *iter,
+					  int flags)
 {
 	return trace_ctxwake_print(iter, "  +");
 }
@@ -626,12 +629,12 @@ static int trace_ctxwake_raw(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_raw(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_raw(iter, 0);
 }
 
-static int trace_wake_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_raw(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_raw(iter, '+');
 }
@@ -660,17 +663,18 @@ static int trace_ctxwake_hex(struct trace_iterator *iter, char S)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_ctx_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctx_hex(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_hex(iter, 0);
 }
 
-static int trace_wake_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_wake_hex(struct trace_iterator *iter, int flags)
 {
 	return trace_ctxwake_hex(iter, '+');
 }
 
-static int trace_ctxwake_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
+					   int flags)
 {
 	struct ctx_switch_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -706,7 +710,8 @@ static struct trace_event trace_wake_event = {
 };
 
 /* TRACE_SPECIAL */
-static int trace_special_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_print(struct trace_iterator *iter,
+					     int flags)
 {
 	struct special_entry *field;
 
@@ -721,7 +726,8 @@ static int trace_special_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_special_hex(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_hex(struct trace_iterator *iter,
+					   int flags)
 {
 	struct special_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -735,7 +741,8 @@ static int trace_special_hex(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_HANDLED;
 }
 
-static int trace_special_bin(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_special_bin(struct trace_iterator *iter,
+					   int flags)
 {
 	struct special_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -760,7 +767,8 @@ static struct trace_event trace_special_event = {
 
 /* TRACE_STACK */
 
-static int trace_stack_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_stack_print(struct trace_iterator *iter,
+					   int flags)
 {
 	struct stack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -796,7 +804,8 @@ static struct trace_event trace_stack_event = {
 };
 
 /* TRACE_USER_STACK */
-static int trace_user_stack_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
+						int flags)
 {
 	struct userstack_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -825,7 +834,8 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static int trace_print_print(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
 {
 	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
@@ -844,7 +854,7 @@ static int trace_print_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static int trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
 	struct print_entry *field;
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 3aeb31f6506b..551a25a72217 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -3,7 +3,8 @@
 
 #include "trace.h"
 
-typedef int (*trace_print_func)(struct trace_iterator *iter, int flags);
+typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter,
+					      int flags);
 
 struct trace_event {
 	struct hlist_node	node;
@@ -39,7 +40,7 @@ struct trace_event *ftrace_find_event(int type);
 int register_ftrace_event(struct trace_event *event);
 int unregister_ftrace_event(struct trace_event *event);
 
-int trace_nop_print(struct trace_iterator *iter, int flags);
+enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3


From 483b4ee60edbefdfbff0dd538fb81f368d9e7c0d Mon Sep 17 00:00:00 2001
From: Suresh Siddha <suresh.b.siddha@intel.com>
Date: Wed, 4 Feb 2009 11:59:44 -0800
Subject: sched: fix nohz load balancer on cpu offline

Christian Borntraeger reports:

> After a logical cpu offline, even on a complete idle system, there
> is one cpu with full ticks. It turns out that nohz.cpu_mask has the
> the offlined cpu still set.
>
> In select_nohz_load_balancer() we check if the system is completely
> idle to turn of load balancing. We compare cpu_online_map with
> nohz.cpu_mask.  Since cpu_online_map is updated on cpu unplug,
> but nohz.cpu_mask is not, the check fails and the scheduler believes
> that we need an "idle load balancer" even on a fully idle system.
> Since the ilb cpu does not deactivate the timer tick this breaks NOHZ.

Fix the select_nohz_load_balancer() to not set the nohz.cpu_mask
while a cpu is going offline.

Reported-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com>
Tested-by: Christian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 242d0d47a70d..e1fc67d0674c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3890,19 +3890,24 @@ int select_nohz_load_balancer(int stop_tick)
 	int cpu = smp_processor_id();
 
 	if (stop_tick) {
-		cpumask_set_cpu(cpu, nohz.cpu_mask);
 		cpu_rq(cpu)->in_nohz_recently = 1;
 
-		/*
-		 * If we are going offline and still the leader, give up!
-		 */
-		if (!cpu_active(cpu) &&
-		    atomic_read(&nohz.load_balancer) == cpu) {
+		if (!cpu_active(cpu)) {
+			if (atomic_read(&nohz.load_balancer) != cpu)
+				return 0;
+
+			/*
+			 * If we are going offline and still the leader,
+			 * give up!
+			 */
 			if (atomic_cmpxchg(&nohz.load_balancer, cpu, -1) != cpu)
 				BUG();
+
 			return 0;
 		}
 
+		cpumask_set_cpu(cpu, nohz.cpu_mask);
+
 		/* time for ilb owner also to sleep */
 		if (cpumask_weight(nohz.cpu_mask) == num_online_cpus()) {
 			if (atomic_read(&nohz.load_balancer) == cpu)
-- 
cgit v1.2.3


From 32bd671d6cbeda60dc73be77fa2b9037d9a9bfa0 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Feb 2009 12:24:15 +0100
Subject: signal: re-add dead task accumulation stats.

We're going to split the process wide cpu accounting into two parts:

 - clocks; which can take all the time they want since they run
           from user context.

 - timers; which need constant time tracing but can affort the overhead
           because they're default off -- and rare.

The clock readout will go back to a full sum of the thread group, for this
we need to re-add the exit stats that were removed in the initial itimer
rework (f06febc9: timers: fix itimer/many thread hang).

Furthermore, since that full sum can be rather slow for large thread groups
and we have the complete dead task stats, revert the do_notify_parent time
computation.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 +++++++++-
 kernel/exit.c         |  3 +++
 kernel/fork.c         |  3 ++-
 kernel/signal.c       |  8 ++++----
 4 files changed, 18 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2127e959e0f4..2e0646a30314 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -559,7 +559,7 @@ struct signal_struct {
 	 * Live threads maintain their own counters and add to these
 	 * in __exit_signal, except for the group leader.
 	 */
-	cputime_t cutime, cstime;
+	cputime_t utime, stime, cutime, cstime;
 	cputime_t gtime;
 	cputime_t cgtime;
 	unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
@@ -567,6 +567,14 @@ struct signal_struct {
 	unsigned long inblock, oublock, cinblock, coublock;
 	struct task_io_accounting ioac;
 
+	/*
+	 * Cumulative ns of schedule CPU time fo dead threads in the
+	 * group, not including a zombie group leader, (This only differs
+	 * from jiffies_to_ns(utime + stime) if sched_clock uses something
+	 * other than jiffies.)
+	 */
+	unsigned long long sum_sched_runtime;
+
 	/*
 	 * We don't bother to synchronize most readers of this at all,
 	 * because there is no reader checking a limit that actually needs
diff --git a/kernel/exit.c b/kernel/exit.c
index f80dec3f1875..efd30ccf3858 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -118,6 +118,8 @@ static void __exit_signal(struct task_struct *tsk)
 		 * We won't ever get here for the group leader, since it
 		 * will have been the last reference on the signal_struct.
 		 */
+		sig->utime = cputime_add(sig->utime, task_utime(tsk));
+		sig->stime = cputime_add(sig->stime, task_stime(tsk));
 		sig->gtime = cputime_add(sig->gtime, task_gtime(tsk));
 		sig->min_flt += tsk->min_flt;
 		sig->maj_flt += tsk->maj_flt;
@@ -126,6 +128,7 @@ static void __exit_signal(struct task_struct *tsk)
 		sig->inblock += task_io_get_inblock(tsk);
 		sig->oublock += task_io_get_oublock(tsk);
 		task_io_accounting_add(&sig->ioac, &tsk->ioac);
+		sig->sum_sched_runtime += tsk->se.sum_exec_runtime;
 		sig = NULL; /* Marker for below. */
 	}
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 242a706e7721..e8e854a04ad2 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -851,13 +851,14 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	sig->tty_old_pgrp = NULL;
 	sig->tty = NULL;
 
-	sig->cutime = sig->cstime = cputime_zero;
+	sig->utime = sig->stime = sig->cutime = sig->cstime = cputime_zero;
 	sig->gtime = cputime_zero;
 	sig->cgtime = cputime_zero;
 	sig->nvcsw = sig->nivcsw = sig->cnvcsw = sig->cnivcsw = 0;
 	sig->min_flt = sig->maj_flt = sig->cmin_flt = sig->cmaj_flt = 0;
 	sig->inblock = sig->oublock = sig->cinblock = sig->coublock = 0;
 	task_io_accounting_init(&sig->ioac);
+	sig->sum_sched_runtime = 0;
 	taskstats_tgid_init(sig);
 
 	task_lock(current->group_leader);
diff --git a/kernel/signal.c b/kernel/signal.c
index b6b36768b758..2a74fe87c0dd 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1367,7 +1367,6 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	struct siginfo info;
 	unsigned long flags;
 	struct sighand_struct *psig;
-	struct task_cputime cputime;
 	int ret = sig;
 
 	BUG_ON(sig == -1);
@@ -1397,9 +1396,10 @@ int do_notify_parent(struct task_struct *tsk, int sig)
 	info.si_uid = __task_cred(tsk)->uid;
 	rcu_read_unlock();
 
-	thread_group_cputime(tsk, &cputime);
-	info.si_utime = cputime_to_jiffies(cputime.utime);
-	info.si_stime = cputime_to_jiffies(cputime.stime);
+	info.si_utime = cputime_to_clock_t(cputime_add(tsk->utime,
+				tsk->signal->utime));
+	info.si_stime = cputime_to_clock_t(cputime_add(tsk->stime,
+				tsk->signal->stime));
 
 	info.si_status = tsk->exit_code & 0x7f;
 	if (tsk->exit_code & 0x80)
-- 
cgit v1.2.3


From 4cd4c1b40d40447fb5e7ba80746c6d7ba91d7a53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 5 Feb 2009 12:24:16 +0100
Subject: timers: split process wide cpu clocks/timers

Change the process wide cpu timers/clocks so that we:

 1) don't mess up the kernel with too many threads,
 2) don't have a per-cpu allocation for each process,
 3) have no impact when not used.

In order to accomplish this we're going to split it into two parts:

 - clocks; which can take all the time they want since they run
           from user context -- ie. sys_clock_gettime(CLOCK_PROCESS_CPUTIME_ID)

 - timers; which need constant time sampling but since they're
           explicity used, the user can pay the overhead.

The clock readout will go back to a full sum of the thread group, while the
timers will run of a global 'clock' that only runs when needed, so only
programs that make use of the facility pay the price.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/init_task.h | 11 +++---
 include/linux/sched.h     | 54 +++++++++++++++------------
 kernel/itimer.c           |  4 +-
 kernel/posix-cpu-timers.c | 95 +++++++++++++++++++++++++++++++++++++++++++++--
 kernel/sched_stats.h      | 45 ++++++++++++----------
 5 files changed, 155 insertions(+), 54 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/init_task.h b/include/linux/init_task.h
index ea0ea1a4c36f..e752d973fa21 100644
--- a/include/linux/init_task.h
+++ b/include/linux/init_task.h
@@ -48,12 +48,11 @@ extern struct fs_struct init_fs;
 	.posix_timers	 = LIST_HEAD_INIT(sig.posix_timers),		\
 	.cpu_timers	= INIT_CPU_TIMERS(sig.cpu_timers),		\
 	.rlim		= INIT_RLIMITS,					\
-	.cputime	= { .totals = {					\
-		.utime = cputime_zero,					\
-		.stime = cputime_zero,					\
-		.sum_exec_runtime = 0,					\
-		.lock = __SPIN_LOCK_UNLOCKED(sig.cputime.totals.lock),	\
-	}, },								\
+	.cputimer	= { 						\
+		.cputime = INIT_CPUTIME,				\
+		.running = 0,						\
+		.lock = __SPIN_LOCK_UNLOCKED(sig.cputimer.lock),	\
+	},								\
 }
 
 extern struct nsproxy init_nsproxy;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2e0646a30314..082d7619b3a1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -443,7 +443,6 @@ struct pacct_struct {
  * @utime:		time spent in user mode, in &cputime_t units
  * @stime:		time spent in kernel mode, in &cputime_t units
  * @sum_exec_runtime:	total time spent on the CPU, in nanoseconds
- * @lock:		lock for fields in this struct
  *
  * This structure groups together three kinds of CPU time that are
  * tracked for threads and thread groups.  Most things considering
@@ -454,23 +453,33 @@ struct task_cputime {
 	cputime_t utime;
 	cputime_t stime;
 	unsigned long long sum_exec_runtime;
-	spinlock_t lock;
 };
 /* Alternate field names when used to cache expirations. */
 #define prof_exp	stime
 #define virt_exp	utime
 #define sched_exp	sum_exec_runtime
 
+#define INIT_CPUTIME	\
+	(struct task_cputime) {					\
+		.utime = cputime_zero,				\
+		.stime = cputime_zero,				\
+		.sum_exec_runtime = 0,				\
+	}
+
 /**
- * struct thread_group_cputime - thread group interval timer counts
- * @totals:		thread group interval timers; substructure for
- *			uniprocessor kernel, per-cpu for SMP kernel.
+ * struct thread_group_cputimer - thread group interval timer counts
+ * @cputime:		thread group interval timers.
+ * @running:		non-zero when there are timers running and
+ * 			@cputime receives updates.
+ * @lock:		lock for fields in this struct.
  *
  * This structure contains the version of task_cputime, above, that is
- * used for thread group CPU clock calculations.
+ * used for thread group CPU timer calculations.
  */
-struct thread_group_cputime {
-	struct task_cputime totals;
+struct thread_group_cputimer {
+	struct task_cputime cputime;
+	int running;
+	spinlock_t lock;
 };
 
 /*
@@ -519,10 +528,10 @@ struct signal_struct {
 	cputime_t it_prof_incr, it_virt_incr;
 
 	/*
-	 * Thread group totals for process CPU clocks.
-	 * See thread_group_cputime(), et al, for details.
+	 * Thread group totals for process CPU timers.
+	 * See thread_group_cputimer(), et al, for details.
 	 */
-	struct thread_group_cputime cputime;
+	struct thread_group_cputimer cputimer;
 
 	/* Earliest-expiration cache. */
 	struct task_cputime cputime_expires;
@@ -2191,27 +2200,26 @@ static inline int spin_needbreak(spinlock_t *lock)
 /*
  * Thread group CPU time accounting.
  */
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
 
 static inline
-void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 {
-	struct task_cputime *totals = &tsk->signal->cputime.totals;
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
 	unsigned long flags;
 
-	spin_lock_irqsave(&totals->lock, flags);
-	*times = *totals;
-	spin_unlock_irqrestore(&totals->lock, flags);
+	WARN_ON(!cputimer->running);
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	*times = cputimer->cputime;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
-	sig->cputime.totals = (struct task_cputime){
-		.utime = cputime_zero,
-		.stime = cputime_zero,
-		.sum_exec_runtime = 0,
-	};
-
-	spin_lock_init(&sig->cputime.totals.lock);
+	sig->cputimer.cputime = INIT_CPUTIME;
+	spin_lock_init(&sig->cputimer.lock);
+	sig->cputimer.running = 0;
 }
 
 static inline void thread_group_cputime_free(struct signal_struct *sig)
diff --git a/kernel/itimer.c b/kernel/itimer.c
index 6a5fe93dd8bd..58762f7077ec 100644
--- a/kernel/itimer.c
+++ b/kernel/itimer.c
@@ -62,7 +62,7 @@ int do_getitimer(int which, struct itimerval *value)
 			struct task_cputime cputime;
 			cputime_t utime;
 
-			thread_group_cputime(tsk, &cputime);
+			thread_group_cputimer(tsk, &cputime);
 			utime = cputime.utime;
 			if (cputime_le(cval, utime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
@@ -82,7 +82,7 @@ int do_getitimer(int which, struct itimerval *value)
 			struct task_cputime times;
 			cputime_t ptime;
 
-			thread_group_cputime(tsk, &times);
+			thread_group_cputimer(tsk, &times);
 			ptime = cputime_add(times.utime, times.stime);
 			if (cputime_le(cval, ptime)) { /* about to fire */
 				cval = jiffies_to_cputime(1);
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index fa07da94d7be..db107c9bbc05 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -230,6 +230,37 @@ static int cpu_clock_sample(const clockid_t which_clock, struct task_struct *p,
 	return 0;
 }
 
+void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct sighand_struct *sighand;
+	struct signal_struct *sig;
+	struct task_struct *t;
+
+	*times = INIT_CPUTIME;
+
+	rcu_read_lock();
+	sighand = rcu_dereference(tsk->sighand);
+	if (!sighand)
+		goto out;
+
+	sig = tsk->signal;
+
+	t = tsk;
+	do {
+		times->utime = cputime_add(times->utime, t->utime);
+		times->stime = cputime_add(times->stime, t->stime);
+		times->sum_exec_runtime += t->se.sum_exec_runtime;
+
+		t = next_thread(t);
+	} while (t != tsk);
+
+	times->utime = cputime_add(times->utime, sig->utime);
+	times->stime = cputime_add(times->stime, sig->stime);
+	times->sum_exec_runtime += sig->sum_sched_runtime;
+out:
+	rcu_read_unlock();
+}
+
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
@@ -475,6 +506,29 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 					     now);
 }
 
+/*
+ * Enable the process wide cpu timer accounting.
+ *
+ * serialized using ->sighand->siglock
+ */
+static void start_process_timers(struct task_struct *tsk)
+{
+	tsk->signal->cputimer.running = 1;
+	barrier();
+}
+
+/*
+ * Release the process wide timer accounting -- timer stops ticking when
+ * nobody cares about it.
+ *
+ * serialized using ->sighand->siglock
+ */
+static void stop_process_timers(struct task_struct *tsk)
+{
+	tsk->signal->cputimer.running = 0;
+	barrier();
+}
+
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
@@ -495,6 +549,9 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
+	if (!CPUCLOCK_PERTHREAD(timer->it_clock))
+		start_process_timers(p);
+
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
@@ -987,13 +1044,15 @@ static void check_process_timers(struct task_struct *tsk,
 	    sig->rlim[RLIMIT_CPU].rlim_cur == RLIM_INFINITY &&
 	    list_empty(&timers[CPUCLOCK_VIRT]) &&
 	    cputime_eq(sig->it_virt_expires, cputime_zero) &&
-	    list_empty(&timers[CPUCLOCK_SCHED]))
+	    list_empty(&timers[CPUCLOCK_SCHED])) {
+		stop_process_timers(tsk);
 		return;
+	}
 
 	/*
 	 * Collect the current process totals.
 	 */
-	thread_group_cputime(tsk, &cputime);
+	thread_group_cputimer(tsk, &cputime);
 	utime = cputime.utime;
 	ptime = cputime_add(utime, cputime.stime);
 	sum_sched_runtime = cputime.sum_exec_runtime;
@@ -1259,7 +1318,7 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 	if (!task_cputime_zero(&sig->cputime_expires)) {
 		struct task_cputime group_sample;
 
-		thread_group_cputime(tsk, &group_sample);
+		thread_group_cputimer(tsk, &group_sample);
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
 	}
@@ -1328,6 +1387,33 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
+/*
+ * Sample a process (thread group) timer for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_timer_sample_group(const clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputimer(p, &cputime);
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = cputime.utime;
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		break;
+	}
+	return 0;
+}
+
 /*
  * Set one of the process-wide special case CPU timers.
  * The tsk->sighand->siglock must be held by the caller.
@@ -1341,7 +1427,8 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	cpu_clock_sample_group(clock_idx, tsk, &now);
+	start_process_timers(tsk);
+	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
 		if (!cputime_eq(*oldval, cputime_zero)) {
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index 8ab0cef8ecab..a8f93dd374e1 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -296,19 +296,21 @@ sched_info_switch(struct task_struct *prev, struct task_struct *next)
 static inline void account_group_user_time(struct task_struct *tsk,
 					   cputime_t cputime)
 {
-	struct task_cputime *times;
-	struct signal_struct *sig;
+	struct thread_group_cputimer *cputimer;
 
 	/* tsk == current, ensure it is safe to use ->signal */
 	if (unlikely(tsk->exit_state))
 		return;
 
-	sig = tsk->signal;
-	times = &sig->cputime.totals;
+	cputimer = &tsk->signal->cputimer;
 
-	spin_lock(&times->lock);
-	times->utime = cputime_add(times->utime, cputime);
-	spin_unlock(&times->lock);
+	if (!cputimer->running)
+		return;
+
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.utime =
+		cputime_add(cputimer->cputime.utime, cputime);
+	spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -324,19 +326,21 @@ static inline void account_group_user_time(struct task_struct *tsk,
 static inline void account_group_system_time(struct task_struct *tsk,
 					     cputime_t cputime)
 {
-	struct task_cputime *times;
-	struct signal_struct *sig;
+	struct thread_group_cputimer *cputimer;
 
 	/* tsk == current, ensure it is safe to use ->signal */
 	if (unlikely(tsk->exit_state))
 		return;
 
-	sig = tsk->signal;
-	times = &sig->cputime.totals;
+	cputimer = &tsk->signal->cputimer;
+
+	if (!cputimer->running)
+		return;
 
-	spin_lock(&times->lock);
-	times->stime = cputime_add(times->stime, cputime);
-	spin_unlock(&times->lock);
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.stime =
+		cputime_add(cputimer->cputime.stime, cputime);
+	spin_unlock(&cputimer->lock);
 }
 
 /**
@@ -352,7 +356,7 @@ static inline void account_group_system_time(struct task_struct *tsk,
 static inline void account_group_exec_runtime(struct task_struct *tsk,
 					      unsigned long long ns)
 {
-	struct task_cputime *times;
+	struct thread_group_cputimer *cputimer;
 	struct signal_struct *sig;
 
 	sig = tsk->signal;
@@ -361,9 +365,12 @@ static inline void account_group_exec_runtime(struct task_struct *tsk,
 	if (unlikely(!sig))
 		return;
 
-	times = &sig->cputime.totals;
+	cputimer = &sig->cputimer;
+
+	if (!cputimer->running)
+		return;
 
-	spin_lock(&times->lock);
-	times->sum_exec_runtime += ns;
-	spin_unlock(&times->lock);
+	spin_lock(&cputimer->lock);
+	cputimer->cputime.sum_exec_runtime += ns;
+	spin_unlock(&cputimer->lock);
 }
-- 
cgit v1.2.3


From 268ccda0cb4d1292029d07ee3dbd07117baf6ecb Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Wed, 4 Feb 2009 20:16:39 -0200
Subject: trace: assign defaults at register_ftrace_event

Impact: simplification of tracers

As all tracers are doing this we might as well do it in
register_ftrace_event and save one branch each time we call these
callbacks.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c            |  2 --
 kernel/trace/trace.c        | 13 +++++--------
 kernel/trace/trace_branch.c |  3 ---
 kernel/trace/trace_output.c | 13 +++++++++++--
 4 files changed, 16 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index c7698d1617a1..1ebd068061ec 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1243,8 +1243,6 @@ static struct trace_event trace_blk_event = {
 	.type	 	= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.latency_trace	= blk_trace_event_print,
-	.raw		= trace_nop_print,
-	.hex		= trace_nop_print,
 	.binary		= blk_trace_event_print_binary,
 };
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd51cf0b94c7..a5e4c0af9bb0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1412,7 +1412,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 			goto partial;
 	}
 
-	if (event && event->latency_trace)
+	if (event)
 		return event->latency_trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
@@ -1441,7 +1441,7 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 			goto partial;
 	}
 
-	if (event && event->trace)
+	if (event)
 		return event->trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
@@ -1467,7 +1467,7 @@ static enum print_line_t print_raw_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->raw)
+	if (event)
 		return event->raw(iter, 0);
 
 	if (!trace_seq_printf(s, "%d ?\n", entry->type))
@@ -1494,7 +1494,7 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->hex) {
+	if (event) {
 		enum print_line_t ret = event->hex(iter, 0);
 		if (ret != TRACE_TYPE_HANDLED)
 			return ret;
@@ -1536,10 +1536,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 	}
 
 	event = ftrace_find_event(entry->type);
-	if (event && event->binary)
-		return event->binary(iter, 0);
-
-	return TRACE_TYPE_HANDLED;
+	return event ? event->binary(iter, 0) : TRACE_TYPE_HANDLED;
 }
 
 static int trace_empty(struct trace_iterator *iter)
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 7ac72a44b2d3..297deb202b68 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -182,9 +182,6 @@ static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 	.latency_trace	= trace_branch_print,
-	.raw		= trace_nop_print,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
 };
 
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b7380eee9fa1..b6e99af79214 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -435,6 +435,17 @@ int register_ftrace_event(struct trace_event *event)
 	if (ftrace_find_event(event->type))
 		goto out;
 
+	if (event->trace == NULL)
+		event->trace = trace_nop_print;
+	if (event->latency_trace == NULL)
+		event->latency_trace = trace_nop_print;
+	if (event->raw == NULL)
+		event->raw = trace_nop_print;
+	if (event->hex == NULL)
+		event->hex = trace_nop_print;
+	if (event->binary == NULL)
+		event->binary = trace_nop_print;
+
 	key = event->type & (EVENT_HASHSIZE - 1);
 
 	hlist_add_head_rcu(&event->node, &event_hash[key]);
@@ -874,8 +885,6 @@ static struct trace_event trace_print_event = {
 	.trace		= trace_print_print,
 	.latency_trace	= trace_print_print,
 	.raw		= trace_print_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
 };
 
 static struct trace_event *events[] __initdata = {
-- 
cgit v1.2.3


From 97e5b191ae7dc0f4f5b82b9db29782928b103b4d Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 01:13:36 -0500
Subject: trace_branch: Remove unused function

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_branch.c | 17 -----------------
 1 file changed, 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 297deb202b68..027e83690615 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -143,23 +143,6 @@ static void branch_trace_reset(struct trace_array *tr)
 	stop_branch_trace(tr);
 }
 
-static int
-trace_print_print(struct trace_seq *s, struct trace_entry *entry, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, entry);
-
-	if (seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 					    int flags)
 {
-- 
cgit v1.2.3


From 7be421510b91491d5aa5a29fa1005712039b95af Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 01:13:37 -0500
Subject: trace: Remove unused trace_array_cpu parameter

Impact: cleanup

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                  |  2 +-
 kernel/trace/trace.c              | 47 +++++++++++++++------------------------
 kernel/trace/trace.h              |  4 ----
 kernel/trace/trace_functions.c    |  8 +++----
 kernel/trace/trace_irqsoff.c      | 10 ++++-----
 kernel/trace/trace_sched_switch.c |  4 ++--
 kernel/trace/trace_sched_wakeup.c | 12 +++++-----
 7 files changed, 35 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 1ebd068061ec..d9d7146ee023 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -245,7 +245,7 @@ record_it:
 			if (pid != 0 &&
 			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, NULL, flags, 5, pc);
+				__trace_stack(blk_tr, flags, 5, pc);
 			trace_wake_up();
 			return;
 		}
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a5e4c0af9bb0..1d4ff568cc4d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -776,7 +776,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 }
 
 void
-trace_function(struct trace_array *tr, struct trace_array_cpu *data,
+trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
 	       int pc)
 {
@@ -802,7 +802,6 @@ trace_function(struct trace_array *tr, struct trace_array_cpu *data,
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 static void __trace_graph_entry(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -826,7 +825,6 @@ static void __trace_graph_entry(struct trace_array *tr,
 }
 
 static void __trace_graph_return(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct ftrace_graph_ret *trace,
 				unsigned long flags,
 				int pc)
@@ -856,11 +854,10 @@ ftrace(struct trace_array *tr, struct trace_array_cpu *data,
        int pc)
 {
 	if (likely(!atomic_read(&data->disabled)))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 }
 
 static void __ftrace_trace_stack(struct trace_array *tr,
-				 struct trace_array_cpu *data,
 				 unsigned long flags,
 				 int skip, int pc)
 {
@@ -891,27 +888,24 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 }
 
 static void ftrace_trace_stack(struct trace_array *tr,
-			       struct trace_array_cpu *data,
 			       unsigned long flags,
 			       int skip, int pc)
 {
 	if (!(trace_flags & TRACE_ITER_STACKTRACE))
 		return;
 
-	__ftrace_trace_stack(tr, data, flags, skip, pc);
+	__ftrace_trace_stack(tr, flags, skip, pc);
 }
 
 void __trace_stack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
 		   unsigned long flags,
 		   int skip, int pc)
 {
-	__ftrace_trace_stack(tr, data, flags, skip, pc);
+	__ftrace_trace_stack(tr, flags, skip, pc);
 }
 
 static void ftrace_trace_userstack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long flags, int pc)
+				   unsigned long flags, int pc)
 {
 #ifdef CONFIG_STACKTRACE
 	struct ring_buffer_event *event;
@@ -942,20 +936,17 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 #endif
 }
 
-void __trace_userstack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
-		   unsigned long flags)
+void __trace_userstack(struct trace_array *tr, unsigned long flags)
 {
-	ftrace_trace_userstack(tr, data, flags, preempt_count());
+	ftrace_trace_userstack(tr, flags, preempt_count());
 }
 
 static void
-ftrace_trace_special(void *__tr, void *__data,
+ftrace_trace_special(void *__tr,
 		     unsigned long arg1, unsigned long arg2, unsigned long arg3,
 		     int pc)
 {
 	struct ring_buffer_event *event;
-	struct trace_array_cpu *data = __data;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
 	unsigned long irq_flags;
@@ -971,8 +962,8 @@ ftrace_trace_special(void *__tr, void *__data,
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, data, irq_flags, pc);
+	ftrace_trace_stack(tr, irq_flags, 4, pc);
+	ftrace_trace_userstack(tr, irq_flags, pc);
 
 	trace_wake_up();
 }
@@ -981,12 +972,11 @@ void
 __trace_special(void *__tr, void *__data,
 		unsigned long arg1, unsigned long arg2, unsigned long arg3)
 {
-	ftrace_trace_special(__tr, __data, arg1, arg2, arg3, preempt_count());
+	ftrace_trace_special(__tr, arg1, arg2, arg3, preempt_count());
 }
 
 void
 tracing_sched_switch_trace(struct trace_array *tr,
-			   struct trace_array_cpu *data,
 			   struct task_struct *prev,
 			   struct task_struct *next,
 			   unsigned long flags, int pc)
@@ -1010,13 +1000,12 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, flags, 5, pc);
-	ftrace_trace_userstack(tr, data, flags, pc);
+	ftrace_trace_stack(tr, flags, 5, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 }
 
 void
 tracing_sched_wakeup_trace(struct trace_array *tr,
-			   struct trace_array_cpu *data,
 			   struct task_struct *wakee,
 			   struct task_struct *curr,
 			   unsigned long flags, int pc)
@@ -1040,8 +1029,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
 	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, data, flags, 6, pc);
-	ftrace_trace_userstack(tr, data, flags, pc);
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 
 	trace_wake_up();
 }
@@ -1064,7 +1053,7 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3)
 	data = tr->data[cpu];
 
 	if (likely(atomic_inc_return(&data->disabled) == 1))
-		ftrace_trace_special(tr, data, arg1, arg2, arg3, pc);
+		ftrace_trace_special(tr, arg1, arg2, arg3, pc);
 
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
@@ -1092,7 +1081,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_entry(tr, data, trace, flags, pc);
+		__trace_graph_entry(tr, trace, flags, pc);
 	}
 	/* Only do the atomic if it is not already set */
 	if (!test_tsk_trace_graph(current))
@@ -1118,7 +1107,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_return(tr, data, trace, flags, pc);
+		__trace_graph_return(tr, trace, flags, pc);
 	}
 	if (!trace->depth)
 		clear_tsk_trace_graph(current);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f0c7a0f08cac..df627a948694 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -419,14 +419,12 @@ void ftrace(struct trace_array *tr,
 			    unsigned long parent_ip,
 			    unsigned long flags, int pc);
 void tracing_sched_switch_trace(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct task_struct *prev,
 				struct task_struct *next,
 				unsigned long flags, int pc);
 void tracing_record_cmdline(struct task_struct *tsk);
 
 void tracing_sched_wakeup_trace(struct trace_array *tr,
-				struct trace_array_cpu *data,
 				struct task_struct *wakee,
 				struct task_struct *cur,
 				unsigned long flags, int pc);
@@ -436,7 +434,6 @@ void trace_special(struct trace_array *tr,
 		   unsigned long arg2,
 		   unsigned long arg3, int pc);
 void trace_function(struct trace_array *tr,
-		    struct trace_array_cpu *data,
 		    unsigned long ip,
 		    unsigned long parent_ip,
 		    unsigned long flags, int pc);
@@ -462,7 +459,6 @@ void update_max_tr_single(struct trace_array *tr,
 			  struct task_struct *tsk, int cpu);
 
 void __trace_stack(struct trace_array *tr,
-		   struct trace_array_cpu *data,
 		   unsigned long flags,
 		   int skip, int pc);
 
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index b3a320f8aba7..d067cea2ccc3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -78,7 +78,7 @@ function_trace_call_preempt_only(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 
 	atomic_dec(&data->disabled);
 	ftrace_preempt_enable(resched);
@@ -108,7 +108,7 @@ function_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 	}
 
 	atomic_dec(&data->disabled);
@@ -139,7 +139,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		trace_function(tr, data, ip, parent_ip, flags, pc);
+		trace_function(tr, ip, parent_ip, flags, pc);
 		/*
 		 * skip over 5 funcs:
 		 *    __ftrace_trace_stack,
@@ -148,7 +148,7 @@ function_stack_trace_call(unsigned long ip, unsigned long parent_ip)
 		 *    ftrace_list_func
 		 *    ftrace_call
 		 */
-		__trace_stack(tr, data, flags, 5, pc);
+		__trace_stack(tr, flags, 5, pc);
 	}
 
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index ed344b022a14..c6b442d88de8 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -95,7 +95,7 @@ irqsoff_tracer_call(unsigned long ip, unsigned long parent_ip)
 	disabled = atomic_inc_return(&data->disabled);
 
 	if (likely(disabled == 1))
-		trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+		trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	atomic_dec(&data->disabled);
 }
@@ -153,7 +153,7 @@ check_critical_timing(struct trace_array *tr,
 	if (!report_latency(delta))
 		goto out_unlock;
 
-	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 
 	latency = nsecs_to_usecs(delta);
 
@@ -177,7 +177,7 @@ out:
 	data->critical_sequence = max_sequence;
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_reset(tr, cpu);
-	trace_function(tr, data, CALLER_ADDR0, parent_ip, flags, pc);
+	trace_function(tr, CALLER_ADDR0, parent_ip, flags, pc);
 }
 
 static inline void
@@ -210,7 +210,7 @@ start_critical_timing(unsigned long ip, unsigned long parent_ip)
 
 	local_save_flags(flags);
 
-	trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 
 	per_cpu(tracing_cpu, cpu) = 1;
 
@@ -244,7 +244,7 @@ stop_critical_timing(unsigned long ip, unsigned long parent_ip)
 	atomic_inc(&data->disabled);
 
 	local_save_flags(flags);
-	trace_function(tr, data, ip, parent_ip, flags, preempt_count());
+	trace_function(tr, ip, parent_ip, flags, preempt_count());
 	check_critical_timing(tr, data, parent_ip ? : ip, cpu);
 	data->critical_start = 0;
 	atomic_dec(&data->disabled);
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index df175cb4564f..c4f9add5ec90 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -43,7 +43,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	data = ctx_trace->data[cpu];
 
 	if (likely(!atomic_read(&data->disabled)))
-		tracing_sched_switch_trace(ctx_trace, data, prev, next, flags, pc);
+		tracing_sched_switch_trace(ctx_trace, prev, next, flags, pc);
 
 	local_irq_restore(flags);
 }
@@ -66,7 +66,7 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	data = ctx_trace->data[cpu];
 
 	if (likely(!atomic_read(&data->disabled)))
-		tracing_sched_wakeup_trace(ctx_trace, data, wakee, current,
+		tracing_sched_wakeup_trace(ctx_trace, wakee, current,
 					   flags, pc);
 
 	local_irq_restore(flags);
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index a48c9b4b0c85..96d716485898 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -72,7 +72,7 @@ wakeup_tracer_call(unsigned long ip, unsigned long parent_ip)
 	if (task_cpu(wakeup_task) != cpu)
 		goto unlock;
 
-	trace_function(tr, data, ip, parent_ip, flags, pc);
+	trace_function(tr, ip, parent_ip, flags, pc);
 
  unlock:
 	__raw_spin_unlock(&wakeup_lock);
@@ -152,8 +152,8 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
-	tracing_sched_switch_trace(wakeup_trace, data, prev, next, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	/*
 	 * usecs conversion is slow so we try to delay the conversion
@@ -254,10 +254,8 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 
 	data = wakeup_trace->data[wakeup_cpu];
 	data->preempt_timestamp = ftrace_now(cpu);
-	tracing_sched_wakeup_trace(wakeup_trace, data, p, current,
-				   flags, pc);
-	trace_function(wakeup_trace, data, CALLER_ADDR1, CALLER_ADDR2,
-		       flags, pc);
+	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
 	__raw_spin_unlock(&wakeup_lock);
-- 
cgit v1.2.3


From dac74940289f350c2590bec92737833bad608541 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 01:13:38 -0500
Subject: trace: code style clean up

Ingo Molnar suggested using goto logic to keep the indentation
down and to be able to remove the nasty line breaks. This actually
makes the code a bit more readable.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1d4ff568cc4d..3536ef41575d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -522,23 +522,24 @@ int register_tracer(struct tracer *type)
 	tracing_selftest_running = false;
 	mutex_unlock(&trace_types_lock);
 
-	if (!ret && default_bootup_tracer) {
-		if (!strncmp(default_bootup_tracer, type->name,
-			     BOOTUP_TRACER_SIZE)) {
-			printk(KERN_INFO "Starting tracer '%s'\n",
-			       type->name);
-			/* Do we want this tracer to start on bootup? */
-			tracing_set_tracer(type->name);
-			default_bootup_tracer = NULL;
-			/* disable other selftests, since this will break it. */
-			tracing_selftest_disabled = 1;
+	if (ret || !default_bootup_tracer)
+		goto out_unlock;
+
+	if (strncmp(default_bootup_tracer, type->name, BOOTUP_TRACER_SIZE))
+		goto out_unlock;
+
+	printk(KERN_INFO "Starting tracer '%s'\n", type->name);
+	/* Do we want this tracer to start on bootup? */
+	tracing_set_tracer(type->name);
+	default_bootup_tracer = NULL;
+	/* disable other selftests, since this will break it. */
+	tracing_selftest_disabled = 1;
 #ifdef CONFIG_FTRACE_STARTUP_TEST
-			printk(KERN_INFO "Disabling FTRACE selftests due"
-			       " to running tracer '%s'\n", type->name);
+	printk(KERN_INFO "Disabling FTRACE selftests due to running tracer '%s'\n",
+	       type->name);
 #endif
-		}
-	}
 
+ out_unlock:
 	lock_kernel();
 	return ret;
 }
-- 
cgit v1.2.3


From ce9dbe244bf2063c41792e40dae7745957b118e0 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Wed, 4 Feb 2009 20:35:48 -0800
Subject: softlockup: check all tasks in hung_task
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: extend the scope of hung-task checks

Changed the default value of hung_task_check_count to PID_MAX_LIMIT.
hung_task_batch_count added to put an upper bound on the critical
section. Every hung_task_batch_count checks, the rcu lock is never
held for a too long time.

Keeping the critical section small minimizes time preemption is disabled
and keeps rcu grace periods small.

To prevent following a stale pointer, get_task_struct is called on g and t.
To verify that g and t have not been unhashed while outside the critical
section, the task states are checked.

The design was proposed by Frédéric Weisbecker.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Suggested-by: Frédéric Weisbecker <fweisbec@gmail.com>
Acked-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 39 +++++++++++++++++++++++++++++++++++++--
 1 file changed, 37 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index ba8ccd432963..481ca8b5c2bc 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -17,9 +17,18 @@
 #include <linux/sysctl.h>
 
 /*
- * Have a reasonable limit on the number of tasks checked:
+ * The number of tasks checked:
  */
-unsigned long __read_mostly sysctl_hung_task_check_count = 1024;
+unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
+
+/*
+ * Limit number of tasks checked in a batch.
+ *
+ * This value controls the preemptibility of khungtaskd since preemption
+ * is disabled during the critical section. It also controls the size of
+ * the RCU grace period. So it needs to be upper-bound.
+ */
+#define HUNG_TASK_BATCHING 1024
 
 /*
  * Zero means infinite timeout - no checking done:
@@ -109,6 +118,24 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 		panic("hung_task: blocked tasks");
 }
 
+/*
+ * To avoid extending the RCU grace period for an unbounded amount of time,
+ * periodically exit the critical section and enter a new one.
+ *
+ * For preemptible RCU it is sufficient to call rcu_read_unlock in order
+ * exit the grace period. For classic RCU, a reschedule is required.
+ */
+static void rcu_lock_break(struct task_struct *g, struct task_struct *t)
+{
+	get_task_struct(g);
+	get_task_struct(t);
+	rcu_read_unlock();
+	cond_resched();
+	rcu_read_lock();
+	put_task_struct(t);
+	put_task_struct(g);
+}
+
 /*
  * Check whether a TASK_UNINTERRUPTIBLE does not get woken up for
  * a really long time (120 seconds). If that happens, print out
@@ -117,6 +144,7 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
+	int batch_count = HUNG_TASK_BATCHING;
 	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
@@ -131,6 +159,13 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
+		if (!--batch_count) {
+			batch_count = HUNG_TASK_BATCHING;
+			rcu_lock_break(g, t);
+			/* Exit if t or g was unhashed during refresh. */
+			if (t->state == TASK_DEAD || g->state == TASK_DEAD)
+				goto unlock;
+		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
 			check_hung_task(t, now, timeout);
-- 
cgit v1.2.3


From 94be52dc075a32af4aa73d7e10f68734d62d6af2 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Thu, 5 Feb 2009 09:56:08 -0800
Subject: softlockup: convert read_lock in hung_task to rcu_read_lock

Since the tasklist is protected by rcu list operations, it is safe
to convert the read_lock()s to rcu_read_lock().

Suggested-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 481ca8b5c2bc..3951a80e7cbe 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -155,7 +155,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 	if (test_taint(TAINT_DIE) || did_panic)
 		return;
 
-	read_lock(&tasklist_lock);
+	rcu_read_lock();
 	do_each_thread(g, t) {
 		if (!--max_count)
 			goto unlock;
@@ -171,7 +171,7 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 			check_hung_task(t, now, timeout);
 	} while_each_thread(g, t);
  unlock:
-	read_unlock(&tasklist_lock);
+	rcu_read_unlock();
 }
 
 static void update_poll_jiffies(void)
-- 
cgit v1.2.3


From 0a9877514c4fed10a70720293b37213dd172ee3e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:12:56 -0200
Subject: ring_buffer: remove unused flags parameter
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: API change, cleanup

>From ring_buffer_{lock_reserve,unlock_commit}.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |  -14
  trace_graph_return         |  -14
  trace_graph_entry          |  -10
  trace_function             |   -8
  __ftrace_trace_stack       |   -8
  ftrace_trace_userstack     |   -8
  tracing_sched_switch_trace |   -8
  ftrace_trace_special       |  -12
  tracing_sched_wakeup_trace |   -8
 9 functions changed, 90 bytes removed, diff: -90

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |   -1
 1 function changed, 1 bytes removed, diff: -1

/tmp/vmlinux.after:
 10 functions changed, 91 bytes removed, diff: -91

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                 |  8 +++---
 include/linux/ring_buffer.h      |  9 +++----
 kernel/trace/kmemtrace.c         | 12 +++------
 kernel/trace/ring_buffer.c       |  9 ++-----
 kernel/trace/trace.c             | 56 ++++++++++++++--------------------------
 kernel/trace/trace_boot.c        | 12 +++------
 kernel/trace/trace_branch.c      |  7 +++--
 kernel/trace/trace_hw_branches.c |  6 ++---
 kernel/trace/trace_mmiotrace.c   | 12 +++------
 kernel/trace/trace_power.c       | 12 +++------
 10 files changed, 51 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index d9d7146ee023..8e52f24cc8f9 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -165,7 +165,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	struct task_struct *tsk = current;
 	struct ring_buffer_event *event = NULL;
 	struct blk_io_trace *t;
-	unsigned long flags;
+	unsigned long flags = 0;
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
@@ -191,7 +191,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		tracing_record_cmdline(current);
 
 		event = ring_buffer_lock_reserve(blk_tr->buffer,
-						 sizeof(*t) + pdu_len, &flags);
+						 sizeof(*t) + pdu_len);
 		if (!event)
 			return;
 
@@ -241,11 +241,11 @@ record_it:
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
 		if (blk_tr) {
-			ring_buffer_unlock_commit(blk_tr->buffer, event, flags);
+			ring_buffer_unlock_commit(blk_tr->buffer, event);
 			if (pid != 0 &&
 			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
 			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, flags, 5, pc);
+				__trace_stack(blk_tr, 0, 5, pc);
 			trace_wake_up();
 			return;
 		}
diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b3b359660082..3110d92e7d81 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -74,13 +74,10 @@ void ring_buffer_free(struct ring_buffer *buffer);
 
 int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size);
 
-struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags);
+struct ring_buffer_event *ring_buffer_lock_reserve(struct ring_buffer *buffer,
+						   unsigned long length);
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags);
+			      struct ring_buffer_event *event);
 int ring_buffer_write(struct ring_buffer *buffer,
 		      unsigned long length, void *data);
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index f04c0625f1cd..256749d1032a 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -272,13 +272,11 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -292,7 +290,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -305,13 +303,11 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	struct ring_buffer_event *event;
 	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
-	unsigned long irq_flags;
 
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -322,7 +318,7 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..aee76b3eeed2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1257,7 +1257,6 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * ring_buffer_lock_reserve - reserve a part of the buffer
  * @buffer: the ring buffer to reserve from
  * @length: the length of the data to reserve (excluding event header)
- * @flags: a pointer to save the interrupt flags
  *
  * Returns a reseverd event on the ring buffer to copy directly to.
  * The user of this interface will need to get the body to write into
@@ -1270,9 +1269,7 @@ static DEFINE_PER_CPU(int, rb_need_resched);
  * If NULL is returned, then nothing has been allocated or locked.
  */
 struct ring_buffer_event *
-ring_buffer_lock_reserve(struct ring_buffer *buffer,
-			 unsigned long length,
-			 unsigned long *flags)
+ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	struct ring_buffer_event *event;
@@ -1339,15 +1336,13 @@ static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer,
  * ring_buffer_unlock_commit - commit a reserved
  * @buffer: The buffer to commit to
  * @event: The event pointer to commit.
- * @flags: the interrupt flags received from ring_buffer_lock_reserve.
  *
  * This commits the data to the ring buffer, and releases any locks held.
  *
  * Must be paired with ring_buffer_lock_reserve.
  */
 int ring_buffer_unlock_commit(struct ring_buffer *buffer,
-			      struct ring_buffer_event *event,
-			      unsigned long flags)
+			      struct ring_buffer_event *event)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu = raw_smp_processor_id();
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3536ef41575d..eb453a238a6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -783,14 +783,12 @@ trace_function(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_entry *entry;
-	unsigned long irq_flags;
 
 	/* If we are reading the ring buffer, don't trace */
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -798,7 +796,7 @@ trace_function(struct trace_array *tr,
 	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
@@ -809,20 +807,18 @@ static void __trace_graph_entry(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ent_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -832,20 +828,18 @@ static void __trace_graph_return(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ftrace_graph_ret_entry *entry;
-	unsigned long irq_flags;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, flags, pc);
 	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
-	ring_buffer_unlock_commit(global_trace.buffer, event, irq_flags);
+	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
 #endif
 
@@ -866,10 +860,8 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct stack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -884,7 +876,7 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -912,13 +904,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct userstack_entry *entry;
 	struct stack_trace trace;
-	unsigned long irq_flags;
 
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -933,7 +923,7 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	trace.entries		= entry->caller;
 
 	save_stack_trace_user(&trace);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 #endif
 }
 
@@ -950,10 +940,8 @@ ftrace_trace_special(void *__tr,
 	struct ring_buffer_event *event;
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -962,9 +950,9 @@ ftrace_trace_special(void *__tr,
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
-	ftrace_trace_stack(tr, irq_flags, 4, pc);
-	ftrace_trace_userstack(tr, irq_flags, pc);
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, 0, 4, pc);
+	ftrace_trace_userstack(tr, 0, pc);
 
 	trace_wake_up();
 }
@@ -984,10 +972,8 @@ tracing_sched_switch_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1000,7 +986,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 5, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 }
@@ -1013,10 +999,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
-	unsigned long irq_flags;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
@@ -1029,7 +1013,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
 
@@ -2841,7 +2825,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size, &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, size);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -2852,7 +2836,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
 	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 1f07895977a0..4e08debf662d 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -132,7 +132,6 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_call *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -144,15 +143,14 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -164,7 +162,6 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 {
 	struct ring_buffer_event *event;
 	struct trace_boot_ret *entry;
-	unsigned long irq_flags;
 	struct trace_array *tr = boot_trace;
 
 	if (!tr || !pre_initcalls_finished)
@@ -173,15 +170,14 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 027e83690615..770e52acfc10 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -33,7 +33,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	struct trace_array *tr = branch_tracer;
 	struct ring_buffer_event *event;
 	struct trace_branch *entry;
-	unsigned long flags, irq_flags;
+	unsigned long flags;
 	int cpu, pc;
 	const char *p;
 
@@ -52,8 +52,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 
@@ -75,7 +74,7 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	entry->line = f->line;
 	entry->correct = val == expect;
 
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545fc866..e720c001db2b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -175,7 +175,7 @@ void trace_hw_branch(u64 from, u64 to)
 	struct trace_array *tr = hw_branch_trace;
 	struct ring_buffer_event *event;
 	struct hw_branch_entry *entry;
-	unsigned long irq1, irq2;
+	unsigned long irq1;
 	int cpu;
 
 	if (unlikely(!tr))
@@ -189,7 +189,7 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry), &irq2);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
@@ -198,7 +198,7 @@ void trace_hw_branch(u64 from, u64 to)
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event, irq2);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index ec78e244242e..104ddebc11d1 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,10 +307,8 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -319,7 +317,7 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
@@ -337,10 +335,8 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
-	unsigned long irq_flags;
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					   &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
@@ -349,7 +345,7 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 }
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index faa6ab7a1f5c..3b1a292d12d2 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -115,7 +115,6 @@ void trace_power_end(struct power_trace *it)
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -125,15 +124,14 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
@@ -148,7 +146,6 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	struct ring_buffer_event *event;
 	struct trace_power *entry;
 	struct trace_array_cpu *data;
-	unsigned long irq_flags;
 	struct trace_array *tr = power_trace;
 
 	if (!trace_power_enabled)
@@ -162,15 +159,14 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
-					 &irq_flags);
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event, irq_flags);
+	ring_buffer_unlock_commit(tr->buffer, event);
 
 	trace_wake_up();
 
-- 
cgit v1.2.3


From 51a763dd84253bab1d0a1e68e11a7753d1b702ca Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 16:14:13 -0200
Subject: tracing: Introduce trace_buffer_{lock_reserve,unlock_commit}
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: new API

These new functions do what previously was being open coded, reducing
the number of details ftrace plugin writers have to worry about.

It also standardizes the handling of stacktrace, userstacktrace and
other trace options we may introduce in the future.

With this patch, for instance, the blk tracer (and some others already
in the tree) can use the "userstacktrace" /d/tracing/trace_options
facility.

$ codiff /tmp/vmlinux.before /tmp/vmlinux.after
linux-2.6-tip/kernel/trace/trace.c:
  trace_vprintk              |   -5
  trace_graph_return         |  -22
  trace_graph_entry          |  -26
  trace_function             |  -45
  __ftrace_trace_stack       |  -27
  ftrace_trace_userstack     |  -29
  tracing_sched_switch_trace |  -66
  tracing_stop               |   +1
  trace_seq_to_user          |   -1
  ftrace_trace_special       |  -63
  ftrace_special             |   +1
  tracing_sched_wakeup_trace |  -70
  tracing_reset_online_cpus  |   -1
 13 functions changed, 2 bytes added, 355 bytes removed, diff: -353

linux-2.6-tip/block/blktrace.c:
  __blk_add_trace |  -58
 1 function changed, 58 bytes removed, diff: -58

linux-2.6-tip/kernel/trace/trace.c:
  trace_buffer_lock_reserve  |  +88
  trace_buffer_unlock_commit |  +86
 2 functions changed, 174 bytes added, diff: +174

/tmp/vmlinux.after:
 16 functions changed, 176 bytes added, 413 bytes removed, diff: -237

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                 | 21 +++------
 kernel/trace/kmemtrace.c         | 19 +++-----
 kernel/trace/trace.c             | 94 ++++++++++++++++++++++------------------
 kernel/trace/trace.h             | 11 +++++
 kernel/trace/trace_boot.c        | 20 +++------
 kernel/trace/trace_branch.c      |  7 ++-
 kernel/trace/trace_hw_branches.c |  7 ++-
 kernel/trace/trace_mmiotrace.c   | 20 ++++-----
 kernel/trace/trace_power.c       | 20 +++------
 9 files changed, 102 insertions(+), 117 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 8e52f24cc8f9..834cd84037b2 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -187,19 +187,15 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	cpu = raw_smp_processor_id();
 
 	if (blk_tr) {
-		struct trace_entry *ent;
 		tracing_record_cmdline(current);
 
-		event = ring_buffer_lock_reserve(blk_tr->buffer,
-						 sizeof(*t) + pdu_len);
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + pdu_len,
+						  0, pc);
 		if (!event)
 			return;
-
-		ent = ring_buffer_event_data(event);
-		t = (struct blk_io_trace *)ent;
-		pc = preempt_count();
-		tracing_generic_entry_update(ent, 0, pc);
-		ent->type = TRACE_BLK;
+		t = ring_buffer_event_data(event);
 		goto record_it;
 	}
 
@@ -241,12 +237,7 @@ record_it:
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
 		if (blk_tr) {
-			ring_buffer_unlock_commit(blk_tr->buffer, event);
-			if (pid != 0 &&
-			    !(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC) &&
-			    (trace_flags & TRACE_ITER_STACKTRACE) != 0)
-				__trace_stack(blk_tr, 0, 5, pc);
-			trace_wake_up();
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
 	}
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 256749d1032a..ae201b3eda89 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -276,13 +276,12 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_ALLOC;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 	entry->bytes_req = bytes_req;
@@ -290,9 +289,7 @@ void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
 	entry->gfp_flags = gfp_flags;
 	entry->node	=	node;
 
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 }
 EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
 
@@ -307,20 +304,16 @@ void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
 	if (!kmem_tracing_enabled)
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-
-	entry->ent.type = TRACE_KMEM_FREE;
 	entry->type_id	= type_id;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 }
 EXPORT_SYMBOL(kmemtrace_mark_free);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index eb453a238a6f..8fad3776e843 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -776,6 +776,39 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 		(need_resched() ? TRACE_FLAG_NEED_RESCHED : 0);
 }
 
+struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
+						    unsigned char type,
+						    unsigned long len,
+						    unsigned long flags, int pc)
+{
+	struct ring_buffer_event *event;
+
+	event = ring_buffer_lock_reserve(tr->buffer, len);
+	if (event != NULL) {
+		struct trace_entry *ent = ring_buffer_event_data(event);
+
+		tracing_generic_entry_update(ent, flags, pc);
+		ent->type = type;
+	}
+
+	return event;
+}
+static void ftrace_trace_stack(struct trace_array *tr,
+			       unsigned long flags, int skip, int pc);
+static void ftrace_trace_userstack(struct trace_array *tr,
+				   unsigned long flags, int pc);
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc)
+{
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
+	trace_wake_up();
+}
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
@@ -788,12 +821,11 @@ trace_function(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_FN, sizeof(*entry),
+					  flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_FN;
 	entry->ip			= ip;
 	entry->parent_ip		= parent_ip;
 	ring_buffer_unlock_commit(tr->buffer, event);
@@ -811,12 +843,11 @@ static void __trace_graph_entry(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_GRAPH_ENT;
 	entry->graph_ent			= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
@@ -832,12 +863,11 @@ static void __trace_graph_return(struct trace_array *tr,
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
 		return;
 
-	event = ring_buffer_lock_reserve(global_trace.buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_RET,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_GRAPH_RET;
 	entry->ret				= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
 }
@@ -861,13 +891,11 @@ static void __ftrace_trace_stack(struct trace_array *tr,
 	struct stack_entry *entry;
 	struct stack_trace trace;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_STACK,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_STACK;
-
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
 	trace.nr_entries	= 0;
@@ -908,12 +936,11 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 	if (!(trace_flags & TRACE_ITER_USERSTACKTRACE))
 		return;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_USER_STACK,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_USER_STACK;
 
 	memset(&entry->caller, 0, sizeof(entry->caller));
 
@@ -941,20 +968,15 @@ ftrace_trace_special(void *__tr,
 	struct trace_array *tr = __tr;
 	struct special_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_SPECIAL,
+					  sizeof(*entry), 0, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, pc);
-	entry->ent.type			= TRACE_SPECIAL;
 	entry->arg1			= arg1;
 	entry->arg2			= arg2;
 	entry->arg3			= arg3;
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, 0, 4, pc);
-	ftrace_trace_userstack(tr, 0, pc);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void
@@ -973,12 +995,11 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_CTX,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_CTX;
 	entry->prev_pid			= prev->pid;
 	entry->prev_prio		= prev->prio;
 	entry->prev_state		= prev->state;
@@ -986,9 +1007,7 @@ tracing_sched_switch_trace(struct trace_array *tr,
 	entry->next_prio		= next->prio;
 	entry->next_state		= next->state;
 	entry->next_cpu	= task_cpu(next);
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, flags, 5, pc);
-	ftrace_trace_userstack(tr, flags, pc);
+	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -1000,12 +1019,11 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	struct ring_buffer_event *event;
 	struct ctx_switch_entry *entry;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_WAKE,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		return;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type			= TRACE_WAKE;
 	entry->prev_pid			= curr->pid;
 	entry->prev_prio		= curr->prio;
 	entry->prev_state		= curr->state;
@@ -1013,11 +1031,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	ring_buffer_unlock_commit(tr->buffer, event);
-	ftrace_trace_stack(tr, flags, 6, pc);
-	ftrace_trace_userstack(tr, flags, pc);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, flags, pc);
 }
 
 void
@@ -2825,12 +2839,10 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	trace_buf[len] = 0;
 
 	size = sizeof(*entry) + len + 1;
-	event = ring_buffer_lock_reserve(tr->buffer, size);
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, irq_flags, pc);
-	entry->ent.type			= TRACE_PRINT;
 	entry->ip			= ip;
 	entry->depth			= depth;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index df627a948694..e03f157c772e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -403,6 +403,17 @@ int tracing_open_generic(struct inode *inode, struct file *filp);
 struct dentry *tracing_init_dentry(void);
 void init_tracer_sysprof_debugfs(struct dentry *d_tracer);
 
+struct ring_buffer_event;
+
+struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr,
+						    unsigned char type,
+						    unsigned long len,
+						    unsigned long flags,
+						    int pc);
+void trace_buffer_unlock_commit(struct trace_array *tr,
+				struct ring_buffer_event *event,
+				unsigned long flags, int pc);
+
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c
index 4e08debf662d..7a30fc4c3642 100644
--- a/kernel/trace/trace_boot.c
+++ b/kernel/trace/trace_boot.c
@@ -143,17 +143,13 @@ void trace_boot_call(struct boot_trace_call *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_BOOT_CALL,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT_CALL;
 	entry->boot_call = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -170,17 +166,13 @@ void trace_boot_ret(struct boot_trace_ret *bt, initcall_t fn)
 	sprint_symbol(bt->func, (unsigned long)fn);
 	preempt_disable();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_BOOT_RET,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_BOOT_RET;
 	entry->boot_ret = *bt;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 770e52acfc10..48b2196abe37 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -52,14 +52,13 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	pc = preempt_count();
+	event = trace_buffer_lock_reserve(tr, TRACE_BRANCH,
+					  sizeof(*entry), flags, pc);
 	if (!event)
 		goto out;
 
-	pc = preempt_count();
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, flags, pc);
-	entry->ent.type		= TRACE_BRANCH;
 
 	/* Strip off the path, only save the file */
 	p = f->file + strlen(f->file);
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e720c001db2b..2aa1c9f4c7d8 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -189,16 +189,15 @@ void trace_hw_branch(u64 from, u64 to)
 	if (atomic_inc_return(&tr->data[cpu]->disabled) != 1)
 		goto out;
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_HW_BRANCHES,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, from);
-	entry->ent.type = TRACE_HW_BRANCHES;
 	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
-	ring_buffer_unlock_commit(tr->buffer, event);
+	trace_buffer_unlock_commit(tr, event, 0, 0);
 
  out:
 	atomic_dec(&tr->data[cpu]->disabled);
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 104ddebc11d1..c401b908e805 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -307,19 +307,17 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_rw *entry;
+	int pc = preempt_count();
 
-	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_MMIO_RW,
+					  sizeof(*entry), 0, pc);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
 	}
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
-	entry->ent.type			= TRACE_MMIO_RW;
 	entry->rw			= *rw;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void mmio_trace_rw(struct mmiotrace_rw *rw)
@@ -335,19 +333,17 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 {
 	struct ring_buffer_event *event;
 	struct trace_mmiotrace_map *entry;
+	int pc = preempt_count();
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_MMIO_MAP,
+					  sizeof(*entry), 0, pc);
 	if (!event) {
 		atomic_inc(&dropped_count);
 		return;
 	}
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
-	entry->ent.type			= TRACE_MMIO_MAP;
 	entry->map			= *map;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
+	trace_buffer_unlock_commit(tr, event, 0, pc);
 }
 
 void mmio_trace_mapping(struct mmiotrace_map *map)
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 3b1a292d12d2..bfc21f8079ab 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -124,17 +124,13 @@ void trace_power_end(struct power_trace *it)
 	it->end = ktime_get();
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
@@ -159,17 +155,13 @@ void trace_power_mark(struct power_trace *it, unsigned int type,
 	it->end = it->stamp;
 	data = tr->data[smp_processor_id()];
 
-	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
 	if (!event)
 		goto out;
 	entry	= ring_buffer_event_data(event);
-	tracing_generic_entry_update(&entry->ent, 0, 0);
-	entry->ent.type = TRACE_POWER;
 	entry->state_data = *it;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-	trace_wake_up();
-
+	trace_buffer_unlock_commit(tr, event, 0, 0);
  out:
 	preempt_enable();
 }
-- 
cgit v1.2.3


From b6f11df26fdc28324cf9c9e3b77f2dc985c1bb13 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Thu, 5 Feb 2009 18:02:00 -0200
Subject: trace: Call tracing_reset_online_cpus before tracer->init()
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Impact: cleanup

To make it easy for ftrace plugin writers, as this was open coded in
the existing plugins

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frédéric Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blktrace.c                     |  2 --
 kernel/trace/trace.c                 |  8 +++++++-
 kernel/trace/trace.h                 |  1 +
 kernel/trace/trace_branch.c          |  1 -
 kernel/trace/trace_functions.c       | 17 +++--------------
 kernel/trace/trace_functions_graph.c |  1 -
 kernel/trace/trace_hw_branches.c     |  1 -
 kernel/trace/trace_nop.c             |  1 -
 kernel/trace/trace_sched_switch.c    |  8 +-------
 kernel/trace/trace_selftest.c        | 18 +++++++++---------
 kernel/trace/trace_sysprof.c         | 14 ++++----------
 11 files changed, 25 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 834cd84037b2..ca6d32061e4f 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -1086,8 +1086,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
-
 	mutex_lock(&blk_probe_mutex);
 	if (atomic_add_return(1, &blk_probes_ref) == 1)
 		if (blk_register_tracepoints())
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8fad3776e843..ef4dbac95568 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2171,6 +2171,12 @@ tracing_set_trace_read(struct file *filp, char __user *ubuf,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+int tracer_init(struct tracer *t, struct trace_array *tr)
+{
+	tracing_reset_online_cpus(tr);
+	return t->init(tr);
+}
+
 static int tracing_set_tracer(const char *buf)
 {
 	struct trace_array *tr = &global_trace;
@@ -2195,7 +2201,7 @@ static int tracing_set_tracer(const char *buf)
 
 	current_trace = t;
 	if (t->init) {
-		ret = t->init(tr);
+		ret = tracer_init(t, tr);
 		if (ret)
 			goto out;
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e03f157c772e..f2742fb1575a 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,7 @@ struct trace_iterator {
 	cpumask_var_t		started;
 };
 
+int tracer_init(struct tracer *t, struct trace_array *tr);
 int tracing_is_enabled(void);
 void trace_wake_up(void);
 void tracing_reset(struct trace_array *tr, int cpu);
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index 48b2196abe37..f8ae2c50e01d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -131,7 +131,6 @@ static void stop_branch_trace(struct trace_array *tr)
 
 static int branch_trace_init(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
 	start_branch_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index d067cea2ccc3..36bf9568ccd9 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -24,32 +24,21 @@ static struct trace_array	*func_trace;
 static void tracing_start_function_trace(void);
 static void tracing_stop_function_trace(void);
 
-static void start_function_trace(struct trace_array *tr)
+static int function_trace_init(struct trace_array *tr)
 {
 	func_trace = tr;
 	tr->cpu = get_cpu();
-	tracing_reset_online_cpus(tr);
 	put_cpu();
 
 	tracing_start_cmdline_record();
 	tracing_start_function_trace();
-}
-
-static void stop_function_trace(struct trace_array *tr)
-{
-	tracing_stop_function_trace();
-	tracing_stop_cmdline_record();
-}
-
-static int function_trace_init(struct trace_array *tr)
-{
-	start_function_trace(tr);
 	return 0;
 }
 
 static void function_trace_reset(struct trace_array *tr)
 {
-	stop_function_trace(tr);
+	tracing_stop_function_trace();
+	tracing_stop_cmdline_record();
 }
 
 static void function_trace_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c97594d826bc..222f97d336a6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -56,7 +56,6 @@ static int graph_trace_init(struct trace_array *tr)
 					&trace_graph_entry);
 	if (ret)
 		return ret;
-	tracing_reset_online_cpus(tr);
 	tracing_start_cmdline_record();
 
 	return 0;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 2aa1c9f4c7d8..ca4bbcfb9e2c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -132,7 +132,6 @@ static int bts_trace_init(struct trace_array *tr)
 	hw_branch_trace = tr;
 
 	register_hotcpu_notifier(&bts_hotcpu_notifier);
-	tracing_reset_online_cpus(tr);
 	bts_trace_start(tr);
 
 	return 0;
diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 087b6cbf4ea5..9aa84bde23cd 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -48,7 +48,6 @@ static void stop_nop_trace(struct trace_array *tr)
 static int nop_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-	tracing_reset_online_cpus(tr);
 	start_nop_trace(tr);
 	return 0;
 }
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index c4f9add5ec90..30e14fe85896 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -185,12 +185,6 @@ void tracing_sched_switch_assign_trace(struct trace_array *tr)
 	ctx_trace = tr;
 }
 
-static void start_sched_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch_record();
-}
-
 static void stop_sched_trace(struct trace_array *tr)
 {
 	tracing_stop_sched_switch_record();
@@ -199,7 +193,7 @@ static void stop_sched_trace(struct trace_array *tr)
 static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
-	start_sched_trace(tr);
+	tracing_start_sched_switch_record();
 	return 0;
 }
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 5013812578b1..445700e51f6d 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -115,7 +115,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	ftrace_set_filter(func_name, strlen(func_name), 1);
 
 	/* enable tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -189,7 +189,7 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 	ftrace_enabled = 1;
 	tracer_enabled = 1;
 
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -236,7 +236,7 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -290,7 +290,7 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	}
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -344,7 +344,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	}
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
@@ -476,7 +476,7 @@ trace_selftest_startup_wakeup(struct tracer *trace, struct trace_array *tr)
 	wait_for_completion(&isrt);
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -537,7 +537,7 @@ trace_selftest_startup_sched_switch(struct tracer *trace, struct trace_array *tr
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
@@ -569,7 +569,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return 0;
@@ -596,7 +596,7 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	int ret;
 
 	/* start the tracing */
-	ret = trace->init(tr);
+	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		return ret;
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index eaca5ad803ff..84ca9d81e74d 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -226,15 +226,6 @@ static void stop_stack_timers(void)
 		stop_stack_timer(cpu);
 }
 
-static void start_stack_trace(struct trace_array *tr)
-{
-	mutex_lock(&sample_timer_lock);
-	tracing_reset_online_cpus(tr);
-	start_stack_timers();
-	tracer_enabled = 1;
-	mutex_unlock(&sample_timer_lock);
-}
-
 static void stop_stack_trace(struct trace_array *tr)
 {
 	mutex_lock(&sample_timer_lock);
@@ -247,7 +238,10 @@ static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
-	start_stack_trace(tr);
+	mutex_lock(&sample_timer_lock);
+	start_stack_timers();
+	tracer_enabled = 1;
+	mutex_unlock(&sample_timer_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1830b52d0de8c60c4f5dfbac134aa8f69d815801 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 7 Feb 2009 19:38:43 -0500
Subject: trace: remove deprecated entry->cpu

Impact: fix to prevent developers from using entry->cpu

With the new ring buffer infrastructure, the cpu for the entry is
implicit with which CPU buffer it is on.

The original code use to record the current cpu into the generic
entry header, which can be retrieved by entry->cpu. When the
ring buffer was introduced, the users were convert to use the
the cpu number of which cpu ring buffer was in use (this was passed
to the tracers by the iterator: iter->cpu).

Unfortunately, the cpu item in the entry structure was never removed.
This allowed for developers to use it instead of the proper iter->cpu,
unknowingly, using an uninitialized variable. This was not the fault
of the developers, since it would seem like the logical place to
retrieve the cpu identifier.

This patch removes the cpu item from the entry structure and fixes
all the users that should have been using iter->cpu.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace.h             | 1 -
 kernel/trace/trace_hw_branches.c | 3 +--
 kernel/trace/trace_output.c      | 6 +++---
 4 files changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fd51cf0b94c7..bd4d9f8818fa 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1531,7 +1531,7 @@ static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
 		SEQ_PUT_FIELD_RET(s, entry->pid);
-		SEQ_PUT_FIELD_RET(s, entry->cpu);
+		SEQ_PUT_FIELD_RET(s, iter->cpu);
 		SEQ_PUT_FIELD_RET(s, iter->ts);
 	}
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f0c7a0f08cac..5efc4c707f7e 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -45,7 +45,6 @@ enum trace_type {
  */
 struct trace_entry {
 	unsigned char		type;
-	unsigned char		cpu;
 	unsigned char		flags;
 	unsigned char		preempt_count;
 	int			pid;
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index fff3545fc866..549238a9b133 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -159,7 +159,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter)
 	trace_assign_type(it, entry);
 
 	if (entry->type == TRACE_HW_BRANCHES) {
-		if (trace_seq_printf(seq, "%4d  ", entry->cpu) &&
+		if (trace_seq_printf(seq, "%4d  ", iter->cpu) &&
 		    seq_print_ip_sym(seq, it->to, symflags) &&
 		    trace_seq_printf(seq, "\t  <-  ") &&
 		    seq_print_ip_sym(seq, it->from, symflags) &&
@@ -195,7 +195,6 @@ void trace_hw_branch(u64 from, u64 to)
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, from);
 	entry->ent.type = TRACE_HW_BRANCHES;
-	entry->ent.cpu = cpu;
 	entry->from = from;
 	entry->to   = to;
 	ring_buffer_unlock_commit(tr->buffer, event, irq2);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b7380eee9fa1..463a310b1d3b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -333,7 +333,7 @@ int trace_print_context(struct trace_iterator *iter)
 	unsigned long secs = (unsigned long)t;
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
-				comm, entry->pid, entry->cpu, secs, usec_rem);
+				comm, entry->pid, iter->cpu, secs, usec_rem);
 }
 
 int trace_print_lat_context(struct trace_iterator *iter)
@@ -356,7 +356,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 		char *comm = trace_find_cmdline(entry->pid);
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
-				       entry->pid, entry->cpu, entry->flags,
+				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
 				       ns2usecs(iter->ts),
 				       abs_usecs / USEC_PER_MSEC,
@@ -364,7 +364,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 				       rel_usecs / USEC_PER_MSEC,
 				       rel_usecs % USEC_PER_MSEC);
 	} else {
-		ret = lat_print_generic(s, entry, entry->cpu);
+		ret = lat_print_generic(s, entry, iter->cpu);
 		if (ret)
 			ret = lat_print_timestamp(s, abs_usecs, rel_usecs);
 	}
-- 
cgit v1.2.3


From 78d904b46a72fcf15ea6a39672bbef92953876b5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Feb 2009 18:43:07 -0500
Subject: ring-buffer: add NMI protection for spinlocks

Impact: prevent deadlock in NMI

The ring buffers are not yet totally lockless with writing to
the buffer. When a writer crosses a page, it grabs a per cpu spinlock
to protect against a reader. The spinlocks taken by a writer are not
to protect against other writers, since a writer can only write to
its own per cpu buffer. The spinlocks protect against readers that
can touch any cpu buffer. The writers are made to be reentrant
with the spinlocks disabling interrupts.

The problem arises when an NMI writes to the buffer, and that write
crosses a page boundary. If it grabs a spinlock, it can be racing
with another writer (since disabling interrupts does not protect
against NMIs) or with a reader on the same CPU. Luckily, most of the
users are not reentrant and protects against this issue. But if a
user of the ring buffer becomes reentrant (which is what the ring
buffers do allow), if the NMI also writes to the ring buffer then
we risk the chance of a deadlock.

This patch moves the ftrace_nmi_enter called by nmi_enter() to the
ring buffer code. It replaces the current ftrace_nmi_enter that is
used by arch specific code to arch_ftrace_nmi_enter and updates
the Kconfig to handle it.

When an NMI is called, it will set a per cpu variable in the ring buffer
code and will clear it when the NMI exits. If a write to the ring buffer
crosses page boundaries inside an NMI, a trylock is used on the spin
lock instead. If the spinlock fails to be acquired, then the entry
is discarded.

This bug appeared in the ftrace work in the RT tree, where event tracing
is reentrant. This workaround solved the deadlocks that appeared there.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/Kconfig           |  1 +
 arch/x86/kernel/ftrace.c   |  8 ++++----
 include/linux/ftrace_irq.h | 10 +++++++++-
 kernel/trace/Kconfig       |  8 ++++++++
 kernel/trace/ring_buffer.c | 48 ++++++++++++++++++++++++++++++++++++++++++++--
 5 files changed, 68 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 73f7fe8fd4d1..a6be725cb049 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -34,6 +34,7 @@ config X86
 	select HAVE_FUNCTION_TRACER
 	select HAVE_FUNCTION_GRAPH_TRACER
 	select HAVE_FUNCTION_TRACE_MCOUNT_TEST
+	select HAVE_FTRACE_NMI_ENTER if DYNAMIC_FTRACE || FUNCTION_GRAPH_TRACER
 	select HAVE_KVM if ((X86_32 && !X86_VOYAGER && !X86_VISWS && !X86_NUMAQ) || X86_64)
 	select HAVE_ARCH_KGDB if !X86_VOYAGER
 	select HAVE_ARCH_TRACEHOOK
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 4d33224c055f..4c683587055b 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 	/* Must have in_nmi seen before reading write flag */
@@ -124,7 +124,7 @@ void ftrace_nmi_enter(void)
 	}
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing in_nmi */
 	smp_wmb();
@@ -376,12 +376,12 @@ int ftrace_disable_ftrace_graph_caller(void)
  */
 static atomic_t in_nmi;
 
-void ftrace_nmi_enter(void)
+void arch_ftrace_nmi_enter(void)
 {
 	atomic_inc(&in_nmi);
 }
 
-void ftrace_nmi_exit(void)
+void arch_ftrace_nmi_exit(void)
 {
 	atomic_dec(&in_nmi);
 }
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 366a054d0b05..29de6779a963 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -2,7 +2,15 @@
 #define _LINUX_FTRACE_IRQ_H
 
 
-#if defined(CONFIG_DYNAMIC_FTRACE) || defined(CONFIG_FUNCTION_GRAPH_TRACER)
+#ifdef CONFIG_FTRACE_NMI_ENTER
+extern void arch_ftrace_nmi_enter(void);
+extern void arch_ftrace_nmi_exit(void);
+#else
+static inline void arch_ftrace_nmi_enter(void) { }
+static inline void arch_ftrace_nmi_exit(void) { }
+#endif
+
+#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 28f2644484d9..25131a5d5e4f 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -9,6 +9,9 @@ config USER_STACKTRACE_SUPPORT
 config NOP_TRACER
 	bool
 
+config HAVE_FTRACE_NMI_ENTER
+	bool
+
 config HAVE_FUNCTION_TRACER
 	bool
 
@@ -37,6 +40,11 @@ config TRACER_MAX_TRACE
 config RING_BUFFER
 	bool
 
+config FTRACE_NMI_ENTER
+       bool
+       depends on HAVE_FTRACE_NMI_ENTER
+       default y
+
 config TRACING
 	bool
 	select DEBUG_FS
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index b36d7374ceef..a60a6a852f42 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
@@ -18,6 +19,35 @@
 
 #include "trace.h"
 
+/*
+ * Since the write to the buffer is still not fully lockless,
+ * we must be careful with NMIs. The locks in the writers
+ * are taken when a write crosses to a new page. The locks
+ * protect against races with the readers (this will soon
+ * be fixed with a lockless solution).
+ *
+ * Because we can not protect against NMIs, and we want to
+ * keep traces reentrant, we need to manage what happens
+ * when we are in an NMI.
+ */
+static DEFINE_PER_CPU(int, rb_in_nmi);
+
+void ftrace_nmi_enter(void)
+{
+	__get_cpu_var(rb_in_nmi)++;
+	/* call arch specific handler too */
+	arch_ftrace_nmi_enter();
+}
+
+void ftrace_nmi_exit(void)
+{
+	arch_ftrace_nmi_exit();
+	__get_cpu_var(rb_in_nmi)--;
+	/* NMIs are not recursive */
+	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
+}
+
+
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -982,6 +1012,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	struct ring_buffer *buffer = cpu_buffer->buffer;
 	struct ring_buffer_event *event;
 	unsigned long flags;
+	bool lock_taken = false;
 
 	commit_page = cpu_buffer->commit_page;
 	/* we just need to protect against interrupts */
@@ -995,7 +1026,19 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		struct buffer_page *next_page = tail_page;
 
 		local_irq_save(flags);
-		__raw_spin_lock(&cpu_buffer->lock);
+		/*
+		 * NMIs can happen after we take the lock.
+		 * If we are in an NMI, only take the lock
+		 * if it is not already taken. Otherwise
+		 * simply fail.
+		 */
+		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+			if (!__raw_spin_trylock(&cpu_buffer->lock))
+				goto out_unlock;
+		} else
+			__raw_spin_lock(&cpu_buffer->lock);
+
+		lock_taken = true;
 
 		rb_inc_page(cpu_buffer, &next_page);
 
@@ -1097,7 +1140,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
 
-	__raw_spin_unlock(&cpu_buffer->lock);
+	if (likely(lock_taken))
+		__raw_spin_unlock(&cpu_buffer->lock);
 	local_irq_restore(flags);
 	return NULL;
 }
-- 
cgit v1.2.3


From a81bd80a0b0a405dc0483e2c428332d69da2c79f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Feb 2009 01:45:16 -0500
Subject: ring-buffer: use generic version of in_nmi

Impact: clean up

Now that a generic in_nmi is available, this patch removes the
special code in the ring_buffer and implements the in_nmi generic
version instead.

With this change, I was also able to rename the "arch_ftrace_nmi_enter"
back to "ftrace_nmi_enter" and remove the code from the ring buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c   |  4 ++--
 include/linux/ftrace_irq.h |  8 --------
 kernel/trace/ring_buffer.c | 43 +++++++++++++------------------------------
 3 files changed, 15 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 918073c6681b..d74d75e0952d 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -113,7 +113,7 @@ static void ftrace_mod_code(void)
 					     MCOUNT_INSN_SIZE);
 }
 
-void arch_ftrace_nmi_enter(void)
+void ftrace_nmi_enter(void)
 {
 	atomic_inc(&nmi_running);
 	/* Must have nmi_running seen before reading write flag */
@@ -124,7 +124,7 @@ void arch_ftrace_nmi_enter(void)
 	}
 }
 
-void arch_ftrace_nmi_exit(void)
+void ftrace_nmi_exit(void)
 {
 	/* Finish all executions before clearing nmi_running */
 	smp_wmb();
diff --git a/include/linux/ftrace_irq.h b/include/linux/ftrace_irq.h
index 29de6779a963..dca7bf8cffe2 100644
--- a/include/linux/ftrace_irq.h
+++ b/include/linux/ftrace_irq.h
@@ -3,14 +3,6 @@
 
 
 #ifdef CONFIG_FTRACE_NMI_ENTER
-extern void arch_ftrace_nmi_enter(void);
-extern void arch_ftrace_nmi_exit(void);
-#else
-static inline void arch_ftrace_nmi_enter(void) { }
-static inline void arch_ftrace_nmi_exit(void) { }
-#endif
-
-#ifdef CONFIG_RING_BUFFER
 extern void ftrace_nmi_enter(void);
 extern void ftrace_nmi_exit(void);
 #else
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a60a6a852f42..5ee344417cd5 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -8,6 +8,7 @@
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
+#include <linux/hardirq.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
@@ -19,35 +20,6 @@
 
 #include "trace.h"
 
-/*
- * Since the write to the buffer is still not fully lockless,
- * we must be careful with NMIs. The locks in the writers
- * are taken when a write crosses to a new page. The locks
- * protect against races with the readers (this will soon
- * be fixed with a lockless solution).
- *
- * Because we can not protect against NMIs, and we want to
- * keep traces reentrant, we need to manage what happens
- * when we are in an NMI.
- */
-static DEFINE_PER_CPU(int, rb_in_nmi);
-
-void ftrace_nmi_enter(void)
-{
-	__get_cpu_var(rb_in_nmi)++;
-	/* call arch specific handler too */
-	arch_ftrace_nmi_enter();
-}
-
-void ftrace_nmi_exit(void)
-{
-	arch_ftrace_nmi_exit();
-	__get_cpu_var(rb_in_nmi)--;
-	/* NMIs are not recursive */
-	WARN_ON_ONCE(__get_cpu_var(rb_in_nmi));
-}
-
-
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
@@ -1027,12 +999,23 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		local_irq_save(flags);
 		/*
+		 * Since the write to the buffer is still not
+		 * fully lockless, we must be careful with NMIs.
+		 * The locks in the writers are taken when a write
+		 * crosses to a new page. The locks protect against
+		 * races with the readers (this will soon be fixed
+		 * with a lockless solution).
+		 *
+		 * Because we can not protect against NMIs, and we
+		 * want to keep traces reentrant, we need to manage
+		 * what happens when we are in an NMI.
+		 *
 		 * NMIs can happen after we take the lock.
 		 * If we are in an NMI, only take the lock
 		 * if it is not already taken. Otherwise
 		 * simply fail.
 		 */
-		if (unlikely(__get_cpu_var(rb_in_nmi))) {
+		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
 				goto out_unlock;
 		} else
-- 
cgit v1.2.3


From 57794a9d48b63e34acbe63282628c9f029603308 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Fri, 6 Feb 2009 17:33:27 +0800
Subject: trace: trivial fixes in comment typos.

Impact: clean up

Fixed several typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 2 +-
 kernel/trace/ftrace.c  | 6 +++---
 kernel/trace/trace.h   | 6 +++---
 3 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 7840e718c6c7..5e302d636fc2 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -140,7 +140,7 @@ static inline int ftrace_disable_ftrace_graph_caller(void) { return 0; }
 #endif
 
 /**
- * ftrace_make_nop - convert code into top
+ * ftrace_make_nop - convert code into nop
  * @mod: module structure if called by module load initialization
  * @rec: the mcount call site record
  * @addr: the address that the call site should be calling
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 68610031780b..1796e018fbff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -465,7 +465,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 	 * it is not enabled then do nothing.
 	 *
 	 * If this record is not to be traced and
-	 * it is enabled then disabled it.
+	 * it is enabled then disable it.
 	 *
 	 */
 	if (rec->flags & FTRACE_FL_NOTRACE) {
@@ -485,7 +485,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 		if (fl == (FTRACE_FL_FILTER | FTRACE_FL_ENABLED))
 			return 0;
 
-		/* Record is not filtered and is not enabled do nothing */
+		/* Record is not filtered or enabled, do nothing */
 		if (!fl)
 			return 0;
 
@@ -507,7 +507,7 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 		} else {
 
-			/* if record is not enabled do nothing */
+			/* if record is not enabled, do nothing */
 			if (!(rec->flags & FTRACE_FL_ENABLED))
 				return 0;
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 5efc4c707f7e..f92aba52a894 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -616,12 +616,12 @@ extern struct tracer nop_trace;
  * preempt_enable (after a disable), a schedule might take place
  * causing an infinite recursion.
  *
- * To prevent this, we read the need_recshed flag before
+ * To prevent this, we read the need_resched flag before
  * disabling preemption. When we want to enable preemption we
  * check the flag, if it is set, then we call preempt_enable_no_resched.
  * Otherwise, we call preempt_enable.
  *
- * The rational for doing the above is that if need resched is set
+ * The rational for doing the above is that if need_resched is set
  * and we have yet to reschedule, we are either in an atomic location
  * (where we do not need to check for scheduling) or we are inside
  * the scheduler and do not want to resched.
@@ -642,7 +642,7 @@ static inline int ftrace_preempt_disable(void)
  *
  * This is a scheduler safe way to enable preemption and not miss
  * any preemption checks. The disabled saved the state of preemption.
- * If resched is set, then we were either inside an atomic or
+ * If resched is set, then we are either inside an atomic or
  * are inside the scheduler (we would have already scheduled
  * otherwise). In this case, we do not want to call normal
  * preempt_enable, but preempt_enable_no_resched instead.
-- 
cgit v1.2.3


From 7a89bbc74937cd74a6bcf109cfc7c032109639be Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 19 Jan 2009 13:45:28 +0100
Subject: async: Fix running list handling.

async_schedule() should pass in async_running as the running
list, and run_one_entry() should put the entry to be run on
the provided running list instead of always on the generic one.

Reported-by: Jonathan Corbet <corbet@lwn.net>
Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/async.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 67a2be71f517..0c90d500ab68 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -133,7 +133,7 @@ static void run_one_entry(void)
 
 	/* 2) move it to the running queue */
 	list_del(&entry->list);
-	list_add_tail(&entry->list, &async_running);
+	list_add_tail(&entry->list, entry->running);
 	spin_unlock_irqrestore(&async_lock, flags);
 
 	/* 3) run it (and print duration)*/
@@ -210,7 +210,7 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
 
 async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 {
-	return __async_schedule(ptr, data, &async_pending);
+	return __async_schedule(ptr, data, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
 
-- 
cgit v1.2.3


From 86532d8b167e71e24da8b564348b52977b76d15f Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 19 Jan 2009 13:45:31 +0100
Subject: async: Handle kthread_run() return codes.

If we fail to create the manager thread, fall back to non-fastboot.
If we fail to create an async thread, try again after waiting for
a bit.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/async.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 0c90d500ab68..078d5ed150d1 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -54,6 +54,7 @@ asynchronous and synchronous parts of the kernel.
 #include <linux/sched.h>
 #include <linux/init.h>
 #include <linux/kthread.h>
+#include <linux/delay.h>
 #include <asm/atomic.h>
 
 static async_cookie_t next_cookie = 1;
@@ -319,7 +320,11 @@ static int async_manager_thread(void *unused)
 		ec = atomic_read(&entry_count);
 
 		while (tc < ec && tc < MAX_THREADS) {
-			kthread_run(async_thread, NULL, "async/%i", tc);
+			if (IS_ERR(kthread_run(async_thread, NULL, "async/%i",
+					       tc))) {
+				msleep(100);
+				continue;
+			}
 			atomic_inc(&thread_count);
 			tc++;
 		}
@@ -334,7 +339,9 @@ static int async_manager_thread(void *unused)
 static int __init async_init(void)
 {
 	if (async_enabled)
-		kthread_run(async_manager_thread, NULL, "async/mgr");
+		if (IS_ERR(kthread_run(async_manager_thread, NULL,
+				       "async/mgr")))
+			async_enabled = 0;
 	return 0;
 }
 
-- 
cgit v1.2.3


From f30d5b307c694e03368ab55f2f96b0ca4131e775 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Mon, 19 Jan 2009 13:45:33 +0100
Subject: async: Add some documentation.

Add some kerneldoc to the async interface.

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/async.c | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index 078d5ed150d1..b5f0d4b94937 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -209,18 +209,43 @@ static async_cookie_t __async_schedule(async_func_ptr *ptr, void *data, struct l
 	return newcookie;
 }
 
+/**
+ * async_schedule - schedule a function for asynchronous execution
+ * @ptr: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
 async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 {
 	return __async_schedule(ptr, data, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_schedule);
 
+/**
+ * async_schedule_special - schedule a function for asynchronous execution with a special running queue
+ * @ptr: function to execute asynchronously
+ * @data: data pointer to pass to the function
+ * @running: list head to add to while running
+ *
+ * Returns an async_cookie_t that may be used for checkpointing later.
+ * @running may be used in the async_synchronize_*_special() functions
+ * to wait on a special running queue rather than on the global running
+ * queue.
+ * Note: This function may be called from atomic or non-atomic contexts.
+ */
 async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
 {
 	return __async_schedule(ptr, data, running);
 }
 EXPORT_SYMBOL_GPL(async_schedule_special);
 
+/**
+ * async_synchronize_full - synchronize all asynchronous function calls
+ *
+ * This function waits until all asynchronous function calls have been done.
+ */
 void async_synchronize_full(void)
 {
 	do {
@@ -229,12 +254,27 @@ void async_synchronize_full(void)
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
+/**
+ * async_synchronize_full_special - synchronize all asynchronous function calls for a running list
+ * @list: running list to synchronize on
+ *
+ * This function waits until all asynchronous function calls for the running
+ * list @list have been done.
+ */
 void async_synchronize_full_special(struct list_head *list)
 {
 	async_synchronize_cookie_special(next_cookie, list);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_full_special);
 
+/**
+ * async_synchronize_cookie_special - synchronize asynchronous function calls on a running list with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ * @running: running list to synchronize on
+ *
+ * This function waits until all asynchronous function calls for the running
+ * list @list submitted prior to @cookie have been done.
+ */
 void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
 {
 	ktime_t starttime, delta, endtime;
@@ -257,6 +297,13 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
 
+/**
+ * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
+ * @cookie: async_cookie_t to use as checkpoint
+ *
+ * This function waits until all asynchronous function calls prior to @cookie
+ * have been done.
+ */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
 	async_synchronize_cookie_special(cookie, &async_running);
-- 
cgit v1.2.3


From 766ccb9ed406c230d13c145def08ebea1b932982 Mon Sep 17 00:00:00 2001
From: Cornelia Huck <cornelia.huck@de.ibm.com>
Date: Tue, 20 Jan 2009 15:31:31 +0100
Subject: async: Rename _special -> _domain for clarity.

Rename the async_*_special() functions to async_*_domain(), which
describes the purpose of these functions much better.
[Broke up long lines to silence checkpatch]

Signed-off-by: Cornelia Huck <cornelia.huck@de.ibm.com>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 fs/super.c            |  4 ++--
 include/linux/async.h |  8 +++++---
 kernel/async.c        | 41 ++++++++++++++++++++++-------------------
 3 files changed, 29 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/fs/super.c b/fs/super.c
index 645e5403f2a0..61dce001dd57 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb)
 		/*
 		 * wait for asynchronous fs operations to finish before going further
 		 */
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 
 		/* bad name - it should be evict_inodes() */
 		invalidate_inodes(sb);
@@ -470,7 +470,7 @@ restart:
 		sb->s_count++;
 		spin_unlock(&sb_lock);
 		down_read(&sb->s_umount);
-		async_synchronize_full_special(&sb->s_async_list);
+		async_synchronize_full_domain(&sb->s_async_list);
 		if (sb->s_root && (wait || sb->s_dirt))
 			sb->s_op->sync_fs(sb, wait);
 		up_read(&sb->s_umount);
diff --git a/include/linux/async.h b/include/linux/async.h
index c4ecacd0b327..68a9530196f2 100644
--- a/include/linux/async.h
+++ b/include/linux/async.h
@@ -17,9 +17,11 @@ typedef u64 async_cookie_t;
 typedef void (async_func_ptr) (void *data, async_cookie_t cookie);
 
 extern async_cookie_t async_schedule(async_func_ptr *ptr, void *data);
-extern async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *list);
+extern async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+					    struct list_head *list);
 extern void async_synchronize_full(void);
-extern void async_synchronize_full_special(struct list_head *list);
+extern void async_synchronize_full_domain(struct list_head *list);
 extern void async_synchronize_cookie(async_cookie_t cookie);
-extern void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *list);
+extern void async_synchronize_cookie_domain(async_cookie_t cookie,
+					    struct list_head *list);
 
diff --git a/kernel/async.c b/kernel/async.c
index b5f0d4b94937..e23399d88bac 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -224,22 +224,23 @@ async_cookie_t async_schedule(async_func_ptr *ptr, void *data)
 EXPORT_SYMBOL_GPL(async_schedule);
 
 /**
- * async_schedule_special - schedule a function for asynchronous execution with a special running queue
+ * async_schedule_domain - schedule a function for asynchronous execution within a certain domain
  * @ptr: function to execute asynchronously
  * @data: data pointer to pass to the function
- * @running: list head to add to while running
+ * @running: running list for the domain
  *
  * Returns an async_cookie_t that may be used for checkpointing later.
- * @running may be used in the async_synchronize_*_special() functions
- * to wait on a special running queue rather than on the global running
- * queue.
+ * @running may be used in the async_synchronize_*_domain() functions
+ * to wait within a certain synchronization domain rather than globally.
+ * A synchronization domain is specified via the running queue @running to use.
  * Note: This function may be called from atomic or non-atomic contexts.
  */
-async_cookie_t async_schedule_special(async_func_ptr *ptr, void *data, struct list_head *running)
+async_cookie_t async_schedule_domain(async_func_ptr *ptr, void *data,
+				     struct list_head *running)
 {
 	return __async_schedule(ptr, data, running);
 }
-EXPORT_SYMBOL_GPL(async_schedule_special);
+EXPORT_SYMBOL_GPL(async_schedule_domain);
 
 /**
  * async_synchronize_full - synchronize all asynchronous function calls
@@ -255,27 +256,29 @@ void async_synchronize_full(void)
 EXPORT_SYMBOL_GPL(async_synchronize_full);
 
 /**
- * async_synchronize_full_special - synchronize all asynchronous function calls for a running list
+ * async_synchronize_full_domain - synchronize all asynchronous function within a certain domain
  * @list: running list to synchronize on
  *
- * This function waits until all asynchronous function calls for the running
- * list @list have been done.
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list have been done.
  */
-void async_synchronize_full_special(struct list_head *list)
+void async_synchronize_full_domain(struct list_head *list)
 {
-	async_synchronize_cookie_special(next_cookie, list);
+	async_synchronize_cookie_domain(next_cookie, list);
 }
-EXPORT_SYMBOL_GPL(async_synchronize_full_special);
+EXPORT_SYMBOL_GPL(async_synchronize_full_domain);
 
 /**
- * async_synchronize_cookie_special - synchronize asynchronous function calls on a running list with cookie checkpointing
+ * async_synchronize_cookie_domain - synchronize asynchronous function calls within a certain domain with cookie checkpointing
  * @cookie: async_cookie_t to use as checkpoint
  * @running: running list to synchronize on
  *
- * This function waits until all asynchronous function calls for the running
- * list @list submitted prior to @cookie have been done.
+ * This function waits until all asynchronous function calls for the
+ * synchronization domain specified by the running list @list submitted
+ * prior to @cookie have been done.
  */
-void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *running)
+void async_synchronize_cookie_domain(async_cookie_t cookie,
+				     struct list_head *running)
 {
 	ktime_t starttime, delta, endtime;
 
@@ -295,7 +298,7 @@ void async_synchronize_cookie_special(async_cookie_t cookie, struct list_head *r
 			(long long)ktime_to_ns(delta) >> 10);
 	}
 }
-EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
+EXPORT_SYMBOL_GPL(async_synchronize_cookie_domain);
 
 /**
  * async_synchronize_cookie - synchronize asynchronous function calls with cookie checkpointing
@@ -306,7 +309,7 @@ EXPORT_SYMBOL_GPL(async_synchronize_cookie_special);
  */
 void async_synchronize_cookie(async_cookie_t cookie)
 {
-	async_synchronize_cookie_special(cookie, &async_running);
+	async_synchronize_cookie_domain(cookie, &async_running);
 }
 EXPORT_SYMBOL_GPL(async_synchronize_cookie);
 
-- 
cgit v1.2.3


From f7de7621f07495ad14fb23a812003bcf8f6af65a Mon Sep 17 00:00:00 2001
From: Stefan Richter <stefanr@s5r6.in-berlin.de>
Date: Mon, 2 Feb 2009 13:24:34 +0100
Subject: async: use list_move_tail

list.h provides a dedicated primitive for
"list_del followed by list_add_tail"... list_move_tail.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Stefan Richter <stefanr@s5r6.in-berlin.de>
---
 kernel/async.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index e23399d88bac..f565891f2c9b 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -133,8 +133,7 @@ static void run_one_entry(void)
 	entry = list_first_entry(&async_pending, struct async_entry, list);
 
 	/* 2) move it to the running queue */
-	list_del(&entry->list);
-	list_add_tail(&entry->list, entry->running);
+	list_move_tail(&entry->list, entry->running);
 	spin_unlock_irqrestore(&async_lock, flags);
 
 	/* 3) run it (and print duration)*/
-- 
cgit v1.2.3


From 0f3c2a89c1451cdf6328f99977bd9decd4f708e1 Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 8 Feb 2009 16:18:03 -0800
Subject: irq: clear kstat_irqs

Impact: get correct kstat_irqs [/proc/interrupts] for msi/msi-x etc

need to call clear_kstat_irqs(), so when we reuse that irq_desc,
we get correct kstat in /proc/interrupts.

This makes /proc/interrupts not have <NULL> entries.

Don't need to worry about arch that doesn't support genirq, because they
will not call dynamic_irq_cleanup().

v2: simplify and make clear_kstat_irqs more robust

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/chip.c      | 1 +
 kernel/irq/handle.c    | 5 +++++
 kernel/irq/internals.h | 1 +
 3 files changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/chip.c b/kernel/irq/chip.c
index f63c706d25e1..1310856cb22b 100644
--- a/kernel/irq/chip.c
+++ b/kernel/irq/chip.c
@@ -78,6 +78,7 @@ void dynamic_irq_cleanup(unsigned int irq)
 	desc->handle_irq = handle_bad_irq;
 	desc->chip = &no_irq_chip;
 	desc->name = NULL;
+	clear_kstat_irqs(desc);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 48299a8a22f8..1b473e7569aa 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -242,6 +242,11 @@ struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu)
 }
 #endif /* !CONFIG_SPARSE_IRQ */
 
+void clear_kstat_irqs(struct irq_desc *desc)
+{
+	memset(desc->kstat_irqs, 0, nr_cpu_ids * sizeof(*(desc->kstat_irqs)));
+}
+
 /*
  * What should we do if we get a hw irq event on an illegal vector?
  * Each architecture has to answer this themself.
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index e6d0a43cc125..b60950bf5a16 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -15,6 +15,7 @@ extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
+extern void clear_kstat_irqs(struct irq_desc *desc);
 extern spinlock_t sparse_irq_lock;
 extern struct irq_desc *irq_desc_ptrs[NR_IRQS];
 
-- 
cgit v1.2.3


From 005bf0e6fa0e9543933fe2e36322af649df7cacb Mon Sep 17 00:00:00 2001
From: Yinghai Lu <yinghai@kernel.org>
Date: Sun, 8 Feb 2009 16:18:03 -0800
Subject: irq: optimize init_kstat_irqs/init_copy_kstat_irqs

Simplify and make init_kstat_irqs etc more type proof, suggested by
Andrew.

Signed-off-by: Yinghai Lu <yinghai@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c       | 20 +++++++++++---------
 kernel/irq/numa_migrate.c | 11 +++--------
 2 files changed, 14 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 1b473e7569aa..49d642b62c64 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -71,19 +71,21 @@ static struct irq_desc irq_desc_init = {
 
 void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr)
 {
-	unsigned long bytes;
-	char *ptr;
 	int node;
-
-	/* Compute how many bytes we need per irq and allocate them */
-	bytes = nr * sizeof(unsigned int);
+	void *ptr;
 
 	node = cpu_to_node(cpu);
-	ptr = kzalloc_node(bytes, GFP_ATOMIC, node);
-	printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n", cpu, node);
+	ptr = kzalloc_node(nr * sizeof(*desc->kstat_irqs), GFP_ATOMIC, node);
 
-	if (ptr)
-		desc->kstat_irqs = (unsigned int *)ptr;
+	/*
+	 * don't overwite if can not get new one
+	 * init_copy_kstat_irqs() could still use old one
+	 */
+	if (ptr) {
+		printk(KERN_DEBUG "  alloc kstat_irqs on cpu %d node %d\n",
+			 cpu, node);
+		desc->kstat_irqs = ptr;
+	}
 }
 
 static void init_one_irq_desc(int irq, struct irq_desc *desc, int cpu)
diff --git a/kernel/irq/numa_migrate.c b/kernel/irq/numa_migrate.c
index ecf765c6a77a..c500cfe422b6 100644
--- a/kernel/irq/numa_migrate.c
+++ b/kernel/irq/numa_migrate.c
@@ -17,16 +17,11 @@ static void init_copy_kstat_irqs(struct irq_desc *old_desc,
 				 struct irq_desc *desc,
 				 int cpu, int nr)
 {
-	unsigned long bytes;
-
 	init_kstat_irqs(desc, cpu, nr);
 
-	if (desc->kstat_irqs != old_desc->kstat_irqs) {
-		/* Compute how many bytes we need per irq and allocate them */
-		bytes = nr * sizeof(unsigned int);
-
-		memcpy(desc->kstat_irqs, old_desc->kstat_irqs, bytes);
-	}
+	if (desc->kstat_irqs != old_desc->kstat_irqs)
+		memcpy(desc->kstat_irqs, old_desc->kstat_irqs,
+			 nr * sizeof(*desc->kstat_irqs));
 }
 
 static void free_kstat_irqs(struct irq_desc *old_desc, struct irq_desc *desc)
-- 
cgit v1.2.3


From 548c8933801c9ee347b6f1bad2491e4286a4f3a2 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Sun, 8 Feb 2009 20:24:47 +0100
Subject: kernel/irq: fix sparse warning: make symbol static

While being at it make every occurrence of 'do_irq_select_affinity'
have the same signature in terms of signedness of the first argument.

Fix this sparse warning:
  kernel/irq/manage.c:112:5: warning: symbol 'do_irq_select_affinity' was not declared. Should it be static?

Also rename do_irq_select_affinity() to setup_affinity() - shorter name
and clearer naming.

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Acked-by: Matthew Wilcox <matthew@wil.cx>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 291f03664552..38008b80bd59 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -109,7 +109,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 /*
  * Generic version of the affinity autoselector.
  */
-int do_irq_select_affinity(unsigned int irq, struct irq_desc *desc)
+static int setup_affinity(unsigned int irq, struct irq_desc *desc)
 {
 	if (!irq_can_set_affinity(irq))
 		return 0;
@@ -133,7 +133,7 @@ set_affinity:
 	return 0;
 }
 #else
-static inline int do_irq_select_affinity(unsigned int irq, struct irq_desc *d)
+static inline int setup_affinity(unsigned int irq, struct irq_desc *d)
 {
 	return irq_select_affinity(irq);
 }
@@ -149,14 +149,14 @@ int irq_select_affinity_usr(unsigned int irq)
 	int ret;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	ret = do_irq_select_affinity(irq, desc);
+	ret = setup_affinity(irq, desc);
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return ret;
 }
 
 #else
-static inline int do_irq_select_affinity(int irq, struct irq_desc *desc)
+static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 {
 	return 0;
 }
@@ -488,7 +488,7 @@ __setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
 			desc->status |= IRQ_NO_BALANCING;
 
 		/* Set default affinity mask once everything is setup */
-		do_irq_select_affinity(irq, desc);
+		setup_affinity(irq, desc);
 
 	} else if ((new->flags & IRQF_TRIGGER_MASK)
 			&& (new->flags & IRQF_TRIGGER_MASK)
-- 
cgit v1.2.3


From 2db270a80b8f2238e536876cfb3987af02684df8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 20:46:45 +0100
Subject: tracing/blktrace: move the tracing file to kernel/trace

Impact: cleanup

Move blktrace.c to kernel/trace, also move its config entry.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/Kconfig           |   24 -
 block/Makefile          |    1 -
 block/blktrace.c        | 1538 -----------------------------------------------
 kernel/trace/Kconfig    |   23 +
 kernel/trace/Makefile   |    1 +
 kernel/trace/blktrace.c | 1538 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 1562 insertions(+), 1563 deletions(-)
 delete mode 100644 block/blktrace.c
 create mode 100644 kernel/trace/blktrace.c

(limited to 'kernel')

diff --git a/block/Kconfig b/block/Kconfig
index 7cdaa1d72252..e7d12782bcfb 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -44,30 +44,6 @@ config LBD
 
 	  If unsure, say N.
 
-config BLK_DEV_IO_TRACE
-	bool "Support for tracing block io actions"
-	depends on SYSFS
-	select RELAY
-	select DEBUG_FS
-	select TRACEPOINTS
-	select TRACING
-	select STACKTRACE
-	help
-	  Say Y here if you want to be able to trace the block layer actions
-	  on a given queue. Tracing allows you to see any traffic happening
-	  on a block device queue. For more information (and the userspace
-	  support tools needed), fetch the blktrace tools from:
-
-	  git://git.kernel.dk/blktrace.git
-
-	  Tracing also is possible using the ftrace interface, e.g.:
-
-	    echo 1 > /sys/block/sda/sda1/trace/enable
-	    echo blk > /sys/kernel/debug/tracing/current_tracer
-	    cat /sys/kernel/debug/tracing/trace_pipe
-
-	  If unsure, say N.
-
 config BLK_DEV_BSG
 	bool "Block layer SG support v4 (EXPERIMENTAL)"
 	depends on EXPERIMENTAL
diff --git a/block/Makefile b/block/Makefile
index bfe73049f939..e9fa4dd690f2 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -13,6 +13,5 @@ obj-$(CONFIG_IOSCHED_AS)	+= as-iosched.o
 obj-$(CONFIG_IOSCHED_DEADLINE)	+= deadline-iosched.o
 obj-$(CONFIG_IOSCHED_CFQ)	+= cfq-iosched.o
 
-obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_BLOCK_COMPAT)	+= compat_ioctl.o
 obj-$(CONFIG_BLK_DEV_INTEGRITY)	+= blk-integrity.o
diff --git a/block/blktrace.c b/block/blktrace.c
deleted file mode 100644
index ca6d32061e4f..000000000000
--- a/block/blktrace.c
+++ /dev/null
@@ -1,1538 +0,0 @@
-/*
- * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
- *
- */
-#include <linux/kernel.h>
-#include <linux/blkdev.h>
-#include <linux/blktrace_api.h>
-#include <linux/percpu.h>
-#include <linux/init.h>
-#include <linux/mutex.h>
-#include <linux/debugfs.h>
-#include <linux/time.h>
-#include <trace/block.h>
-#include <linux/uaccess.h>
-#include <../kernel/trace/trace_output.h>
-
-static unsigned int blktrace_seq __read_mostly = 1;
-
-static struct trace_array *blk_tr;
-static int __read_mostly  blk_tracer_enabled;
-
-/* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
-
-static struct tracer_opt blk_tracer_opts[] = {
-	/* Default disable the minimalistic output */
-	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
-	{ }
-};
-
-static struct tracer_flags blk_tracer_flags = {
-	.val  = 0,
-	.opts = blk_tracer_opts,
-};
-
-/* Global reference count of probes */
-static DEFINE_MUTEX(blk_probe_mutex);
-static atomic_t blk_probes_ref = ATOMIC_INIT(0);
-
-static int blk_register_tracepoints(void);
-static void blk_unregister_tracepoints(void);
-
-/*
- * Send out a notify message.
- */
-static void trace_note(struct blk_trace *bt, pid_t pid, int action,
-		       const void *data, size_t len)
-{
-	struct blk_io_trace *t;
-
-	if (!bt->rchan)
-		return;
-
-	t = relay_reserve(bt->rchan, sizeof(*t) + len);
-	if (t) {
-		const int cpu = smp_processor_id();
-
-		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
-		t->time = ktime_to_ns(ktime_get());
-		t->device = bt->dev;
-		t->action = action;
-		t->pid = pid;
-		t->cpu = cpu;
-		t->pdu_len = len;
-		memcpy((void *) t + sizeof(*t), data, len);
-	}
-}
-
-/*
- * Send out a notify for this process, if we haven't done so since a trace
- * started
- */
-static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
-{
-	tsk->btrace_seq = blktrace_seq;
-	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
-}
-
-static void trace_note_time(struct blk_trace *bt)
-{
-	struct timespec now;
-	unsigned long flags;
-	u32 words[2];
-
-	getnstimeofday(&now);
-	words[0] = now.tv_sec;
-	words[1] = now.tv_nsec;
-
-	local_irq_save(flags);
-	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
-	local_irq_restore(flags);
-}
-
-void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
-{
-	int n;
-	va_list args;
-	unsigned long flags;
-	char *buf;
-
-	if (blk_tr) {
-		va_start(args, fmt);
-		ftrace_vprintk(fmt, args);
-		va_end(args);
-		return;
-	}
-
-	if (!bt->msg_data)
-		return;
-
-	local_irq_save(flags);
-	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
-	va_start(args, fmt);
-	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
-	va_end(args);
-
-	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
-	local_irq_restore(flags);
-}
-EXPORT_SYMBOL_GPL(__trace_note_message);
-
-static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
-			 pid_t pid)
-{
-	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
-		return 1;
-	if (sector < bt->start_lba || sector > bt->end_lba)
-		return 1;
-	if (bt->pid && pid != bt->pid)
-		return 1;
-
-	return 0;
-}
-
-/*
- * Data direction bit lookup
- */
-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
-					 BLK_TC_ACT(BLK_TC_WRITE) };
-
-/* The ilog2() calls fall out because they're constant */
-#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
-	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
-
-/*
- * The worker for the various blk_add_trace*() types. Fills out a
- * blk_io_trace structure and places it in a per-cpu subbuffer.
- */
-static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
-		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
-{
-	struct task_struct *tsk = current;
-	struct ring_buffer_event *event = NULL;
-	struct blk_io_trace *t;
-	unsigned long flags = 0;
-	unsigned long *sequence;
-	pid_t pid;
-	int cpu, pc = 0;
-
-	if (unlikely(bt->trace_state != Blktrace_running ||
-		     !blk_tracer_enabled))
-		return;
-
-	what |= ddir_act[rw & WRITE];
-	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNC);
-	what |= MASK_TC_BIT(rw, AHEAD);
-	what |= MASK_TC_BIT(rw, META);
-	what |= MASK_TC_BIT(rw, DISCARD);
-
-	pid = tsk->pid;
-	if (unlikely(act_log_check(bt, what, sector, pid)))
-		return;
-	cpu = raw_smp_processor_id();
-
-	if (blk_tr) {
-		tracing_record_cmdline(current);
-
-		pc = preempt_count();
-		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
-						  sizeof(*t) + pdu_len,
-						  0, pc);
-		if (!event)
-			return;
-		t = ring_buffer_event_data(event);
-		goto record_it;
-	}
-
-	/*
-	 * A word about the locking here - we disable interrupts to reserve
-	 * some space in the relay per-cpu buffer, to prevent an irq
-	 * from coming in and stepping on our toes.
-	 */
-	local_irq_save(flags);
-
-	if (unlikely(tsk->btrace_seq != blktrace_seq))
-		trace_note_tsk(bt, tsk);
-
-	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
-	if (t) {
-		sequence = per_cpu_ptr(bt->sequence, cpu);
-
-		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
-		t->sequence = ++(*sequence);
-		t->time = ktime_to_ns(ktime_get());
-record_it:
-		/*
-		 * These two are not needed in ftrace as they are in the
-		 * generic trace_entry, filled by tracing_generic_entry_update,
-		 * but for the trace_event->bin() synthesizer benefit we do it
-		 * here too.
-		 */
-		t->cpu = cpu;
-		t->pid = pid;
-
-		t->sector = sector;
-		t->bytes = bytes;
-		t->action = what;
-		t->device = bt->dev;
-		t->error = error;
-		t->pdu_len = pdu_len;
-
-		if (pdu_len)
-			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
-
-		if (blk_tr) {
-			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
-			return;
-		}
-	}
-
-	local_irq_restore(flags);
-}
-
-static struct dentry *blk_tree_root;
-static DEFINE_MUTEX(blk_tree_mutex);
-
-static void blk_trace_cleanup(struct blk_trace *bt)
-{
-	debugfs_remove(bt->msg_file);
-	debugfs_remove(bt->dropped_file);
-	relay_close(bt->rchan);
-	free_percpu(bt->sequence);
-	free_percpu(bt->msg_data);
-	kfree(bt);
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
-}
-
-int blk_trace_remove(struct request_queue *q)
-{
-	struct blk_trace *bt;
-
-	bt = xchg(&q->blk_trace, NULL);
-	if (!bt)
-		return -EINVAL;
-
-	if (bt->trace_state == Blktrace_setup ||
-	    bt->trace_state == Blktrace_stopped)
-		blk_trace_cleanup(bt);
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blk_trace_remove);
-
-static int blk_dropped_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-
-	return 0;
-}
-
-static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	struct blk_trace *bt = filp->private_data;
-	char buf[16];
-
-	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
-
-	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
-}
-
-static const struct file_operations blk_dropped_fops = {
-	.owner =	THIS_MODULE,
-	.open =		blk_dropped_open,
-	.read =		blk_dropped_read,
-};
-
-static int blk_msg_open(struct inode *inode, struct file *filp)
-{
-	filp->private_data = inode->i_private;
-
-	return 0;
-}
-
-static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
-				size_t count, loff_t *ppos)
-{
-	char *msg;
-	struct blk_trace *bt;
-
-	if (count > BLK_TN_MAX_MSG)
-		return -EINVAL;
-
-	msg = kmalloc(count, GFP_KERNEL);
-	if (msg == NULL)
-		return -ENOMEM;
-
-	if (copy_from_user(msg, buffer, count)) {
-		kfree(msg);
-		return -EFAULT;
-	}
-
-	bt = filp->private_data;
-	__trace_note_message(bt, "%s", msg);
-	kfree(msg);
-
-	return count;
-}
-
-static const struct file_operations blk_msg_fops = {
-	.owner =	THIS_MODULE,
-	.open =		blk_msg_open,
-	.write =	blk_msg_write,
-};
-
-/*
- * Keep track of how many times we encountered a full subbuffer, to aid
- * the user space app in telling how many lost events there were.
- */
-static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
-				     void *prev_subbuf, size_t prev_padding)
-{
-	struct blk_trace *bt;
-
-	if (!relay_buf_full(buf))
-		return 1;
-
-	bt = buf->chan->private_data;
-	atomic_inc(&bt->dropped);
-	return 0;
-}
-
-static int blk_remove_buf_file_callback(struct dentry *dentry)
-{
-	struct dentry *parent = dentry->d_parent;
-	debugfs_remove(dentry);
-
-	/*
-	* this will fail for all but the last file, but that is ok. what we
-	* care about is the top level buts->name directory going away, when
-	* the last trace file is gone. Then we don't have to rmdir() that
-	* manually on trace stop, so it nicely solves the issue with
-	* force killing of running traces.
-	*/
-
-	debugfs_remove(parent);
-	return 0;
-}
-
-static struct dentry *blk_create_buf_file_callback(const char *filename,
-						   struct dentry *parent,
-						   int mode,
-						   struct rchan_buf *buf,
-						   int *is_global)
-{
-	return debugfs_create_file(filename, mode, parent, buf,
-					&relay_file_operations);
-}
-
-static struct rchan_callbacks blk_relay_callbacks = {
-	.subbuf_start		= blk_subbuf_start_callback,
-	.create_buf_file	= blk_create_buf_file_callback,
-	.remove_buf_file	= blk_remove_buf_file_callback,
-};
-
-/*
- * Setup everything required to start tracing
- */
-int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-			struct blk_user_trace_setup *buts)
-{
-	struct blk_trace *old_bt, *bt = NULL;
-	struct dentry *dir = NULL;
-	int ret, i;
-
-	if (!buts->buf_size || !buts->buf_nr)
-		return -EINVAL;
-
-	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
-	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
-
-	/*
-	 * some device names have larger paths - convert the slashes
-	 * to underscores for this to work as expected
-	 */
-	for (i = 0; i < strlen(buts->name); i++)
-		if (buts->name[i] == '/')
-			buts->name[i] = '_';
-
-	ret = -ENOMEM;
-	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
-	if (!bt)
-		goto err;
-
-	bt->sequence = alloc_percpu(unsigned long);
-	if (!bt->sequence)
-		goto err;
-
-	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
-	if (!bt->msg_data)
-		goto err;
-
-	ret = -ENOENT;
-
-	if (!blk_tree_root) {
-		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root)
-			return -ENOMEM;
-	}
-
-	dir = debugfs_create_dir(buts->name, blk_tree_root);
-
-	if (!dir)
-		goto err;
-
-	bt->dir = dir;
-	bt->dev = dev;
-	atomic_set(&bt->dropped, 0);
-
-	ret = -EIO;
-	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
-					       &blk_dropped_fops);
-	if (!bt->dropped_file)
-		goto err;
-
-	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
-	if (!bt->msg_file)
-		goto err;
-
-	bt->rchan = relay_open("trace", dir, buts->buf_size,
-				buts->buf_nr, &blk_relay_callbacks, bt);
-	if (!bt->rchan)
-		goto err;
-
-	bt->act_mask = buts->act_mask;
-	if (!bt->act_mask)
-		bt->act_mask = (u16) -1;
-
-	bt->start_lba = buts->start_lba;
-	bt->end_lba = buts->end_lba;
-	if (!bt->end_lba)
-		bt->end_lba = -1ULL;
-
-	bt->pid = buts->pid;
-	bt->trace_state = Blktrace_setup;
-
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1) {
-		ret = blk_register_tracepoints();
-		if (ret)
-			goto probe_err;
-	}
-	mutex_unlock(&blk_probe_mutex);
-
-	ret = -EBUSY;
-	old_bt = xchg(&q->blk_trace, bt);
-	if (old_bt) {
-		(void) xchg(&q->blk_trace, old_bt);
-		goto err;
-	}
-
-	return 0;
-probe_err:
-	atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
-err:
-	if (bt) {
-		if (bt->msg_file)
-			debugfs_remove(bt->msg_file);
-		if (bt->dropped_file)
-			debugfs_remove(bt->dropped_file);
-		free_percpu(bt->sequence);
-		free_percpu(bt->msg_data);
-		if (bt->rchan)
-			relay_close(bt->rchan);
-		kfree(bt);
-	}
-	return ret;
-}
-
-int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
-		    char __user *arg)
-{
-	struct blk_user_trace_setup buts;
-	int ret;
-
-	ret = copy_from_user(&buts, arg, sizeof(buts));
-	if (ret)
-		return -EFAULT;
-
-	ret = do_blk_trace_setup(q, name, dev, &buts);
-	if (ret)
-		return ret;
-
-	if (copy_to_user(arg, &buts, sizeof(buts)))
-		return -EFAULT;
-
-	return 0;
-}
-EXPORT_SYMBOL_GPL(blk_trace_setup);
-
-int blk_trace_startstop(struct request_queue *q, int start)
-{
-	int ret;
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt == NULL)
-		return -EINVAL;
-
-	/*
-	 * For starting a trace, we can transition from a setup or stopped
-	 * trace. For stopping a trace, the state must be running
-	 */
-	ret = -EINVAL;
-	if (start) {
-		if (bt->trace_state == Blktrace_setup ||
-		    bt->trace_state == Blktrace_stopped) {
-			blktrace_seq++;
-			smp_mb();
-			bt->trace_state = Blktrace_running;
-
-			trace_note_time(bt);
-			ret = 0;
-		}
-	} else {
-		if (bt->trace_state == Blktrace_running) {
-			bt->trace_state = Blktrace_stopped;
-			relay_flush(bt->rchan);
-			ret = 0;
-		}
-	}
-
-	return ret;
-}
-EXPORT_SYMBOL_GPL(blk_trace_startstop);
-
-/**
- * blk_trace_ioctl: - handle the ioctls associated with tracing
- * @bdev:	the block device
- * @cmd: 	the ioctl cmd
- * @arg:	the argument data, if any
- *
- **/
-int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
-{
-	struct request_queue *q;
-	int ret, start = 0;
-	char b[BDEVNAME_SIZE];
-
-	q = bdev_get_queue(bdev);
-	if (!q)
-		return -ENXIO;
-
-	mutex_lock(&bdev->bd_mutex);
-
-	switch (cmd) {
-	case BLKTRACESETUP:
-		bdevname(bdev, b);
-		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
-		break;
-	case BLKTRACESTART:
-		start = 1;
-	case BLKTRACESTOP:
-		ret = blk_trace_startstop(q, start);
-		break;
-	case BLKTRACETEARDOWN:
-		ret = blk_trace_remove(q);
-		break;
-	default:
-		ret = -ENOTTY;
-		break;
-	}
-
-	mutex_unlock(&bdev->bd_mutex);
-	return ret;
-}
-
-/**
- * blk_trace_shutdown: - stop and cleanup trace structures
- * @q:    the request queue associated with the device
- *
- **/
-void blk_trace_shutdown(struct request_queue *q)
-{
-	if (q->blk_trace) {
-		blk_trace_startstop(q, 0);
-		blk_trace_remove(q);
-	}
-}
-
-/*
- * blktrace probes
- */
-
-/**
- * blk_add_trace_rq - Add a trace for a request oriented action
- * @q:		queue the io is for
- * @rq:		the source request
- * @what:	the action
- *
- * Description:
- *     Records an action against a request. Will log the bio offset + size.
- *
- **/
-static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
-				    u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-	int rw = rq->cmd_flags & 0x03;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_discard_rq(rq))
-		rw |= (1 << BIO_RW_DISCARD);
-
-	if (blk_pc_request(rq)) {
-		what |= BLK_TC_ACT(BLK_TC_PC);
-		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				sizeof(rq->cmd), rq->cmd);
-	} else  {
-		what |= BLK_TC_ACT(BLK_TC_FS);
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				rw, what, rq->errors, 0, NULL);
-	}
-}
-
-static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
-}
-
-static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
-}
-
-static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
-}
-
-static void blk_add_trace_rq_requeue(struct request_queue *q,
-				     struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
-}
-
-static void blk_add_trace_rq_complete(struct request_queue *q,
-				      struct request *rq)
-{
-	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
-}
-
-/**
- * blk_add_trace_bio - Add a trace for a bio oriented action
- * @q:		queue the io is for
- * @bio:	the source bio
- * @what:	the action
- *
- * Description:
- *     Records an action against a bio. Will log the bio offset + size.
- *
- **/
-static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
-				     u32 what)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
-			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
-}
-
-static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
-}
-
-static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
-}
-
-static void blk_add_trace_bio_backmerge(struct request_queue *q,
-					struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
-}
-
-static void blk_add_trace_bio_frontmerge(struct request_queue *q,
-					 struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
-}
-
-static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
-{
-	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
-}
-
-static void blk_add_trace_getrq(struct request_queue *q,
-				struct bio *bio, int rw)
-{
-	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
-	else {
-		struct blk_trace *bt = q->blk_trace;
-
-		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
-	}
-}
-
-
-static void blk_add_trace_sleeprq(struct request_queue *q,
-				  struct bio *bio, int rw)
-{
-	if (bio)
-		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
-	else {
-		struct blk_trace *bt = q->blk_trace;
-
-		if (bt)
-			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
-					0, 0, NULL);
-	}
-}
-
-static void blk_add_trace_plug(struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt)
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
-}
-
-static void blk_add_trace_unplug_io(struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
-				sizeof(rpdu), &rpdu);
-	}
-}
-
-static void blk_add_trace_unplug_timer(struct request_queue *q)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
-				sizeof(rpdu), &rpdu);
-	}
-}
-
-static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
-				unsigned int pdu)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (bt) {
-		__be64 rpdu = cpu_to_be64(pdu);
-
-		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
-				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
-				sizeof(rpdu), &rpdu);
-	}
-}
-
-/**
- * blk_add_trace_remap - Add a trace for a remap operation
- * @q:		queue the io is for
- * @bio:	the source bio
- * @dev:	target device
- * @from:	source sector
- * @to:		target sector
- *
- * Description:
- *     Device mapper or raid target sometimes need to split a bio because
- *     it spans a stripe (or similar). Add a trace for that action.
- *
- **/
-static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
-				       dev_t dev, sector_t from, sector_t to)
-{
-	struct blk_trace *bt = q->blk_trace;
-	struct blk_io_trace_remap r;
-
-	if (likely(!bt))
-		return;
-
-	r.device = cpu_to_be32(dev);
-	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
-	r.sector = cpu_to_be64(to);
-
-	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
-			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
-}
-
-/**
- * blk_add_driver_data - Add binary message with driver-specific data
- * @q:		queue the io is for
- * @rq:		io request
- * @data:	driver-specific data
- * @len:	length of driver-specific data
- *
- * Description:
- *     Some drivers might want to write driver-specific data per request.
- *
- **/
-void blk_add_driver_data(struct request_queue *q,
-			 struct request *rq,
-			 void *data, size_t len)
-{
-	struct blk_trace *bt = q->blk_trace;
-
-	if (likely(!bt))
-		return;
-
-	if (blk_pc_request(rq))
-		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
-				rq->errors, len, data);
-	else
-		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-				0, BLK_TA_DRV_DATA, rq->errors, len, data);
-}
-EXPORT_SYMBOL_GPL(blk_add_driver_data);
-
-static int blk_register_tracepoints(void)
-{
-	int ret;
-
-	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
-	WARN_ON(ret);
-	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
-	WARN_ON(ret);
-	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
-	WARN_ON(ret);
-	ret = register_trace_block_getrq(blk_add_trace_getrq);
-	WARN_ON(ret);
-	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
-	WARN_ON(ret);
-	ret = register_trace_block_plug(blk_add_trace_plug);
-	WARN_ON(ret);
-	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
-	WARN_ON(ret);
-	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
-	WARN_ON(ret);
-	ret = register_trace_block_split(blk_add_trace_split);
-	WARN_ON(ret);
-	ret = register_trace_block_remap(blk_add_trace_remap);
-	WARN_ON(ret);
-	return 0;
-}
-
-static void blk_unregister_tracepoints(void)
-{
-	unregister_trace_block_remap(blk_add_trace_remap);
-	unregister_trace_block_split(blk_add_trace_split);
-	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
-	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
-	unregister_trace_block_plug(blk_add_trace_plug);
-	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
-	unregister_trace_block_getrq(blk_add_trace_getrq);
-	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
-	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
-	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
-	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
-	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
-	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
-	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
-	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
-	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
-	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
-
-	tracepoint_synchronize_unregister();
-}
-
-/*
- * struct blk_io_tracer formatting routines
- */
-
-static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
-{
-	int i = 0;
-
-	if (t->action & BLK_TC_DISCARD)
-		rwbs[i++] = 'D';
-	else if (t->action & BLK_TC_WRITE)
-		rwbs[i++] = 'W';
-	else if (t->bytes)
-		rwbs[i++] = 'R';
-	else
-		rwbs[i++] = 'N';
-
-	if (t->action & BLK_TC_AHEAD)
-		rwbs[i++] = 'A';
-	if (t->action & BLK_TC_BARRIER)
-		rwbs[i++] = 'B';
-	if (t->action & BLK_TC_SYNC)
-		rwbs[i++] = 'S';
-	if (t->action & BLK_TC_META)
-		rwbs[i++] = 'M';
-
-	rwbs[i] = '\0';
-}
-
-static inline
-const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
-{
-	return (const struct blk_io_trace *)ent;
-}
-
-static inline const void *pdu_start(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent) + 1;
-}
-
-static inline u32 t_sec(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent)->bytes >> 9;
-}
-
-static inline unsigned long long t_sector(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent)->sector;
-}
-
-static inline __u16 t_error(const struct trace_entry *ent)
-{
-	return te_blk_io_trace(ent)->sector;
-}
-
-static __u64 get_pdu_int(const struct trace_entry *ent)
-{
-	const __u64 *val = pdu_start(ent);
-	return be64_to_cpu(*val);
-}
-
-static void get_pdu_remap(const struct trace_entry *ent,
-			  struct blk_io_trace_remap *r)
-{
-	const struct blk_io_trace_remap *__r = pdu_start(ent);
-	__u64 sector = __r->sector;
-
-	r->device = be32_to_cpu(__r->device);
-	r->device_from = be32_to_cpu(__r->device_from);
-	r->sector = be64_to_cpu(sector);
-}
-
-static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
-{
-	char rwbs[6];
-	unsigned long long ts  = ns2usecs(iter->ts);
-	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
-	unsigned secs	       = (unsigned long)ts;
-	const struct trace_entry *ent = iter->ent;
-	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
-
-	fill_rwbs(rwbs, t);
-
-	return trace_seq_printf(&iter->seq,
-				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
-				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, usec_rem, ent->pid, act, rwbs);
-}
-
-static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-			      const char *act)
-{
-	char rwbs[6];
-	fill_rwbs(rwbs, t);
-	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
-				MAJOR(t->device), MINOR(t->device), act, rwbs);
-}
-
-static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
-{
-	const char *cmd = trace_find_cmdline(ent->pid);
-
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%s]\n",
-					t_sector(ent), t_sec(ent), cmd);
-	return trace_seq_printf(s, "[%s]\n", cmd);
-}
-
-static int blk_log_with_error(struct trace_seq *s,
-			      const struct trace_entry *ent)
-{
-	if (t_sec(ent))
-		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
-					t_sec(ent), t_error(ent));
-	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
-}
-
-static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
-{
-	struct blk_io_trace_remap r = { .device = 0, };
-
-	get_pdu_remap(ent, &r);
-	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
-			       t_sector(ent),
-			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
-			       (unsigned long long)r.sector);
-}
-
-static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
-{
-	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
-}
-
-static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
-{
-	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
-				get_pdu_int(ent));
-}
-
-static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
-{
-	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-				get_pdu_int(ent), trace_find_cmdline(ent->pid));
-}
-
-/*
- * struct tracer operations
- */
-
-static void blk_tracer_print_header(struct seq_file *m)
-{
-	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
-		return;
-	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
-		    "#  |     |     |           |   |   |\n");
-}
-
-static void blk_tracer_start(struct trace_array *tr)
-{
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		if (blk_register_tracepoints())
-			atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
-	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
-}
-
-static int blk_tracer_init(struct trace_array *tr)
-{
-	blk_tr = tr;
-	blk_tracer_start(tr);
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled++;
-	mutex_unlock(&blk_probe_mutex);
-	return 0;
-}
-
-static void blk_tracer_stop(struct trace_array *tr)
-{
-	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
-}
-
-static void blk_tracer_reset(struct trace_array *tr)
-{
-	if (!atomic_read(&blk_probes_ref))
-		return;
-
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled--;
-	WARN_ON(blk_tracer_enabled < 0);
-	mutex_unlock(&blk_probe_mutex);
-
-	blk_tracer_stop(tr);
-}
-
-static struct {
-	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
-} what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
-	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
-	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
-	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
-	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
-	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
-	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
-	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
-	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
-	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
-	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
-	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
-	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
-	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
-	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
-};
-
-static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-					       int flags)
-{
-	struct trace_seq *s = &iter->seq;
-	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-	int ret;
-
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(s, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-}
-
-static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const int offset = offsetof(struct blk_io_trace, sector);
-	struct blk_io_trace old = {
-		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
-		.time     = ns2usecs(iter->ts),
-	};
-
-	if (!trace_seq_putmem(s, &old, offset))
-		return 0;
-	return trace_seq_putmem(s, &t->sector,
-				sizeof(old) - offset + t->pdu_len);
-}
-
-static enum print_line_t
-blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
-{
-	return blk_trace_synthesize_old_trace(iter) ?
-			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
-{
-	const struct blk_io_trace *t;
-	u16 what;
-	int ret;
-
-	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
-		return TRACE_TYPE_UNHANDLED;
-
-	t = (const struct blk_io_trace *)iter->ent;
-	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(&iter->seq, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct tracer blk_tracer __read_mostly = {
-	.name		= "blk",
-	.init		= blk_tracer_init,
-	.reset		= blk_tracer_reset,
-	.start		= blk_tracer_start,
-	.stop		= blk_tracer_stop,
-	.print_header	= blk_tracer_print_header,
-	.print_line	= blk_tracer_print_line,
-	.flags		= &blk_tracer_flags,
-};
-
-static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
-	.trace		= blk_trace_event_print,
-	.latency_trace	= blk_trace_event_print,
-	.binary		= blk_trace_event_print_binary,
-};
-
-static int __init init_blk_tracer(void)
-{
-	if (!register_ftrace_event(&trace_blk_event)) {
-		pr_warning("Warning: could not register block events\n");
-		return 1;
-	}
-
-	if (register_tracer(&blk_tracer) != 0) {
-		pr_warning("Warning: could not register the block tracer\n");
-		unregister_ftrace_event(&trace_blk_event);
-		return 1;
-	}
-
-	return 0;
-}
-
-device_initcall(init_blk_tracer);
-
-static int blk_trace_remove_queue(struct request_queue *q)
-{
-	struct blk_trace *bt;
-
-	bt = xchg(&q->blk_trace, NULL);
-	if (bt == NULL)
-		return -EINVAL;
-
-	kfree(bt);
-	return 0;
-}
-
-/*
- * Setup everything required to start tracing
- */
-static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
-{
-	struct blk_trace *old_bt, *bt = NULL;
-	int ret;
-
-	ret = -ENOMEM;
-	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
-	if (!bt)
-		goto err;
-
-	bt->dev = dev;
-	bt->act_mask = (u16)-1;
-	bt->end_lba = -1ULL;
-	bt->trace_state = Blktrace_running;
-
-	old_bt = xchg(&q->blk_trace, bt);
-	if (old_bt != NULL) {
-		(void)xchg(&q->blk_trace, old_bt);
-		kfree(bt);
-		ret = -EBUSY;
-	}
-	return 0;
-err:
-	return ret;
-}
-
-/*
- * sysfs interface to enable and configure tracing
- */
-
-static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev != NULL) {
-		struct request_queue *q = bdev_get_queue(bdev);
-
-		if (q != NULL) {
-			mutex_lock(&bdev->bd_mutex);
-			ret = sprintf(buf, "%u\n", !!q->blk_trace);
-			mutex_unlock(&bdev->bd_mutex);
-		}
-
-		bdput(bdev);
-	}
-
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	int value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0 || sscanf(buf, "%d", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	if (value)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-	else
-		ret = blk_trace_remove_queue(q);
-	mutex_unlock(&bdev->bd_mutex);
-
-	if (ret == 0)
-		ret = count;
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
-					 struct device_attribute *attr,
-					 char *buf);
-static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf, size_t count);
-#define BLK_TRACE_DEVICE_ATTR(_name) \
-	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
-		    sysfs_blk_trace_attr_show, \
-		    sysfs_blk_trace_attr_store)
-
-static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
-		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
-static BLK_TRACE_DEVICE_ATTR(act_mask);
-static BLK_TRACE_DEVICE_ATTR(pid);
-static BLK_TRACE_DEVICE_ATTR(start_lba);
-static BLK_TRACE_DEVICE_ATTR(end_lba);
-
-static struct attribute *blk_trace_attrs[] = {
-	&dev_attr_enable.attr,
-	&dev_attr_act_mask.attr,
-	&dev_attr_pid.attr,
-	&dev_attr_start_lba.attr,
-	&dev_attr_end_lba.attr,
-	NULL
-};
-
-struct attribute_group blk_trace_attr_group = {
-	.name  = "trace",
-	.attrs = blk_trace_attrs,
-};
-
-static int blk_str2act_mask(const char *str)
-{
-	int mask = 0;
-	char *copy = kstrdup(str, GFP_KERNEL), *s;
-
-	if (copy == NULL)
-		return -ENOMEM;
-
-	s = strstrip(copy);
-
-	while (1) {
-		char *sep = strchr(s, ',');
-
-		if (sep != NULL)
-			*sep = '\0';
-
-		if (strcasecmp(s, "barrier") == 0)
-			mask |= BLK_TC_BARRIER;
-		else if (strcasecmp(s, "complete") == 0)
-			mask |= BLK_TC_COMPLETE;
-		else if (strcasecmp(s, "fs") == 0)
-			mask |= BLK_TC_FS;
-		else if (strcasecmp(s, "issue") == 0)
-			mask |= BLK_TC_ISSUE;
-		else if (strcasecmp(s, "pc") == 0)
-			mask |= BLK_TC_PC;
-		else if (strcasecmp(s, "queue") == 0)
-			mask |= BLK_TC_QUEUE;
-		else if (strcasecmp(s, "read") == 0)
-			mask |= BLK_TC_READ;
-		else if (strcasecmp(s, "requeue") == 0)
-			mask |= BLK_TC_REQUEUE;
-		else if (strcasecmp(s, "sync") == 0)
-			mask |= BLK_TC_SYNC;
-		else if (strcasecmp(s, "write") == 0)
-			mask |= BLK_TC_WRITE;
-
-		if (sep == NULL)
-			break;
-
-		s = sep + 1;
-	}
-	kfree(copy);
-
-	return mask;
-}
-
-static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
-					 struct device_attribute *attr,
-					 char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct request_queue *q;
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-	mutex_lock(&bdev->bd_mutex);
-	if (q->blk_trace == NULL)
-		ret = sprintf(buf, "disabled\n");
-	else if (attr == &dev_attr_act_mask)
-		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
-	else if (attr == &dev_attr_pid)
-		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
-	else if (attr == &dev_attr_start_lba)
-		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
-	else if (attr == &dev_attr_end_lba)
-		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
-	mutex_unlock(&bdev->bd_mutex);
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
-					  struct device_attribute *attr,
-					  const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	u64 value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0)
-		goto out;
-
-	if (attr == &dev_attr_act_mask) {
-		if (sscanf(buf, "%llx", &value) != 1) {
-			/* Assume it is a list of trace category names */
-			value = blk_str2act_mask(buf);
-			if (value < 0)
-				goto out;
-		}
-	} else if (sscanf(buf, "%llu", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	ret = 0;
-	if (q->blk_trace == NULL)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-
-	if (ret == 0) {
-		if (attr == &dev_attr_act_mask)
-			q->blk_trace->act_mask = value;
-		else if (attr == &dev_attr_pid)
-			q->blk_trace->pid = value;
-		else if (attr == &dev_attr_start_lba)
-			q->blk_trace->start_lba = value;
-		else if (attr == &dev_attr_end_lba)
-			q->blk_trace->end_lba = value;
-		ret = count;
-	}
-	mutex_unlock(&bdev->bd_mutex);
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 25131a5d5e4f..4fee43c01942 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -302,6 +302,29 @@ config WORKQUEUE_TRACER
           For example it can help a developer to decide whether he should
           choose a per cpu workqueue instead of a singlethreaded one.
 
+config BLK_DEV_IO_TRACE
+	bool "Support for tracing block io actions"
+	depends on SYSFS
+	select RELAY
+	select DEBUG_FS
+	select TRACEPOINTS
+	select TRACING
+	select STACKTRACE
+	help
+	  Say Y here if you want to be able to trace the block layer actions
+	  on a given queue. Tracing allows you to see any traffic happening
+	  on a block device queue. For more information (and the userspace
+	  support tools needed), fetch the blktrace tools from:
+
+	  git://git.kernel.dk/blktrace.git
+
+	  Tracing also is possible using the ftrace interface, e.g.:
+
+	    echo 1 > /sys/block/sda/sda1/trace/enable
+	    echo blk > /sys/kernel/debug/tracing/current_tracer
+	    cat /sys/kernel/debug/tracing/trace_pipe
+
+	  If unsure, say N.
 
 config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f76d48f3527d..627090bc262d 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -37,5 +37,6 @@ obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
 obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
+obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
new file mode 100644
index 000000000000..3b91da064820
--- /dev/null
+++ b/kernel/trace/blktrace.c
@@ -0,0 +1,1538 @@
+/*
+ * Copyright (C) 2006 Jens Axboe <axboe@kernel.dk>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/blktrace_api.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
+#include <linux/mutex.h>
+#include <linux/debugfs.h>
+#include <linux/time.h>
+#include <trace/block.h>
+#include <linux/uaccess.h>
+#include "trace_output.h"
+
+static unsigned int blktrace_seq __read_mostly = 1;
+
+static struct trace_array *blk_tr;
+static int __read_mostly  blk_tracer_enabled;
+
+/* Select an alternative, minimalistic output than the original one */
+#define TRACE_BLK_OPT_CLASSIC 	0x1
+
+static struct tracer_opt blk_tracer_opts[] = {
+	/* Default disable the minimalistic output */
+	{ TRACER_OPT(blk_classic, TRACE_BLK_OPT_CLASSIC) },
+	{ }
+};
+
+static struct tracer_flags blk_tracer_flags = {
+	.val  = 0,
+	.opts = blk_tracer_opts,
+};
+
+/* Global reference count of probes */
+static DEFINE_MUTEX(blk_probe_mutex);
+static atomic_t blk_probes_ref = ATOMIC_INIT(0);
+
+static int blk_register_tracepoints(void);
+static void blk_unregister_tracepoints(void);
+
+/*
+ * Send out a notify message.
+ */
+static void trace_note(struct blk_trace *bt, pid_t pid, int action,
+		       const void *data, size_t len)
+{
+	struct blk_io_trace *t;
+
+	if (!bt->rchan)
+		return;
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + len);
+	if (t) {
+		const int cpu = smp_processor_id();
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->time = ktime_to_ns(ktime_get());
+		t->device = bt->dev;
+		t->action = action;
+		t->pid = pid;
+		t->cpu = cpu;
+		t->pdu_len = len;
+		memcpy((void *) t + sizeof(*t), data, len);
+	}
+}
+
+/*
+ * Send out a notify for this process, if we haven't done so since a trace
+ * started
+ */
+static void trace_note_tsk(struct blk_trace *bt, struct task_struct *tsk)
+{
+	tsk->btrace_seq = blktrace_seq;
+	trace_note(bt, tsk->pid, BLK_TN_PROCESS, tsk->comm, sizeof(tsk->comm));
+}
+
+static void trace_note_time(struct blk_trace *bt)
+{
+	struct timespec now;
+	unsigned long flags;
+	u32 words[2];
+
+	getnstimeofday(&now);
+	words[0] = now.tv_sec;
+	words[1] = now.tv_nsec;
+
+	local_irq_save(flags);
+	trace_note(bt, 0, BLK_TN_TIMESTAMP, words, sizeof(words));
+	local_irq_restore(flags);
+}
+
+void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
+{
+	int n;
+	va_list args;
+	unsigned long flags;
+	char *buf;
+
+	if (blk_tr) {
+		va_start(args, fmt);
+		ftrace_vprintk(fmt, args);
+		va_end(args);
+		return;
+	}
+
+	if (!bt->msg_data)
+		return;
+
+	local_irq_save(flags);
+	buf = per_cpu_ptr(bt->msg_data, smp_processor_id());
+	va_start(args, fmt);
+	n = vscnprintf(buf, BLK_TN_MAX_MSG, fmt, args);
+	va_end(args);
+
+	trace_note(bt, 0, BLK_TN_MESSAGE, buf, n);
+	local_irq_restore(flags);
+}
+EXPORT_SYMBOL_GPL(__trace_note_message);
+
+static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
+			 pid_t pid)
+{
+	if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0)
+		return 1;
+	if (sector < bt->start_lba || sector > bt->end_lba)
+		return 1;
+	if (bt->pid && pid != bt->pid)
+		return 1;
+
+	return 0;
+}
+
+/*
+ * Data direction bit lookup
+ */
+static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
+					 BLK_TC_ACT(BLK_TC_WRITE) };
+
+/* The ilog2() calls fall out because they're constant */
+#define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
+	  (ilog2(BLK_TC_ ## __name) + BLK_TC_SHIFT - BIO_RW_ ## __name))
+
+/*
+ * The worker for the various blk_add_trace*() types. Fills out a
+ * blk_io_trace structure and places it in a per-cpu subbuffer.
+ */
+static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
+		     int rw, u32 what, int error, int pdu_len, void *pdu_data)
+{
+	struct task_struct *tsk = current;
+	struct ring_buffer_event *event = NULL;
+	struct blk_io_trace *t;
+	unsigned long flags = 0;
+	unsigned long *sequence;
+	pid_t pid;
+	int cpu, pc = 0;
+
+	if (unlikely(bt->trace_state != Blktrace_running ||
+		     !blk_tracer_enabled))
+		return;
+
+	what |= ddir_act[rw & WRITE];
+	what |= MASK_TC_BIT(rw, BARRIER);
+	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, AHEAD);
+	what |= MASK_TC_BIT(rw, META);
+	what |= MASK_TC_BIT(rw, DISCARD);
+
+	pid = tsk->pid;
+	if (unlikely(act_log_check(bt, what, sector, pid)))
+		return;
+	cpu = raw_smp_processor_id();
+
+	if (blk_tr) {
+		tracing_record_cmdline(current);
+
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + pdu_len,
+						  0, pc);
+		if (!event)
+			return;
+		t = ring_buffer_event_data(event);
+		goto record_it;
+	}
+
+	/*
+	 * A word about the locking here - we disable interrupts to reserve
+	 * some space in the relay per-cpu buffer, to prevent an irq
+	 * from coming in and stepping on our toes.
+	 */
+	local_irq_save(flags);
+
+	if (unlikely(tsk->btrace_seq != blktrace_seq))
+		trace_note_tsk(bt, tsk);
+
+	t = relay_reserve(bt->rchan, sizeof(*t) + pdu_len);
+	if (t) {
+		sequence = per_cpu_ptr(bt->sequence, cpu);
+
+		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
+		t->sequence = ++(*sequence);
+		t->time = ktime_to_ns(ktime_get());
+record_it:
+		/*
+		 * These two are not needed in ftrace as they are in the
+		 * generic trace_entry, filled by tracing_generic_entry_update,
+		 * but for the trace_event->bin() synthesizer benefit we do it
+		 * here too.
+		 */
+		t->cpu = cpu;
+		t->pid = pid;
+
+		t->sector = sector;
+		t->bytes = bytes;
+		t->action = what;
+		t->device = bt->dev;
+		t->error = error;
+		t->pdu_len = pdu_len;
+
+		if (pdu_len)
+			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
+
+		if (blk_tr) {
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
+			return;
+		}
+	}
+
+	local_irq_restore(flags);
+}
+
+static struct dentry *blk_tree_root;
+static DEFINE_MUTEX(blk_tree_mutex);
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	debugfs_remove(bt->msg_file);
+	debugfs_remove(bt->dropped_file);
+	relay_close(bt->rchan);
+	free_percpu(bt->sequence);
+	free_percpu(bt->msg_data);
+	kfree(bt);
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+int blk_trace_remove(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (!bt)
+		return -EINVAL;
+
+	if (bt->trace_state == Blktrace_setup ||
+	    bt->trace_state == Blktrace_stopped)
+		blk_trace_cleanup(bt);
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_remove);
+
+static int blk_dropped_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_dropped_read(struct file *filp, char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	struct blk_trace *bt = filp->private_data;
+	char buf[16];
+
+	snprintf(buf, sizeof(buf), "%u\n", atomic_read(&bt->dropped));
+
+	return simple_read_from_buffer(buffer, count, ppos, buf, strlen(buf));
+}
+
+static const struct file_operations blk_dropped_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_dropped_open,
+	.read =		blk_dropped_read,
+};
+
+static int blk_msg_open(struct inode *inode, struct file *filp)
+{
+	filp->private_data = inode->i_private;
+
+	return 0;
+}
+
+static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
+				size_t count, loff_t *ppos)
+{
+	char *msg;
+	struct blk_trace *bt;
+
+	if (count > BLK_TN_MAX_MSG)
+		return -EINVAL;
+
+	msg = kmalloc(count, GFP_KERNEL);
+	if (msg == NULL)
+		return -ENOMEM;
+
+	if (copy_from_user(msg, buffer, count)) {
+		kfree(msg);
+		return -EFAULT;
+	}
+
+	bt = filp->private_data;
+	__trace_note_message(bt, "%s", msg);
+	kfree(msg);
+
+	return count;
+}
+
+static const struct file_operations blk_msg_fops = {
+	.owner =	THIS_MODULE,
+	.open =		blk_msg_open,
+	.write =	blk_msg_write,
+};
+
+/*
+ * Keep track of how many times we encountered a full subbuffer, to aid
+ * the user space app in telling how many lost events there were.
+ */
+static int blk_subbuf_start_callback(struct rchan_buf *buf, void *subbuf,
+				     void *prev_subbuf, size_t prev_padding)
+{
+	struct blk_trace *bt;
+
+	if (!relay_buf_full(buf))
+		return 1;
+
+	bt = buf->chan->private_data;
+	atomic_inc(&bt->dropped);
+	return 0;
+}
+
+static int blk_remove_buf_file_callback(struct dentry *dentry)
+{
+	struct dentry *parent = dentry->d_parent;
+	debugfs_remove(dentry);
+
+	/*
+	* this will fail for all but the last file, but that is ok. what we
+	* care about is the top level buts->name directory going away, when
+	* the last trace file is gone. Then we don't have to rmdir() that
+	* manually on trace stop, so it nicely solves the issue with
+	* force killing of running traces.
+	*/
+
+	debugfs_remove(parent);
+	return 0;
+}
+
+static struct dentry *blk_create_buf_file_callback(const char *filename,
+						   struct dentry *parent,
+						   int mode,
+						   struct rchan_buf *buf,
+						   int *is_global)
+{
+	return debugfs_create_file(filename, mode, parent, buf,
+					&relay_file_operations);
+}
+
+static struct rchan_callbacks blk_relay_callbacks = {
+	.subbuf_start		= blk_subbuf_start_callback,
+	.create_buf_file	= blk_create_buf_file_callback,
+	.remove_buf_file	= blk_remove_buf_file_callback,
+};
+
+/*
+ * Setup everything required to start tracing
+ */
+int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+			struct blk_user_trace_setup *buts)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	struct dentry *dir = NULL;
+	int ret, i;
+
+	if (!buts->buf_size || !buts->buf_nr)
+		return -EINVAL;
+
+	strncpy(buts->name, name, BLKTRACE_BDEV_SIZE);
+	buts->name[BLKTRACE_BDEV_SIZE - 1] = '\0';
+
+	/*
+	 * some device names have larger paths - convert the slashes
+	 * to underscores for this to work as expected
+	 */
+	for (i = 0; i < strlen(buts->name); i++)
+		if (buts->name[i] == '/')
+			buts->name[i] = '_';
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->sequence = alloc_percpu(unsigned long);
+	if (!bt->sequence)
+		goto err;
+
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG);
+	if (!bt->msg_data)
+		goto err;
+
+	ret = -ENOENT;
+
+	if (!blk_tree_root) {
+		blk_tree_root = debugfs_create_dir("block", NULL);
+		if (!blk_tree_root)
+			return -ENOMEM;
+	}
+
+	dir = debugfs_create_dir(buts->name, blk_tree_root);
+
+	if (!dir)
+		goto err;
+
+	bt->dir = dir;
+	bt->dev = dev;
+	atomic_set(&bt->dropped, 0);
+
+	ret = -EIO;
+	bt->dropped_file = debugfs_create_file("dropped", 0444, dir, bt,
+					       &blk_dropped_fops);
+	if (!bt->dropped_file)
+		goto err;
+
+	bt->msg_file = debugfs_create_file("msg", 0222, dir, bt, &blk_msg_fops);
+	if (!bt->msg_file)
+		goto err;
+
+	bt->rchan = relay_open("trace", dir, buts->buf_size,
+				buts->buf_nr, &blk_relay_callbacks, bt);
+	if (!bt->rchan)
+		goto err;
+
+	bt->act_mask = buts->act_mask;
+	if (!bt->act_mask)
+		bt->act_mask = (u16) -1;
+
+	bt->start_lba = buts->start_lba;
+	bt->end_lba = buts->end_lba;
+	if (!bt->end_lba)
+		bt->end_lba = -1ULL;
+
+	bt->pid = buts->pid;
+	bt->trace_state = Blktrace_setup;
+
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1) {
+		ret = blk_register_tracepoints();
+		if (ret)
+			goto probe_err;
+	}
+	mutex_unlock(&blk_probe_mutex);
+
+	ret = -EBUSY;
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt) {
+		(void) xchg(&q->blk_trace, old_bt);
+		goto err;
+	}
+
+	return 0;
+probe_err:
+	atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+err:
+	if (bt) {
+		if (bt->msg_file)
+			debugfs_remove(bt->msg_file);
+		if (bt->dropped_file)
+			debugfs_remove(bt->dropped_file);
+		free_percpu(bt->sequence);
+		free_percpu(bt->msg_data);
+		if (bt->rchan)
+			relay_close(bt->rchan);
+		kfree(bt);
+	}
+	return ret;
+}
+
+int blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
+		    char __user *arg)
+{
+	struct blk_user_trace_setup buts;
+	int ret;
+
+	ret = copy_from_user(&buts, arg, sizeof(buts));
+	if (ret)
+		return -EFAULT;
+
+	ret = do_blk_trace_setup(q, name, dev, &buts);
+	if (ret)
+		return ret;
+
+	if (copy_to_user(arg, &buts, sizeof(buts)))
+		return -EFAULT;
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(blk_trace_setup);
+
+int blk_trace_startstop(struct request_queue *q, int start)
+{
+	int ret;
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt == NULL)
+		return -EINVAL;
+
+	/*
+	 * For starting a trace, we can transition from a setup or stopped
+	 * trace. For stopping a trace, the state must be running
+	 */
+	ret = -EINVAL;
+	if (start) {
+		if (bt->trace_state == Blktrace_setup ||
+		    bt->trace_state == Blktrace_stopped) {
+			blktrace_seq++;
+			smp_mb();
+			bt->trace_state = Blktrace_running;
+
+			trace_note_time(bt);
+			ret = 0;
+		}
+	} else {
+		if (bt->trace_state == Blktrace_running) {
+			bt->trace_state = Blktrace_stopped;
+			relay_flush(bt->rchan);
+			ret = 0;
+		}
+	}
+
+	return ret;
+}
+EXPORT_SYMBOL_GPL(blk_trace_startstop);
+
+/**
+ * blk_trace_ioctl: - handle the ioctls associated with tracing
+ * @bdev:	the block device
+ * @cmd: 	the ioctl cmd
+ * @arg:	the argument data, if any
+ *
+ **/
+int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg)
+{
+	struct request_queue *q;
+	int ret, start = 0;
+	char b[BDEVNAME_SIZE];
+
+	q = bdev_get_queue(bdev);
+	if (!q)
+		return -ENXIO;
+
+	mutex_lock(&bdev->bd_mutex);
+
+	switch (cmd) {
+	case BLKTRACESETUP:
+		bdevname(bdev, b);
+		ret = blk_trace_setup(q, b, bdev->bd_dev, arg);
+		break;
+	case BLKTRACESTART:
+		start = 1;
+	case BLKTRACESTOP:
+		ret = blk_trace_startstop(q, start);
+		break;
+	case BLKTRACETEARDOWN:
+		ret = blk_trace_remove(q);
+		break;
+	default:
+		ret = -ENOTTY;
+		break;
+	}
+
+	mutex_unlock(&bdev->bd_mutex);
+	return ret;
+}
+
+/**
+ * blk_trace_shutdown: - stop and cleanup trace structures
+ * @q:    the request queue associated with the device
+ *
+ **/
+void blk_trace_shutdown(struct request_queue *q)
+{
+	if (q->blk_trace) {
+		blk_trace_startstop(q, 0);
+		blk_trace_remove(q);
+	}
+}
+
+/*
+ * blktrace probes
+ */
+
+/**
+ * blk_add_trace_rq - Add a trace for a request oriented action
+ * @q:		queue the io is for
+ * @rq:		the source request
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a request. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
+				    u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+	int rw = rq->cmd_flags & 0x03;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_discard_rq(rq))
+		rw |= (1 << BIO_RW_DISCARD);
+
+	if (blk_pc_request(rq)) {
+		what |= BLK_TC_ACT(BLK_TC_PC);
+		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
+				sizeof(rq->cmd), rq->cmd);
+	} else  {
+		what |= BLK_TC_ACT(BLK_TC_FS);
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				rw, what, rq->errors, 0, NULL);
+	}
+}
+
+static void blk_add_trace_rq_abort(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ABORT);
+}
+
+static void blk_add_trace_rq_insert(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_INSERT);
+}
+
+static void blk_add_trace_rq_issue(struct request_queue *q, struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_ISSUE);
+}
+
+static void blk_add_trace_rq_requeue(struct request_queue *q,
+				     struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_REQUEUE);
+}
+
+static void blk_add_trace_rq_complete(struct request_queue *q,
+				      struct request *rq)
+{
+	blk_add_trace_rq(q, rq, BLK_TA_COMPLETE);
+}
+
+/**
+ * blk_add_trace_bio - Add a trace for a bio oriented action
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @what:	the action
+ *
+ * Description:
+ *     Records an action against a bio. Will log the bio offset + size.
+ *
+ **/
+static void blk_add_trace_bio(struct request_queue *q, struct bio *bio,
+				     u32 what)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, what,
+			!bio_flagged(bio, BIO_UPTODATE), 0, NULL);
+}
+
+static void blk_add_trace_bio_bounce(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BOUNCE);
+}
+
+static void blk_add_trace_bio_complete(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_COMPLETE);
+}
+
+static void blk_add_trace_bio_backmerge(struct request_queue *q,
+					struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_BACKMERGE);
+}
+
+static void blk_add_trace_bio_frontmerge(struct request_queue *q,
+					 struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_FRONTMERGE);
+}
+
+static void blk_add_trace_bio_queue(struct request_queue *q, struct bio *bio)
+{
+	blk_add_trace_bio(q, bio, BLK_TA_QUEUE);
+}
+
+static void blk_add_trace_getrq(struct request_queue *q,
+				struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_GETRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_GETRQ, 0, 0, NULL);
+	}
+}
+
+
+static void blk_add_trace_sleeprq(struct request_queue *q,
+				  struct bio *bio, int rw)
+{
+	if (bio)
+		blk_add_trace_bio(q, bio, BLK_TA_SLEEPRQ);
+	else {
+		struct blk_trace *bt = q->blk_trace;
+
+		if (bt)
+			__blk_add_trace(bt, 0, 0, rw, BLK_TA_SLEEPRQ,
+					0, 0, NULL);
+	}
+}
+
+static void blk_add_trace_plug(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt)
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_PLUG, 0, 0, NULL);
+}
+
+static void blk_add_trace_unplug_io(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_IO, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_unplug_timer(struct request_queue *q)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		unsigned int pdu = q->rq.count[READ] + q->rq.count[WRITE];
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, 0, 0, 0, BLK_TA_UNPLUG_TIMER, 0,
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+static void blk_add_trace_split(struct request_queue *q, struct bio *bio,
+				unsigned int pdu)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (bt) {
+		__be64 rpdu = cpu_to_be64(pdu);
+
+		__blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw,
+				BLK_TA_SPLIT, !bio_flagged(bio, BIO_UPTODATE),
+				sizeof(rpdu), &rpdu);
+	}
+}
+
+/**
+ * blk_add_trace_remap - Add a trace for a remap operation
+ * @q:		queue the io is for
+ * @bio:	the source bio
+ * @dev:	target device
+ * @from:	source sector
+ * @to:		target sector
+ *
+ * Description:
+ *     Device mapper or raid target sometimes need to split a bio because
+ *     it spans a stripe (or similar). Add a trace for that action.
+ *
+ **/
+static void blk_add_trace_remap(struct request_queue *q, struct bio *bio,
+				       dev_t dev, sector_t from, sector_t to)
+{
+	struct blk_trace *bt = q->blk_trace;
+	struct blk_io_trace_remap r;
+
+	if (likely(!bt))
+		return;
+
+	r.device = cpu_to_be32(dev);
+	r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev);
+	r.sector = cpu_to_be64(to);
+
+	__blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP,
+			!bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r);
+}
+
+/**
+ * blk_add_driver_data - Add binary message with driver-specific data
+ * @q:		queue the io is for
+ * @rq:		io request
+ * @data:	driver-specific data
+ * @len:	length of driver-specific data
+ *
+ * Description:
+ *     Some drivers might want to write driver-specific data per request.
+ *
+ **/
+void blk_add_driver_data(struct request_queue *q,
+			 struct request *rq,
+			 void *data, size_t len)
+{
+	struct blk_trace *bt = q->blk_trace;
+
+	if (likely(!bt))
+		return;
+
+	if (blk_pc_request(rq))
+		__blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA,
+				rq->errors, len, data);
+	else
+		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
+				0, BLK_TA_DRV_DATA, rq->errors, len, data);
+}
+EXPORT_SYMBOL_GPL(blk_add_driver_data);
+
+static int blk_register_tracepoints(void)
+{
+	int ret;
+
+	ret = register_trace_block_rq_abort(blk_add_trace_rq_abort);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_insert(blk_add_trace_rq_insert);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_issue(blk_add_trace_rq_issue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	WARN_ON(ret);
+	ret = register_trace_block_rq_complete(blk_add_trace_rq_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_complete(blk_add_trace_bio_complete);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	WARN_ON(ret);
+	ret = register_trace_block_bio_queue(blk_add_trace_bio_queue);
+	WARN_ON(ret);
+	ret = register_trace_block_getrq(blk_add_trace_getrq);
+	WARN_ON(ret);
+	ret = register_trace_block_sleeprq(blk_add_trace_sleeprq);
+	WARN_ON(ret);
+	ret = register_trace_block_plug(blk_add_trace_plug);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	WARN_ON(ret);
+	ret = register_trace_block_unplug_io(blk_add_trace_unplug_io);
+	WARN_ON(ret);
+	ret = register_trace_block_split(blk_add_trace_split);
+	WARN_ON(ret);
+	ret = register_trace_block_remap(blk_add_trace_remap);
+	WARN_ON(ret);
+	return 0;
+}
+
+static void blk_unregister_tracepoints(void)
+{
+	unregister_trace_block_remap(blk_add_trace_remap);
+	unregister_trace_block_split(blk_add_trace_split);
+	unregister_trace_block_unplug_io(blk_add_trace_unplug_io);
+	unregister_trace_block_unplug_timer(blk_add_trace_unplug_timer);
+	unregister_trace_block_plug(blk_add_trace_plug);
+	unregister_trace_block_sleeprq(blk_add_trace_sleeprq);
+	unregister_trace_block_getrq(blk_add_trace_getrq);
+	unregister_trace_block_bio_queue(blk_add_trace_bio_queue);
+	unregister_trace_block_bio_frontmerge(blk_add_trace_bio_frontmerge);
+	unregister_trace_block_bio_backmerge(blk_add_trace_bio_backmerge);
+	unregister_trace_block_bio_complete(blk_add_trace_bio_complete);
+	unregister_trace_block_bio_bounce(blk_add_trace_bio_bounce);
+	unregister_trace_block_rq_complete(blk_add_trace_rq_complete);
+	unregister_trace_block_rq_requeue(blk_add_trace_rq_requeue);
+	unregister_trace_block_rq_issue(blk_add_trace_rq_issue);
+	unregister_trace_block_rq_insert(blk_add_trace_rq_insert);
+	unregister_trace_block_rq_abort(blk_add_trace_rq_abort);
+
+	tracepoint_synchronize_unregister();
+}
+
+/*
+ * struct blk_io_tracer formatting routines
+ */
+
+static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
+{
+	int i = 0;
+
+	if (t->action & BLK_TC_DISCARD)
+		rwbs[i++] = 'D';
+	else if (t->action & BLK_TC_WRITE)
+		rwbs[i++] = 'W';
+	else if (t->bytes)
+		rwbs[i++] = 'R';
+	else
+		rwbs[i++] = 'N';
+
+	if (t->action & BLK_TC_AHEAD)
+		rwbs[i++] = 'A';
+	if (t->action & BLK_TC_BARRIER)
+		rwbs[i++] = 'B';
+	if (t->action & BLK_TC_SYNC)
+		rwbs[i++] = 'S';
+	if (t->action & BLK_TC_META)
+		rwbs[i++] = 'M';
+
+	rwbs[i] = '\0';
+}
+
+static inline
+const struct blk_io_trace *te_blk_io_trace(const struct trace_entry *ent)
+{
+	return (const struct blk_io_trace *)ent;
+}
+
+static inline const void *pdu_start(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent) + 1;
+}
+
+static inline u32 t_sec(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->bytes >> 9;
+}
+
+static inline unsigned long long t_sector(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static inline __u16 t_error(const struct trace_entry *ent)
+{
+	return te_blk_io_trace(ent)->sector;
+}
+
+static __u64 get_pdu_int(const struct trace_entry *ent)
+{
+	const __u64 *val = pdu_start(ent);
+	return be64_to_cpu(*val);
+}
+
+static void get_pdu_remap(const struct trace_entry *ent,
+			  struct blk_io_trace_remap *r)
+{
+	const struct blk_io_trace_remap *__r = pdu_start(ent);
+	__u64 sector = __r->sector;
+
+	r->device = be32_to_cpu(__r->device);
+	r->device_from = be32_to_cpu(__r->device_from);
+	r->sector = be64_to_cpu(sector);
+}
+
+static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+{
+	char rwbs[6];
+	unsigned long long ts  = ns2usecs(iter->ts);
+	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned secs	       = (unsigned long)ts;
+	const struct trace_entry *ent = iter->ent;
+	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+
+	fill_rwbs(rwbs, t);
+
+	return trace_seq_printf(&iter->seq,
+				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), iter->cpu,
+				secs, usec_rem, ent->pid, act, rwbs);
+}
+
+static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
+			      const char *act)
+{
+	char rwbs[6];
+	fill_rwbs(rwbs, t);
+	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+				MAJOR(t->device), MINOR(t->device), act, rwbs);
+}
+
+static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
+{
+	const char *cmd = trace_find_cmdline(ent->pid);
+
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%s]\n",
+					t_sector(ent), t_sec(ent), cmd);
+	return trace_seq_printf(s, "[%s]\n", cmd);
+}
+
+static int blk_log_with_error(struct trace_seq *s,
+			      const struct trace_entry *ent)
+{
+	if (t_sec(ent))
+		return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent),
+					t_sec(ent), t_error(ent));
+	return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent));
+}
+
+static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
+{
+	struct blk_io_trace_remap r = { .device = 0, };
+
+	get_pdu_remap(ent, &r);
+	return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n",
+			       t_sector(ent),
+			       t_sec(ent), MAJOR(r.device), MINOR(r.device),
+			       (unsigned long long)r.sector);
+}
+
+static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+}
+
+static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
+				get_pdu_int(ent));
+}
+
+static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
+{
+	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
+				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+}
+
+/*
+ * struct tracer operations
+ */
+
+static void blk_tracer_print_header(struct seq_file *m)
+{
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return;
+	seq_puts(m, "# DEV   CPU TIMESTAMP     PID ACT FLG\n"
+		    "#  |     |     |           |   |   |\n");
+}
+
+static void blk_tracer_start(struct trace_array *tr)
+{
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		if (blk_register_tracepoints())
+			atomic_dec(&blk_probes_ref);
+	mutex_unlock(&blk_probe_mutex);
+	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
+}
+
+static int blk_tracer_init(struct trace_array *tr)
+{
+	blk_tr = tr;
+	blk_tracer_start(tr);
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled++;
+	mutex_unlock(&blk_probe_mutex);
+	return 0;
+}
+
+static void blk_tracer_stop(struct trace_array *tr)
+{
+	trace_flags |= TRACE_ITER_CONTEXT_INFO;
+	mutex_lock(&blk_probe_mutex);
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+	mutex_unlock(&blk_probe_mutex);
+}
+
+static void blk_tracer_reset(struct trace_array *tr)
+{
+	if (!atomic_read(&blk_probes_ref))
+		return;
+
+	mutex_lock(&blk_probe_mutex);
+	blk_tracer_enabled--;
+	WARN_ON(blk_tracer_enabled < 0);
+	mutex_unlock(&blk_probe_mutex);
+
+	blk_tracer_stop(tr);
+}
+
+static struct {
+	const char *act[2];
+	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+} what2act[] __read_mostly = {
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
+	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
+	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
+	[__BLK_TA_SLEEPRQ]	= {{  "S", "sleeprq" },	   blk_log_generic },
+	[__BLK_TA_REQUEUE]	= {{  "R", "requeue" },	   blk_log_with_error },
+	[__BLK_TA_ISSUE]	= {{  "D", "issue" },	   blk_log_generic },
+	[__BLK_TA_COMPLETE]	= {{  "C", "complete" },   blk_log_with_error },
+	[__BLK_TA_PLUG]		= {{  "P", "plug" },	   blk_log_plug },
+	[__BLK_TA_UNPLUG_IO]	= {{  "U", "unplug_io" },  blk_log_unplug },
+	[__BLK_TA_UNPLUG_TIMER]	= {{ "UT", "unplug_timer" }, blk_log_unplug },
+	[__BLK_TA_INSERT]	= {{  "I", "insert" },	   blk_log_generic },
+	[__BLK_TA_SPLIT]	= {{  "X", "split" },	   blk_log_split },
+	[__BLK_TA_BOUNCE]	= {{  "B", "bounce" },	   blk_log_generic },
+	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
+};
+
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	int ret;
+
+	if (!trace_print_context(iter))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(s, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
+	const int offset = offsetof(struct blk_io_trace, sector);
+	struct blk_io_trace old = {
+		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
+		.time     = ns2usecs(iter->ts),
+	};
+
+	if (!trace_seq_putmem(s, &old, offset))
+		return 0;
+	return trace_seq_putmem(s, &t->sector,
+				sizeof(old) - offset + t->pdu_len);
+}
+
+static enum print_line_t
+blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
+{
+	return blk_trace_synthesize_old_trace(iter) ?
+			TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
+{
+	const struct blk_io_trace *t;
+	u16 what;
+	int ret;
+
+	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
+		return TRACE_TYPE_UNHANDLED;
+
+	t = (const struct blk_io_trace *)iter->ent;
+	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+
+	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
+	else {
+		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
+		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
+		if (ret)
+			ret = what2act[what].print(&iter->seq, iter->ent);
+	}
+
+	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct tracer blk_tracer __read_mostly = {
+	.name		= "blk",
+	.init		= blk_tracer_init,
+	.reset		= blk_tracer_reset,
+	.start		= blk_tracer_start,
+	.stop		= blk_tracer_stop,
+	.print_header	= blk_tracer_print_header,
+	.print_line	= blk_tracer_print_line,
+	.flags		= &blk_tracer_flags,
+};
+
+static struct trace_event trace_blk_event = {
+	.type	 	= TRACE_BLK,
+	.trace		= blk_trace_event_print,
+	.latency_trace	= blk_trace_event_print,
+	.binary		= blk_trace_event_print_binary,
+};
+
+static int __init init_blk_tracer(void)
+{
+	if (!register_ftrace_event(&trace_blk_event)) {
+		pr_warning("Warning: could not register block events\n");
+		return 1;
+	}
+
+	if (register_tracer(&blk_tracer) != 0) {
+		pr_warning("Warning: could not register the block tracer\n");
+		unregister_ftrace_event(&trace_blk_event);
+		return 1;
+	}
+
+	return 0;
+}
+
+device_initcall(init_blk_tracer);
+
+static int blk_trace_remove_queue(struct request_queue *q)
+{
+	struct blk_trace *bt;
+
+	bt = xchg(&q->blk_trace, NULL);
+	if (bt == NULL)
+		return -EINVAL;
+
+	kfree(bt);
+	return 0;
+}
+
+/*
+ * Setup everything required to start tracing
+ */
+static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
+{
+	struct blk_trace *old_bt, *bt = NULL;
+	int ret;
+
+	ret = -ENOMEM;
+	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
+	if (!bt)
+		goto err;
+
+	bt->dev = dev;
+	bt->act_mask = (u16)-1;
+	bt->end_lba = -1ULL;
+	bt->trace_state = Blktrace_running;
+
+	old_bt = xchg(&q->blk_trace, bt);
+	if (old_bt != NULL) {
+		(void)xchg(&q->blk_trace, old_bt);
+		kfree(bt);
+		ret = -EBUSY;
+	}
+	return 0;
+err:
+	return ret;
+}
+
+/*
+ * sysfs interface to enable and configure tracing
+ */
+
+static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
+					   struct device_attribute *attr,
+					   char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev != NULL) {
+		struct request_queue *q = bdev_get_queue(bdev);
+
+		if (q != NULL) {
+			mutex_lock(&bdev->bd_mutex);
+			ret = sprintf(buf, "%u\n", !!q->blk_trace);
+			mutex_unlock(&bdev->bd_mutex);
+		}
+
+		bdput(bdev);
+	}
+
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	int value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0 || sscanf(buf, "%d", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	if (value)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+	else
+		ret = blk_trace_remove_queue(q);
+	mutex_unlock(&bdev->bd_mutex);
+
+	if (ret == 0)
+		ret = count;
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf);
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count);
+#define BLK_TRACE_DEVICE_ATTR(_name) \
+	DEVICE_ATTR(_name, S_IRUGO | S_IWUSR, \
+		    sysfs_blk_trace_attr_show, \
+		    sysfs_blk_trace_attr_store)
+
+static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
+		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(act_mask);
+static BLK_TRACE_DEVICE_ATTR(pid);
+static BLK_TRACE_DEVICE_ATTR(start_lba);
+static BLK_TRACE_DEVICE_ATTR(end_lba);
+
+static struct attribute *blk_trace_attrs[] = {
+	&dev_attr_enable.attr,
+	&dev_attr_act_mask.attr,
+	&dev_attr_pid.attr,
+	&dev_attr_start_lba.attr,
+	&dev_attr_end_lba.attr,
+	NULL
+};
+
+struct attribute_group blk_trace_attr_group = {
+	.name  = "trace",
+	.attrs = blk_trace_attrs,
+};
+
+static int blk_str2act_mask(const char *str)
+{
+	int mask = 0;
+	char *copy = kstrdup(str, GFP_KERNEL), *s;
+
+	if (copy == NULL)
+		return -ENOMEM;
+
+	s = strstrip(copy);
+
+	while (1) {
+		char *sep = strchr(s, ',');
+
+		if (sep != NULL)
+			*sep = '\0';
+
+		if (strcasecmp(s, "barrier") == 0)
+			mask |= BLK_TC_BARRIER;
+		else if (strcasecmp(s, "complete") == 0)
+			mask |= BLK_TC_COMPLETE;
+		else if (strcasecmp(s, "fs") == 0)
+			mask |= BLK_TC_FS;
+		else if (strcasecmp(s, "issue") == 0)
+			mask |= BLK_TC_ISSUE;
+		else if (strcasecmp(s, "pc") == 0)
+			mask |= BLK_TC_PC;
+		else if (strcasecmp(s, "queue") == 0)
+			mask |= BLK_TC_QUEUE;
+		else if (strcasecmp(s, "read") == 0)
+			mask |= BLK_TC_READ;
+		else if (strcasecmp(s, "requeue") == 0)
+			mask |= BLK_TC_REQUEUE;
+		else if (strcasecmp(s, "sync") == 0)
+			mask |= BLK_TC_SYNC;
+		else if (strcasecmp(s, "write") == 0)
+			mask |= BLK_TC_WRITE;
+
+		if (sep == NULL)
+			break;
+
+		s = sep + 1;
+	}
+	kfree(copy);
+
+	return mask;
+}
+
+static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
+					 struct device_attribute *attr,
+					 char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct request_queue *q;
+	struct block_device *bdev;
+	ssize_t ret = -ENXIO;
+
+	lock_kernel();
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+	mutex_lock(&bdev->bd_mutex);
+	if (q->blk_trace == NULL)
+		ret = sprintf(buf, "disabled\n");
+	else if (attr == &dev_attr_act_mask)
+		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+	else if (attr == &dev_attr_pid)
+		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
+	else if (attr == &dev_attr_start_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
+	else if (attr == &dev_attr_end_lba)
+		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+	return ret;
+}
+
+static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
+					  struct device_attribute *attr,
+					  const char *buf, size_t count)
+{
+	struct block_device *bdev;
+	struct request_queue *q;
+	struct hd_struct *p;
+	u64 value;
+	ssize_t ret = -ENXIO;
+
+	if (count == 0)
+		goto out;
+
+	if (attr == &dev_attr_act_mask) {
+		if (sscanf(buf, "%llx", &value) != 1) {
+			/* Assume it is a list of trace category names */
+			value = blk_str2act_mask(buf);
+			if (value < 0)
+				goto out;
+		}
+	} else if (sscanf(buf, "%llu", &value) != 1)
+		goto out;
+
+	lock_kernel();
+	p = dev_to_part(dev);
+	bdev = bdget(part_devt(p));
+	if (bdev == NULL)
+		goto out_unlock_kernel;
+
+	q = bdev_get_queue(bdev);
+	if (q == NULL)
+		goto out_bdput;
+
+	mutex_lock(&bdev->bd_mutex);
+	ret = 0;
+	if (q->blk_trace == NULL)
+		ret = blk_trace_setup_queue(q, bdev->bd_dev);
+
+	if (ret == 0) {
+		if (attr == &dev_attr_act_mask)
+			q->blk_trace->act_mask = value;
+		else if (attr == &dev_attr_pid)
+			q->blk_trace->pid = value;
+		else if (attr == &dev_attr_start_lba)
+			q->blk_trace->start_lba = value;
+		else if (attr == &dev_attr_end_lba)
+			q->blk_trace->end_lba = value;
+		ret = count;
+	}
+	mutex_unlock(&bdev->bd_mutex);
+out_bdput:
+	bdput(bdev);
+out_unlock_kernel:
+	unlock_kernel();
+out:
+	return ret;
+}
-- 
cgit v1.2.3


From 7447dce96f2233d250bc39a4a10a42f7c3dd46fc Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 21:33:57 +0100
Subject: tracing/function-graph-tracer: provide a selftest for the function
 graph tracer

Making it more easy to do a basic regression test for this tracer.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                 |  2 ++
 kernel/trace/trace_functions_graph.c |  3 +++
 kernel/trace/trace_selftest.c        | 50 ++++++++++++++++++++++++++++++++++++
 3 files changed, 55 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b9838f4a6929..a011ec062225 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -500,6 +500,8 @@ extern int DYN_FTRACE_TEST_NAME(void);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 extern int trace_selftest_startup_function(struct tracer *trace,
 					   struct trace_array *tr);
+extern int trace_selftest_startup_function_graph(struct tracer *trace,
+						 struct trace_array *tr);
 extern int trace_selftest_startup_irqsoff(struct tracer *trace,
 					  struct trace_array *tr);
 extern int trace_selftest_startup_preemptoff(struct tracer *trace,
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 222f97d336a6..88f8d9d80a93 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -750,6 +750,9 @@ static struct tracer graph_trace __read_mostly = {
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
+#ifdef CONFIG_FTRACE_SELFTEST
+	.selftest	= trace_selftest_startup_function_graph,
+#endif
 };
 
 static __init int init_graph_trace(void)
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 445700e51f6d..0c9aa1457e51 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -13,6 +13,8 @@ static inline int trace_valid_entry(struct trace_entry *entry)
 	case TRACE_PRINT:
 	case TRACE_SPECIAL:
 	case TRACE_BRANCH:
+	case TRACE_GRAPH_ENT:
+	case TRACE_GRAPH_RET:
 		return 1;
 	}
 	return 0;
@@ -227,6 +229,54 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 }
 #endif /* CONFIG_FUNCTION_TRACER */
 
+
+#ifdef CONFIG_FUNCTION_GRAPH_TRACER
+/*
+ * Pretty much the same than for the function tracer from which the selftest
+ * has been borrowed.
+ */
+int
+trace_selftest_startup_function_graph(struct tracer *trace,
+					struct trace_array *tr)
+{
+	int ret;
+	unsigned long count;
+
+	ret = tracer_init(trace, tr);
+	if (ret) {
+		warn_failed_init_tracer(trace, ret);
+		goto out;
+	}
+
+	/* Sleep for a 1/10 of a second */
+	msleep(100);
+
+	tracing_stop();
+
+	/* check the trace buffer */
+	ret = trace_test_buffer(tr, &count);
+
+	trace->reset(tr);
+	tracing_start();
+
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+		goto out;
+	}
+
+	/* Don't test dynamic tracing, the function tracer already did */
+
+out:
+	/* Stop it if we failed */
+	if (ret)
+		ftrace_graph_stop();
+
+	return ret;
+}
+#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
+
+
 #ifdef CONFIG_IRQSOFF_TRACER
 int
 trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
-- 
cgit v1.2.3


From 1292211058aaf872eeb2a0e2677d237916b4501f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Feb 2009 22:16:12 +0100
Subject: tracing/power: move the power trace headers to a dedicated file

Impact: cleanup

Move the power tracer headers to trace/power.h to keep ftrace.h and power bits
more easy to maintain as separated topics.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arjan van de Ven <arjan@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  2 +-
 arch/x86/kernel/process.c                  |  2 +-
 include/linux/ftrace.h                     | 30 -------------------------
 include/trace/power.h                      | 35 ++++++++++++++++++++++++++++++
 kernel/trace/trace.h                       |  1 +
 kernel/trace/trace_power.c                 |  2 +-
 6 files changed, 39 insertions(+), 33 deletions(-)
 create mode 100644 include/trace/power.h

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..7ed925edf4d2 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 
 #include <linux/acpi.h>
 #include <acpi/processor.h>
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index e68bb9e30864..026819ffcb0c 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -8,7 +8,7 @@
 #include <linux/module.h>
 #include <linux/pm.h>
 #include <linux/clockchips.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 5e302d636fc2..106b7909d500 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -339,36 +339,6 @@ ftrace_init_module(struct module *mod,
 		   unsigned long *start, unsigned long *end) { }
 #endif
 
-enum {
-	POWER_NONE = 0,
-	POWER_CSTATE = 1,
-	POWER_PSTATE = 2,
-};
-
-struct power_trace {
-#ifdef CONFIG_POWER_TRACER
-	ktime_t			stamp;
-	ktime_t			end;
-	int			type;
-	int			state;
-#endif
-};
-
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
-
-
 /*
  * Structure that defines an entry function trace.
  */
diff --git a/include/trace/power.h b/include/trace/power.h
new file mode 100644
index 000000000000..c7cefbcdaea4
--- /dev/null
+++ b/include/trace/power.h
@@ -0,0 +1,35 @@
+#ifndef _TRACE_POWER_H
+#define _TRACE_POWER_H
+
+#include <linux/ktime.h>
+
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+
+struct power_trace {
+#ifdef CONFIG_POWER_TRACER
+	ktime_t			stamp;
+	ktime_t			end;
+	int			type;
+	int			state;
+#endif
+};
+
+#ifdef CONFIG_POWER_TRACER
+extern void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state);
+extern void trace_power_end(struct power_trace *it);
+#else
+static inline void trace_power_start(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_mark(struct power_trace *it, unsigned int type,
+					unsigned int state) { }
+static inline void trace_power_end(struct power_trace *it) { }
+#endif
+
+#endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index a011ec062225..1ecfb9d2b365 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -10,6 +10,7 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <trace/kmemtrace.h>
+#include <trace/power.h>
 
 enum trace_type {
 	__TRACE_FIRST_TYPE = 0,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index bfc21f8079ab..b1d0d087d3a6 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -11,7 +11,7 @@
 
 #include <linux/init.h>
 #include <linux/debugfs.h>
-#include <linux/ftrace.h>
+#include <trace/power.h>
 #include <linux/kallsyms.h>
 #include <linux/module.h>
 
-- 
cgit v1.2.3


From 3861a17bcc0af815f684c6178bc9ec2d790c350e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 8 Feb 2009 00:04:02 +0100
Subject: tracing/function-graph-tracer: drop the kernel_text_address check

When the function graph tracer picks a return address, it ensures this address
is really a kernel text one by calling __kernel_text_address()

Actually this path has never been taken.Its role was more likely to debug the tracer
on the beginning of its development but this function is wasteful since it is called
for every traced function.

The fault check is already sufficient.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c | 7 -------
 kernel/extable.c         | 4 ++--
 kernel/module.c          | 2 +-
 3 files changed, 3 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index d74d75e0952d..18828aee8781 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -491,13 +491,6 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	if (unlikely(!__kernel_text_address(old))) {
-		ftrace_graph_stop();
-		*parent = old;
-		WARN_ON(1);
-		return;
-	}
-
 	calltime = cpu_clock(raw_smp_processor_id());
 
 	if (push_return_trace(old, calltime,
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..0df6253730be 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,7 +41,7 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
-__notrace_funcgraph int core_kernel_text(unsigned long addr)
+int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
 	    addr <= (unsigned long)_etext)
@@ -54,7 +54,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
 	return 0;
 }
 
-__notrace_funcgraph int __kernel_text_address(unsigned long addr)
+int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987e..22d7379709da 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2735,7 +2735,7 @@ int is_module_address(unsigned long addr)
 
 
 /* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+struct module *__module_text_address(unsigned long addr)
 {
 	struct module *mod;
 
-- 
cgit v1.2.3


From b5db03c4355e568f1567758287b30a6a262d5057 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 7 Feb 2009 18:52:59 -0200
Subject: tracing: handle unregistering the current tracer

Impact: simplification

Instead of requiring that plugins have the sequence:

  my_tracer_stop(my_trace_array);
  unregister_tracer(my_tracer);

it should be possible just do a:

  unregister_tracer(my_tracer);

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 03fbd4c20bc2..93040f1bef13 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -559,6 +559,15 @@ void unregister_tracer(struct tracer *type)
 
  found:
 	*t = (*t)->next;
+
+	if (type == current_trace && tracer_enabled) {
+		tracer_enabled = 0;
+		tracing_stop();
+		if (current_trace->stop)
+			current_trace->stop(&global_trace);
+		current_trace = &nop_trace;
+	}
+
 	if (strlen(type->name) != max_tracer_type_len)
 		goto out;
 
-- 
cgit v1.2.3


From 17406b82d621930cca8ccc1272cdac9a7dae8e40 Mon Sep 17 00:00:00 2001
From: Mandeep Singh Baines <msb@google.com>
Date: Fri, 6 Feb 2009 15:37:47 -0800
Subject: softlockup: remove timestamp checking from hung_task

Impact: saves sizeof(long) bytes per task_struct

By guaranteeing that sysctl_hung_task_timeout_secs have elapsed between
tasklist scans we can avoid using timestamps.

Signed-off-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  1 -
 kernel/fork.c         |  8 +++-----
 kernel/hung_task.c    | 48 +++++++++---------------------------------------
 3 files changed, 12 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 2a2811c6239d..e0d723fea9f5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1241,7 +1241,6 @@ struct task_struct {
 #endif
 #ifdef CONFIG_DETECT_HUNG_TASK
 /* hung task detection */
-	unsigned long last_switch_timestamp;
 	unsigned long last_switch_count;
 #endif
 /* CPU-specific state of this task */
diff --git a/kernel/fork.c b/kernel/fork.c
index fb9444282836..bf582f75014b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -639,6 +639,9 @@ static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 
 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->nvcsw = tsk->nivcsw = 0;
+#ifdef CONFIG_DETECT_HUNG_TASK
+	tsk->last_switch_count = tsk->nvcsw + tsk->nivcsw;
+#endif
 
 	tsk->mm = NULL;
 	tsk->active_mm = NULL;
@@ -1041,11 +1044,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 
 	p->default_timer_slack_ns = current->timer_slack_ns;
 
-#ifdef CONFIG_DETECT_HUNG_TASK
-	p->last_switch_count = 0;
-	p->last_switch_timestamp = 0;
-#endif
-
 	task_io_accounting_init(&p->ioac);
 	acct_clear_integrals(p);
 
diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 3951a80e7cbe..0c924de58cb2 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -34,7 +34,6 @@ unsigned long __read_mostly sysctl_hung_task_check_count = PID_MAX_LIMIT;
  * Zero means infinite timeout - no checking done:
  */
 unsigned long __read_mostly sysctl_hung_task_timeout_secs = 120;
-static unsigned long __read_mostly hung_task_poll_jiffies;
 
 unsigned long __read_mostly sysctl_hung_task_warnings = 10;
 
@@ -69,33 +68,17 @@ static struct notifier_block panic_block = {
 	.notifier_call = hung_task_panic,
 };
 
-/*
- * Returns seconds, approximately.  We don't need nanosecond
- * resolution, and we don't need to waste time with a big divide when
- * 2^30ns == 1.074s.
- */
-static unsigned long get_timestamp(void)
-{
-	int this_cpu = raw_smp_processor_id();
-
-	return cpu_clock(this_cpu) >> 30LL;  /* 2^30 ~= 10^9 */
-}
-
-static void check_hung_task(struct task_struct *t, unsigned long now,
-			    unsigned long timeout)
+static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
 	if (t->flags & PF_FROZEN)
 		return;
 
-	if (switch_count != t->last_switch_count || !t->last_switch_timestamp) {
+	if (switch_count != t->last_switch_count) {
 		t->last_switch_count = switch_count;
-		t->last_switch_timestamp = now;
 		return;
 	}
-	if ((long)(now - t->last_switch_timestamp) < timeout)
-		return;
 	if (!sysctl_hung_task_warnings)
 		return;
 	sysctl_hung_task_warnings--;
@@ -111,7 +94,6 @@ static void check_hung_task(struct task_struct *t, unsigned long now,
 	sched_show_task(t);
 	__debug_show_held_locks(t);
 
-	t->last_switch_timestamp = now;
 	touch_nmi_watchdog();
 
 	if (sysctl_hung_task_panic)
@@ -145,7 +127,6 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 {
 	int max_count = sysctl_hung_task_check_count;
 	int batch_count = HUNG_TASK_BATCHING;
-	unsigned long now = get_timestamp();
 	struct task_struct *g, *t;
 
 	/*
@@ -168,19 +149,16 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout)
 		}
 		/* use "==" to skip the TASK_KILLABLE tasks waiting on NFS */
 		if (t->state == TASK_UNINTERRUPTIBLE)
-			check_hung_task(t, now, timeout);
+			check_hung_task(t, timeout);
 	} while_each_thread(g, t);
  unlock:
 	rcu_read_unlock();
 }
 
-static void update_poll_jiffies(void)
+static unsigned long timeout_jiffies(unsigned long timeout)
 {
 	/* timeout of 0 will disable the watchdog */
-	if (sysctl_hung_task_timeout_secs == 0)
-		hung_task_poll_jiffies = MAX_SCHEDULE_TIMEOUT;
-	else
-		hung_task_poll_jiffies = sysctl_hung_task_timeout_secs * HZ / 2;
+	return timeout ? timeout * HZ : MAX_SCHEDULE_TIMEOUT;
 }
 
 /*
@@ -197,8 +175,6 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 	if (ret || !write)
 		goto out;
 
-	update_poll_jiffies();
-
 	wake_up_process(watchdog_task);
 
  out:
@@ -211,20 +187,14 @@ int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
 static int watchdog(void *dummy)
 {
 	set_user_nice(current, 0);
-	update_poll_jiffies();
 
 	for ( ; ; ) {
-		unsigned long timeout;
+		unsigned long timeout = sysctl_hung_task_timeout_secs;
 
-		while (schedule_timeout_interruptible(hung_task_poll_jiffies));
+		while (schedule_timeout_interruptible(timeout_jiffies(timeout)))
+			timeout = sysctl_hung_task_timeout_secs;
 
-		/*
-		 * Need to cache timeout here to avoid timeout being set
-		 * to 0 via sysctl while inside check_hung_*_tasks().
-		 */
-		timeout = sysctl_hung_task_timeout_secs;
-		if (timeout)
-			check_hung_uninterruptible_tasks(timeout);
+		check_hung_uninterruptible_tasks(timeout);
 	}
 
 	return 0;
-- 
cgit v1.2.3


From 1dfba05d0f1a9b4245bb242a7c17fe448811a520 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 9 Feb 2009 12:06:54 +0100
Subject: tracing/blktrace: move the tracing file to kernel/trace, fix

Impact: build fix

The BLK_DEV_IO_TRACE entry used to be in block/Kconfig - which
file itself was dependent on CONFIG_BLOCK. But now the entry is
in kernel/trace/Kconfig - which is present even on !CONFIG_BLOCK.

So add a 'depends on BLOCK' to BLK_DEV_IO_TRACE.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 4fee43c01942..3a331289457a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -305,6 +305,7 @@ config WORKQUEUE_TRACER
 config BLK_DEV_IO_TRACE
 	bool "Support for tracing block io actions"
 	depends on SYSFS
+	depends on BLOCK
 	select RELAY
 	select DEBUG_FS
 	select TRACEPOINTS
-- 
cgit v1.2.3


From b91facc367366b3f71375f337eb5997ec9ab4e69 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Feb 2009 18:30:44 +0100
Subject: tracing/function-graph-tracer: handle the leaf functions from
 trace_pipe

When one cats the trace file, the leaf functions are printed without brackets:

 function();

whereas in the trace_pipe file we'll see the following:

 function() {
 }

This is because the ring_buffer handling is not the same between those two files.
On the trace file, when an entry is printed, the iterator advanced and then we can
check the next entry.

There is no iterator with trace_pipe, the current entry to print has been peeked
and not consumed. So checking the next entry will still return the current one while
we don't consume it.

This patch introduces a new value for the output callbacks to ask the tracing
core to not consume the current entry after printing it.

We need it because we will have to consume the current entry ourself to check
the next one.

Now the trace_pipe is able to handle well the leaf functions.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 |  4 +--
 kernel/trace/trace.h                 |  7 +++---
 kernel/trace/trace_functions_graph.c | 48 ++++++++++++++++++++++--------------
 3 files changed, 36 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 93040f1bef13..5b1e9a9e9906 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2468,8 +2468,8 @@ waitagain:
 			iter->seq.len = len;
 			break;
 		}
-
-		trace_consume(iter);
+		if (ret != TRACE_TYPE_NO_CONSUME)
+			trace_consume(iter);
 
 		if (iter->seq.len >= cnt)
 			break;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 1ecfb9d2b365..7b0518adf6d7 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -63,13 +63,13 @@ struct ftrace_entry {
 
 /* Function call entry */
 struct ftrace_graph_ent_entry {
-	struct trace_entry			ent;
+	struct trace_entry		ent;
 	struct ftrace_graph_ent		graph_ent;
 };
 
 /* Function return entry */
 struct ftrace_graph_ret_entry {
-	struct trace_entry			ent;
+	struct trace_entry		ent;
 	struct ftrace_graph_ret		ret;
 };
 extern struct tracer boot_tracer;
@@ -309,7 +309,8 @@ extern void __ftrace_bad_type(void);
 enum print_line_t {
 	TRACE_TYPE_PARTIAL_LINE	= 0,	/* Retry after flushing the seq */
 	TRACE_TYPE_HANDLED	= 1,
-	TRACE_TYPE_UNHANDLED	= 2	/* Relay to other output functions */
+	TRACE_TYPE_UNHANDLED	= 2,	/* Relay to other output functions */
+	TRACE_TYPE_NO_CONSUME	= 3	/* Handled but ask to not consume */
 };
 
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 88f8d9d80a93..782ec0fdf453 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -212,8 +212,8 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 	return ret;
 }
 
-static bool
-trace_branch_is_leaf(struct trace_iterator *iter,
+static struct ftrace_graph_ret_entry *
+get_return_for_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *curr)
 {
 	struct ring_buffer_iter *ring_iter;
@@ -222,24 +222,33 @@ trace_branch_is_leaf(struct trace_iterator *iter,
 
 	ring_iter = iter->buffer_iter[iter->cpu];
 
-	if (!ring_iter)
-		return false;
-
-	event = ring_buffer_iter_peek(ring_iter, NULL);
+	/* First peek to compare current entry and the next one */
+	if (ring_iter)
+		event = ring_buffer_iter_peek(ring_iter, NULL);
+	else {
+	/* We need to consume the current entry to see the next one */
+		ring_buffer_consume(iter->tr->buffer, iter->cpu, NULL);
+		event = ring_buffer_peek(iter->tr->buffer, iter->cpu,
+					NULL);
+	}
 
 	if (!event)
-		return false;
+		return NULL;
 
 	next = ring_buffer_event_data(event);
 
 	if (next->ent.type != TRACE_GRAPH_RET)
-		return false;
+		return NULL;
 
 	if (curr->ent.pid != next->ent.pid ||
 			curr->graph_ent.func != next->ret.func)
-		return false;
+		return NULL;
 
-	return true;
+	/* this is a leaf, now advance the iterator */
+	if (ring_iter)
+		ring_buffer_read(ring_iter, NULL);
+
+	return next;
 }
 
 /* Signal a overhead of time execution to the output */
@@ -376,18 +385,15 @@ static int print_graph_abs_time(u64 t, struct trace_seq *s)
 /* Case of a leaf function on its call entry */
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
-		struct ftrace_graph_ent_entry *entry, struct trace_seq *s)
+		struct ftrace_graph_ent_entry *entry,
+		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
 {
-	struct ftrace_graph_ret_entry *ret_entry;
 	struct ftrace_graph_ret *graph_ret;
-	struct ring_buffer_event *event;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
 	int ret;
 	int i;
 
-	event = ring_buffer_read(iter->buffer_iter[iter->cpu], NULL);
-	ret_entry = ring_buffer_event_data(event);
 	graph_ret = &ret_entry->ret;
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
@@ -457,7 +463,11 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	return TRACE_TYPE_HANDLED;
+	/*
+	 * we already consumed the current entry to check the next one
+	 * and see if this is a leaf.
+	 */
+	return TRACE_TYPE_NO_CONSUME;
 }
 
 static enum print_line_t
@@ -469,6 +479,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
 	struct ftrace_graph_ent *call = &field->graph_ent;
+	struct ftrace_graph_ret_entry *leaf_ret;
 
 	/* Pid */
 	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
@@ -504,8 +515,9 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	if (trace_branch_is_leaf(iter, field))
-		return print_graph_entry_leaf(iter, field, s);
+	leaf_ret = get_return_for_leaf(iter, field);
+	if (leaf_ret)
+		return print_graph_entry_leaf(iter, field, leaf_ret, s);
 	else
 		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
 
-- 
cgit v1.2.3


From 3c56819b14b00dd449bd776303e61f8532fad09f Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 9 Feb 2009 08:15:56 +0200
Subject: tracing: splice support for tracing_pipe

Added and implemented tracing_pipe_fops->splice_read(). This allows
userspace programs to get tracing data more efficiently.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 136 +++++++++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace.h |   6 +++
 2 files changed, 142 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b1e9a9e9906..9e29fdb0dfe5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -31,6 +31,7 @@
 #include <linux/fs.h>
 #include <linux/kprobes.h>
 #include <linux/writeback.h>
+#include <linux/splice.h>
 
 #include <linux/stacktrace.h>
 #include <linux/ring_buffer.h>
@@ -364,6 +365,25 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	return cnt;
 }
 
+ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+{
+	int len;
+	void *ret;
+
+	if (s->len <= s->readpos)
+		return -EBUSY;
+
+	len = s->len - s->readpos;
+	if (cnt > len)
+		cnt = len;
+	ret = memcpy(buf, s->buffer + s->readpos, cnt);
+	if (!ret)
+		return -EFAULT;
+
+	s->readpos += len;
+	return cnt;
+}
+
 static void
 trace_print_seq(struct seq_file *m, struct trace_seq *s)
 {
@@ -2493,6 +2513,121 @@ out:
 	return sret;
 }
 
+static void tracing_pipe_buf_release(struct pipe_inode_info *pipe,
+				     struct pipe_buffer *buf)
+{
+	__free_page(buf->page);
+}
+
+static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
+				     unsigned int idx)
+{
+	__free_page(spd->pages[idx]);
+}
+
+static struct pipe_buf_operations tracing_pipe_buf_ops = {
+	.can_merge = 0,
+	.map = generic_pipe_buf_map,
+	.unmap = generic_pipe_buf_unmap,
+	.confirm = generic_pipe_buf_confirm,
+	.release = tracing_pipe_buf_release,
+	.steal = generic_pipe_buf_steal,
+	.get = generic_pipe_buf_get,
+};
+
+static ssize_t tracing_splice_read_pipe(struct file *filp,
+					loff_t *ppos,
+					struct pipe_inode_info *pipe,
+					size_t len,
+					unsigned int flags)
+{
+	struct page *pages[PIPE_BUFFERS];
+	struct partial_page partial[PIPE_BUFFERS];
+	struct trace_iterator *iter = filp->private_data;
+	struct splice_pipe_desc spd = {
+		.pages = pages,
+		.partial = partial,
+		.nr_pages = 0, /* This gets updated below. */
+		.flags = flags,
+		.ops = &tracing_pipe_buf_ops,
+		.spd_release = tracing_spd_release_pipe,
+	};
+	ssize_t ret;
+	size_t count, rem;
+	unsigned int i;
+
+	mutex_lock(&trace_types_lock);
+
+	if (iter->trace->splice_read) {
+		ret = iter->trace->splice_read(iter, filp,
+					       ppos, pipe, len, flags);
+		if (ret)
+			goto out;
+	}
+
+	ret = tracing_wait_pipe(filp);
+	if (ret <= 0)
+		goto out;
+
+	if (!iter->ent && !find_next_entry_inc(iter)) {
+		ret = -EFAULT;
+		goto out;
+	}
+
+	/* Fill as many pages as possible. */
+	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
+		pages[i] = alloc_page(GFP_KERNEL);
+
+		/* Seq buffer is page-sized, exactly what we need. */
+		for (;;) {
+			count = iter->seq.len;
+			ret = print_trace_line(iter);
+			count = iter->seq.len - count;
+			if (rem < count) {
+				rem = 0;
+				iter->seq.len -= count;
+				break;
+			}
+			if (ret == TRACE_TYPE_PARTIAL_LINE) {
+				iter->seq.len -= count;
+				break;
+			}
+
+			trace_consume(iter);
+			rem -= count;
+			if (!find_next_entry_inc(iter))	{
+				rem = 0;
+				iter->ent = NULL;
+				break;
+			}
+		}
+
+		/* Copy the data into the page, so we can start over. */
+		ret = trace_seq_to_buffer(&iter->seq,
+					  page_address(pages[i]),
+					  iter->seq.len);
+		if (ret < 0) {
+			__free_page(pages[i]);
+			break;
+		}
+		partial[i].offset = 0;
+		partial[i].len = iter->seq.len;
+
+		trace_seq_reset(&iter->seq);
+	}
+
+	mutex_unlock(&trace_types_lock);
+
+	spd.nr_pages = i;
+
+	return splice_to_pipe(pipe, &spd);
+
+out:
+	mutex_unlock(&trace_types_lock);
+
+	return ret;
+}
+
 static ssize_t
 tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
@@ -2656,6 +2791,7 @@ static struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
 	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
+	.splice_read	= tracing_splice_read_pipe,
 	.release	= tracing_release_pipe,
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7b0518adf6d7..dbff0207b213 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -353,6 +353,12 @@ struct tracer {
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
 					size_t cnt, loff_t *ppos);
+	ssize_t			(*splice_read)(struct trace_iterator *iter,
+					       struct file *filp,
+					       loff_t *ppos,
+					       struct pipe_inode_info *pipe,
+					       size_t len,
+					       unsigned int flags);
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	int			(*selftest)(struct tracer *trace,
 					    struct trace_array *tr);
-- 
cgit v1.2.3


From ff98781bab2735e6c89793034173e0cb5007a7e5 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 9 Feb 2009 08:15:55 +0200
Subject: tracing: Move pipe waiting code out of tracing_read_pipe().

This moves the pipe waiting code from tracing_read_pipe() into
tracing_wait_pipe(), which is useful to implement other fops, like
splice_read.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 69 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 40 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9e29fdb0dfe5..11fde0ad9cb6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2388,37 +2388,15 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
-/*
- * Consumer reader.
- */
-static ssize_t
-tracing_read_pipe(struct file *filp, char __user *ubuf,
-		  size_t cnt, loff_t *ppos)
+/* Must be called with trace_types_lock mutex held. */
+static int tracing_wait_pipe(struct file *filp)
 {
 	struct trace_iterator *iter = filp->private_data;
-	ssize_t sret;
-
-	/* return any leftover data */
-	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
-	if (sret != -EBUSY)
-		return sret;
-
-	trace_seq_reset(&iter->seq);
 
-	mutex_lock(&trace_types_lock);
-	if (iter->trace->read) {
-		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
-		if (sret)
-			goto out;
-	}
-
-waitagain:
-	sret = 0;
 	while (trace_empty(iter)) {
 
 		if ((filp->f_flags & O_NONBLOCK)) {
-			sret = -EAGAIN;
-			goto out;
+			return -EAGAIN;
 		}
 
 		/*
@@ -2443,12 +2421,11 @@ waitagain:
 		iter->tr->waiter = NULL;
 
 		if (signal_pending(current)) {
-			sret = -EINTR;
-			goto out;
+			return -EINTR;
 		}
 
 		if (iter->trace != current_trace)
-			goto out;
+			return 0;
 
 		/*
 		 * We block until we read something and tracing is disabled.
@@ -2465,9 +2442,43 @@ waitagain:
 		continue;
 	}
 
+	return 1;
+}
+
+/*
+ * Consumer reader.
+ */
+static ssize_t
+tracing_read_pipe(struct file *filp, char __user *ubuf,
+		  size_t cnt, loff_t *ppos)
+{
+	struct trace_iterator *iter = filp->private_data;
+	ssize_t sret;
+
+	/* return any leftover data */
+	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
+	if (sret != -EBUSY)
+		return sret;
+
+	trace_seq_reset(&iter->seq);
+
+	mutex_lock(&trace_types_lock);
+	if (iter->trace->read) {
+		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
+		if (sret)
+			goto out;
+	}
+
+waitagain:
+	sret = tracing_wait_pipe(filp);
+	if (sret <= 0)
+		goto out;
+
 	/* stop when tracing is finished */
-	if (trace_empty(iter))
+	if (trace_empty(iter)) {
+		sret = 0;
 		goto out;
+	}
 
 	if (cnt >= PAGE_SIZE)
 		cnt = PAGE_SIZE - 1;
-- 
cgit v1.2.3


From 34cd4998d38f9bd04f34b78a7cb0c7f1bee00bd9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Feb 2009 12:06:29 -0500
Subject: tracing: clean up splice code

Ingo Molnar suggested a series of clean ups for the splice code.
This patch implements those suggestions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 96 ++++++++++++++++++++++++++++++----------------------
 1 file changed, 55 insertions(+), 41 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 11fde0ad9cb6..d89821283b47 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2537,15 +2537,49 @@ static void tracing_spd_release_pipe(struct splice_pipe_desc *spd,
 }
 
 static struct pipe_buf_operations tracing_pipe_buf_ops = {
-	.can_merge = 0,
-	.map = generic_pipe_buf_map,
-	.unmap = generic_pipe_buf_unmap,
-	.confirm = generic_pipe_buf_confirm,
-	.release = tracing_pipe_buf_release,
-	.steal = generic_pipe_buf_steal,
-	.get = generic_pipe_buf_get,
+	.can_merge		= 0,
+	.map			= generic_pipe_buf_map,
+	.unmap			= generic_pipe_buf_unmap,
+	.confirm		= generic_pipe_buf_confirm,
+	.release		= tracing_pipe_buf_release,
+	.steal			= generic_pipe_buf_steal,
+	.get			= generic_pipe_buf_get,
 };
 
+static size_t
+tracing_fill_pipe_page(struct page *pages, size_t rem,
+			struct trace_iterator *iter)
+{
+	size_t count;
+	int ret;
+
+	/* Seq buffer is page-sized, exactly what we need. */
+	for (;;) {
+		count = iter->seq.len;
+		ret = print_trace_line(iter);
+		count = iter->seq.len - count;
+		if (rem < count) {
+			rem = 0;
+			iter->seq.len -= count;
+			break;
+		}
+		if (ret == TRACE_TYPE_PARTIAL_LINE) {
+			iter->seq.len -= count;
+			break;
+		}
+
+		trace_consume(iter);
+		rem -= count;
+		if (!find_next_entry_inc(iter))	{
+			rem = 0;
+			iter->ent = NULL;
+			break;
+		}
+	}
+
+	return rem;
+}
+
 static ssize_t tracing_splice_read_pipe(struct file *filp,
 					loff_t *ppos,
 					struct pipe_inode_info *pipe,
@@ -2556,15 +2590,15 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 	struct partial_page partial[PIPE_BUFFERS];
 	struct trace_iterator *iter = filp->private_data;
 	struct splice_pipe_desc spd = {
-		.pages = pages,
-		.partial = partial,
-		.nr_pages = 0, /* This gets updated below. */
-		.flags = flags,
-		.ops = &tracing_pipe_buf_ops,
-		.spd_release = tracing_spd_release_pipe,
+		.pages		= pages,
+		.partial	= partial,
+		.nr_pages	= 0, /* This gets updated below. */
+		.flags		= flags,
+		.ops		= &tracing_pipe_buf_ops,
+		.spd_release	= tracing_spd_release_pipe,
 	};
 	ssize_t ret;
-	size_t count, rem;
+	size_t rem;
 	unsigned int i;
 
 	mutex_lock(&trace_types_lock);
@@ -2573,45 +2607,25 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		ret = iter->trace->splice_read(iter, filp,
 					       ppos, pipe, len, flags);
 		if (ret)
-			goto out;
+			goto out_err;
 	}
 
 	ret = tracing_wait_pipe(filp);
 	if (ret <= 0)
-		goto out;
+		goto out_err;
 
 	if (!iter->ent && !find_next_entry_inc(iter)) {
 		ret = -EFAULT;
-		goto out;
+		goto out_err;
 	}
 
 	/* Fill as many pages as possible. */
 	for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) {
 		pages[i] = alloc_page(GFP_KERNEL);
+		if (!pages[i])
+			break;
 
-		/* Seq buffer is page-sized, exactly what we need. */
-		for (;;) {
-			count = iter->seq.len;
-			ret = print_trace_line(iter);
-			count = iter->seq.len - count;
-			if (rem < count) {
-				rem = 0;
-				iter->seq.len -= count;
-				break;
-			}
-			if (ret == TRACE_TYPE_PARTIAL_LINE) {
-				iter->seq.len -= count;
-				break;
-			}
-
-			trace_consume(iter);
-			rem -= count;
-			if (!find_next_entry_inc(iter))	{
-				rem = 0;
-				iter->ent = NULL;
-				break;
-			}
-		}
+		rem = tracing_fill_pipe_page(pages[i], rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
@@ -2633,7 +2647,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 
 	return splice_to_pipe(pipe, &spd);
 
-out:
+out_err:
 	mutex_unlock(&trace_types_lock);
 
 	return ret;
-- 
cgit v1.2.3


From 6cd61c0baa8bce32271226198b46c67a7a05d108 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:39 +0900
Subject: elf: add ELF_CORE_COPY_KERNEL_REGS()

ELF core dump is used for both user land core dump and kernel crash
dump.  Depending on architecture, register might need to be accessed
differently for userland and kernel.  Allow architectures to define
ELF_CORE_COPY_KERNEL_REGS() and use different operation for kernel
register dump.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/elfcore.h | 9 +++++++++
 kernel/kexec.c          | 2 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/elfcore.h b/include/linux/elfcore.h
index 5ca54d77079f..7605c5e9589f 100644
--- a/include/linux/elfcore.h
+++ b/include/linux/elfcore.h
@@ -111,6 +111,15 @@ static inline void elf_core_copy_regs(elf_gregset_t *elfregs, struct pt_regs *re
 #endif
 }
 
+static inline void elf_core_copy_kernel_regs(elf_gregset_t *elfregs, struct pt_regs *regs)
+{
+#ifdef ELF_CORE_COPY_KERNEL_REGS
+	ELF_CORE_COPY_KERNEL_REGS((*elfregs), regs);
+#else
+	elf_core_copy_regs(elfregs, regs);
+#endif
+}
+
 static inline int elf_core_copy_task_regs(struct task_struct *t, elf_gregset_t* elfregs)
 {
 #ifdef ELF_CORE_COPY_TASK_REGS
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8a6d7b08864e..795e7b67a228 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1130,7 +1130,7 @@ void crash_save_cpu(struct pt_regs *regs, int cpu)
 		return;
 	memset(&prstatus, 0, sizeof(prstatus));
 	prstatus.pr_pid = current->pid;
-	elf_core_copy_regs(&prstatus.pr_reg, regs);
+	elf_core_copy_kernel_regs(&prstatus.pr_reg, regs);
 	buf = append_elf_note(buf, KEXEC_CORE_NOTE_NAME, NT_PRSTATUS,
 		      	      &prstatus, sizeof(prstatus));
 	final_note(buf);
-- 
cgit v1.2.3


From 5d707e9c8ef2a3596ed5c975c6ff05cec890c2b4 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Mon, 9 Feb 2009 22:17:39 +0900
Subject: stackprotector: update make rules

Impact: no default -fno-stack-protector if stackp is enabled, cleanup

Stackprotector make rules had the following problems.

* cc support test and warning are scattered across makefile and
  kernel/panic.c.

* -fno-stack-protector was always added regardless of configuration.

Update such that cc support test and warning are contained in makefile
and -fno-stack-protector is added iff stackp is turned off.  While at
it, prepare for 32bit support.

Signed-off-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Makefile                                  |  3 ++-
 arch/x86/Makefile                         | 17 ++++++++++-------
 kernel/panic.c                            |  4 ----
 scripts/gcc-x86_64-has-stack-protector.sh |  4 +++-
 4 files changed, 15 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/Makefile b/Makefile
index 681c1d23b4d4..77a006dae2da 100644
--- a/Makefile
+++ b/Makefile
@@ -532,8 +532,9 @@ KBUILD_CFLAGS += $(call cc-option,-Wframe-larger-than=${CONFIG_FRAME_WARN})
 endif
 
 # Force gcc to behave correct even for buggy distributions
-# Arch Makefiles may override this setting
+ifndef CONFIG_CC_STACKPROTECTOR
 KBUILD_CFLAGS += $(call cc-option, -fno-stack-protector)
+endif
 
 ifdef CONFIG_FRAME_POINTER
 KBUILD_CFLAGS	+= -fno-omit-frame-pointer -fno-optimize-sibling-calls
diff --git a/arch/x86/Makefile b/arch/x86/Makefile
index cacee981d166..ab48ab497e5a 100644
--- a/arch/x86/Makefile
+++ b/arch/x86/Makefile
@@ -70,14 +70,17 @@ else
         # this works around some issues with generating unwind tables in older gccs
         # newer gccs do it by default
         KBUILD_CFLAGS += -maccumulate-outgoing-args
+endif
 
-        stackp := $(CONFIG_SHELL) $(srctree)/scripts/gcc-x86_64-has-stack-protector.sh
-        stackp-$(CONFIG_CC_STACKPROTECTOR) := $(shell $(stackp) \
-                "$(CC)" "-fstack-protector -DGCC_HAS_SP" )
-        stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += $(shell $(stackp) \
-                "$(CC)" -fstack-protector-all )
-
-        KBUILD_CFLAGS += $(stackp-y)
+ifdef CONFIG_CC_STACKPROTECTOR
+	cc_has_sp := $(srctree)/scripts/gcc-x86_$(BITS)-has-stack-protector.sh
+        ifeq ($(shell $(CONFIG_SHELL) $(cc_has_sp) $(CC)),y)
+                stackp-y := -fstack-protector
+                stackp-$(CONFIG_CC_STACKPROTECTOR_ALL) += -fstack-protector-all
+                KBUILD_CFLAGS += $(stackp-y)
+        else
+                $(warning stack protector enabled but no compiler support)
+        endif
 endif
 
 # Stackpointer is addressed different for 32 bit and 64 bit x86
diff --git a/kernel/panic.c b/kernel/panic.c
index 33cab3de1763..32fe4eff1b89 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -359,10 +359,6 @@ EXPORT_SYMBOL(warn_slowpath);
 
 #ifdef CONFIG_CC_STACKPROTECTOR
 
-#ifndef GCC_HAS_SP
-#warning You have selected the CONFIG_CC_STACKPROTECTOR option, but the gcc used does not support this.
-#endif
-
 /*
  * Called when gcc's -fstack-protector feature is used, and
  * gcc detects corruption of the on-stack canary value
diff --git a/scripts/gcc-x86_64-has-stack-protector.sh b/scripts/gcc-x86_64-has-stack-protector.sh
index 325c0a1b03b6..2d69fcdc5609 100644
--- a/scripts/gcc-x86_64-has-stack-protector.sh
+++ b/scripts/gcc-x86_64-has-stack-protector.sh
@@ -2,5 +2,7 @@
 
 echo "int foo(void) { char X[200]; return 3; }" | $1 -S -xc -c -O0 -mcmodel=kernel -fstack-protector - -o - 2> /dev/null | grep -q "%gs"
 if [ "$?" -eq "0" ] ; then
-	echo $2
+	echo y
+else
+	echo n
 fi
-- 
cgit v1.2.3


From acd895795d35d7c6405f20301a846d16998795ec Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 9 Feb 2009 19:20:50 +0000
Subject: profiling: fix broken profiling regression

Impact: fix broken /proc/profile on UP machines

Commit c309b917cab55799ea489d7b5f1b77025d9f8462 "cpumask: convert
kernel/profile.c" broke profiling.  prof_cpu_mask was previously
initialized to CPU_MASK_ALL, but left uninitialized in that commit.
We need to copy cpu_possible_mask (cpu_online_mask is not enough).

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/profile.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/profile.c b/kernel/profile.c
index 784933acf5b8..7724e0409bae 100644
--- a/kernel/profile.c
+++ b/kernel/profile.c
@@ -114,12 +114,15 @@ int __ref profile_init(void)
 	if (!slab_is_available()) {
 		prof_buffer = alloc_bootmem(buffer_bytes);
 		alloc_bootmem_cpumask_var(&prof_cpu_mask);
+		cpumask_copy(prof_cpu_mask, cpu_possible_mask);
 		return 0;
 	}
 
 	if (!alloc_cpumask_var(&prof_cpu_mask, GFP_KERNEL))
 		return -ENOMEM;
 
+	cpumask_copy(prof_cpu_mask, cpu_possible_mask);
+
 	prof_buffer = kzalloc(buffer_bytes, GFP_KERNEL);
 	if (prof_buffer)
 		return 0;
-- 
cgit v1.2.3


From b85fa01ed958ca59523a2db3c2ee647b98745d6a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Feb 2009 14:21:14 +0800
Subject: ring_buffer: fix typing mistake

Impact: Fix bug

I found several very very curious line.
It's so curious that it may be brought by typing mistake.

When (cpu_buffer->reader_page == cpu_buffer->commit_page):

1) We haven't copied it for bpage is changed:
   bpage = cpu_buffer->reader_page->page;
   memcpy(bpage->data, cpu_buffer->reader_page->page->data + read ... )
2) We need update cpu_buffer->reader_page->read, but
   "cpu_buffer->reader_page += read;" is not right.

[
  This bug was a typo. The commit->reader_page is a page pointer
  and not an index into the page. The line should have been
  commit->reader_page->read += read.  The other changes
  by Lai are nice clean ups to the code.  - SDR
]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53ba3a6d16d0..eca282720838 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2406,7 +2406,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * to swap with a page in the ring buffer.
  *
  * for example:
- *	rpage = ring_buffer_alloc_page(buffer);
+ *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
  *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
@@ -2461,18 +2461,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	 */
 	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		unsigned int read = cpu_buffer->reader_page->read;
+		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
 
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		bpage = cpu_buffer->reader_page->page;
 		memcpy(bpage->data,
 		       cpu_buffer->reader_page->page->data + read,
-		       local_read(&bpage->commit) - read);
+		       commit - read);
 
 		/* consume what was read */
-		cpu_buffer->reader_page += read;
-
+		cpu_buffer->reader_page->read = commit;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.2.3


From 667d24125839b6f3363d8177d7ed9fab8a40e45f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 9 Feb 2009 14:21:17 +0800
Subject: ring_buffer: fix ring_buffer_read_page()

Impact: change API and init bpage when copy

ring_buffer_read_page()/rb_remove_entries() may be called for
a partially consumed page.

Add a parameter for rb_remove_entries() and make it update
cpu_buffer->entries correctly for partially consumed pages.

ring_buffer_read_page() now returns the offset to the next event.

Init the bpage's time_stamp when return value is 0.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index eca282720838..10d202ea06f3 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2332,13 +2332,14 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
 static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
-			      struct buffer_data_page *bpage)
+			      struct buffer_data_page *bpage,
+			      unsigned int offset)
 {
 	struct ring_buffer_event *event;
 	unsigned long head;
 
 	__raw_spin_lock(&cpu_buffer->lock);
-	for (head = 0; head < local_read(&bpage->commit);
+	for (head = offset; head < local_read(&bpage->commit);
 	     head += rb_event_length(event)) {
 
 		event = __rb_data_page_index(bpage, head);
@@ -2410,8 +2411,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	if (!rpage)
  *		return error;
  *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
- *	if (ret)
- *		process_page(rpage);
+ *	if (ret >= 0)
+ *		process_page(rpage, ret);
  *
  * When @full is set, the function will not return true unless
  * the writer is off the reader page.
@@ -2422,8 +2423,8 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  responsible for that.
  *
  * Returns:
- *  1 if data has been transferred
- *  0 if no data has been transferred.
+ *  >=0 if data has been transferred, returns the offset of consumed data.
+ *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
 			    void **data_page, int cpu, int full)
@@ -2432,7 +2433,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
 	unsigned long flags;
-	int ret = 0;
+	unsigned int read;
+	int ret = -1;
 
 	if (!data_page)
 		return 0;
@@ -2454,24 +2456,29 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	/* check for data */
 	if (!local_read(&cpu_buffer->reader_page->page->commit))
 		goto out;
+
+	read = cpu_buffer->reader_page->read;
 	/*
 	 * If the writer is already off of the read page, then simply
 	 * switch the read page with the given page. Otherwise
 	 * we need to copy the data from the reader to the writer.
 	 */
 	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int read = cpu_buffer->reader_page->read;
 		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
 
 		if (full)
 			goto out;
 		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data,
-		       cpu_buffer->reader_page->page->data + read,
-		       commit - read);
+		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
 		/* consume what was read */
 		cpu_buffer->reader_page->read = commit;
+
+		/* update bpage */
+		local_set(&bpage->commit, commit);
+		if (!read)
+			bpage->time_stamp = rpage->time_stamp;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
@@ -2480,10 +2487,10 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		cpu_buffer->reader_page->read = 0;
 		*data_page = bpage;
 	}
-	ret = 1;
+	ret = read;
 
 	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage);
+	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From 4543ae7ce1cb8b5ff27a59009e7991ea63791a71 Mon Sep 17 00:00:00 2001
From: Tobias Klauser <tklauser@distanz.ch>
Date: Mon, 9 Feb 2009 23:09:32 +0100
Subject: tracing: storage class should be before const qualifier

The C99 specification states in section 6.11.5:

The placement of a storage-class specifier other than at the beginning
of the declaration specifiers in a declaration is an obsolescent
feature.

Signed-off-by: Tobias Klauser <tklauser@distanz.ch>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sysprof.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 84ca9d81e74d..4e2de4de1d65 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -88,7 +88,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable)
 	}
 }
 
-const static struct stacktrace_ops backtrace_ops = {
+static const struct stacktrace_ops backtrace_ops = {
 	.warning		= backtrace_warning,
 	.warning_symbol		= backtrace_warning_symbol,
 	.stack			= backtrace_stack,
-- 
cgit v1.2.3


From f54fc98aa656f334c1571df6e3ca9178ea223847 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:02:46 -0500
Subject: tracing: remove unneeded variable

Impact: clean up.

Remove the unnecessary variable ret.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_branch.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index f8ae2c50e01d..c2e68d440c4d 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -91,8 +91,6 @@ void trace_likely_condition(struct ftrace_branch_data *f, int val, int expect)
 
 int enable_branch_tracing(struct trace_array *tr)
 {
-	int ret = 0;
-
 	mutex_lock(&branch_tracing_mutex);
 	branch_tracer = tr;
 	/*
@@ -103,7 +101,7 @@ int enable_branch_tracing(struct trace_array *tr)
 	branch_tracing_enabled++;
 	mutex_unlock(&branch_tracing_mutex);
 
-	return ret;
+	return 0;
 }
 
 void disable_branch_tracing(void)
-- 
cgit v1.2.3


From 810dc73265cd690b2bc6010489b4317bba2cda39 Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:05 -0500
Subject: tracing: provide correct return value after outputting the event

This patch is to make the function return early on failure, and give
correct return value on success.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 782ec0fdf453..519a0cab1530 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -186,30 +186,30 @@ verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
 	ret = trace_seq_printf(s,
 		" ------------------------------------------\n");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_cpu(s, cpu);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_proc(s, prev_pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = trace_seq_printf(s, " => ");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = print_graph_proc(s, pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
 	ret = trace_seq_printf(s,
 		"\n ------------------------------------------\n\n");
 	if (!ret)
-		TRACE_TYPE_PARTIAL_LINE;
+		return TRACE_TYPE_PARTIAL_LINE;
 
-	return ret;
+	return TRACE_TYPE_HANDLED;
 }
 
 static struct ftrace_graph_ret_entry *
-- 
cgit v1.2.3


From c3706f005c3aaf570e71f0f083fdbb59a5a9fa2e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 10 Feb 2009 01:03:18 -0500
Subject: tracing: fix typos in comments

Impact: clean up.

Fix typos in the comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h      | 2 +-
 kernel/trace/ring_buffer.c       | 8 ++++----
 kernel/trace/trace.c             | 2 +-
 kernel/trace/trace_hw_branches.c | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 3c103d636da3..8e6646a54acf 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -8,7 +8,7 @@ struct ring_buffer;
 struct ring_buffer_iter;
 
 /*
- * Don't reference this struct directly, use functions below.
+ * Don't refer to this struct directly, use functions below.
  */
 struct ring_buffer_event {
 	u32		type:2, len:3, time_delta:27;
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 10d202ea06f3..fa64e1f003eb 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -91,7 +91,7 @@ EXPORT_SYMBOL_GPL(tracing_off);
  * tracing_off_permanent - permanently disable ring buffers
  *
  * This function, once called, will disable all ring buffers
- * permanenty.
+ * permanently.
  */
 void tracing_off_permanent(void)
 {
@@ -210,7 +210,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_data);
 
 struct buffer_data_page {
 	u64		 time_stamp;	/* page time stamp */
-	local_t		 commit;	/* write commited index */
+	local_t		 commit;	/* write committed index */
 	unsigned char	 data[];	/* data of buffer page */
 };
 
@@ -260,7 +260,7 @@ struct ring_buffer_per_cpu {
 	struct list_head		pages;
 	struct buffer_page		*head_page;	/* read from head */
 	struct buffer_page		*tail_page;	/* write to tail */
-	struct buffer_page		*commit_page;	/* commited pages */
+	struct buffer_page		*commit_page;	/* committed pages */
 	struct buffer_page		*reader_page;
 	unsigned long			overrun;
 	unsigned long			entries;
@@ -303,7 +303,7 @@ struct ring_buffer_iter {
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
  *
- * As a safty measure we check to make sure the data pages have not
+ * As a safety measure we check to make sure the data pages have not
  * been corrupted.
  */
 static int rb_check_pages(struct ring_buffer_per_cpu *cpu_buffer)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d89821283b47..d7c175a442df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1963,7 +1963,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	struct tracer_opt *trace_opts = current_trace->flags->opts;
 
 
-	/* calulate max size */
+	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
 		len += 3; /* "no" and space */
diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index e3e7db61c067..0794dd33f27b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -75,7 +75,7 @@ static void bts_trace_start(struct trace_array *tr)
 }
 
 /*
- * Start tracing on the current cpu.
+ * Stop tracing on the current cpu.
  * The argument is ignored.
  *
  * pre: bts_tracer_mutex must be locked.
-- 
cgit v1.2.3


From 4fd2735881bf4d8bf5e30979f31fc2f1b1d505fa Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:12 +0100
Subject: tracing: fix sparse warnings: make symbols static

Impact: make global variables and a global function static

The function '__trace_userstack' does not seem to have a caller, so it
is commented out.

Fix this sparse warnings:
  kernel/trace/trace.c:82:5: warning: symbol 'tracing_disabled' was not declared. Should it be static?
  kernel/trace/trace.c:600:10: warning: symbol 'trace_record_cmdline_disabled' was not declared. Should it be static?
  kernel/trace/trace.c:957:6: warning: symbol '__trace_userstack' was not declared. Should it be static?
  kernel/trace/trace.c:1694:5: warning: symbol 'tracing_release' was not declared. Should it be static?

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d7c175a442df..c157ba70f30c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -80,7 +80,7 @@ static int dummy_set_flag(u32 old_flags, u32 bit, int set)
  * of the tracer is successful. But that is the only place that sets
  * this back to zero.
  */
-int tracing_disabled = 1;
+static int tracing_disabled = 1;
 
 static DEFINE_PER_CPU(local_t, ftrace_cpu_disabled);
 
@@ -626,7 +626,7 @@ static int cmdline_idx;
 static DEFINE_SPINLOCK(trace_cmdline_lock);
 
 /* temporary disable recording */
-atomic_t trace_record_cmdline_disabled __read_mostly;
+static atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
@@ -983,10 +983,12 @@ static void ftrace_trace_userstack(struct trace_array *tr,
 #endif
 }
 
-void __trace_userstack(struct trace_array *tr, unsigned long flags)
+#ifdef UNUSED
+static void __trace_userstack(struct trace_array *tr, unsigned long flags)
 {
 	ftrace_trace_userstack(tr, flags, preempt_count());
 }
+#endif /* UNUSED */
 
 static void
 ftrace_trace_special(void *__tr,
@@ -1720,7 +1722,7 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 	return 0;
 }
 
-int tracing_release(struct inode *inode, struct file *file)
+static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
 	struct trace_iterator *iter = m->private;
-- 
cgit v1.2.3


From 5e39841c45cf5e6ea930ede1b0303309e03037a2 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:34 +0100
Subject: tracing: fix sparse warnings: fix (un-)signedness

Fix these sparse warnings:

  kernel/trace/ring_buffer.c:70:37: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:84:39: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:96:43: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2475:13: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2478:42: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2500:40: warning: incorrect type in argument 3 (different signedness)
  kernel/trace/ring_buffer.c:2505:44: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/ring_buffer.c:2507:46: warning: incorrect type in argument 2 (different signedness)
  kernel/trace/trace.c:2130:40: warning: incorrect type in argument 3 (different signedness)
  kernel/trace/trace.c:2280:40: warning: incorrect type in argument 3 (different signedness)

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 kernel/trace/trace.c       | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index fa64e1f003eb..dc18b5b9ccb4 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -59,7 +59,7 @@ enum {
 	RB_BUFFERS_DISABLED	= 1 << RB_BUFFERS_DISABLED_BIT,
 };
 
-static long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
+static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
 /**
  * tracing_on - enable all tracing buffers
@@ -2501,7 +2501,7 @@ static ssize_t
 rb_simple_read(struct file *filp, char __user *ubuf,
 	       size_t cnt, loff_t *ppos)
 {
-	long *p = filp->private_data;
+	unsigned long *p = filp->private_data;
 	char buf[64];
 	int r;
 
@@ -2517,9 +2517,9 @@ static ssize_t
 rb_simple_write(struct file *filp, const char __user *ubuf,
 		size_t cnt, loff_t *ppos)
 {
-	long *p = filp->private_data;
+	unsigned long *p = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c157ba70f30c..e2434990277c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2147,7 +2147,7 @@ tracing_ctrl_write(struct file *filp, const char __user *ubuf,
 {
 	struct trace_array *tr = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
@@ -2295,9 +2295,9 @@ static ssize_t
 tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 		      size_t cnt, loff_t *ppos)
 {
-	long *ptr = filp->private_data;
+	unsigned long *ptr = filp->private_data;
 	char buf[64];
-	long val;
+	unsigned long val;
 	int ret;
 
 	if (cnt >= sizeof(buf))
-- 
cgit v1.2.3


From e7669b8e329255bbcb40af65b38e342825d97a46 Mon Sep 17 00:00:00 2001
From: Hannes Eder <hannes@hanneseder.net>
Date: Tue, 10 Feb 2009 19:44:45 +0100
Subject: tracing: fix sparse warning: attribute function with
 __acquires/__releases

Fix this sparse warning:

  kernel/trace/trace.c:458:9: warning: context imbalance in 'register_tracer' - unexpected unlock

Signed-off-by: Hannes Eder <hannes@hanneseder.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e2434990277c..95f99a7abf2f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -459,6 +459,8 @@ update_max_tr_single(struct trace_array *tr, struct task_struct *tsk, int cpu)
  * Register a new plugin tracer.
  */
 int register_tracer(struct tracer *type)
+__releases(kernel_lock)
+__acquires(kernel_lock)
 {
 	struct tracer *t;
 	int len;
-- 
cgit v1.2.3


From ad0b0fd554dfc126b5750d14908dccc3bbf602be Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Tue, 10 Feb 2009 11:42:26 -0800
Subject: sched, latencytop: incorporate review feedback from Andrew Morton

Andrew had some suggestions for the latencytop file; this patch takes care
of most of these:

* Add documentation
* Turn account_scheduler_latency into an inline function
* Don't report negative values to userspace
* Make the file operations struct const
* Fix a few checkpatch.pl warnings

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/latencytop.h | 10 +++++-
 kernel/latencytop.c        | 83 +++++++++++++++++++++++++++++++++++++++-------
 2 files changed, 80 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/latencytop.h b/include/linux/latencytop.h
index 901c2d6377a8..b0e99898527c 100644
--- a/include/linux/latencytop.h
+++ b/include/linux/latencytop.h
@@ -9,6 +9,7 @@
 #ifndef _INCLUDE_GUARD_LATENCYTOP_H_
 #define _INCLUDE_GUARD_LATENCYTOP_H_
 
+#include <linux/compiler.h>
 #ifdef CONFIG_LATENCYTOP
 
 #define LT_SAVECOUNT		32
@@ -24,7 +25,14 @@ struct latency_record {
 
 struct task_struct;
 
-void account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+extern int latencytop_enabled;
+void __account_scheduler_latency(struct task_struct *task, int usecs, int inter);
+static inline void
+account_scheduler_latency(struct task_struct *task, int usecs, int inter)
+{
+	if (unlikely(latencytop_enabled))
+		__account_scheduler_latency(task, usecs, inter);
+}
 
 void clear_all_latency_tracing(struct task_struct *p);
 
diff --git a/kernel/latencytop.c b/kernel/latencytop.c
index 449db466bdbc..ca07c5c0c914 100644
--- a/kernel/latencytop.c
+++ b/kernel/latencytop.c
@@ -9,6 +9,44 @@
  * as published by the Free Software Foundation; version 2
  * of the License.
  */
+
+/*
+ * CONFIG_LATENCYTOP enables a kernel latency tracking infrastructure that is
+ * used by the "latencytop" userspace tool. The latency that is tracked is not
+ * the 'traditional' interrupt latency (which is primarily caused by something
+ * else consuming CPU), but instead, it is the latency an application encounters
+ * because the kernel sleeps on its behalf for various reasons.
+ *
+ * This code tracks 2 levels of statistics:
+ * 1) System level latency
+ * 2) Per process latency
+ *
+ * The latency is stored in fixed sized data structures in an accumulated form;
+ * if the "same" latency cause is hit twice, this will be tracked as one entry
+ * in the data structure. Both the count, total accumulated latency and maximum
+ * latency are tracked in this data structure. When the fixed size structure is
+ * full, no new causes are tracked until the buffer is flushed by writing to
+ * the /proc file; the userspace tool does this on a regular basis.
+ *
+ * A latency cause is identified by a stringified backtrace at the point that
+ * the scheduler gets invoked. The userland tool will use this string to
+ * identify the cause of the latency in human readable form.
+ *
+ * The information is exported via /proc/latency_stats and /proc/<pid>/latency.
+ * These files look like this:
+ *
+ * Latency Top version : v0.1
+ * 70 59433 4897 i915_irq_wait drm_ioctl vfs_ioctl do_vfs_ioctl sys_ioctl
+ * |    |    |    |
+ * |    |    |    +----> the stringified backtrace
+ * |    |    +---------> The maximum latency for this entry in microseconds
+ * |    +--------------> The accumulated latency for this entry (microseconds)
+ * +-------------------> The number of times this entry is hit
+ *
+ * (note: the average latency is the accumulated latency divided by the number
+ * of times)
+ */
+
 #include <linux/latencytop.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
@@ -72,7 +110,7 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
 				firstnonnull = i;
 			continue;
 		}
-		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+		for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 			unsigned long record = lat->backtrace[q];
 
 			if (latency_record[i].backtrace[q] != record) {
@@ -101,31 +139,52 @@ account_global_scheduler_latency(struct task_struct *tsk, struct latency_record
 	memcpy(&latency_record[i], lat, sizeof(struct latency_record));
 }
 
-static inline void store_stacktrace(struct task_struct *tsk, struct latency_record *lat)
+/*
+ * Iterator to store a backtrace into a latency record entry
+ */
+static inline void store_stacktrace(struct task_struct *tsk,
+					struct latency_record *lat)
 {
 	struct stack_trace trace;
 
 	memset(&trace, 0, sizeof(trace));
 	trace.max_entries = LT_BACKTRACEDEPTH;
 	trace.entries = &lat->backtrace[0];
-	trace.skip = 0;
 	save_stack_trace_tsk(tsk, &trace);
 }
 
+/**
+ * __account_scheduler_latency - record an occured latency
+ * @tsk - the task struct of the task hitting the latency
+ * @usecs - the duration of the latency in microseconds
+ * @inter - 1 if the sleep was interruptible, 0 if uninterruptible
+ *
+ * This function is the main entry point for recording latency entries
+ * as called by the scheduler.
+ *
+ * This function has a few special cases to deal with normal 'non-latency'
+ * sleeps: specifically, interruptible sleep longer than 5 msec is skipped
+ * since this usually is caused by waiting for events via select() and co.
+ *
+ * Negative latencies (caused by time going backwards) are also explicitly
+ * skipped.
+ */
 void __sched
-account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
+__account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 {
 	unsigned long flags;
 	int i, q;
 	struct latency_record lat;
 
-	if (!latencytop_enabled)
-		return;
-
 	/* Long interruptible waits are generally user requested... */
 	if (inter && usecs > 5000)
 		return;
 
+	/* Negative sleeps are time going backwards */
+	/* Zero-time sleeps are non-interesting */
+	if (usecs <= 0)
+		return;
+
 	memset(&lat, 0, sizeof(lat));
 	lat.count = 1;
 	lat.time = usecs;
@@ -143,12 +202,12 @@ account_scheduler_latency(struct task_struct *tsk, int usecs, int inter)
 	if (tsk->latency_record_count >= LT_SAVECOUNT)
 		goto out_unlock;
 
-	for (i = 0; i < LT_SAVECOUNT ; i++) {
+	for (i = 0; i < LT_SAVECOUNT; i++) {
 		struct latency_record *mylat;
 		int same = 1;
 
 		mylat = &tsk->latency_record[i];
-		for (q = 0 ; q < LT_BACKTRACEDEPTH ; q++) {
+		for (q = 0; q < LT_BACKTRACEDEPTH; q++) {
 			unsigned long record = lat.backtrace[q];
 
 			if (mylat->backtrace[q] != record) {
@@ -186,7 +245,7 @@ static int lstats_show(struct seq_file *m, void *v)
 	for (i = 0; i < MAXLR; i++) {
 		if (latency_record[i].backtrace[0]) {
 			int q;
-			seq_printf(m, "%i %li %li ",
+			seq_printf(m, "%i %lu %lu ",
 				latency_record[i].count,
 				latency_record[i].time,
 				latency_record[i].max);
@@ -223,7 +282,7 @@ static int lstats_open(struct inode *inode, struct file *filp)
 	return single_open(filp, lstats_show, NULL);
 }
 
-static struct file_operations lstats_fops = {
+static const struct file_operations lstats_fops = {
 	.open		= lstats_open,
 	.read		= seq_read,
 	.write		= lstats_write,
@@ -236,4 +295,4 @@ static int __init init_lstats_procfs(void)
 	proc_create("latency_stats", 0644, NULL, &lstats_fops);
 	return 0;
 }
-__initcall(init_lstats_procfs);
+device_initcall(init_lstats_procfs);
-- 
cgit v1.2.3


From 06eb23b1ba39c61ee5d5faeb42a097635693e370 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 9 Feb 2009 02:02:33 +0100
Subject: ptrace, x86: fix the usage of ptrace_fork()

I noticed by pure accident we have ptrace_fork() and friends. This was
added by "x86, bts: add fork and exit handling", commit
bf53de907dfdaac178c92d774aae7370d7b97d20.

I can't test this, ds_request_bts() returns -EOPNOTSUPP, but I strongly
believe this needs the fix. I think something like this program

	int main(void)
	{
		int pid = fork();

		if (!pid) {
			ptrace(PTRACE_TRACEME, 0, NULL, NULL);
			kill(getpid(), SIGSTOP);
			fork();
		} else {
			struct ptrace_bts_config bts = {
				.flags = PTRACE_BTS_O_ALLOC,
				.size  = 4 * 4096,
			};

			wait(NULL);

			ptrace(PTRACE_SETOPTIONS, pid, NULL, PTRACE_O_TRACEFORK);
			ptrace(PTRACE_BTS_CONFIG, pid, &bts, sizeof(bts));
			ptrace(PTRACE_CONT, pid, NULL, NULL);

			sleep(1);
		}

		return 0;
	}

should crash the kernel.

If the task is traced by its natural parent ptrace_reparented() returns 0
but we should clear ->btsxxx anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 242a706e7721..43c039d55e95 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1093,7 +1093,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
 	p->blocked_on = NULL; /* not blocked yet */
 #endif
-	if (unlikely(ptrace_reparented(current)))
+	if (unlikely(current->ptrace))
 		ptrace_fork(p, clone_flags);
 
 	/* Perform scheduler related setup. Assign this task to a CPU. */
-- 
cgit v1.2.3


From cf2592f59c0e8ed4308adbdb2e0a88655379d579 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Feb 2009 16:52:37 +0100
Subject: softlockup: ensure the task has been switched out once

When we check if a task has been switched out since the last scan, we might
have a race condition on the following scenario:

- the task is freshly created and scheduled

- it puts its state to TASK_UNINTERRUPTIBLE and is not yet switched out

- check_hung_task() scans this task and will report a false positive because
  t->nvcsw + t->nivcsw == t->last_switch_count == 0

Add a check for such cases.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Mandeep Singh Baines <msb@google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/hung_task.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/hung_task.c b/kernel/hung_task.c
index 0c924de58cb2..022a4927b785 100644
--- a/kernel/hung_task.c
+++ b/kernel/hung_task.c
@@ -72,7 +72,13 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout)
 {
 	unsigned long switch_count = t->nvcsw + t->nivcsw;
 
-	if (t->flags & PF_FROZEN)
+	/*
+	 * Ensure the task is not frozen.
+	 * Also, when a freshly created task is scheduled once, changes
+	 * its state to TASK_UNINTERRUPTIBLE without having ever been
+	 * switched out once, it musn't be checked.
+	 */
+	if (unlikely(t->flags & PF_FROZEN || !switch_count))
 		return;
 
 	if (switch_count != t->last_switch_count) {
-- 
cgit v1.2.3


From b22f4858126a6aa852ad745b94f6b25dbdea708e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 10 Feb 2009 15:49:11 +0100
Subject: tracing/sysprof: add missing tracing_{start,stop}_record_cmdline()

Add the missing pair tracing_{start,stop}_record_cmdline() to record well
the cmdline associated with pid.

Changes in v2:

- fix a build error, the sched_switch tracer is needed to record the
  cmdline.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig         | 1 +
 kernel/trace/trace_sysprof.c | 3 +++
 2 files changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 3a331289457a..6ff928acd453 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -134,6 +134,7 @@ config SYSPROF_TRACER
 	bool "Sysprof Tracer"
 	depends on X86
 	select TRACING
+	select CONTEXT_SWITCH_TRACER
 	help
 	  This tracer provides the trace needed by the 'Sysprof' userspace
 	  tool.
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 84ca9d81e74d..9902c15997ad 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -238,6 +238,8 @@ static int stack_trace_init(struct trace_array *tr)
 {
 	sysprof_trace = tr;
 
+	tracing_start_cmdline_record();
+
 	mutex_lock(&sample_timer_lock);
 	start_stack_timers();
 	tracer_enabled = 1;
@@ -247,6 +249,7 @@ static int stack_trace_init(struct trace_array *tr)
 
 static void stack_trace_reset(struct trace_array *tr)
 {
+	tracing_stop_cmdline_record();
 	stop_stack_trace(tr);
 }
 
-- 
cgit v1.2.3


From 00f62f614bb713027b9296068d1879fbca511eb7 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 9 Feb 2009 17:04:06 -0200
Subject: ring_buffer: pahole struct ring_buffer

While fixing some bugs in pahole (built-in.o files were not being
processed due to relocation problems) I found out about these packable
structures:

$ pahole --packable kernel/trace/ring_buffer.o  | grep ring
ring_buffer	72	64	8
ring_buffer_per_cpu	112	104	8

If we take a look at the current layout of struct ring_buffer we can see
that we have two 4 bytes holes.

$ pahole -C ring_buffer kernel/trace/ring_buffer.o
struct ring_buffer {
	unsigned int               pages;           /*     0     4 */
	unsigned int               flags;           /*     4     4 */
	int                        cpus;            /*     8     4 */

	/* XXX 4 bytes hole, try to pack */

	cpumask_var_t              cpumask;         /*    16     8 */
	atomic_t                   record_disabled; /*    24     4 */

	/* XXX 4 bytes hole, try to pack */

	struct mutex               mutex;           /*    32    32 */
	/* --- cacheline 1 boundary (64 bytes) --- */
	struct ring_buffer_per_cpu * * buffers;     /*    64     8 */

	/* size: 72, cachelines: 2, members: 7 */
	/* sum members: 64, holes: 2, sum holes: 8 */
	/* last cacheline: 8 bytes */
};

So, if I ask pahole to reorganize it:

$ pahole -C ring_buffer --reorganize kernel/trace/ring_buffer.o

struct ring_buffer {
	unsigned int               pages;           /*     0     4 */
	unsigned int               flags;           /*     4     4 */
	int                        cpus;            /*     8     4 */
	atomic_t                   record_disabled; /*    12     4 */
	cpumask_var_t              cpumask;         /*    16     8 */
	struct mutex               mutex;           /*    24    32 */
	struct ring_buffer_per_cpu * * buffers;     /*    56     8 */
	/* --- cacheline 1 boundary (64 bytes) --- */

	/* size: 64, cachelines: 1, members: 7 */
};   /* saved 8 bytes and 1 cacheline! */

We get it using just one 64 bytes cacheline.

To see what it did:

$ pahole -C ring_buffer --reorganize --show_reorg_steps \
	kernel/trace/ring_buffer.o | grep \/
/* Moving 'record_disabled' from after 'cpumask' to after 'cpus' */

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 53ba3a6d16d0..27ef3bf13ed2 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -273,8 +273,8 @@ struct ring_buffer {
 	unsigned			pages;
 	unsigned			flags;
 	int				cpus;
-	cpumask_var_t			cpumask;
 	atomic_t			record_disabled;
+	cpumask_var_t			cpumask;
 
 	struct mutex			mutex;
 
-- 
cgit v1.2.3


From 3fccfd67df79c6351a156eb25a7a514e5f39c4d9 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 10 Feb 2009 16:37:31 +0100
Subject: timers: split process wide cpu clocks/timers, fix

To decrease the chance of a missed enable, always enable the timer when we
sample it, we'll always disable it when we find that there are no active timers
in the jiffy tick.

This fixes a flood of warnings reported by Mike Galbraith.

Reported-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     |  1 +
 kernel/posix-cpu-timers.c | 42 ++++++++++++++----------------------------
 2 files changed, 15 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 79392916d6c9..5d10fa0b6002 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2209,6 +2209,7 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 	unsigned long flags;
 
 	spin_lock_irqsave(&cputimer->lock, flags);
+	cputimer->running = 1;
 	*times = cputimer->cputime;
 	spin_unlock_irqrestore(&cputimer->lock, flags);
 }
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index db107c9bbc05..e5d7bfdfa7d4 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -488,7 +488,7 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
 	struct task_cputime cputime;
 
-	thread_group_cputime(tsk, &cputime);
+	thread_group_cputimer(tsk, &cputime);
 	cleanup_timers(tsk->signal->cpu_timers,
 		       cputime.utime, cputime.stime, cputime.sum_exec_runtime);
 }
@@ -506,29 +506,6 @@ static void clear_dead_task(struct k_itimer *timer, union cpu_time_count now)
 					     now);
 }
 
-/*
- * Enable the process wide cpu timer accounting.
- *
- * serialized using ->sighand->siglock
- */
-static void start_process_timers(struct task_struct *tsk)
-{
-	tsk->signal->cputimer.running = 1;
-	barrier();
-}
-
-/*
- * Release the process wide timer accounting -- timer stops ticking when
- * nobody cares about it.
- *
- * serialized using ->sighand->siglock
- */
-static void stop_process_timers(struct task_struct *tsk)
-{
-	tsk->signal->cputimer.running = 0;
-	barrier();
-}
-
 /*
  * Insert the timer on the appropriate list before any timers that
  * expire later.  This must be called with the tasklist_lock held
@@ -549,9 +526,6 @@ static void arm_timer(struct k_itimer *timer, union cpu_time_count now)
 	BUG_ON(!irqs_disabled());
 	spin_lock(&p->sighand->siglock);
 
-	if (!CPUCLOCK_PERTHREAD(timer->it_clock))
-		start_process_timers(p);
-
 	listpos = head;
 	if (CPUCLOCK_WHICH(timer->it_clock) == CPUCLOCK_SCHED) {
 		list_for_each_entry(next, head, entry) {
@@ -1021,6 +995,19 @@ static void check_thread_timers(struct task_struct *tsk,
 	}
 }
 
+static void stop_process_timers(struct task_struct *tsk)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	unsigned long flags;
+
+	if (!cputimer->running)
+		return;
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	cputimer->running = 0;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
 /*
  * Check for any per-thread CPU timers that have fired and move them
  * off the tsk->*_timers list onto the firing list.  Per-thread timers
@@ -1427,7 +1414,6 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 	struct list_head *head;
 
 	BUG_ON(clock_idx == CPUCLOCK_SCHED);
-	start_process_timers(tsk);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
-- 
cgit v1.2.3


From 4da94d49b2ecb0a26e716a8811c3ecc542c2a65d Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Feb 2009 11:30:27 +0100
Subject: timers: fix TIMER_ABSTIME for process wide cpu timers

The POSIX timer interface allows for absolute time expiry values through the
TIMER_ABSTIME flag, therefore we have to synchronize the timer to the clock
every time we start it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h     | 13 +------------
 kernel/posix-cpu-timers.c | 34 ++++++++++++++++++++++++++++++++++
 2 files changed, 35 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 5d10fa0b6002..8981e52c714f 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2201,18 +2201,7 @@ static inline int spin_needbreak(spinlock_t *lock)
  * Thread group CPU time accounting.
  */
 void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
-
-static inline
-void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
-{
-	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
-	unsigned long flags;
-
-	spin_lock_irqsave(&cputimer->lock, flags);
-	cputimer->running = 1;
-	*times = cputimer->cputime;
-	spin_unlock_irqrestore(&cputimer->lock, flags);
-}
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
 
 static inline void thread_group_cputime_init(struct signal_struct *sig)
 {
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e5d7bfdfa7d4..2313a4cc14ea 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -261,6 +261,40 @@ out:
 	rcu_read_unlock();
 }
 
+static void update_gt_cputime(struct task_cputime *a, struct task_cputime *b)
+{
+	if (cputime_gt(b->utime, a->utime))
+		a->utime = b->utime;
+
+	if (cputime_gt(b->stime, a->stime))
+		a->stime = b->stime;
+
+	if (b->sum_exec_runtime > a->sum_exec_runtime)
+		a->sum_exec_runtime = b->sum_exec_runtime;
+}
+
+void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
+{
+	struct thread_group_cputimer *cputimer = &tsk->signal->cputimer;
+	struct task_cputime sum;
+	unsigned long flags;
+
+	spin_lock_irqsave(&cputimer->lock, flags);
+	if (!cputimer->running) {
+		cputimer->running = 1;
+		/*
+		 * The POSIX timer interface allows for absolute time expiry
+		 * values through the TIMER_ABSTIME flag, therefore we have
+		 * to synchronize the timer to the clock every time we start
+		 * it.
+		 */
+		thread_group_cputime(tsk, &sum);
+		update_gt_cputime(&cputimer->cputime, &sum);
+	}
+	*times = cputimer->cputime;
+	spin_unlock_irqrestore(&cputimer->lock, flags);
+}
+
 /*
  * Sample a process (thread group) clock for the given group_leader task.
  * Must be called with tasklist_lock held for reading.
-- 
cgit v1.2.3


From fc631c82e1734d718ff0832558f64c8f5d185f26 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 11 Feb 2009 14:27:17 +0100
Subject: sched: revert recent sync wakeup changes

Intel reported a 10% regression (mysql+sysbench) on a 16-way machine
with these patches:

  1596e29: sched: symmetric sync vs avg_overlap
  d942fb6: sched: fix sync wakeups

Revert them.

Reported-by: "Zhang, Yanmin" <yanmin_zhang@linux.intel.com>
Bisected-by: Lin Ming <ming.m.lin@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c      | 10 ----------
 kernel/sched_fair.c | 11 +++++++++--
 2 files changed, 9 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e1fc67d0674c..f11c02b86c73 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2266,16 +2266,6 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 	if (!sched_feat(SYNC_WAKEUPS))
 		sync = 0;
 
-	if (!sync) {
-		if (current->se.avg_overlap < sysctl_sched_migration_cost &&
-			  p->se.avg_overlap < sysctl_sched_migration_cost)
-			sync = 1;
-	} else {
-		if (current->se.avg_overlap >= sysctl_sched_migration_cost ||
-			  p->se.avg_overlap >= sysctl_sched_migration_cost)
-			sync = 0;
-	}
-
 #ifdef CONFIG_SMP
 	if (sched_feat(LB_WAKEUP_UPDATE)) {
 		struct sched_domain *sd;
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index a7e50ba185ac..0566f2a03c42 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1191,15 +1191,20 @@ wake_affine(struct sched_domain *this_sd, struct rq *this_rq,
 	    int idx, unsigned long load, unsigned long this_load,
 	    unsigned int imbalance)
 {
+	struct task_struct *curr = this_rq->curr;
+	struct task_group *tg;
 	unsigned long tl = this_load;
 	unsigned long tl_per_task;
-	struct task_group *tg;
 	unsigned long weight;
 	int balanced;
 
 	if (!(this_sd->flags & SD_WAKE_AFFINE) || !sched_feat(AFFINE_WAKEUPS))
 		return 0;
 
+	if (sync && (curr->se.avg_overlap > sysctl_sched_migration_cost ||
+			p->se.avg_overlap > sysctl_sched_migration_cost))
+		sync = 0;
+
 	/*
 	 * If sync wakeup then subtract the (maximum possible)
 	 * effect of the currently running task from the load
@@ -1426,7 +1431,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync)
 	if (!sched_feat(WAKEUP_PREEMPT))
 		return;
 
-	if (sched_feat(WAKEUP_OVERLAP) && sync) {
+	if (sched_feat(WAKEUP_OVERLAP) && (sync ||
+			(se->avg_overlap < sysctl_sched_migration_cost &&
+			 pse->avg_overlap < sysctl_sched_migration_cost))) {
 		resched_task(curr);
 		return;
 	}
-- 
cgit v1.2.3


From 2fff78c784ed97a8e5aa225ef5228f0a6d862d82 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 11 Feb 2009 18:10:10 +0100
Subject: futex: fix reference leak

Catalin noticed that (38d47c1b7075: futex: rely on get_user_pages() for
shared futexes) caused an mm_struct leak.

Some tracing with the function graph tracer quickly pointed out that
futex_wait() has exit paths with unbalanced reference counts.

This regression was discovered by kmemleak.

Reported-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Tested-by: "Pallipadi, Venkatesh" <venkatesh.pallipadi@intel.com>
Tested-by: Catalin Marinas <catalin.marinas@arm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 53 ++++++++++++++++++++++++++++-------------------------
 1 file changed, 28 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index f89d373a9c6d..438701adce23 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1165,6 +1165,7 @@ static int futex_wait(u32 __user *uaddr, int fshared,
 		      u32 val, ktime_t *abs_time, u32 bitset, int clockrt)
 {
 	struct task_struct *curr = current;
+	struct restart_block *restart;
 	DECLARE_WAITQUEUE(wait, curr);
 	struct futex_hash_bucket *hb;
 	struct futex_q q;
@@ -1216,11 +1217,13 @@ retry:
 
 		if (!ret)
 			goto retry;
-		return ret;
+		goto out;
 	}
 	ret = -EWOULDBLOCK;
-	if (uval != val)
-		goto out_unlock_put_key;
+	if (unlikely(uval != val)) {
+		queue_unlock(&q, hb);
+		goto out_put_key;
+	}
 
 	/* Only actually queue if *uaddr contained val.  */
 	queue_me(&q, hb);
@@ -1284,38 +1287,38 @@ retry:
 	 */
 
 	/* If we were woken (and unqueued), we succeeded, whatever. */
+	ret = 0;
 	if (!unqueue_me(&q))
-		return 0;
+		goto out_put_key;
+	ret = -ETIMEDOUT;
 	if (rem)
-		return -ETIMEDOUT;
+		goto out_put_key;
 
 	/*
 	 * We expect signal_pending(current), but another thread may
 	 * have handled it for us already.
 	 */
+	ret = -ERESTARTSYS;
 	if (!abs_time)
-		return -ERESTARTSYS;
-	else {
-		struct restart_block *restart;
-		restart = &current_thread_info()->restart_block;
-		restart->fn = futex_wait_restart;
-		restart->futex.uaddr = (u32 *)uaddr;
-		restart->futex.val = val;
-		restart->futex.time = abs_time->tv64;
-		restart->futex.bitset = bitset;
-		restart->futex.flags = 0;
-
-		if (fshared)
-			restart->futex.flags |= FLAGS_SHARED;
-		if (clockrt)
-			restart->futex.flags |= FLAGS_CLOCKRT;
-		return -ERESTART_RESTARTBLOCK;
-	}
+		goto out_put_key;
 
-out_unlock_put_key:
-	queue_unlock(&q, hb);
-	put_futex_key(fshared, &q.key);
+	restart = &current_thread_info()->restart_block;
+	restart->fn = futex_wait_restart;
+	restart->futex.uaddr = (u32 *)uaddr;
+	restart->futex.val = val;
+	restart->futex.time = abs_time->tv64;
+	restart->futex.bitset = bitset;
+	restart->futex.flags = 0;
+
+	if (fshared)
+		restart->futex.flags |= FLAGS_SHARED;
+	if (clockrt)
+		restart->futex.flags |= FLAGS_CLOCKRT;
 
+	ret = -ERESTART_RESTARTBLOCK;
+
+out_put_key:
+	put_futex_key(fshared, &q.key);
 out:
 	return ret;
 }
-- 
cgit v1.2.3


From fc3501d411d34823fb9be248a95a0c44f945866f Mon Sep 17 00:00:00 2001
From: Sven Wegener <sven.wegener@stealer.net>
Date: Wed, 11 Feb 2009 13:04:23 -0800
Subject: mm: fix dirty_bytes/dirty_background_bytes sysctls on 64bit arches

We need to pass an unsigned long as the minimum, because it gets casted
to an unsigned long in the sysctl handler. If we pass an int, we'll
access four more bytes on 64bit arches, resulting in a random minimum
value.

[rientjes@google.com: fix type of `old_bytes']
Signed-off-by: Sven Wegener <sven.wegener@stealer.net>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Dave Chinner <david@fromorbit.com>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c     | 5 +++--
 mm/page-writeback.c | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 790f9d785663..c5ef44ff850f 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,7 @@ static int two = 2;
 
 static int zero;
 static int one = 1;
+static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
@@ -974,7 +975,7 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &dirty_background_bytes_handler,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
+		.extra1		= &one_ul,
 	},
 	{
 		.ctl_name	= VM_DIRTY_RATIO,
@@ -995,7 +996,7 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0644,
 		.proc_handler	= &dirty_bytes_handler,
 		.strategy	= &sysctl_intvec,
-		.extra1		= &one,
+		.extra1		= &one_ul,
 	},
 	{
 		.procname	= "dirty_writeback_centisecs",
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index dc32dae01e5f..c17005e73974 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -209,7 +209,7 @@ int dirty_bytes_handler(struct ctl_table *table, int write,
 		struct file *filp, void __user *buffer, size_t *lenp,
 		loff_t *ppos)
 {
-	int old_bytes = vm_dirty_bytes;
+	unsigned long old_bytes = vm_dirty_bytes;
 	int ret;
 
 	ret = proc_doulongvec_minmax(table, write, filp, buffer, lenp, ppos);
-- 
cgit v1.2.3


From cfebe563bd0a3ff97e1bc167123120d59c7a84db Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 11 Feb 2009 13:04:36 -0800
Subject: cgroups: fix lockdep subclasses overflow

I enabled all cgroup subsystems when compiling kernel, and then:
 # mount -t cgroup -o net_cls xxx /mnt
 # mkdir /mnt/0

This showed up immediately:
 BUG: MAX_LOCKDEP_SUBCLASSES too low!
 turning off the locking correctness validator.

It's caused by the cgroup hierarchy lock:
	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
		struct cgroup_subsys *ss = subsys[i];
		if (ss->root == root)
			mutex_lock_nested(&ss->hierarchy_mutex, i);
	}

Now we have 9 cgroup subsystems, and the above 'i' for net_cls is 8, but
MAX_LOCKDEP_SUBCLASSES is 8.

This patch uses different lockdep keys for different subsystems.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h | 1 +
 kernel/cgroup.c        | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index e4e8e117d27d..499900d0cee7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -378,6 +378,7 @@ struct cgroup_subsys {
 	 * - initiating hotplug events
 	 */
 	struct mutex hierarchy_mutex;
+	struct lock_class_key subsys_key;
 
 	/*
 	 * Link to parent, and list entry in parent's children.
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 5a54ff42874e..e14db9c089b9 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -2351,7 +2351,7 @@ static void cgroup_lock_hierarchy(struct cgroupfs_root *root)
 	for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) {
 		struct cgroup_subsys *ss = subsys[i];
 		if (ss->root == root)
-			mutex_lock_nested(&ss->hierarchy_mutex, i);
+			mutex_lock(&ss->hierarchy_mutex);
 	}
 }
 
@@ -2637,6 +2637,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss)
 	BUG_ON(!list_empty(&init_task.tasks));
 
 	mutex_init(&ss->hierarchy_mutex);
+	lockdep_set_class(&ss->hierarchy_mutex, &ss->subsys_key);
 	ss->active = 1;
 }
 
-- 
cgit v1.2.3


From a0490fa35dc0022ef95f64802e8edf18c411c790 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 12 Feb 2009 11:35:40 +0100
Subject: sched: cpu hotplug fix

rq_attach_root() does a kfree() with the runqueue lock held.

That's not a very wise move, fix it.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index c1d0ed360088..410eec404133 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -6944,20 +6944,26 @@ static void free_rootdomain(struct root_domain *rd)
 
 static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 {
+	struct root_domain *old_rd = NULL;
 	unsigned long flags;
 
 	spin_lock_irqsave(&rq->lock, flags);
 
 	if (rq->rd) {
-		struct root_domain *old_rd = rq->rd;
+		old_rd = rq->rd;
 
 		if (cpumask_test_cpu(rq->cpu, old_rd->online))
 			set_rq_offline(rq);
 
 		cpumask_clear_cpu(rq->cpu, old_rd->span);
 
-		if (atomic_dec_and_test(&old_rd->refcount))
-			free_rootdomain(old_rd);
+		/*
+		 * If we dont want to free the old_rt yet then
+		 * set old_rd to NULL to skip the freeing later
+		 * in this function:
+		 */
+		if (!atomic_dec_and_test(&old_rd->refcount))
+			old_rd = NULL;
 	}
 
 	atomic_inc(&rd->refcount);
@@ -6968,6 +6974,9 @@ static void rq_attach_root(struct rq *rq, struct root_domain *rd)
 		set_rq_online(rq);
 
 	spin_unlock_irqrestore(&rq->lock, flags);
+
+	if (old_rd)
+		free_rootdomain(old_rd);
 }
 
 static int __init_refok init_rootdomain(struct root_domain *rd, bool bootmem)
-- 
cgit v1.2.3


From 9f8d979f082a3ee1b27f32b7e0811b51c3ad1d15 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 12 Feb 2009 13:10:17 +0100
Subject: softlockup: move 'one' to the softlockup section in sysctl.c

CONFIG_SOFTLOCKUP=y || CONFIG_DETECT_HUNG_TASKS=y is now the only user
of the 'one' constant in kernel/sysctl.c. Move it to the softlockup
block of constants.

This fixes a GCC warning.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sysctl.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 3b6b54c8ac0d..6d2aeff92b3d 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -90,6 +90,9 @@ extern int rcutorture_runnable;
 #endif /* #ifdef CONFIG_RCU_TORTURE_TEST */
 
 /* Constants used for minimum and  maximum */
+#if defined(CONFIG_DETECT_HUNG_TASK) || defined(CONFIG_DETECT_SOFTLOCKUP) || defined(CONFIG_HIGHMEM)
+static int one = 1;
+#endif
 #ifdef CONFIG_DETECT_SOFTLOCKUP
 static int sixty = 60;
 static int neg_one = -1;
@@ -100,7 +103,6 @@ static int two = 2;
 #endif
 
 static int zero;
-static int one = 1;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
-- 
cgit v1.2.3


From 45141d4667d208421ca787a3301542b6a5e0b112 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Feb 2009 13:19:48 -0500
Subject: ring-buffer: rename label out_unlock to out_reset

Impact: clean up

While reviewing the ring buffer code, I thougth I saw a bug with

	if (!__raw_spin_trylock(&cpu_buffer->lock))
		goto out_unlock;

But I forgot that we use a variable "lock_taken" that is set if
the spinlock is taken, and only unlock it if that variable is set.

To avoid further confusion from other reviewers, this patch
renames the label out_unlock with out_reset, which is the more
appropriate name.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index dc18b5b9ccb4..f39d7e9a4305 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1017,7 +1017,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (unlikely(in_nmi())) {
 			if (!__raw_spin_trylock(&cpu_buffer->lock))
-				goto out_unlock;
+				goto out_reset;
 		} else
 			__raw_spin_lock(&cpu_buffer->lock);
 
@@ -1030,7 +1030,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 		/* we grabbed the lock before incrementing */
 		if (RB_WARN_ON(cpu_buffer, next_page == reader_page))
-			goto out_unlock;
+			goto out_reset;
 
 		/*
 		 * If for some reason, we had an interrupt storm that made
@@ -1039,12 +1039,12 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		 */
 		if (unlikely(next_page == commit_page)) {
 			WARN_ON_ONCE(1);
-			goto out_unlock;
+			goto out_reset;
 		}
 
 		if (next_page == head_page) {
 			if (!(buffer->flags & RB_FL_OVERWRITE))
-				goto out_unlock;
+				goto out_reset;
 
 			/* tail_page has not moved yet? */
 			if (tail_page == cpu_buffer->tail_page) {
@@ -1118,7 +1118,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 
 	return event;
 
- out_unlock:
+ out_reset:
 	/* reset write */
 	if (tail <= BUF_PAGE_SIZE)
 		local_set(&tail_page->write, tail);
-- 
cgit v1.2.3


From 0e43785c57fee50fbc00ea0378e941efb61fa0c2 Mon Sep 17 00:00:00 2001
From: Johannes Weiner <hannes@cmpxchg.org>
Date: Fri, 13 Feb 2009 04:38:04 +0100
Subject: irq: use GFP_KERNEL for action allocation in request_irq()

request_irq() calls into proc code via __setup_irq() which is not safe
in an atomic context, so request_irq() can itself use the more
reliable GFP_KERNEL allocation for the action descriptor.

Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index cd0cd8dcb345..1c5055069170 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -717,7 +717,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	if (!handler)
 		return -EINVAL;
 
-	action = kmalloc(sizeof(struct irqaction), GFP_ATOMIC);
+	action = kmalloc(sizeof(struct irqaction), GFP_KERNEL);
 	if (!action)
 		return -ENOMEM;
 
-- 
cgit v1.2.3


From 3997ad317fdf9ecdb5702e2b4fd1f8229814ff8c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 12 Feb 2009 15:00:52 +0100
Subject: timers: more consistently use clock vs timer

While reviewing the manpages, I noticed I'd missed some clock vs timer sites.

Make sure that all timer functions call cpu_timer_sample_group() and not
cpu_clock_sample_group(). This ensures that we enable the process wide timer
in time, and therefore pay the O(n) thread group cost from the syscall.

Not doing it here, will result in the first jiffy tick after setting the timer
doing this, resulting in a very expensive tick (but only once) and a delay in
actually starting the timer.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 60 +++++++++++++++++++++++------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 2313a4cc14ea..e976e505648d 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -680,6 +680,33 @@ static void cpu_timer_fire(struct k_itimer *timer)
 	}
 }
 
+/*
+ * Sample a process (thread group) timer for the given group_leader task.
+ * Must be called with tasklist_lock held for reading.
+ */
+static int cpu_timer_sample_group(const clockid_t which_clock,
+				  struct task_struct *p,
+				  union cpu_time_count *cpu)
+{
+	struct task_cputime cputime;
+
+	thread_group_cputimer(p, &cputime);
+	switch (CPUCLOCK_WHICH(which_clock)) {
+	default:
+		return -EINVAL;
+	case CPUCLOCK_PROF:
+		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
+		break;
+	case CPUCLOCK_VIRT:
+		cpu->cpu = cputime.utime;
+		break;
+	case CPUCLOCK_SCHED:
+		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
+		break;
+	}
+	return 0;
+}
+
 /*
  * Guts of sys_timer_settime for CPU timers.
  * This is called with the timer locked and interrupts disabled.
@@ -741,7 +768,7 @@ int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 		cpu_clock_sample(timer->it_clock, p, &val);
 	} else {
-		cpu_clock_sample_group(timer->it_clock, p, &val);
+		cpu_timer_sample_group(timer->it_clock, p, &val);
 	}
 
 	if (old) {
@@ -889,7 +916,7 @@ void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			read_unlock(&tasklist_lock);
 			goto dead;
 		} else {
-			cpu_clock_sample_group(timer->it_clock, p, &now);
+			cpu_timer_sample_group(timer->it_clock, p, &now);
 			clear_dead = (unlikely(p->exit_state) &&
 				      thread_group_empty(p));
 		}
@@ -1244,7 +1271,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			clear_dead_task(timer, now);
 			goto out_unlock;
 		}
-		cpu_clock_sample_group(timer->it_clock, p, &now);
+		cpu_timer_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
 		/* Leave the tasklist_lock locked for the call below.  */
 	}
@@ -1408,33 +1435,6 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	}
 }
 
-/*
- * Sample a process (thread group) timer for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
- */
-static int cpu_timer_sample_group(const clockid_t which_clock,
-				  struct task_struct *p,
-				  union cpu_time_count *cpu)
-{
-	struct task_cputime cputime;
-
-	thread_group_cputimer(p, &cputime);
-	switch (CPUCLOCK_WHICH(which_clock)) {
-	default:
-		return -EINVAL;
-	case CPUCLOCK_PROF:
-		cpu->cpu = cputime_add(cputime.utime, cputime.stime);
-		break;
-	case CPUCLOCK_VIRT:
-		cpu->cpu = cputime.utime;
-		break;
-	case CPUCLOCK_SCHED:
-		cpu->sched = cputime.sum_exec_runtime + task_delta_exec(p);
-		break;
-	}
-	return 0;
-}
-
 /*
  * Set one of the process-wide special case CPU timers.
  * The tsk->sighand->siglock must be held by the caller.
-- 
cgit v1.2.3


From b5f9fd0f8a05c9bafb91a9a85b9110938d8e585b Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Wed, 11 Feb 2009 13:57:25 -0500
Subject: tracing: convert c/p state power tracer to use tracepoints

Convert the c/p state "power" tracer to use tracepoints. Avoids a
function call when the tracer is disabled.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |   2 +
 arch/x86/kernel/process.c                  |   3 +
 include/trace/power.h                      |  25 ++---
 kernel/trace/trace_power.c                 | 173 +++++++++++++++++------------
 4 files changed, 119 insertions(+), 84 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 7ed925edf4d2..c5d737cdb365 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -70,6 +70,8 @@ struct acpi_cpufreq_data {
 
 static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
+DEFINE_TRACE(power_mark);
+
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 026819ffcb0c..e0d0fd7ab514 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -19,6 +19,9 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
+DEFINE_TRACE(power_start);
+DEFINE_TRACE(power_end);
+
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
diff --git a/include/trace/power.h b/include/trace/power.h
index c7cefbcdaea4..2c733e58e89c 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -2,6 +2,7 @@
 #define _TRACE_POWER_H
 
 #include <linux/ktime.h>
+#include <linux/tracepoint.h>
 
 enum {
 	POWER_NONE = 0,
@@ -18,18 +19,16 @@ struct power_trace {
 #endif
 };
 
-#ifdef CONFIG_POWER_TRACER
-extern void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state);
-extern void trace_power_end(struct power_trace *it);
-#else
-static inline void trace_power_start(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_mark(struct power_trace *it, unsigned int type,
-					unsigned int state) { }
-static inline void trace_power_end(struct power_trace *it) { }
-#endif
+DECLARE_TRACE(power_start,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_mark,
+	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
+		TPARGS(it, type, state));
+
+DECLARE_TRACE(power_end,
+	TPPROTO(struct power_trace *it),
+		TPARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index b1d0d087d3a6..91ce672fb037 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -21,15 +21,116 @@
 static struct trace_array *power_trace;
 static int __read_mostly trace_power_enabled;
 
+static void probe_power_start(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+}
+
+
+static void probe_power_end(struct power_trace *it)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	preempt_disable();
+	it->end = ktime_get();
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static void probe_power_mark(struct power_trace *it, unsigned int type,
+				unsigned int level)
+{
+	struct ring_buffer_event *event;
+	struct trace_power *entry;
+	struct trace_array_cpu *data;
+	struct trace_array *tr = power_trace;
+
+	if (!trace_power_enabled)
+		return;
+
+	memset(it, 0, sizeof(struct power_trace));
+	it->state = level;
+	it->type = type;
+	it->stamp = ktime_get();
+	preempt_disable();
+	it->end = it->stamp;
+	data = tr->data[smp_processor_id()];
+
+	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
+					  sizeof(*entry), 0, 0);
+	if (!event)
+		goto out;
+	entry	= ring_buffer_event_data(event);
+	entry->state_data = *it;
+	trace_buffer_unlock_commit(tr, event, 0, 0);
+ out:
+	preempt_enable();
+}
+
+static int tracing_power_register(void)
+{
+	int ret;
+
+	ret = register_trace_power_start(probe_power_start);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_start\n");
+		return ret;
+	}
+	ret = register_trace_power_end(probe_power_end);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_end\n");
+		goto fail_start;
+	}
+	ret = register_trace_power_mark(probe_power_mark);
+	if (ret) {
+		pr_info("power trace: Couldn't activate tracepoint"
+			" probe to trace_power_mark\n");
+		goto fail_end;
+	}
+	return ret;
+fail_end:
+	unregister_trace_power_end(probe_power_end);
+fail_start:
+	unregister_trace_power_start(probe_power_start);
+	return ret;
+}
 
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
+	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
+	unregister_trace_power_start(probe_power_start);
+	unregister_trace_power_end(probe_power_end);
+	unregister_trace_power_mark(probe_power_mark);
 }
 
 
@@ -39,6 +140,7 @@ static int power_trace_init(struct trace_array *tr)
 	power_trace = tr;
 
 	trace_power_enabled = 1;
+	tracing_power_register();
 
 	for_each_cpu(cpu, cpu_possible_mask)
 		tracing_reset(tr, cpu);
@@ -95,74 +197,3 @@ static int init_power_trace(void)
 	return register_tracer(&power_tracer);
 }
 device_initcall(init_power_trace);
-
-void trace_power_start(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-}
-EXPORT_SYMBOL_GPL(trace_power_start);
-
-
-void trace_power_end(struct power_trace *it)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	preempt_disable();
-	it->end = ktime_get();
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_end);
-
-void trace_power_mark(struct power_trace *it, unsigned int type,
-			 unsigned int level)
-{
-	struct ring_buffer_event *event;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-	preempt_disable();
-	it->end = it->stamp;
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(tr, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	trace_buffer_unlock_commit(tr, event, 0, 0);
- out:
-	preempt_enable();
-}
-EXPORT_SYMBOL_GPL(trace_power_mark);
-- 
cgit v1.2.3


From fb5ae64fdde29236e1a15e0366946df7060f41f2 Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Fri, 13 Feb 2009 14:04:21 +0000
Subject: User namespaces: Only put the userns when we unhash the uid

uids in namespaces other than init don't get a sysfs entry.

For those in the init namespace, while we're waiting to remove
the sysfs entry for the uid the uid is still hashed, and
alloc_uid() may re-grab that uid without getting a new
reference to the user_ns, which we've already put in free_user
before scheduling remove_user_sysfs_dir().

Reported-and-tested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Tested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 477b6660f447..3551ac742395 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -72,6 +72,7 @@ static void uid_hash_insert(struct user_struct *up, struct hlist_head *hashent)
 static void uid_hash_remove(struct user_struct *up)
 {
 	hlist_del_init(&up->uidhash_node);
+	put_user_ns(up->user_ns);
 }
 
 static struct user_struct *uid_hash_find(uid_t uid, struct hlist_head *hashent)
@@ -334,7 +335,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
-	put_user_ns(up->user_ns);
 	INIT_WORK(&up->work, remove_user_sysfs_dir);
 	schedule_work(&up->work);
 }
@@ -357,7 +357,6 @@ static void free_user(struct user_struct *up, unsigned long flags)
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
 	key_put(up->session_keyring);
-	put_user_ns(up->user_ns);
 	kmem_cache_free(uid_cachep, up);
 }
 
-- 
cgit v1.2.3


From 327ec5699c29454322d0136375f717f509c145b6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 15 Feb 2009 11:21:37 +0100
Subject: irq: clean up manage.c

- make printk message git-greppable
- fix a few style details

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 1c5055069170..8f4bc61f0df9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -397,7 +397,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
  * allocate special interrupts that are part of the architecture.
  */
 static int
-__setup_irq(unsigned int irq, struct irq_desc * desc, struct irqaction *new)
+__setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
 	struct irqaction *old, **p;
 	const char *old_name = NULL;
@@ -687,11 +687,12 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	 * the behavior is classified as "will not fix" so we need to
 	 * start nudging drivers away from using that idiom.
 	 */
-	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED))
-			== (IRQF_SHARED|IRQF_DISABLED))
-		pr_warning("IRQ %d/%s: IRQF_DISABLED is not "
-				"guaranteed on shared IRQs\n",
-				irq, devname);
+	if ((irqflags & (IRQF_SHARED|IRQF_DISABLED)) ==
+					(IRQF_SHARED|IRQF_DISABLED)) {
+		pr_warning(
+		  "IRQ %d/%s: IRQF_DISABLED is not guaranteed on shared IRQs\n",
+			irq, devname);
+	}
 
 #ifdef CONFIG_LOCKDEP
 	/*
-- 
cgit v1.2.3


From ae88a23b32fa7e0dc9fa7ce735966e68eb41b0bc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 15 Feb 2009 11:29:50 +0100
Subject: irq: refactor and clean up the free_irq() code flow

Impact: cleanup

- separate out the loop from the actual freeing logic, this wins us
  two indentation levels allowing a number of followup prettifications

- turn the WARN_ON() into a more informative WARN().

- clean up the comments and the code flow some more

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 101 ++++++++++++++++++++++++++++------------------------
 1 file changed, 54 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8f4bc61f0df9..7a954b860c07 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -575,72 +575,79 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 void free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction **p;
+	struct irqaction *action, **p, **pp;
 	unsigned long flags;
 
-	WARN_ON(in_interrupt());
+	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
 
 	if (!desc)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
+
+	/*
+	 * There can be multiple actions per IRQ descriptor, find the right
+	 * one based on the dev_id:
+	 */
 	p = &desc->action;
 	for (;;) {
-		struct irqaction *action = *p;
+		action = *p;
+		pp = p;
+
+		if (!action) {
+			WARN(1, "Trying to free already-free IRQ %d\n", irq);
+			spin_unlock_irqrestore(&desc->lock, flags);
+
+			return;
+		}
 
-		if (action) {
-			struct irqaction **pp = p;
+		p = &action->next;
+		if (action->dev_id != dev_id)
+			continue;
 
-			p = &action->next;
-			if (action->dev_id != dev_id)
-				continue;
+		break;
+	}
 
-			/* Found it - now remove it from the list of entries */
-			*pp = action->next;
+	/* Found it - now remove it from the list of entries: */
+	*pp = action->next;
 
-			/* Currently used only by UML, might disappear one day.*/
+	/* Currently used only by UML, might disappear one day: */
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-			if (desc->chip->release)
-				desc->chip->release(irq, dev_id);
+	if (desc->chip->release)
+		desc->chip->release(irq, dev_id);
 #endif
 
-			if (!desc->action) {
-				desc->status |= IRQ_DISABLED;
-				if (desc->chip->shutdown)
-					desc->chip->shutdown(irq);
-				else
-					desc->chip->disable(irq);
-			}
-			spin_unlock_irqrestore(&desc->lock, flags);
-			unregister_handler_proc(irq, action);
+	/* If this was the last handler, shut down the IRQ line: */
+	if (!desc->action) {
+		desc->status |= IRQ_DISABLED;
+		if (desc->chip->shutdown)
+			desc->chip->shutdown(irq);
+		else
+			desc->chip->disable(irq);
+	}
+	spin_unlock_irqrestore(&desc->lock, flags);
+
+	unregister_handler_proc(irq, action);
+
+	/* Make sure it's not being used on another CPU: */
+	synchronize_irq(irq);
 
-			/* Make sure it's not being used on another CPU */
-			synchronize_irq(irq);
-#ifdef CONFIG_DEBUG_SHIRQ
-			/*
-			 * It's a shared IRQ -- the driver ought to be
-			 * prepared for it to happen even now it's
-			 * being freed, so let's make sure....  We do
-			 * this after actually deregistering it, to
-			 * make sure that a 'real' IRQ doesn't run in
-			 * parallel with our fake
-			 */
-			if (action->flags & IRQF_SHARED) {
-				local_irq_save(flags);
-				action->handler(irq, dev_id);
-				local_irq_restore(flags);
-			}
-#endif
-			kfree(action);
-			return;
-		}
-		printk(KERN_ERR "Trying to free already-free IRQ %d\n", irq);
 #ifdef CONFIG_DEBUG_SHIRQ
-		dump_stack();
-#endif
-		spin_unlock_irqrestore(&desc->lock, flags);
-		return;
+	/*
+	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
+	 * event to happen even now it's being freed, so let's make sure that
+	 * is so by doing an extra call to the handler ....
+	 *
+	 * ( We do this after actually deregistering it, to make sure that a
+	 *   'real' IRQ doesn't run in * parallel with our fake. )
+	 */
+	if (action->flags & IRQF_SHARED) {
+		local_irq_save(flags);
+		action->handler(irq, dev_id);
+		local_irq_restore(flags);
 	}
+#endif
+	kfree(action);
 }
 EXPORT_SYMBOL(free_irq);
 
-- 
cgit v1.2.3


From 391b170f10e669dd429aa47bce998c2fa839404c Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Tue, 6 Jan 2009 13:57:11 +0200
Subject: mmiotrace: count events lost due to not recording

Impact: enhances lost events counting in mmiotrace

The tracing framework, or the ring buffer facility it uses, has a switch
to stop recording data. When recording is off, the trace events will be
lost. The framework does not count these, so mmiotrace has to count them
itself.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_mmiotrace.c | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index fffcb069f1dc..80e503ef6136 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -9,6 +9,7 @@
 #include <linux/kernel.h>
 #include <linux/mmiotrace.h>
 #include <linux/pci.h>
+#include <asm/atomic.h>
 
 #include "trace.h"
 
@@ -19,6 +20,7 @@ struct header_iter {
 static struct trace_array *mmio_trace_array;
 static bool overrun_detected;
 static unsigned long prev_overruns;
+static atomic_t dropped_count;
 
 static void mmio_reset_data(struct trace_array *tr)
 {
@@ -121,11 +123,11 @@ static void mmio_close(struct trace_iterator *iter)
 
 static unsigned long count_overruns(struct trace_iterator *iter)
 {
-	unsigned long cnt = 0;
+	unsigned long cnt = atomic_xchg(&dropped_count, 0);
 	unsigned long over = ring_buffer_overruns(iter->tr->buffer);
 
 	if (over > prev_overruns)
-		cnt = over - prev_overruns;
+		cnt += over - prev_overruns;
 	prev_overruns = over;
 	return cnt;
 }
@@ -310,8 +312,10 @@ static void __trace_mmiotrace_rw(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_RW;
@@ -338,8 +342,10 @@ static void __trace_mmiotrace_map(struct trace_array *tr,
 
 	event	= ring_buffer_lock_reserve(tr->buffer, sizeof(*entry),
 					   &irq_flags);
-	if (!event)
+	if (!event) {
+		atomic_inc(&dropped_count);
 		return;
+	}
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, preempt_count());
 	entry->ent.type			= TRACE_MMIO_MAP;
-- 
cgit v1.2.3


From 6bc5c366b1a45ca18fba6851f62db5743b3f6db5 Mon Sep 17 00:00:00 2001
From: Pekka Paalanen <pq@iki.fi>
Date: Sat, 3 Jan 2009 21:23:51 +0200
Subject: trace: mmiotrace to the tracer menu in Kconfig

Impact: cosmetic change in Kconfig menu layout

This patch was originally suggested by Peter Zijlstra, but seems it
was forgotten.

CONFIG_MMIOTRACE and CONFIG_MMIOTRACE_TEST were selectable
directly under the Kernel hacking / debugging menu in the kernel
configuration system. They were present only for x86 and x86_64.

Other tracers that use the ftrace tracing framework are in their own
sub-menu. This patch moves the mmiotrace configuration options there.
Since the Kconfig file, where the tracer menu is, is not architecture
specific, HAVE_MMIOTRACE_SUPPORT is introduced and provided only by
x86/x86_64. CONFIG_MMIOTRACE now depends on it.

Signed-off-by: Pekka Paalanen <pq@iki.fi>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/Kconfig.debug | 24 ++----------------------
 kernel/trace/Kconfig   | 23 +++++++++++++++++++++++
 2 files changed, 25 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 10d6cc3fd052..e1983fa025d2 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -174,28 +174,8 @@ config IOMMU_LEAK
 	  Add a simple leak tracer to the IOMMU code. This is useful when you
 	  are debugging a buggy device driver that leaks IOMMU mappings.
 
-config MMIOTRACE
-	bool "Memory mapped IO tracing"
-	depends on DEBUG_KERNEL && PCI
-	select TRACING
-	help
-	  Mmiotrace traces Memory Mapped I/O access and is meant for
-	  debugging and reverse engineering. It is called from the ioremap
-	  implementation and works via page faults. Tracing is disabled by
-	  default and can be enabled at run-time.
-
-	  See Documentation/tracers/mmiotrace.txt.
-	  If you are not helping to develop drivers, say N.
-
-config MMIOTRACE_TEST
-	tristate "Test module for mmiotrace"
-	depends on MMIOTRACE && m
-	help
-	  This is a dumb module for testing mmiotrace. It is very dangerous
-	  as it will write garbage to IO memory starting at a given address.
-	  However, it should be safe to use on e.g. unused portion of VRAM.
-
-	  Say N, unless you absolutely know what you are doing.
+config HAVE_MMIOTRACE_SUPPORT
+	def_bool y
 
 #
 # IO delay types:
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index e2a4ff6fc3a6..58a93fbd68aa 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -302,4 +302,27 @@ config FTRACE_STARTUP_TEST
 	  functioning properly. It will do tests on all the configured
 	  tracers of ftrace.
 
+config MMIOTRACE
+	bool "Memory mapped IO tracing"
+	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	select TRACING
+	help
+	  Mmiotrace traces Memory Mapped I/O access and is meant for
+	  debugging and reverse engineering. It is called from the ioremap
+	  implementation and works via page faults. Tracing is disabled by
+	  default and can be enabled at run-time.
+
+	  See Documentation/tracers/mmiotrace.txt.
+	  If you are not helping to develop drivers, say N.
+
+config MMIOTRACE_TEST
+	tristate "Test module for mmiotrace"
+	depends on MMIOTRACE && m
+	help
+	  This is a dumb module for testing mmiotrace. It is very dangerous
+	  as it will write garbage to IO memory starting at a given address.
+	  However, it should be safe to use on e.g. unused portion of VRAM.
+
+	  Say N, unless you absolutely know what you are doing.
+
 endmenu
-- 
cgit v1.2.3


From a234aa9ecdf47a5461573a21dc0b154278df5ba8 Mon Sep 17 00:00:00 2001
From: Rakib Mullick <rakib.mullick@gmail.com>
Date: Sat, 14 Feb 2009 09:36:00 +0600
Subject: tracing: fix section mismatch in trace_hw_branches.c

The function bts_trace_init() references a variable
bts_hotcpu_notifier which is marked
as __cpuinitdata. Thus causes section mismatch. This patch fixes it.

   LD      kernel/trace/built-in.o
 WARNING: kernel/trace/built-in.o(.text+0xc90c): Section mismatch in
 reference from the function bts_trace_init() to the variable
 .cpuinit.data:bts_hotcpu_notifier
 The function bts_trace_init() references
 the variable __cpuinitdata bts_hotcpu_notifier.
 This is often because bts_trace_init lacks a __cpuinitdata
 annotation or the annotation of bts_hotcpu_notifier is wrong.

 WARNING: kernel/trace/built-in.o(.text+0xc92a): Section mismatch in
 reference from the function bts_trace_reset() to the variable
 .cpuinit.data:bts_hotcpu_notifier
 The function bts_trace_reset() references
 the variable __cpuinitdata bts_hotcpu_notifier.
 This is often because bts_trace_reset lacks a __cpuinitdata
 annotation or the annotation of bts_hotcpu_notifier is wrong.

Signed-off-by: Rakib Mullick <rakib.mullick@gmail.com>
Cc: markus.t.metzger@gmail.com
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 0794dd33f27b..3561aace075c 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -127,7 +127,7 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int bts_trace_init(struct trace_array *tr)
+static int __cpuinit bts_trace_init(struct trace_array *tr)
 {
 	hw_branch_trace = tr;
 
@@ -137,7 +137,7 @@ static int bts_trace_init(struct trace_array *tr)
 	return 0;
 }
 
-static void bts_trace_reset(struct trace_array *tr)
+static void __cpuinit bts_trace_reset(struct trace_array *tr)
 {
 	bts_trace_stop(tr);
 	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
-- 
cgit v1.2.3


From a0a522ce3d6d8c907e45d4f2730ee8573484cc88 Mon Sep 17 00:00:00 2001
From: Henrik Austad <henrik@austad.us>
Date: Fri, 13 Feb 2009 20:35:45 +0100
Subject: sched: idle_at_tick is only used when CONFIG_SMP is set

Impact: struct rq size optimization

The idle_at_tick in struct rq is only used in SMP settings
and it does not make sense to have this in the rq in an UP setup.

Signed-off-by: Henrik Austad <henrik@austad.us>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5faf5d482fcd..648154cf1117 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -555,7 +555,6 @@ struct rq {
 	unsigned long nr_running;
 	#define CPU_LOAD_IDX_MAX 5
 	unsigned long cpu_load[CPU_LOAD_IDX_MAX];
-	unsigned char idle_at_tick;
 #ifdef CONFIG_NO_HZ
 	unsigned long last_tick_seen;
 	unsigned char in_nohz_recently;
@@ -596,6 +595,7 @@ struct rq {
 	struct root_domain *rd;
 	struct sched_domain *sd;
 
+	unsigned char idle_at_tick;
 	/* For active balancing */
 	int active_balance;
 	int push_cpu;
-- 
cgit v1.2.3


From a038a353c3de4040d8445ec568acebdac144436f Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 12 Feb 2009 05:03:34 +0000
Subject: clocksource: allow usage independent of timekeeping.c

So far struct clocksource acted as the interface between time/timekeeping.c
and hardware. This patch generalizes the concept so that a similar
interface can also be used in other contexts. For that it introduces
new structures and related functions *without* touching the existing
struct clocksource.

The reasons for adding these new structures to clocksource.[ch] are
* the APIs are clearly related
* struct clocksource could be cleaned up to use the new structs
* avoids proliferation of files with similar names (timesource.h?
  timecounter.h?)

As outlined in the discussion with John Stultz, this patch adds
* struct cyclecounter: stateless API to hardware which counts clock cycles
* struct timecounter: stateful utility code built on a cyclecounter which
  provides a nanosecond counter
* only the function to read the nanosecond counter; deltas are used internally
  and not exposed to users of timecounter

The code does no locking of the shared state. It must be called at least
as often as the cycle counter wraps around to detect these wrap arounds.
Both is the responsibility of the timecounter user.

Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/clocksource.h | 101 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/time/clocksource.c   |  76 +++++++++++++++++++++++++++++++++
 2 files changed, 177 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h
index f88d32f8ff7c..573819ef4cc0 100644
--- a/include/linux/clocksource.h
+++ b/include/linux/clocksource.h
@@ -21,9 +21,110 @@
 typedef u64 cycle_t;
 struct clocksource;
 
+/**
+ * struct cyclecounter - hardware abstraction for a free running counter
+ *	Provides completely state-free accessors to the underlying hardware.
+ *	Depending on which hardware it reads, the cycle counter may wrap
+ *	around quickly. Locking rules (if necessary) have to be defined
+ *	by the implementor and user of specific instances of this API.
+ *
+ * @read:		returns the current cycle value
+ * @mask:		bitmask for two's complement
+ *			subtraction of non 64 bit counters,
+ *			see CLOCKSOURCE_MASK() helper macro
+ * @mult:		cycle to nanosecond multiplier
+ * @shift:		cycle to nanosecond divisor (power of two)
+ */
+struct cyclecounter {
+	cycle_t (*read)(const struct cyclecounter *cc);
+	cycle_t mask;
+	u32 mult;
+	u32 shift;
+};
+
+/**
+ * struct timecounter - layer above a %struct cyclecounter which counts nanoseconds
+ *	Contains the state needed by timecounter_read() to detect
+ *	cycle counter wrap around. Initialize with
+ *	timecounter_init(). Also used to convert cycle counts into the
+ *	corresponding nanosecond counts with timecounter_cyc2time(). Users
+ *	of this code are responsible for initializing the underlying
+ *	cycle counter hardware, locking issues and reading the time
+ *	more often than the cycle counter wraps around. The nanosecond
+ *	counter will only wrap around after ~585 years.
+ *
+ * @cc:			the cycle counter used by this instance
+ * @cycle_last:		most recent cycle counter value seen by
+ *			timecounter_read()
+ * @nsec:		continuously increasing count
+ */
+struct timecounter {
+	const struct cyclecounter *cc;
+	cycle_t cycle_last;
+	u64 nsec;
+};
+
+/**
+ * cyclecounter_cyc2ns - converts cycle counter cycles to nanoseconds
+ * @tc:		Pointer to cycle counter.
+ * @cycles:	Cycles
+ *
+ * XXX - This could use some mult_lxl_ll() asm optimization. Same code
+ * as in cyc2ns, but with unsigned result.
+ */
+static inline u64 cyclecounter_cyc2ns(const struct cyclecounter *cc,
+				      cycle_t cycles)
+{
+	u64 ret = (u64)cycles;
+	ret = (ret * cc->mult) >> cc->shift;
+	return ret;
+}
+
+/**
+ * timecounter_init - initialize a time counter
+ * @tc:			Pointer to time counter which is to be initialized/reset
+ * @cc:			A cycle counter, ready to be used.
+ * @start_tstamp:	Arbitrary initial time stamp.
+ *
+ * After this call the current cycle register (roughly) corresponds to
+ * the initial time stamp. Every call to timecounter_read() increments
+ * the time stamp counter by the number of elapsed nanoseconds.
+ */
+extern void timecounter_init(struct timecounter *tc,
+			     const struct cyclecounter *cc,
+			     u64 start_tstamp);
+
+/**
+ * timecounter_read - return nanoseconds elapsed since timecounter_init()
+ *                    plus the initial time stamp
+ * @tc:          Pointer to time counter.
+ *
+ * In other words, keeps track of time since the same epoch as
+ * the function which generated the initial time stamp.
+ */
+extern u64 timecounter_read(struct timecounter *tc);
+
+/**
+ * timecounter_cyc2time - convert a cycle counter to same
+ *                        time base as values returned by
+ *                        timecounter_read()
+ * @tc:		Pointer to time counter.
+ * @cycle:	a value returned by tc->cc->read()
+ *
+ * Cycle counts that are converted correctly as long as they
+ * fall into the interval [-1/2 max cycle count, +1/2 max cycle count],
+ * with "max cycle count" == cs->mask+1.
+ *
+ * This allows conversion of cycle counter values which were generated
+ * in the past.
+ */
+extern u64 timecounter_cyc2time(struct timecounter *tc,
+				cycle_t cycle_tstamp);
+
 /**
  * struct clocksource - hardware abstraction for a free running counter
  *	Provides mostly state-free accessors to the underlying hardware.
+ *	This is the structure used for system time.
  *
  * @name:		ptr to clocksource name
  * @list:		list head for registration
diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index ca89e1593f08..c46c931a7fe7 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -31,6 +31,82 @@
 #include <linux/sched.h> /* for spin_unlock_irq() using preempt_count() m68k */
 #include <linux/tick.h>
 
+void timecounter_init(struct timecounter *tc,
+		      const struct cyclecounter *cc,
+		      u64 start_tstamp)
+{
+	tc->cc = cc;
+	tc->cycle_last = cc->read(cc);
+	tc->nsec = start_tstamp;
+}
+EXPORT_SYMBOL(timecounter_init);
+
+/**
+ * timecounter_read_delta - get nanoseconds since last call of this function
+ * @tc:         Pointer to time counter
+ *
+ * When the underlying cycle counter runs over, this will be handled
+ * correctly as long as it does not run over more than once between
+ * calls.
+ *
+ * The first call to this function for a new time counter initializes
+ * the time tracking and returns an undefined result.
+ */
+static u64 timecounter_read_delta(struct timecounter *tc)
+{
+	cycle_t cycle_now, cycle_delta;
+	u64 ns_offset;
+
+	/* read cycle counter: */
+	cycle_now = tc->cc->read(tc->cc);
+
+	/* calculate the delta since the last timecounter_read_delta(): */
+	cycle_delta = (cycle_now - tc->cycle_last) & tc->cc->mask;
+
+	/* convert to nanoseconds: */
+	ns_offset = cyclecounter_cyc2ns(tc->cc, cycle_delta);
+
+	/* update time stamp of timecounter_read_delta() call: */
+	tc->cycle_last = cycle_now;
+
+	return ns_offset;
+}
+
+u64 timecounter_read(struct timecounter *tc)
+{
+	u64 nsec;
+
+	/* increment time by nanoseconds since last call */
+	nsec = timecounter_read_delta(tc);
+	nsec += tc->nsec;
+	tc->nsec = nsec;
+
+	return nsec;
+}
+EXPORT_SYMBOL(timecounter_read);
+
+u64 timecounter_cyc2time(struct timecounter *tc,
+			 cycle_t cycle_tstamp)
+{
+	u64 cycle_delta = (cycle_tstamp - tc->cycle_last) & tc->cc->mask;
+	u64 nsec;
+
+	/*
+	 * Instead of always treating cycle_tstamp as more recent
+	 * than tc->cycle_last, detect when it is too far in the
+	 * future and treat it as old time stamp instead.
+	 */
+	if (cycle_delta > tc->cc->mask / 2) {
+		cycle_delta = (tc->cycle_last - cycle_tstamp) & tc->cc->mask;
+		nsec = tc->nsec - cyclecounter_cyc2ns(tc->cc, cycle_delta);
+	} else {
+		nsec = cyclecounter_cyc2ns(tc->cc, cycle_delta) + tc->nsec;
+	}
+
+	return nsec;
+}
+EXPORT_SYMBOL(timecounter_cyc2time);
+
 /* XXX - Would like a better way for initializing curr_clocksource */
 extern struct clocksource clocksource_jiffies;
 
-- 
cgit v1.2.3


From a75244c3d519fcb490ca2bf3f123c98017f1e8d0 Mon Sep 17 00:00:00 2001
From: Patrick Ohly <patrick.ohly@intel.com>
Date: Thu, 12 Feb 2009 05:03:35 +0000
Subject: timecompare: generic infrastructure to map between two time bases

Mapping from a struct timecounter to a time returned by functions like
ktime_get_real() is implemented. This is sufficient to use this code
in a network device driver which wants to support hardware time
stamping and transformation of hardware time stamps to system time.

The interface could have been made more versatile by not depending on
a time counter, but this wasn't done to avoid writing glue code
elsewhere.

The method implemented here is the one used and analyzed under the name
"assisted PTP" in the LCI PTP paper:
http://www.linuxclustersinstitute.org/conferences/archive/2008/PDF/Ohly_92221.pdf

Acked-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Patrick Ohly <patrick.ohly@intel.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
---
 include/linux/timecompare.h | 125 +++++++++++++++++++++++++++++
 kernel/time/Makefile        |   2 +-
 kernel/time/timecompare.c   | 191 ++++++++++++++++++++++++++++++++++++++++++++
 3 files changed, 317 insertions(+), 1 deletion(-)
 create mode 100644 include/linux/timecompare.h
 create mode 100644 kernel/time/timecompare.c

(limited to 'kernel')

diff --git a/include/linux/timecompare.h b/include/linux/timecompare.h
new file mode 100644
index 000000000000..546e2234e4b3
--- /dev/null
+++ b/include/linux/timecompare.h
@@ -0,0 +1,125 @@
+/*
+ * Utility code which helps transforming between two different time
+ * bases, called "source" and "target" time in this code.
+ *
+ * Source time has to be provided via the timecounter API while target
+ * time is accessed via a function callback whose prototype
+ * intentionally matches ktime_get() and ktime_get_real(). These
+ * interfaces where chosen like this so that the code serves its
+ * initial purpose without additional glue code.
+ *
+ * This purpose is synchronizing a hardware clock in a NIC with system
+ * time, in order to implement the Precision Time Protocol (PTP,
+ * IEEE1588) with more accurate hardware assisted time stamping.  In
+ * that context only synchronization against system time (=
+ * ktime_get_real()) is currently needed. But this utility code might
+ * become useful in other situations, which is why it was written as
+ * general purpose utility code.
+ *
+ * The source timecounter is assumed to return monotonically
+ * increasing time (but this code does its best to compensate if that
+ * is not the case) whereas target time may jump.
+ *
+ * The target time corresponding to a source time is determined by
+ * reading target time, reading source time, reading target time
+ * again, then assuming that average target time corresponds to source
+ * time. In other words, the assumption is that reading the source
+ * time is slow and involves equal time for sending the request and
+ * receiving the reply, whereas reading target time is assumed to be
+ * fast.
+ *
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. * See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc.,
+ * 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
+ */
+#ifndef _LINUX_TIMECOMPARE_H
+#define _LINUX_TIMECOMPARE_H
+
+#include <linux/clocksource.h>
+#include <linux/ktime.h>
+
+/**
+ * struct timecompare - stores state and configuration for the two clocks
+ *
+ * Initialize to zero, then set source/target/num_samples.
+ *
+ * Transformation between source time and target time is done with:
+ * target_time = source_time + offset +
+ *               (source_time - last_update) * skew /
+ *               TIMECOMPARE_SKEW_RESOLUTION
+ *
+ * @source:          used to get source time stamps via timecounter_read()
+ * @target:          function returning target time (for example, ktime_get
+ *                   for monotonic time, or ktime_get_real for wall clock)
+ * @num_samples:     number of times that source time and target time are to
+ *                   be compared when determining their offset
+ * @offset:          (target time - source time) at the time of the last update
+ * @skew:            average (target time - source time) / delta source time *
+ *                   TIMECOMPARE_SKEW_RESOLUTION
+ * @last_update:     last source time stamp when time offset was measured
+ */
+struct timecompare {
+	struct timecounter *source;
+	ktime_t (*target)(void);
+	int num_samples;
+
+	s64 offset;
+	s64 skew;
+	u64 last_update;
+};
+
+/**
+ * timecompare_transform - transform source time stamp into target time base
+ * @sync:            context for time sync
+ * @source_tstamp:   the result of timecounter_read() or
+ *                   timecounter_cyc2time()
+ */
+extern ktime_t timecompare_transform(struct timecompare *sync,
+				     u64 source_tstamp);
+
+/**
+ * timecompare_offset - measure current (target time - source time) offset
+ * @sync:            context for time sync
+ * @offset:          average offset during sample period returned here
+ * @source_tstamp:   average source time during sample period returned here
+ *
+ * Returns number of samples used. Might be zero (= no result) in the
+ * unlikely case that target time was monotonically decreasing for all
+ * samples (= broken).
+ */
+extern int timecompare_offset(struct timecompare *sync,
+			      s64 *offset,
+			      u64 *source_tstamp);
+
+extern void __timecompare_update(struct timecompare *sync,
+				 u64 source_tstamp);
+
+/**
+ * timecompare_update - update offset and skew by measuring current offset
+ * @sync:            context for time sync
+ * @source_tstamp:   the result of timecounter_read() or
+ *                   timecounter_cyc2time(), pass zero to force update
+ *
+ * Updates are only done at most once per second.
+ */
+static inline void timecompare_update(struct timecompare *sync,
+				      u64 source_tstamp)
+{
+	if (!source_tstamp ||
+	    (s64)(source_tstamp - sync->last_update) >= NSEC_PER_SEC)
+		__timecompare_update(sync, source_tstamp);
+}
+
+#endif /* _LINUX_TIMECOMPARE_H */
diff --git a/kernel/time/Makefile b/kernel/time/Makefile
index 905b0b50792d..0b0a6366c9d4 100644
--- a/kernel/time/Makefile
+++ b/kernel/time/Makefile
@@ -1,4 +1,4 @@
-obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o
+obj-y += timekeeping.o ntp.o clocksource.o jiffies.o timer_list.o timecompare.o
 
 obj-$(CONFIG_GENERIC_CLOCKEVENTS_BUILD)		+= clockevents.o
 obj-$(CONFIG_GENERIC_CLOCKEVENTS)		+= tick-common.o
diff --git a/kernel/time/timecompare.c b/kernel/time/timecompare.c
new file mode 100644
index 000000000000..71e7f1a19156
--- /dev/null
+++ b/kernel/time/timecompare.c
@@ -0,0 +1,191 @@
+/*
+ * Copyright (C) 2009 Intel Corporation.
+ * Author: Patrick Ohly <patrick.ohly@intel.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/timecompare.h>
+#include <linux/module.h>
+#include <linux/math64.h>
+
+/*
+ * fixed point arithmetic scale factor for skew
+ *
+ * Usually one would measure skew in ppb (parts per billion, 1e9), but
+ * using a factor of 2 simplifies the math.
+ */
+#define TIMECOMPARE_SKEW_RESOLUTION (((s64)1)<<30)
+
+ktime_t timecompare_transform(struct timecompare *sync,
+			      u64 source_tstamp)
+{
+	u64 nsec;
+
+	nsec = source_tstamp + sync->offset;
+	nsec += (s64)(source_tstamp - sync->last_update) * sync->skew /
+		TIMECOMPARE_SKEW_RESOLUTION;
+
+	return ns_to_ktime(nsec);
+}
+EXPORT_SYMBOL(timecompare_transform);
+
+int timecompare_offset(struct timecompare *sync,
+		       s64 *offset,
+		       u64 *source_tstamp)
+{
+	u64 start_source = 0, end_source = 0;
+	struct {
+		s64 offset;
+		s64 duration_target;
+	} buffer[10], sample, *samples;
+	int counter = 0, i;
+	int used;
+	int index;
+	int num_samples = sync->num_samples;
+
+	if (num_samples > sizeof(buffer)/sizeof(buffer[0])) {
+		samples = kmalloc(sizeof(*samples) * num_samples, GFP_ATOMIC);
+		if (!samples) {
+			samples = buffer;
+			num_samples = sizeof(buffer)/sizeof(buffer[0]);
+		}
+	} else {
+		samples = buffer;
+	}
+
+	/* run until we have enough valid samples, but do not try forever */
+	i = 0;
+	counter = 0;
+	while (1) {
+		u64 ts;
+		ktime_t start, end;
+
+		start = sync->target();
+		ts = timecounter_read(sync->source);
+		end = sync->target();
+
+		if (!i)
+			start_source = ts;
+
+		/* ignore negative durations */
+		sample.duration_target = ktime_to_ns(ktime_sub(end, start));
+		if (sample.duration_target >= 0) {
+			/*
+			 * assume symetric delay to and from source:
+			 * average target time corresponds to measured
+			 * source time
+			 */
+			sample.offset =
+				ktime_to_ns(ktime_add(end, start)) / 2 -
+				ts;
+
+			/* simple insertion sort based on duration */
+			index = counter - 1;
+			while (index >= 0) {
+				if (samples[index].duration_target <
+				    sample.duration_target)
+					break;
+				samples[index + 1] = samples[index];
+				index--;
+			}
+			samples[index + 1] = sample;
+			counter++;
+		}
+
+		i++;
+		if (counter >= num_samples || i >= 100000) {
+			end_source = ts;
+			break;
+		}
+	}
+
+	*source_tstamp = (end_source + start_source) / 2;
+
+	/* remove outliers by only using 75% of the samples */
+	used = counter * 3 / 4;
+	if (!used)
+		used = counter;
+	if (used) {
+		/* calculate average */
+		s64 off = 0;
+		for (index = 0; index < used; index++)
+			off += samples[index].offset;
+		*offset = div_s64(off, used);
+	}
+
+	if (samples && samples != buffer)
+		kfree(samples);
+
+	return used;
+}
+EXPORT_SYMBOL(timecompare_offset);
+
+void __timecompare_update(struct timecompare *sync,
+			  u64 source_tstamp)
+{
+	s64 offset;
+	u64 average_time;
+
+	if (!timecompare_offset(sync, &offset, &average_time))
+		return;
+
+	if (!sync->last_update) {
+		sync->last_update = average_time;
+		sync->offset = offset;
+		sync->skew = 0;
+	} else {
+		s64 delta_nsec = average_time - sync->last_update;
+
+		/* avoid division by negative or small deltas */
+		if (delta_nsec >= 10000) {
+			s64 delta_offset_nsec = offset - sync->offset;
+			s64 skew; /* delta_offset_nsec *
+				     TIMECOMPARE_SKEW_RESOLUTION /
+				     delta_nsec */
+			u64 divisor;
+
+			/* div_s64() is limited to 32 bit divisor */
+			skew = delta_offset_nsec * TIMECOMPARE_SKEW_RESOLUTION;
+			divisor = delta_nsec;
+			while (unlikely(divisor >= ((s64)1) << 32)) {
+				/* divide both by 2; beware, right shift
+				   of negative value has undefined
+				   behavior and can only be used for
+				   the positive divisor */
+				skew = div_s64(skew, 2);
+				divisor >>= 1;
+			}
+			skew = div_s64(skew, divisor);
+
+			/*
+			 * Calculate new overall skew as 4/16 the
+			 * old value and 12/16 the new one. This is
+			 * a rather arbitrary tradeoff between
+			 * only using the latest measurement (0/16 and
+			 * 16/16) and even more weight on past measurements.
+			 */
+#define TIMECOMPARE_NEW_SKEW_PER_16 12
+			sync->skew =
+				div_s64((16 - TIMECOMPARE_NEW_SKEW_PER_16) *
+					sync->skew +
+					TIMECOMPARE_NEW_SKEW_PER_16 * skew,
+					16);
+			sync->last_update = average_time;
+			sync->offset = offset;
+		}
+	}
+}
+EXPORT_SYMBOL(__timecompare_update);
-- 
cgit v1.2.3


From 2b8f836fb196acede88b6cc772e9057e0a9c0223 Mon Sep 17 00:00:00 2001
From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Mon, 16 Feb 2009 18:54:21 +0800
Subject: sched: use TASK_NICE for task_struct

#define TASK_NICE(p)            PRIO_TO_NICE((p)->static_prio)

So it's better to use TASK_NICE here.

Signed-off-by: WANG Cong <wangcong@zeuux.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 648154cf1117..5475d56a20f1 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5236,7 +5236,7 @@ SYSCALL_DEFINE1(nice, int, increment)
 	if (increment > 40)
 		increment = 40;
 
-	nice = PRIO_TO_NICE(current->static_prio) + increment;
+	nice = TASK_NICE(current) + increment;
 	if (nice < -20)
 		nice = -20;
 	if (nice > 19)
-- 
cgit v1.2.3


From 0c75a3ed633419d75d823d5dcb05d42924c6ae61 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 11:21:52 -0500
Subject: ftrace: state that all functions are enabled in set_ftrace_filter

Impact: clean up, make set_ftrace_filter less confusing

The set_ftrace_filter shows only the functions that will be traced.
But when it is empty, it will trace all functions. This can be a bit
confusing.

This patch makes set_ftrace_filter show:

  #### all functions enabled ####

When all functions will be traced, and we do not filter only a select
few.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 23 +++++++++++++++++++++++
 1 file changed, 23 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1796e018fbff..369fb78bd4ab 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -773,6 +773,7 @@ enum {
 	FTRACE_ITER_CONT	= (1 << 1),
 	FTRACE_ITER_NOTRACE	= (1 << 2),
 	FTRACE_ITER_FAILURES	= (1 << 3),
+	FTRACE_ITER_PRINTALL	= (1 << 4),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
@@ -794,6 +795,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
+	if (iter->flags & FTRACE_ITER_PRINTALL)
+		return NULL;
+
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
  retry:
@@ -834,6 +838,19 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 
+	/*
+	 * For set_ftrace_filter reading, if we have the filter
+	 * off, we can short cut and just print out that all
+	 * functions are enabled.
+	 */
+	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
+		if (*pos > 0)
+			return NULL;
+		iter->flags |= FTRACE_ITER_PRINTALL;
+		(*pos)++;
+		return iter;
+	}
+
 	if (*pos > 0) {
 		if (iter->idx < 0)
 			return p;
@@ -852,9 +869,15 @@ static void t_stop(struct seq_file *m, void *p)
 
 static int t_show(struct seq_file *m, void *v)
 {
+	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
+	if (iter->flags & FTRACE_ITER_PRINTALL) {
+		seq_printf(m, "#### all functions enabled ####\n");
+		return 0;
+	}
+
 	if (!rec)
 		return 0;
 
-- 
cgit v1.2.3


From 265c831cb03d533cbe159af45798ac9fef534260 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 12:43:56 -0500
Subject: ftrace: add do_for_each_ftrace_rec and while_for_each_ftrace_rec

Impact: clean up

To iterate over all the functions that dynamic trace knows about
it requires two for loops. One to iterate over the pages and the
other to iterate over the records within the page.

There are several duplications of these loops in ftrace.c. This
patch creates the macros do_for_each_ftrace_rec and
while_for_each_ftrace_rec to handle this logic, and removes the
duplicate code.

While making this change, I also discovered and fixed a small
bug that one of the iterations should exit the loop after it found the
record it was searching for. This used a break when it should have
used a goto, since there were two loops it needed to break out
from.  No real harm was done by this bug since it would only continue
to search the other records, and the code was in a slow path anyway.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 208 ++++++++++++++++++++++++--------------------------
 1 file changed, 101 insertions(+), 107 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 369fb78bd4ab..fed1ebc3a133 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -297,6 +297,19 @@ static struct ftrace_page	*ftrace_pages;
 
 static struct dyn_ftrace *ftrace_free_records;
 
+/*
+ * This is a double for. Do not use 'break' to break out of the loop,
+ * you must use a goto.
+ */
+#define do_for_each_ftrace_rec(pg, rec)					\
+	for (pg = ftrace_pages_start; pg; pg = pg->next) {		\
+		int _____i;						\
+		for (_____i = 0; _____i < pg->index; _____i++) {	\
+			rec = &pg->records[_____i];
+
+#define while_for_each_ftrace_rec()		\
+		}				\
+	}
 
 #ifdef CONFIG_KPROBES
 
@@ -341,7 +354,6 @@ void ftrace_release(void *start, unsigned long size)
 	struct ftrace_page *pg;
 	unsigned long s = (unsigned long)start;
 	unsigned long e = s + size;
-	int i;
 
 	if (ftrace_disabled || !start)
 		return;
@@ -349,14 +361,11 @@ void ftrace_release(void *start, unsigned long size)
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
+	do_for_each_ftrace_rec(pg, rec) {
+		if ((rec->ip >= s) && (rec->ip < e))
+			ftrace_free_rec(rec);
+	} while_for_each_ftrace_rec();
 
-			if ((rec->ip >= s) && (rec->ip < e))
-				ftrace_free_rec(rec);
-		}
-	}
 	spin_unlock(&ftrace_lock);
 }
 
@@ -523,41 +532,37 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 static void ftrace_replace_code(int enable)
 {
-	int i, failed;
+	int failed;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-
-			/*
-			 * Skip over free records and records that have
-			 * failed.
-			 */
-			if (rec->flags & FTRACE_FL_FREE ||
-			    rec->flags & FTRACE_FL_FAILED)
-				continue;
-
-			/* ignore updates to this record's mcount site */
-			if (get_kprobe((void *)rec->ip)) {
-				freeze_record(rec);
-				continue;
-			} else {
-				unfreeze_record(rec);
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+		/*
+		 * Skip over free records and records that have
+		 * failed.
+		 */
+		if (rec->flags & FTRACE_FL_FREE ||
+		    rec->flags & FTRACE_FL_FAILED)
+			continue;
 
-			failed = __ftrace_replace_code(rec, enable);
-			if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
-				rec->flags |= FTRACE_FL_FAILED;
-				if ((system_state == SYSTEM_BOOTING) ||
-				    !core_kernel_text(rec->ip)) {
-					ftrace_free_rec(rec);
-				} else
-					ftrace_bug(failed, rec->ip);
-			}
+		/* ignore updates to this record's mcount site */
+		if (get_kprobe((void *)rec->ip)) {
+			freeze_record(rec);
+			continue;
+		} else {
+			unfreeze_record(rec);
 		}
-	}
+
+		failed = __ftrace_replace_code(rec, enable);
+		if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+			rec->flags |= FTRACE_FL_FAILED;
+			if ((system_state == SYSTEM_BOOTING) ||
+			    !core_kernel_text(rec->ip)) {
+				ftrace_free_rec(rec);
+			} else
+				ftrace_bug(failed, rec->ip);
+		}
+	} while_for_each_ftrace_rec();
 }
 
 static int
@@ -956,22 +961,17 @@ static void ftrace_filter_reset(int enable)
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned i;
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 0;
-	pg = ftrace_pages_start;
-	while (pg) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-			if (rec->flags & FTRACE_FL_FAILED)
-				continue;
-			rec->flags &= ~type;
-		}
-		pg = pg->next;
-	}
+	do_for_each_ftrace_rec(pg, rec) {
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+		rec->flags &= ~type;
+	} while_for_each_ftrace_rec();
+
 	spin_unlock(&ftrace_lock);
 }
 
@@ -1094,44 +1094,39 @@ ftrace_match(unsigned char *buff, int len, int enable)
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 1;
-	pg = ftrace_pages_start;
-	while (pg) {
-		for (i = 0; i < pg->index; i++) {
-			int matched = 0;
-			char *ptr;
-
-			rec = &pg->records[i];
-			if (rec->flags & FTRACE_FL_FAILED)
-				continue;
-			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-			switch (type) {
-			case MATCH_FULL:
-				if (strcmp(str, buff) == 0)
-					matched = 1;
-				break;
-			case MATCH_FRONT_ONLY:
-				if (memcmp(str, buff, match) == 0)
-					matched = 1;
-				break;
-			case MATCH_MIDDLE_ONLY:
-				if (strstr(str, search))
-					matched = 1;
-				break;
-			case MATCH_END_ONLY:
-				ptr = strstr(str, search);
-				if (ptr && (ptr[search_len] == 0))
-					matched = 1;
-				break;
-			}
-			if (matched) {
-				if (not)
-					rec->flags &= ~flag;
-				else
-					rec->flags |= flag;
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+		int matched = 0;
+		char *ptr;
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+		switch (type) {
+		case MATCH_FULL:
+			if (strcmp(str, buff) == 0)
+				matched = 1;
+			break;
+		case MATCH_FRONT_ONLY:
+			if (memcmp(str, buff, match) == 0)
+				matched = 1;
+			break;
+		case MATCH_MIDDLE_ONLY:
+			if (strstr(str, search))
+				matched = 1;
+			break;
+		case MATCH_END_ONLY:
+			ptr = strstr(str, search);
+			if (ptr && (ptr[search_len] == 0))
+				matched = 1;
+			break;
 		}
-		pg = pg->next;
-	}
+		if (matched) {
+			if (not)
+				rec->flags &= ~flag;
+			else
+				rec->flags |= flag;
+		}
+	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
 
@@ -1452,7 +1447,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
 	int found = 0;
-	int i, j;
+	int j;
 
 	if (ftrace_disabled)
 		return -ENODEV;
@@ -1460,27 +1455,26 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 
-	for (pg = ftrace_pages_start; pg; pg = pg->next) {
-		for (i = 0; i < pg->index; i++) {
-			rec = &pg->records[i];
-
-			if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
-				continue;
-
-			kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-			if (strcmp(str, buffer) == 0) {
-				found = 1;
-				for (j = 0; j < idx; j++)
-					if (array[j] == rec->ip) {
-						found = 0;
-						break;
-					}
-				if (found)
-					array[idx] = rec->ip;
-				break;
-			}
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
+			continue;
+
+		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+		if (strcmp(str, buffer) == 0) {
+			/* Return 1 if we add it to the array */
+			found = 1;
+			for (j = 0; j < idx; j++)
+				if (array[j] == rec->ip) {
+					found = 0;
+					break;
+				}
+			if (found)
+				array[idx] = rec->ip;
+			goto out;
 		}
-	}
+	} while_for_each_ftrace_rec();
+ out:
 	spin_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
-- 
cgit v1.2.3


From 7f24b31b01a271b62346d9df084b029e48612163 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 14:37:33 -0500
Subject: ftrace: rename ftrace_match to ftrace_match_records

Impact: clean up

ftrace_match is too generic of a name. What it really does is
search all records and matches the records with the given string,
and either sets or unsets the functions to be traced depending
on if the parameter 'enable' is set or not.

This allows us to make another function called ftrace_match that
can be used to test a single record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fed1ebc3a133..f397d7adb62e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1054,7 +1054,7 @@ enum {
 };
 
 static void
-ftrace_match(unsigned char *buff, int len, int enable)
+ftrace_match_records(unsigned char *buff, int len, int enable)
 {
 	char str[KSYM_SYMBOL_LEN];
 	char *search = NULL;
@@ -1197,7 +1197,7 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
@@ -1236,7 +1236,7 @@ ftrace_set_regex(unsigned char *buf, int len, int reset, int enable)
 	if (reset)
 		ftrace_filter_reset(enable);
 	if (buf)
-		ftrace_match(buf, len, enable);
+		ftrace_match_records(buf, len, enable);
 	mutex_unlock(&ftrace_regex_lock);
 }
 
@@ -1286,7 +1286,7 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 	if (iter->buffer_idx) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match(iter->buffer, iter->buffer_idx, enable);
+		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 	}
 
 	mutex_lock(&ftrace_sysctl_lock);
-- 
cgit v1.2.3


From 9f4801e30ad291e27284e873696da1ead92d68fa Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 15:56:43 -0500
Subject: ftrace: break up ftrace_match_records into smaller components

Impact: clean up

ftrace_match_records does a lot of things that other features
can use. This patch breaks up ftrace_match_records and pulls
out ftrace_setup_glob and ftrace_match_record.

ftrace_setup_glob prepares a simple glob expression for use with
ftrace_match_record. ftrace_match_record compares a single record
with a glob type.

Breaking this up will allow for more features to run on individual
records.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 115 ++++++++++++++++++++++++++++++++------------------
 1 file changed, 75 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index f397d7adb62e..fcec31323a10 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1053,79 +1053,114 @@ enum {
 	MATCH_END_ONLY,
 };
 
-static void
-ftrace_match_records(unsigned char *buff, int len, int enable)
+/*
+ * (static function - no need for kernel doc)
+ *
+ * Pass in a buffer containing a glob and this function will
+ * set search to point to the search part of the buffer and
+ * return the type of search it is (see enum above).
+ * This does modify buff.
+ *
+ * Returns enum type.
+ *  search returns the pointer to use for comparison.
+ *  not returns 1 if buff started with a '!'
+ *     0 otherwise.
+ */
+static int
+ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
 {
-	char str[KSYM_SYMBOL_LEN];
-	char *search = NULL;
-	struct ftrace_page *pg;
-	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned i, match = 0, search_len = 0;
-	int not = 0;
+	int i;
 
 	if (buff[0] == '!') {
-		not = 1;
+		*not = 1;
 		buff++;
 		len--;
-	}
+	} else
+		*not = 0;
+
+	*search = buff;
 
 	for (i = 0; i < len; i++) {
 		if (buff[i] == '*') {
 			if (!i) {
-				search = buff + i + 1;
+				*search = buff + 1;
 				type = MATCH_END_ONLY;
-				search_len = len - (i + 1);
 			} else {
-				if (type == MATCH_END_ONLY) {
+				if (type == MATCH_END_ONLY)
 					type = MATCH_MIDDLE_ONLY;
-				} else {
-					match = i;
+				else
 					type = MATCH_FRONT_ONLY;
-				}
 				buff[i] = 0;
 				break;
 			}
 		}
 	}
 
+	return type;
+}
+
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+	int matched = 0;
+	char *ptr;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	switch (type) {
+	case MATCH_FULL:
+		if (strcmp(str, regex) == 0)
+			matched = 1;
+		break;
+	case MATCH_FRONT_ONLY:
+		if (strncmp(str, regex, len) == 0)
+			matched = 1;
+		break;
+	case MATCH_MIDDLE_ONLY:
+		if (strstr(str, regex))
+			matched = 1;
+		break;
+	case MATCH_END_ONLY:
+		ptr = strstr(str, regex);
+		if (ptr && (ptr[len] == 0))
+			matched = 1;
+		break;
+	}
+
+	return matched;
+}
+
+static void ftrace_match_records(char *buff, int len, int enable)
+{
+	char *search;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned search_len;
+	int not;
+
+	type = ftrace_setup_glob(buff, len, &search, &not);
+
+	search_len = strlen(search);
+
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 1;
 	do_for_each_ftrace_rec(pg, rec) {
-		int matched = 0;
-		char *ptr;
 
 		if (rec->flags & FTRACE_FL_FAILED)
 			continue;
-		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-		switch (type) {
-		case MATCH_FULL:
-			if (strcmp(str, buff) == 0)
-				matched = 1;
-			break;
-		case MATCH_FRONT_ONLY:
-			if (memcmp(str, buff, match) == 0)
-				matched = 1;
-			break;
-		case MATCH_MIDDLE_ONLY:
-			if (strstr(str, search))
-				matched = 1;
-			break;
-		case MATCH_END_ONLY:
-			ptr = strstr(str, search);
-			if (ptr && (ptr[search_len] == 0))
-				matched = 1;
-			break;
-		}
-		if (matched) {
+
+		if (ftrace_match_record(rec, search, search_len, type)) {
 			if (not)
 				rec->flags &= ~flag;
 			else
 				rec->flags |= flag;
 		}
+
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3


From 64e7c440618998fd69eee6ab490b042d12248021 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 17:08:48 -0500
Subject: ftrace: add module command function filter selection

This patch adds a "command" syntax to the function filtering files:

  /debugfs/tracing/set_ftrace_filter
  /debugfs/tracing/set_ftrace_notrace

Of the format:  <function>:<command>:<parameter>

The command is optional, and dependent on the command, so are
the parameters.

 echo do_fork > set_ftrace_filter

Will only trace 'do_fork'.

 echo 'sched_*' > set_ftrace_filter

Will only trace functions starting with the letters 'sched_'.

 echo '*:mod:ext3' > set_ftrace_filter

Will trace only the ext3 module functions.

 echo '*write*:mod:ext3' > set_ftrace_notrace

Will prevent the ext3 functions with the letters 'write' in
the name from being traced.

 echo '!*_allocate:mod:ext3' > set_ftrace_filter

Will remove the functions in ext3 that end with the letters
'_allocate' from the ftrace filter.

Although this patch implements the 'command' format, only the
'mod' command is supported. More commands to follow.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 115 +++++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 109 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fcec31323a10..9e60ae423af9 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1067,7 +1067,7 @@ enum {
  *     0 otherwise.
  */
 static int
-ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
+ftrace_setup_glob(char *buff, int len, char **search, int *not)
 {
 	int type = MATCH_FULL;
 	int i;
@@ -1100,14 +1100,11 @@ ftrace_setup_glob(unsigned char *buff, int len, char **search, int *not)
 	return type;
 }
 
-static int
-ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+static int ftrace_match(char *str, char *regex, int len, int type)
 {
-	char str[KSYM_SYMBOL_LEN];
 	int matched = 0;
 	char *ptr;
 
-	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	switch (type) {
 	case MATCH_FULL:
 		if (strcmp(str, regex) == 0)
@@ -1131,6 +1128,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
 	return matched;
 }
 
+static int
+ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	return ftrace_match(str, regex, len, type);
+}
+
 static void ftrace_match_records(char *buff, int len, int enable)
 {
 	char *search;
@@ -1165,6 +1171,100 @@ static void ftrace_match_records(char *buff, int len, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+static int
+ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
+			   char *regex, int len, int type)
+{
+	char str[KSYM_SYMBOL_LEN];
+	char *modname;
+
+	kallsyms_lookup(rec->ip, NULL, NULL, &modname, str);
+
+	if (!modname || strcmp(modname, mod))
+		return 0;
+
+	/* blank search means to match all funcs in the mod */
+	if (len)
+		return ftrace_match(str, regex, len, type);
+	else
+		return 1;
+}
+
+static void ftrace_match_module_records(char *buff, char *mod, int enable)
+{
+	char *search = buff;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	int type = MATCH_FULL;
+	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+	unsigned search_len = 0;
+	int not = 0;
+
+	/* blank or '*' mean the same */
+	if (strcmp(buff, "*") == 0)
+		buff[0] = 0;
+
+	/* handle the case of 'dont filter this module' */
+	if (strcmp(buff, "!") == 0 || strcmp(buff, "!*") == 0) {
+		buff[0] = 0;
+		not = 1;
+	}
+
+	if (strlen(buff)) {
+		type = ftrace_setup_glob(buff, strlen(buff), &search, &not);
+		search_len = strlen(search);
+	}
+
+	/* should not be called from interrupt context */
+	spin_lock(&ftrace_lock);
+	if (enable)
+		ftrace_filtered = 1;
+
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (ftrace_match_module_record(rec, mod,
+					       search, search_len, type)) {
+			if (not)
+				rec->flags &= ~flag;
+			else
+				rec->flags |= flag;
+		}
+
+	} while_for_each_ftrace_rec();
+	spin_unlock(&ftrace_lock);
+}
+
+static int ftrace_process_regex(char *buff, int len, int enable)
+{
+	char *func, *mod, *command, *next = buff;
+
+	func = strsep(&next, ":");
+
+	if (!next) {
+		ftrace_match_records(func, len, enable);
+		return 0;
+	}
+
+	/* command fonud */
+
+	command = strsep(&next, ":");
+
+	if (strcmp(command, "mod") == 0) {
+		/* only match modules */
+		if (!next)
+			return -EINVAL;
+
+		mod = strsep(&next, ":");
+		ftrace_match_module_records(func, mod, enable);
+		return 0;
+	}
+
+	return -EINVAL;
+}
+
 static ssize_t
 ftrace_regex_write(struct file *file, const char __user *ubuf,
 		   size_t cnt, loff_t *ppos, int enable)
@@ -1232,7 +1332,10 @@ ftrace_regex_write(struct file *file, const char __user *ubuf,
 	if (isspace(ch)) {
 		iter->filtered++;
 		iter->buffer[iter->buffer_idx] = 0;
-		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
+		ret = ftrace_process_regex(iter->buffer,
+					   iter->buffer_idx, enable);
+		if (ret)
+			goto out;
 		iter->buffer_idx = 0;
 	} else
 		iter->flags |= FTRACE_ITER_CONT;
-- 
cgit v1.2.3


From e68746a271eb3393a2183840be9e903caddf765b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Feb 2009 20:53:42 -0500
Subject: ftrace: enable filtering only when a function is filtered on

Impact: fix to prevent empty set_ftrace_filter and no ftrace output

The function filter is used to only trace a given set of functions.
The filter is enabled when a function name is echoed into the
set_ftrace_filter file. But if the name has a typo and the function
is not found, the filter is enabled, but no function is listed.

This makes a confusing situation where set_ftrace_filter is empty
but no functions ever get enabled for tracing.

For example:

 # cat /debug/tracing/set_ftrace_filter

  #### all functions enabled ####

 # echo bad_name > set_ftrace_filter
 # cat /debug/tracing/set_ftrace_filter

 # echo function > current_tracer
 # cat trace

  # tracer: nop
  #
  #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
  #              | |       |          |         |

This patch changes that to only enable filtering if a function
is set to be filtered on. Now, the filter is not enabled if
a bad name is echoed into set_ftrace_filter.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9e60ae423af9..340f88b68d9e 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1153,8 +1153,6 @@ static void ftrace_match_records(char *buff, int len, int enable)
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
-	if (enable)
-		ftrace_filtered = 1;
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1166,7 +1164,12 @@ static void ftrace_match_records(char *buff, int len, int enable)
 			else
 				rec->flags |= flag;
 		}
-
+		/*
+		 * Only enable filtering if we have a function that
+		 * is filtered on.
+		 */
+		if (enable && (rec->flags & FTRACE_FL_FILTER))
+			ftrace_filtered = 1;
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
 }
@@ -1217,9 +1220,6 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 
 	/* should not be called from interrupt context */
 	spin_lock(&ftrace_lock);
-	if (enable)
-		ftrace_filtered = 1;
-
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1232,6 +1232,8 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 			else
 				rec->flags |= flag;
 		}
+		if (enable && (rec->flags & FTRACE_FL_FILTER))
+			ftrace_filtered = 1;
 
 	} while_for_each_ftrace_rec();
 	spin_unlock(&ftrace_lock);
-- 
cgit v1.2.3


From f6180773d90595650e11de0118bb112018290915 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 00:40:25 -0500
Subject: ftrace: add command interface for function selection

Allow for other tracers to add their own commands for function
selection. This interface gives a trace the ability to name a
command for function selection. Right now it is pretty limited
in what it offers, but this is a building step for more features.

The :mod: command is converted to this interface and also serves
as a template for other implementations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  16 ++++++++
 kernel/trace/ftrace.c  | 106 ++++++++++++++++++++++++++++++++++++++++++++-----
 2 files changed, 111 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 106b7909d500..f0a0ecc63b5c 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -95,6 +95,13 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 		   loff_t *ppos);
 #endif
 
+struct ftrace_func_command {
+	struct list_head	list;
+	char			*name;
+	int			(*func)(char *func, char *cmd,
+					char *params, int enable);
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
@@ -119,6 +126,9 @@ struct dyn_ftrace {
 int ftrace_force_update(void);
 void ftrace_set_filter(unsigned char *buf, int len, int reset);
 
+int register_ftrace_command(struct ftrace_func_command *cmd);
+int unregister_ftrace_command(struct ftrace_func_command *cmd);
+
 /* defined in arch */
 extern int ftrace_ip_converted(unsigned long ip);
 extern int ftrace_dyn_arch_init(void *data);
@@ -202,6 +212,12 @@ extern void ftrace_enable_daemon(void);
 # define ftrace_disable_daemon()		do { } while (0)
 # define ftrace_enable_daemon()			do { } while (0)
 static inline void ftrace_release(void *start, unsigned long size) { }
+static inline int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+}
+static inline int unregister_ftrace_command(char *cmd_name)
+{
+}
 #endif /* CONFIG_DYNAMIC_FTRACE */
 
 /* totally disable ftrace - can not re-enable after this */
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 340f88b68d9e..45a44c402566 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1239,9 +1239,93 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 	spin_unlock(&ftrace_lock);
 }
 
+/*
+ * We register the module command as a template to show others how
+ * to register the a command as well.
+ */
+
+static int
+ftrace_mod_callback(char *func, char *cmd, char *param, int enable)
+{
+	char *mod;
+
+	/*
+	 * cmd == 'mod' because we only registered this func
+	 * for the 'mod' ftrace_func_command.
+	 * But if you register one func with multiple commands,
+	 * you can tell which command was used by the cmd
+	 * parameter.
+	 */
+
+	/* we must have a module name */
+	if (!param)
+		return -EINVAL;
+
+	mod = strsep(&param, ":");
+	if (!strlen(mod))
+		return -EINVAL;
+
+	ftrace_match_module_records(func, mod, enable);
+	return 0;
+}
+
+static struct ftrace_func_command ftrace_mod_cmd = {
+	.name			= "mod",
+	.func			= ftrace_mod_callback,
+};
+
+static int __init ftrace_mod_cmd_init(void)
+{
+	return register_ftrace_command(&ftrace_mod_cmd);
+}
+device_initcall(ftrace_mod_cmd_init);
+
+static LIST_HEAD(ftrace_commands);
+static DEFINE_MUTEX(ftrace_cmd_mutex);
+
+int register_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p;
+	int ret = 0;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = -EBUSY;
+			goto out_unlock;
+		}
+	}
+	list_add(&cmd->list, &ftrace_commands);
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
+int unregister_ftrace_command(struct ftrace_func_command *cmd)
+{
+	struct ftrace_func_command *p, *n;
+	int ret = -ENODEV;
+
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry_safe(p, n, &ftrace_commands, list) {
+		if (strcmp(cmd->name, p->name) == 0) {
+			ret = 0;
+			list_del_init(&p->list);
+			goto out_unlock;
+		}
+	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
+
+	return ret;
+}
+
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	char *func, *mod, *command, *next = buff;
+	struct ftrace_func_command *p;
+	char *func, *command, *next = buff;
+	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
 
@@ -1250,21 +1334,21 @@ static int ftrace_process_regex(char *buff, int len, int enable)
 		return 0;
 	}
 
-	/* command fonud */
+	/* command found */
 
 	command = strsep(&next, ":");
 
-	if (strcmp(command, "mod") == 0) {
-		/* only match modules */
-		if (!next)
-			return -EINVAL;
-
-		mod = strsep(&next, ":");
-		ftrace_match_module_records(func, mod, enable);
-		return 0;
+	mutex_lock(&ftrace_cmd_mutex);
+	list_for_each_entry(p, &ftrace_commands, list) {
+		if (strcmp(p->name, command) == 0) {
+			ret = p->func(func, command, next, enable);
+			goto out_unlock;
+		}
 	}
+ out_unlock:
+	mutex_unlock(&ftrace_cmd_mutex);
 
-	return -EINVAL;
+	return ret;
 }
 
 static ssize_t
-- 
cgit v1.2.3


From 52baf11922db7377b580dd5448a07f71c6a35611 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 01:15:39 -0500
Subject: ftrace: convert ftrace_lock from a spinlock to mutex

Impact: clean up

The older versions of ftrace required doing the ftrace list
search under atomic context. Now all the calls are in non-atomic
context. There is no reason to keep the ftrace_lock as a spinlock.

This patch converts it to a mutex.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 51 +++++++++++++++++++--------------------------------
 1 file changed, 19 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 45a44c402566..4771732037ee 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -61,7 +61,7 @@ int function_trace_stop;
  */
 static int ftrace_disabled __read_mostly;
 
-static DEFINE_SPINLOCK(ftrace_lock);
+static DEFINE_MUTEX(ftrace_lock);
 static DEFINE_MUTEX(ftrace_sysctl_lock);
 static DEFINE_MUTEX(ftrace_start_lock);
 
@@ -134,8 +134,7 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	ops->next = ftrace_list;
 	/*
@@ -172,7 +171,7 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 #endif
 	}
 
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return 0;
 }
@@ -182,8 +181,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	struct ftrace_ops **p;
 	int ret = 0;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	/*
 	 * If we are removing the last function, then simply point
@@ -224,7 +222,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	}
 
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return ret;
 }
@@ -233,8 +231,7 @@ static void ftrace_update_pid_func(void)
 {
 	ftrace_func_t func;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 
 	if (ftrace_trace_function == ftrace_stub)
 		goto out;
@@ -256,7 +253,7 @@ static void ftrace_update_pid_func(void)
 #endif
 
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 #ifdef CONFIG_DYNAMIC_FTRACE
@@ -358,15 +355,12 @@ void ftrace_release(void *start, unsigned long size)
 	if (ftrace_disabled || !start)
 		return;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
-
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 		if ((rec->ip >= s) && (rec->ip < e))
 			ftrace_free_rec(rec);
 	} while_for_each_ftrace_rec();
-
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
@@ -803,8 +797,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
  retry:
 	if (iter->idx >= iter->pg->index) {
 		if (iter->pg->next) {
@@ -833,7 +826,7 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			goto retry;
 		}
 	}
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return rec;
 }
@@ -962,8 +955,7 @@ static void ftrace_filter_reset(int enable)
 	struct dyn_ftrace *rec;
 	unsigned long type = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	if (enable)
 		ftrace_filtered = 0;
 	do_for_each_ftrace_rec(pg, rec) {
@@ -971,8 +963,7 @@ static void ftrace_filter_reset(int enable)
 			continue;
 		rec->flags &= ~type;
 	} while_for_each_ftrace_rec();
-
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static int
@@ -1151,8 +1142,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 
 	search_len = strlen(search);
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1171,7 +1161,7 @@ static void ftrace_match_records(char *buff, int len, int enable)
 		if (enable && (rec->flags & FTRACE_FL_FILTER))
 			ftrace_filtered = 1;
 	} while_for_each_ftrace_rec();
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 static int
@@ -1218,8 +1208,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 		search_len = strlen(search);
 	}
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & FTRACE_FL_FAILED)
@@ -1236,7 +1225,7 @@ static void ftrace_match_module_records(char *buff, char *mod, int enable)
 			ftrace_filtered = 1;
 
 	} while_for_each_ftrace_rec();
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 /*
@@ -1676,9 +1665,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 	if (ftrace_disabled)
 		return -ENODEV;
 
-	/* should not be called from interrupt context */
-	spin_lock(&ftrace_lock);
-
+	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
@@ -1699,7 +1686,7 @@ ftrace_set_func(unsigned long *array, int idx, char *buffer)
 		}
 	} while_for_each_ftrace_rec();
  out:
-	spin_unlock(&ftrace_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
 }
-- 
cgit v1.2.3


From e6ea44e9b4c12325337cd1c06103cd515a1c02b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 01:42:44 -0500
Subject: ftrace: consolidate mutexes

Impact: clean up

Now that ftrace_lock is a mutex, there is no reason to have three
different mutexes protecting similar data. All the mutex paths
are not in hot paths, so having a mutex to cover more data is
not a problem.

This patch removes the ftrace_sysctl_lock and ftrace_start_lock
and uses the ftrace_lock to protect the locations that were protected
by these locks. By doing so, this change also removes some of
the lock nesting that was taking place.

There are still more mutexes in ftrace.c that can probably be
consolidated, but they can be dealt with later. We need to be careful
about the way the locks are nested, and by consolidating, we can cause
a recursive deadlock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 68 ++++++++++++++++-----------------------------------
 1 file changed, 21 insertions(+), 47 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4771732037ee..157d4f68b0e0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -62,8 +62,6 @@ int function_trace_stop;
 static int ftrace_disabled __read_mostly;
 
 static DEFINE_MUTEX(ftrace_lock);
-static DEFINE_MUTEX(ftrace_sysctl_lock);
-static DEFINE_MUTEX(ftrace_start_lock);
 
 static struct ftrace_ops ftrace_list_end __read_mostly =
 {
@@ -134,8 +132,6 @@ static void ftrace_test_stop_func(unsigned long ip, unsigned long parent_ip)
 
 static int __register_ftrace_function(struct ftrace_ops *ops)
 {
-	mutex_lock(&ftrace_lock);
-
 	ops->next = ftrace_list;
 	/*
 	 * We are entering ops into the ftrace_list but another
@@ -171,17 +167,12 @@ static int __register_ftrace_function(struct ftrace_ops *ops)
 #endif
 	}
 
-	mutex_unlock(&ftrace_lock);
-
 	return 0;
 }
 
 static int __unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	struct ftrace_ops **p;
-	int ret = 0;
-
-	mutex_lock(&ftrace_lock);
 
 	/*
 	 * If we are removing the last function, then simply point
@@ -190,17 +181,15 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 	if (ftrace_list == ops && ops->next == &ftrace_list_end) {
 		ftrace_trace_function = ftrace_stub;
 		ftrace_list = &ftrace_list_end;
-		goto out;
+		return 0;
 	}
 
 	for (p = &ftrace_list; *p != &ftrace_list_end; p = &(*p)->next)
 		if (*p == ops)
 			break;
 
-	if (*p != ops) {
-		ret = -1;
-		goto out;
-	}
+	if (*p != ops)
+		return -1;
 
 	*p = (*p)->next;
 
@@ -221,10 +210,7 @@ static int __unregister_ftrace_function(struct ftrace_ops *ops)
 		}
 	}
 
- out:
-	mutex_unlock(&ftrace_lock);
-
-	return ret;
+	return 0;
 }
 
 static void ftrace_update_pid_func(void)
@@ -622,13 +608,10 @@ static void ftrace_startup(int command)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up++;
 	command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_startup_enable(command);
-
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_shutdown(int command)
@@ -636,7 +619,6 @@ static void ftrace_shutdown(int command)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	ftrace_start_up--;
 	if (!ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
@@ -647,11 +629,9 @@ static void ftrace_shutdown(int command)
 	}
 
 	if (!command || !ftrace_enabled)
-		goto out;
+		return;
 
 	ftrace_run_update_code(command);
- out:
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_startup_sysctl(void)
@@ -661,7 +641,6 @@ static void ftrace_startup_sysctl(void)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	/* Force update next time */
 	saved_ftrace_func = NULL;
 	/* ftrace_start_up is true if we want ftrace running */
@@ -669,7 +648,6 @@ static void ftrace_startup_sysctl(void)
 		command |= FTRACE_ENABLE_CALLS;
 
 	ftrace_run_update_code(command);
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static void ftrace_shutdown_sysctl(void)
@@ -679,13 +657,11 @@ static void ftrace_shutdown_sysctl(void)
 	if (unlikely(ftrace_disabled))
 		return;
 
-	mutex_lock(&ftrace_start_lock);
 	/* ftrace_start_up is true if ftrace is running */
 	if (ftrace_start_up)
 		command |= FTRACE_DISABLE_CALLS;
 
 	ftrace_run_update_code(command);
-	mutex_unlock(&ftrace_start_lock);
 }
 
 static cycle_t		ftrace_update_time;
@@ -1502,12 +1478,10 @@ ftrace_regex_release(struct inode *inode, struct file *file, int enable)
 		ftrace_match_records(iter->buffer, iter->buffer_idx, enable);
 	}
 
-	mutex_lock(&ftrace_sysctl_lock);
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	if (ftrace_start_up && ftrace_enabled)
 		ftrace_run_update_code(FTRACE_ENABLE_CALLS);
-	mutex_unlock(&ftrace_start_lock);
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 
 	kfree(iter);
 	mutex_unlock(&ftrace_regex_lock);
@@ -1824,7 +1798,7 @@ static int ftrace_convert_nops(struct module *mod,
 	unsigned long addr;
 	unsigned long flags;
 
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	p = start;
 	while (p < end) {
 		addr = ftrace_call_adjust(*p++);
@@ -1843,7 +1817,7 @@ static int ftrace_convert_nops(struct module *mod,
 	local_irq_save(flags);
 	ftrace_update_code(mod);
 	local_irq_restore(flags);
-	mutex_unlock(&ftrace_start_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return 0;
 }
@@ -2016,7 +1990,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	if (ret < 0)
 		return ret;
 
-	mutex_lock(&ftrace_start_lock);
+	mutex_lock(&ftrace_lock);
 	if (val < 0) {
 		/* disable pid tracing */
 		if (!ftrace_pid_trace)
@@ -2055,7 +2029,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	ftrace_startup_enable(0);
 
  out:
-	mutex_unlock(&ftrace_start_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return cnt;
 }
@@ -2118,12 +2092,12 @@ int register_ftrace_function(struct ftrace_ops *ops)
 	if (unlikely(ftrace_disabled))
 		return -1;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ret = __register_ftrace_function(ops);
 	ftrace_startup(0);
 
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
@@ -2137,10 +2111,10 @@ int unregister_ftrace_function(struct ftrace_ops *ops)
 {
 	int ret;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 	ret = __unregister_ftrace_function(ops);
 	ftrace_shutdown(0);
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 
 	return ret;
 }
@@ -2155,7 +2129,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	if (unlikely(ftrace_disabled))
 		return -ENODEV;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ret  = proc_dointvec(table, write, file, buffer, lenp, ppos);
 
@@ -2184,7 +2158,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write,
 	}
 
  out:
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
@@ -2296,7 +2270,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 {
 	int ret = 0;
 
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
@@ -2314,13 +2288,13 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 	ftrace_startup(FTRACE_START_FUNC_RET);
 
 out:
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 	return ret;
 }
 
 void unregister_ftrace_graph(void)
 {
-	mutex_lock(&ftrace_sysctl_lock);
+	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2328,7 +2302,7 @@ void unregister_ftrace_graph(void)
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 
-	mutex_unlock(&ftrace_sysctl_lock);
+	mutex_unlock(&ftrace_lock);
 }
 
 /* Allocate a return stack for newly created task */
-- 
cgit v1.2.3


From 59df055f1991c9fc0c71a9230663c39188f6972f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 15:29:06 -0500
Subject: ftrace: trace different functions with a different tracer

Impact: new feature

Currently, the function tracer only gives you an ability to hook
a tracer to all functions being traced. The dynamic function trace
allows you to pick and choose which of those functions will be
traced, but all functions being traced will call all tracers that
registered with the function tracer.

This patch adds a new feature that allows a tracer to hook to specific
functions, even when all functions are being traced. It allows for
different functions to call different tracer hooks.

The way this is accomplished is by a special function that will hook
to the function tracer and will set up a hash table knowing which
tracer hook to call with which function. This is the most general
and easiest method to accomplish this. Later, an arch may choose
to supply their own method in changing the mcount call of a function
to call a different tracer. But that will be an exercise for the
future.

To register a function:

 struct ftrace_hook_ops {
	void			(*func)(unsigned long ip,
					unsigned long parent_ip,
					void **data);
	int			(*callback)(unsigned long ip, void **data);
	void			(*free)(void **data);
 };

 int register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				  void *data);

glob is a simple glob to search for the functions to hook.
ops is a pointer to the operations (listed below)
data is the default data to be passed to the hook functions when traced

ops:
 func is the hook function to call when the functions are traced
 callback is a callback function that is called when setting up the hash.
   That is, if the tracer needs to do something special for each
   function, that is being traced, and wants to give each function
   its own data. The address of the entry data is passed to this
   callback, so that the callback may wish to update the entry to
   whatever it would like.
 free is a callback for when the entry is freed. In case the tracer
   allocated any data, it is give the chance to free it.

To unregister we have three functions:

  void
  unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
				void *data)

This will unregister all hooks that match glob, point to ops, and
have its data matching data. (note, if glob is NULL, blank or '*',
all functions will be tested).

  void
  unregister_ftrace_function_hook_func(char *glob,
				 struct ftrace_hook_ops *ops)

This will unregister all functions matching glob that has an entry
pointing to ops.

  void unregister_ftrace_function_hook_all(char *glob)

This simply unregisters all funcs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  18 ++++
 kernel/trace/ftrace.c  | 247 +++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 265 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index f0a0ecc63b5c..13918c4400ad 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,6 +106,24 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct ftrace_hook_ops {
+	void			(*func)(unsigned long ip,
+					unsigned long parent_ip,
+					void **data);
+	int			(*callback)(unsigned long ip, void **data);
+	void			(*free)(void **data);
+};
+
+extern int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data);
+extern void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data);
+extern void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
+extern void unregister_ftrace_function_hook_all(char *glob);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 157d4f68b0e0..0b80e325f296 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -27,6 +27,7 @@
 #include <linux/sysctl.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
+#include <linux/hash.h>
 
 #include <asm/ftrace.h>
 
@@ -1245,6 +1246,252 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+static void
+function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_head *hhd;
+	struct hlist_node *n;
+	unsigned long key;
+	int resched;
+
+	key = hash_long(ip, FTRACE_HASH_BITS);
+
+	hhd = &ftrace_func_hash[key];
+
+	if (hlist_empty(hhd))
+		return;
+
+	/*
+	 * Disable preemption for these calls to prevent a RCU grace
+	 * period. This syncs the hash iteration and freeing of items
+	 * on the hash. rcu_read_lock is too dangerous here.
+	 */
+	resched = ftrace_preempt_disable();
+	hlist_for_each_entry_rcu(entry, n, hhd, node) {
+		if (entry->ip == ip)
+			entry->ops->func(ip, parent_ip, &entry->data);
+	}
+	ftrace_preempt_enable(resched);
+}
+
+static struct ftrace_ops trace_hook_ops __read_mostly =
+{
+	.func = function_trace_hook_call,
+};
+
+static int ftrace_hook_registered;
+
+static void __enable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			break;
+	}
+	/* Nothing registered? */
+	if (i == FTRACE_FUNC_HASHSIZE)
+		return;
+
+	__register_ftrace_function(&trace_hook_ops);
+	ftrace_startup(0);
+	ftrace_hook_registered = 1;
+}
+
+static void __disable_ftrace_function_hook(void)
+{
+	int i;
+
+	if (!ftrace_hook_registered)
+		return;
+
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+		if (hhd->first)
+			return;
+	}
+
+	/* no more funcs left */
+	__unregister_ftrace_function(&trace_hook_ops);
+	ftrace_shutdown(0);
+	ftrace_hook_registered = 0;
+}
+
+
+static void ftrace_free_entry_rcu(struct rcu_head *rhp)
+{
+	struct ftrace_func_hook *entry =
+		container_of(rhp, struct ftrace_func_hook, rcu);
+
+	if (entry->ops->free)
+		entry->ops->free(&entry->data);
+	kfree(entry);
+}
+
+
+int
+register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+			      void *data)
+{
+	struct ftrace_func_hook *entry;
+	struct ftrace_page *pg;
+	struct dyn_ftrace *rec;
+	unsigned long key;
+	int type, len, not;
+	int count = 0;
+	char *search;
+
+	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+	len = strlen(search);
+
+	/* we do not support '!' for function hooks */
+	if (WARN_ON(not))
+		return -EINVAL;
+
+	mutex_lock(&ftrace_lock);
+	do_for_each_ftrace_rec(pg, rec) {
+
+		if (rec->flags & FTRACE_FL_FAILED)
+			continue;
+
+		if (!ftrace_match_record(rec, search, len, type))
+			continue;
+
+		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
+		if (!entry) {
+			/* If we did not hook to any, then return error */
+			if (!count)
+				count = -ENOMEM;
+			goto out_unlock;
+		}
+
+		count++;
+
+		entry->data = data;
+
+		/*
+		 * The caller might want to do something special
+		 * for each function we find. We call the callback
+		 * to give the caller an opportunity to do so.
+		 */
+		if (ops->callback) {
+			if (ops->callback(rec->ip, &entry->data) < 0) {
+				/* caller does not like this func */
+				kfree(entry);
+				continue;
+			}
+		}
+
+		entry->ops = ops;
+		entry->ip = rec->ip;
+
+		key = hash_long(entry->ip, FTRACE_HASH_BITS);
+		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
+
+	} while_for_each_ftrace_rec();
+	__enable_ftrace_function_hook();
+
+ out_unlock:
+	mutex_unlock(&ftrace_lock);
+
+	return count;
+}
+
+enum {
+	HOOK_TEST_FUNC		= 1,
+	HOOK_TEST_DATA		= 2
+};
+
+static void
+__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				  void *data, int flags)
+{
+	struct ftrace_func_hook *entry;
+	struct hlist_node *n, *tmp;
+	char str[KSYM_SYMBOL_LEN];
+	int type = MATCH_FULL;
+	int i, len = 0;
+	char *search;
+
+	if (glob && (strcmp(glob, "*") || !strlen(glob)))
+		glob = NULL;
+	else {
+		int not;
+
+		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
+		len = strlen(search);
+
+		/* we do not support '!' for function hooks */
+		if (WARN_ON(not))
+			return;
+	}
+
+	mutex_lock(&ftrace_lock);
+	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
+		struct hlist_head *hhd = &ftrace_func_hash[i];
+
+		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
+
+			/* break up if statements for readability */
+			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+				continue;
+
+			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+				continue;
+
+			/* do this last, since it is the most expensive */
+			if (glob) {
+				kallsyms_lookup(entry->ip, NULL, NULL,
+						NULL, str);
+				if (!ftrace_match(str, glob, len, type))
+					continue;
+			}
+
+			hlist_del(&entry->node);
+			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
+		}
+	}
+	__disable_ftrace_function_hook();
+	mutex_unlock(&ftrace_lock);
+}
+
+void
+unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+				void *data)
+{
+	__unregister_ftrace_function_hook(glob, ops, data,
+					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+}
+
+void
+unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+{
+	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+}
+
+void unregister_ftrace_function_hook_all(char *glob)
+{
+	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+}
+
 static LIST_HEAD(ftrace_commands);
 static DEFINE_MUTEX(ftrace_cmd_mutex);
 
-- 
cgit v1.2.3


From 988ae9d6b2bc3ebdc1a488490250a6812f85e9d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:17:02 -0500
Subject: ring-buffer: add tracing_is_on to test if ring buffer is enabled

This patch adds the tracing_is_on() interface to tell if the ring
buffer is turned on or not.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h | 2 ++
 kernel/trace/ring_buffer.c  | 9 +++++++++
 2 files changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 8e6646a54acf..f5e793d69bd3 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -128,10 +128,12 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 void tracing_on(void);
 void tracing_off(void);
 void tracing_off_permanent(void);
+int tracing_is_on(void);
 #else
 static inline void tracing_on(void) { }
 static inline void tracing_off(void) { }
 static inline void tracing_off_permanent(void) { }
+static inline int tracing_is_on(void) { return 0; }
 #endif
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2b4626ce95d6..8f19f1aa42b0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -98,6 +98,15 @@ void tracing_off_permanent(void)
 	set_bit(RB_BUFFERS_DISABLED_BIT, &ring_buffer_flags);
 }
 
+/**
+ * tracing_is_on - show state of ring buffers enabled
+ */
+int tracing_is_on(void)
+{
+	return ring_buffer_flags == RB_BUFFERS_ON;
+}
+EXPORT_SYMBOL_GPL(tracing_is_on);
+
 #include "trace.h"
 
 /* Up this if you want to test the TIME_EXTENTS and normalization */
-- 
cgit v1.2.3


From 23b4ff3aa479c9e3bb23cb6b2d0a97878399784a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 14 Feb 2009 19:04:24 -0500
Subject: ftrace: add traceon traceoff commands to enable/disable the buffers

This patch adds the new function selection commands traceon and
traceoff. traceon sets the function to enable the ring buffers
while traceoff disables the ring buffers.  You can pass in the
number of times you want the command to be executed when the function
is hit. It will only execute if the state of the buffers are not
already in that state.

Example:

 # echo do_fork:traceon:4

Will enable the ring buffers if they are disabled every time it
hits do_fork, up to 4 times.

 # echo sys_close:traceoff

This will disable the ring buffers every time (unlimited) when
sys_close is called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 135 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 135 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 36bf9568ccd9..5c95708b9dc3 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -9,6 +9,7 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
+#include <linux/ring_buffer.h>
 #include <linux/debugfs.h>
 #include <linux/uaccess.h>
 #include <linux/ftrace.h>
@@ -231,9 +232,143 @@ static struct tracer function_trace __read_mostly =
 #endif
 };
 
+#ifdef CONFIG_DYNAMIC_FTRACE
+static void
+ftrace_traceon(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	long *count = (long *)data;
+
+	if (tracing_is_on())
+		return;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_on();
+}
+
+static void
+ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
+{
+	long *count = (long *)data;
+
+	if (!tracing_is_on())
+		return;
+
+	if (!*count)
+		return;
+
+	if (*count != -1)
+		(*count)--;
+
+	tracing_off();
+}
+
+static struct ftrace_hook_ops traceon_hook_ops = {
+	.func			= ftrace_traceon,
+};
+
+static struct ftrace_hook_ops traceoff_hook_ops = {
+	.func			= ftrace_traceoff,
+};
+
+static int
+ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
+{
+	struct ftrace_hook_ops *ops;
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = &traceon_hook_ops;
+	else
+		ops = &traceoff_hook_ops;
+
+	unregister_ftrace_function_hook_func(glob, ops);
+
+	return 0;
+}
+
+static int
+ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
+{
+	struct ftrace_hook_ops *ops;
+	void *count = (void *)-1;
+	char *number;
+	int ret;
+
+	/* hash funcs only work with set_ftrace_filter */
+	if (!enable)
+		return -EINVAL;
+
+	if (glob[0] == '!')
+		return ftrace_trace_onoff_unreg(glob+1, cmd, param);
+
+	/* we register both traceon and traceoff to this callback */
+	if (strcmp(cmd, "traceon") == 0)
+		ops = &traceon_hook_ops;
+	else
+		ops = &traceoff_hook_ops;
+
+	if (!param)
+		goto out_reg;
+
+	number = strsep(&param, ":");
+
+	if (!strlen(number))
+		goto out_reg;
+
+	/*
+	 * We use the callback data field (which is a pointer)
+	 * as our counter.
+	 */
+	ret = strict_strtoul(number, 0, (unsigned long *)&count);
+	if (ret)
+		return ret;
+
+ out_reg:
+	ret = register_ftrace_function_hook(glob, ops, count);
+
+	return ret;
+}
+
+static struct ftrace_func_command ftrace_traceon_cmd = {
+	.name			= "traceon",
+	.func			= ftrace_trace_onoff_callback,
+};
+
+static struct ftrace_func_command ftrace_traceoff_cmd = {
+	.name			= "traceoff",
+	.func			= ftrace_trace_onoff_callback,
+};
+
+static int __init init_func_cmd_traceon(void)
+{
+	int ret;
+
+	ret = register_ftrace_command(&ftrace_traceoff_cmd);
+	if (ret)
+		return ret;
+
+	ret = register_ftrace_command(&ftrace_traceon_cmd);
+	if (ret)
+		unregister_ftrace_command(&ftrace_traceoff_cmd);
+	return ret;
+}
+#else
+static inline int init_func_cmd_traceon(void)
+{
+	return 0;
+}
+#endif /* CONFIG_DYNAMIC_FTRACE */
+
 static __init int init_function_trace(void)
 {
+	init_func_cmd_traceon();
 	return register_tracer(&function_trace);
 }
 
 device_initcall(init_function_trace);
+
-- 
cgit v1.2.3


From 8fc0c701c5b6c0c3e242758c3acef6f9047940a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 15:28:00 -0500
Subject: ftrace: show selected functions in set_ftrace_filter

This patch adds output to show what functions have tracer hooks
attached to them.

  # echo 'sys_open:traceon:4' > /debug/tracing/set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:ftrace_traceon:0000000000000004

  # echo 'do_fork:traceoff:' > set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:ftrace_traceon:0000000000000002
 do_fork:ftrace_traceoff:ffffffffffffffff

Note the 4 changed to a 2. This is because The code was executed twice
since the traceoff was added. If a cat is done again:

 #### all functions enabled ####
 sys_open:ftrace_traceon
 do_fork:ftrace_traceoff:ffffffffffffffff

The number disappears. That is because it will not print a NULL.

Callbacks to allow the tracer to pretty print will be implemented soon.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 123 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 103 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0b80e325f296..1e058848cddb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -45,14 +45,14 @@
 			ftrace_kill();		\
 	} while (0)
 
+/* hash bits for specific function selection */
+#define FTRACE_HASH_BITS 7
+#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
+
 /* ftrace_enabled is a method to turn ftrace on or off */
 int ftrace_enabled __read_mostly;
 static int last_ftrace_enabled;
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
-
 /* Quick disabling of function tracer. */
 int function_trace_stop;
 
@@ -248,6 +248,21 @@ static void ftrace_update_pid_func(void)
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
+
+struct ftrace_func_hook {
+	struct hlist_node	node;
+	struct ftrace_hook_ops	*ops;
+	unsigned long		flags;
+	unsigned long		ip;
+	void			*data;
+	struct rcu_head		rcu;
+};
+
+
 enum {
 	FTRACE_ENABLE_CALLS		= (1 << 0),
 	FTRACE_DISABLE_CALLS		= (1 << 1),
@@ -750,12 +765,14 @@ enum {
 	FTRACE_ITER_NOTRACE	= (1 << 2),
 	FTRACE_ITER_FAILURES	= (1 << 3),
 	FTRACE_ITER_PRINTALL	= (1 << 4),
+	FTRACE_ITER_HASH	= (1 << 5),
 };
 
 #define FTRACE_BUFF_MAX (KSYM_SYMBOL_LEN+4) /* room for wildcards */
 
 struct ftrace_iterator {
 	struct ftrace_page	*pg;
+	int			hidx;
 	int			idx;
 	unsigned		flags;
 	unsigned char		buffer[FTRACE_BUFF_MAX+1];
@@ -763,18 +780,87 @@ struct ftrace_iterator {
 	unsigned		filtered;
 };
 
+static void *
+t_hash_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	struct hlist_node *hnd = v;
+	struct hlist_head *hhd;
+
+	WARN_ON(!(iter->flags & FTRACE_ITER_HASH));
+
+	(*pos)++;
+
+ retry:
+	if (iter->hidx >= FTRACE_FUNC_HASHSIZE)
+		return NULL;
+
+	hhd = &ftrace_func_hash[iter->hidx];
+
+	if (hlist_empty(hhd)) {
+		iter->hidx++;
+		hnd = NULL;
+		goto retry;
+	}
+
+	if (!hnd)
+		hnd = hhd->first;
+	else {
+		hnd = hnd->next;
+		if (!hnd) {
+			iter->hidx++;
+			goto retry;
+		}
+	}
+
+	return hnd;
+}
+
+static void *t_hash_start(struct seq_file *m, loff_t *pos)
+{
+	struct ftrace_iterator *iter = m->private;
+	void *p = NULL;
+
+	iter->flags |= FTRACE_ITER_HASH;
+
+	return t_hash_next(m, p, pos);
+}
+
+static int t_hash_show(struct seq_file *m, void *v)
+{
+	struct ftrace_func_hook *rec;
+	struct hlist_node *hnd = v;
+	char str[KSYM_SYMBOL_LEN];
+
+	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+
+	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
+	seq_printf(m, "%s:", str);
+
+	kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str);
+	seq_printf(m, "%s", str);
+
+	if (rec->data)
+		seq_printf(m, ":%p", rec->data);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
 	struct ftrace_iterator *iter = m->private;
 	struct dyn_ftrace *rec = NULL;
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_next(m, v, pos);
+
 	(*pos)++;
 
 	if (iter->flags & FTRACE_ITER_PRINTALL)
 		return NULL;
 
-	mutex_lock(&ftrace_lock);
  retry:
 	if (iter->idx >= iter->pg->index) {
 		if (iter->pg->next) {
@@ -803,7 +889,6 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 			goto retry;
 		}
 	}
-	mutex_unlock(&ftrace_lock);
 
 	return rec;
 }
@@ -813,6 +898,7 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	struct ftrace_iterator *iter = m->private;
 	void *p = NULL;
 
+	mutex_lock(&ftrace_lock);
 	/*
 	 * For set_ftrace_filter reading, if we have the filter
 	 * off, we can short cut and just print out that all
@@ -820,12 +906,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 	 */
 	if (iter->flags & FTRACE_ITER_FILTER && !ftrace_filtered) {
 		if (*pos > 0)
-			return NULL;
+			return t_hash_start(m, pos);
 		iter->flags |= FTRACE_ITER_PRINTALL;
 		(*pos)++;
 		return iter;
 	}
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_start(m, pos);
+
 	if (*pos > 0) {
 		if (iter->idx < 0)
 			return p;
@@ -835,11 +924,15 @@ static void *t_start(struct seq_file *m, loff_t *pos)
 
 	p = t_next(m, p, pos);
 
+	if (!p)
+		return t_hash_start(m, pos);
+
 	return p;
 }
 
 static void t_stop(struct seq_file *m, void *p)
 {
+	mutex_unlock(&ftrace_lock);
 }
 
 static int t_show(struct seq_file *m, void *v)
@@ -848,6 +941,9 @@ static int t_show(struct seq_file *m, void *v)
 	struct dyn_ftrace *rec = v;
 	char str[KSYM_SYMBOL_LEN];
 
+	if (iter->flags & FTRACE_ITER_HASH)
+		return t_hash_show(m, v);
+
 	if (iter->flags & FTRACE_ITER_PRINTALL) {
 		seq_printf(m, "#### all functions enabled ####\n");
 		return 0;
@@ -1246,19 +1342,6 @@ static int __init ftrace_mod_cmd_init(void)
 }
 device_initcall(ftrace_mod_cmd_init);
 
-#define FTRACE_HASH_BITS 7
-#define FTRACE_FUNC_HASHSIZE (1 << FTRACE_HASH_BITS)
-static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
-
-struct ftrace_func_hook {
-	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
-	unsigned long		flags;
-	unsigned long		ip;
-	void			*data;
-	struct rcu_head		rcu;
-};
-
 static void
 function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 {
-- 
cgit v1.2.3


From 809dcf29ce4e1723709910878e050bd187617e0e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:06:01 -0500
Subject: ftrace: add pretty print to selected fuction traces

This patch adds a call back for the tracers that have hooks to
selected functions. This allows the tracer to show better output
in the set_ftrace_filter file.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h | 6 ++++++
 kernel/trace/ftrace.c  | 3 +++
 2 files changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 13918c4400ad..b331e216d8a1 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -106,12 +106,18 @@ struct ftrace_func_command {
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+struct seq_file;
+
 struct ftrace_hook_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
 	int			(*callback)(unsigned long ip, void **data);
 	void			(*free)(void **data);
+	int			(*print)(struct seq_file *m,
+					 unsigned long ip,
+					 struct ftrace_hook_ops *ops,
+					 void *data);
 };
 
 extern int
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1e058848cddb..6533c1d20155 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -834,6 +834,9 @@ static int t_hash_show(struct seq_file *m, void *v)
 
 	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
 
+	if (rec->ops->print)
+		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
+
 	kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-- 
cgit v1.2.3


From e110e3d1eaa0f9628918be67ddd32e8ad65a2871 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Feb 2009 23:38:13 -0500
Subject: ftrace: add pretty print function for traceon and traceoff hooks

This patch adds a pretty print version of traceon and traceoff
output for set_ftrace_filter.

  # echo 'sys_open:traceon:4' > set_ftrace_filter
  # cat set_ftrace_filter

 #### all functions enabled ####
 sys_open:traceon:count=4

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 5c95708b9dc3..f520aa419dff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -267,14 +267,42 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 	tracing_off();
 }
 
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_hook_ops *ops, void *data);
+
 static struct ftrace_hook_ops traceon_hook_ops = {
 	.func			= ftrace_traceon,
+	.print			= ftrace_trace_onoff_print,
 };
 
 static struct ftrace_hook_ops traceoff_hook_ops = {
 	.func			= ftrace_traceoff,
+	.print			= ftrace_trace_onoff_print,
 };
 
+static int
+ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
+			 struct ftrace_hook_ops *ops, void *data)
+{
+	char str[KSYM_SYMBOL_LEN];
+	long count = (long)data;
+
+	kallsyms_lookup(ip, NULL, NULL, NULL, str);
+	seq_printf(m, "%s:", str);
+
+	if (ops == &traceon_hook_ops)
+		seq_printf(m, "traceon");
+	else
+		seq_printf(m, "traceoff");
+
+	if (count != -1)
+		seq_printf(m, ":count=%ld", count);
+	seq_putc(m, '\n');
+
+	return 0;
+}
+
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-- 
cgit v1.2.3


From 73d3fd96e77745742f3750b7b19ee42204adc210 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 11:48:18 +0100
Subject: ftrace: fix !CONFIG_DYNAMIC_FTRACE ftrace_swapper_pid definition

Impact: build fix

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 6533c1d20155..4e6c87ecf1bf 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -243,14 +243,16 @@ static void ftrace_update_pid_func(void)
 	mutex_unlock(&ftrace_lock);
 }
 
+/* set when tracing only a pid */
+struct pid *ftrace_pid_trace;
+static struct pid * const ftrace_swapper_pid = &init_struct_pid;
+
 #ifdef CONFIG_DYNAMIC_FTRACE
+
 #ifndef CONFIG_FTRACE_MCOUNT_RECORD
 # error Dynamic ftrace depends on MCOUNT_RECORD
 #endif
 
-/* set when tracing only a pid */
-struct pid *ftrace_pid_trace;
-static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
 struct ftrace_func_hook {
-- 
cgit v1.2.3


From 6a24a244cd3a02d5b290293c32fcf2c6e92b4235 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 11:20:26 -0500
Subject: ftrace: clean up coding style

Ingo Molnar pointed out some coding style issues with the recent ftrace
updates. This patch cleans them up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c          | 24 +++++++++++++-----------
 kernel/trace/trace_functions.c |  1 -
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 4e6c87ecf1bf..af9d95c0e4de 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -460,8 +460,8 @@ static void ftrace_bug(int failed, unsigned long ip)
 static int
 __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 {
-	unsigned long ip, fl;
 	unsigned long ftrace_addr;
+	unsigned long ip, fl;
 
 	ftrace_addr = (unsigned long)FTRACE_ADDR;
 
@@ -530,9 +530,9 @@ __ftrace_replace_code(struct dyn_ftrace *rec, int enable)
 
 static void ftrace_replace_code(int enable)
 {
-	int failed;
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int failed;
 
 	do_for_each_ftrace_rec(pg, rec) {
 		/*
@@ -1208,14 +1208,15 @@ ftrace_match_record(struct dyn_ftrace *rec, char *regex, int len, int type)
 
 static void ftrace_match_records(char *buff, int len, int enable)
 {
-	char *search;
+	unsigned int search_len;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
+	unsigned long flag;
+	char *search;
 	int type;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned search_len;
 	int not;
 
+	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
 	type = ftrace_setup_glob(buff, len, &search, &not);
 
 	search_len = strlen(search);
@@ -1263,14 +1264,16 @@ ftrace_match_module_record(struct dyn_ftrace *rec, char *mod,
 
 static void ftrace_match_module_records(char *buff, char *mod, int enable)
 {
-	char *search = buff;
+	unsigned search_len = 0;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type = MATCH_FULL;
-	unsigned long flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
-	unsigned search_len = 0;
+	char *search = buff;
+	unsigned long flag;
 	int not = 0;
 
+	flag = enable ? FTRACE_FL_FILTER : FTRACE_FL_NOTRACE;
+
 	/* blank or '*' mean the same */
 	if (strcmp(buff, "*") == 0)
 		buff[0] = 0;
@@ -1442,8 +1445,8 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	struct ftrace_func_hook *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
-	unsigned long key;
 	int type, len, not;
+	unsigned long key;
 	int count = 0;
 	char *search;
 
@@ -1623,8 +1626,8 @@ int unregister_ftrace_command(struct ftrace_func_command *cmd)
 
 static int ftrace_process_regex(char *buff, int len, int enable)
 {
-	struct ftrace_func_command *p;
 	char *func, *command, *next = buff;
+	struct ftrace_func_command *p;
 	int ret = -EINVAL;
 
 	func = strsep(&next, ":");
@@ -2392,7 +2395,6 @@ static __init int ftrace_init_debugfs(void)
 			   "'set_ftrace_pid' entry\n");
 	return 0;
 }
-
 fs_initcall(ftrace_init_debugfs);
 
 /**
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index f520aa419dff..021a574c5988 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -397,6 +397,5 @@ static __init int init_function_trace(void)
 	init_func_cmd_traceon();
 	return register_tracer(&function_trace);
 }
-
 device_initcall(init_function_trace);
 
-- 
cgit v1.2.3


From b6887d7916e44c1d8913084fb6aa5004d9473f1a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 12:32:04 -0500
Subject: ftrace: rename _hook to _probe

Impact: clean up

Ingo Molnar did not like the _hook naming convention used by the
select function tracer. Luis Claudio R. Goncalves suggested using
the "_probe" extension. This patch implements the change of
calling the functions and variables "_hook" and replacing them
with "_probe".

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h         | 12 +++----
 kernel/trace/ftrace.c          | 78 +++++++++++++++++++++---------------------
 kernel/trace/trace_functions.c | 26 +++++++-------
 3 files changed, 58 insertions(+), 58 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 63281228ce3e..9d224c43e634 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -108,7 +108,7 @@ struct ftrace_func_command {
 
 struct seq_file;
 
-struct ftrace_hook_ops {
+struct ftrace_probe_ops {
 	void			(*func)(unsigned long ip,
 					unsigned long parent_ip,
 					void **data);
@@ -116,19 +116,19 @@ struct ftrace_hook_ops {
 	void			(*free)(void **data);
 	int			(*print)(struct seq_file *m,
 					 unsigned long ip,
-					 struct ftrace_hook_ops *ops,
+					 struct ftrace_probe_ops *ops,
 					 void *data);
 };
 
 extern int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data);
 extern void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data);
 extern void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops);
-extern void unregister_ftrace_function_hook_all(char *glob);
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops);
+extern void unregister_ftrace_function_probe_all(char *glob);
 
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index af9d95c0e4de..330a059f6ed7 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -255,9 +255,9 @@ static struct pid * const ftrace_swapper_pid = &init_struct_pid;
 
 static struct hlist_head ftrace_func_hash[FTRACE_FUNC_HASHSIZE] __read_mostly;
 
-struct ftrace_func_hook {
+struct ftrace_func_probe {
 	struct hlist_node	node;
-	struct ftrace_hook_ops	*ops;
+	struct ftrace_probe_ops	*ops;
 	unsigned long		flags;
 	unsigned long		ip;
 	void			*data;
@@ -830,11 +830,11 @@ static void *t_hash_start(struct seq_file *m, loff_t *pos)
 
 static int t_hash_show(struct seq_file *m, void *v)
 {
-	struct ftrace_func_hook *rec;
+	struct ftrace_func_probe *rec;
 	struct hlist_node *hnd = v;
 	char str[KSYM_SYMBOL_LEN];
 
-	rec = hlist_entry(hnd, struct ftrace_func_hook, node);
+	rec = hlist_entry(hnd, struct ftrace_func_probe, node);
 
 	if (rec->ops->print)
 		return rec->ops->print(m, rec->ip, rec->ops, rec->data);
@@ -1351,9 +1351,9 @@ static int __init ftrace_mod_cmd_init(void)
 device_initcall(ftrace_mod_cmd_init);
 
 static void
-function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
+function_trace_probe_call(unsigned long ip, unsigned long parent_ip)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_head *hhd;
 	struct hlist_node *n;
 	unsigned long key;
@@ -1379,18 +1379,18 @@ function_trace_hook_call(unsigned long ip, unsigned long parent_ip)
 	ftrace_preempt_enable(resched);
 }
 
-static struct ftrace_ops trace_hook_ops __read_mostly =
+static struct ftrace_ops trace_probe_ops __read_mostly =
 {
-	.func = function_trace_hook_call,
+	.func = function_trace_probe_call,
 };
 
-static int ftrace_hook_registered;
+static int ftrace_probe_registered;
 
-static void __enable_ftrace_function_hook(void)
+static void __enable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (ftrace_hook_registered)
+	if (ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1402,16 +1402,16 @@ static void __enable_ftrace_function_hook(void)
 	if (i == FTRACE_FUNC_HASHSIZE)
 		return;
 
-	__register_ftrace_function(&trace_hook_ops);
+	__register_ftrace_function(&trace_probe_ops);
 	ftrace_startup(0);
-	ftrace_hook_registered = 1;
+	ftrace_probe_registered = 1;
 }
 
-static void __disable_ftrace_function_hook(void)
+static void __disable_ftrace_function_probe(void)
 {
 	int i;
 
-	if (!ftrace_hook_registered)
+	if (!ftrace_probe_registered)
 		return;
 
 	for (i = 0; i < FTRACE_FUNC_HASHSIZE; i++) {
@@ -1421,16 +1421,16 @@ static void __disable_ftrace_function_hook(void)
 	}
 
 	/* no more funcs left */
-	__unregister_ftrace_function(&trace_hook_ops);
+	__unregister_ftrace_function(&trace_probe_ops);
 	ftrace_shutdown(0);
-	ftrace_hook_registered = 0;
+	ftrace_probe_registered = 0;
 }
 
 
 static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 {
-	struct ftrace_func_hook *entry =
-		container_of(rhp, struct ftrace_func_hook, rcu);
+	struct ftrace_func_probe *entry =
+		container_of(rhp, struct ftrace_func_probe, rcu);
 
 	if (entry->ops->free)
 		entry->ops->free(&entry->data);
@@ -1439,10 +1439,10 @@ static void ftrace_free_entry_rcu(struct rcu_head *rhp)
 
 
 int
-register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+register_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 			      void *data)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct ftrace_page *pg;
 	struct dyn_ftrace *rec;
 	int type, len, not;
@@ -1453,7 +1453,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 	type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 	len = strlen(search);
 
-	/* we do not support '!' for function hooks */
+	/* we do not support '!' for function probes */
 	if (WARN_ON(not))
 		return -EINVAL;
 
@@ -1468,7 +1468,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 
 		entry = kmalloc(sizeof(*entry), GFP_KERNEL);
 		if (!entry) {
-			/* If we did not hook to any, then return error */
+			/* If we did not process any, then return error */
 			if (!count)
 				count = -ENOMEM;
 			goto out_unlock;
@@ -1498,7 +1498,7 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_add_head_rcu(&entry->node, &ftrace_func_hash[key]);
 
 	} while_for_each_ftrace_rec();
-	__enable_ftrace_function_hook();
+	__enable_ftrace_function_probe();
 
  out_unlock:
 	mutex_unlock(&ftrace_lock);
@@ -1507,15 +1507,15 @@ register_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 }
 
 enum {
-	HOOK_TEST_FUNC		= 1,
-	HOOK_TEST_DATA		= 2
+	PROBE_TEST_FUNC		= 1,
+	PROBE_TEST_DATA		= 2
 };
 
 static void
-__unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+__unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				  void *data, int flags)
 {
-	struct ftrace_func_hook *entry;
+	struct ftrace_func_probe *entry;
 	struct hlist_node *n, *tmp;
 	char str[KSYM_SYMBOL_LEN];
 	int type = MATCH_FULL;
@@ -1530,7 +1530,7 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		type = ftrace_setup_glob(glob, strlen(glob), &search, &not);
 		len = strlen(search);
 
-		/* we do not support '!' for function hooks */
+		/* we do not support '!' for function probes */
 		if (WARN_ON(not))
 			return;
 	}
@@ -1542,10 +1542,10 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 		hlist_for_each_entry_safe(entry, n, tmp, hhd, node) {
 
 			/* break up if statements for readability */
-			if ((flags & HOOK_TEST_FUNC) && entry->ops != ops)
+			if ((flags & PROBE_TEST_FUNC) && entry->ops != ops)
 				continue;
 
-			if ((flags & HOOK_TEST_DATA) && entry->data != data)
+			if ((flags & PROBE_TEST_DATA) && entry->data != data)
 				continue;
 
 			/* do this last, since it is the most expensive */
@@ -1560,27 +1560,27 @@ __unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
 			call_rcu(&entry->rcu, ftrace_free_entry_rcu);
 		}
 	}
-	__disable_ftrace_function_hook();
+	__disable_ftrace_function_probe();
 	mutex_unlock(&ftrace_lock);
 }
 
 void
-unregister_ftrace_function_hook(char *glob, struct ftrace_hook_ops *ops,
+unregister_ftrace_function_probe(char *glob, struct ftrace_probe_ops *ops,
 				void *data)
 {
-	__unregister_ftrace_function_hook(glob, ops, data,
-					  HOOK_TEST_FUNC | HOOK_TEST_DATA);
+	__unregister_ftrace_function_probe(glob, ops, data,
+					  PROBE_TEST_FUNC | PROBE_TEST_DATA);
 }
 
 void
-unregister_ftrace_function_hook_func(char *glob, struct ftrace_hook_ops *ops)
+unregister_ftrace_function_probe_func(char *glob, struct ftrace_probe_ops *ops)
 {
-	__unregister_ftrace_function_hook(glob, ops, NULL, HOOK_TEST_FUNC);
+	__unregister_ftrace_function_probe(glob, ops, NULL, PROBE_TEST_FUNC);
 }
 
-void unregister_ftrace_function_hook_all(char *glob)
+void unregister_ftrace_function_probe_all(char *glob)
 {
-	__unregister_ftrace_function_hook(glob, NULL, NULL, 0);
+	__unregister_ftrace_function_probe(glob, NULL, NULL, 0);
 }
 
 static LIST_HEAD(ftrace_commands);
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 021a574c5988..6ea73ed03bfa 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -269,21 +269,21 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, void **data)
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data);
+			 struct ftrace_probe_ops *ops, void *data);
 
-static struct ftrace_hook_ops traceon_hook_ops = {
+static struct ftrace_probe_ops traceon_probe_ops = {
 	.func			= ftrace_traceon,
 	.print			= ftrace_trace_onoff_print,
 };
 
-static struct ftrace_hook_ops traceoff_hook_ops = {
+static struct ftrace_probe_ops traceoff_probe_ops = {
 	.func			= ftrace_traceoff,
 	.print			= ftrace_trace_onoff_print,
 };
 
 static int
 ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
-			 struct ftrace_hook_ops *ops, void *data)
+			 struct ftrace_probe_ops *ops, void *data)
 {
 	char str[KSYM_SYMBOL_LEN];
 	long count = (long)data;
@@ -291,7 +291,7 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	kallsyms_lookup(ip, NULL, NULL, NULL, str);
 	seq_printf(m, "%s:", str);
 
-	if (ops == &traceon_hook_ops)
+	if (ops == &traceon_probe_ops)
 		seq_printf(m, "traceon");
 	else
 		seq_printf(m, "traceoff");
@@ -306,15 +306,15 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 static int
 ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
-	unregister_ftrace_function_hook_func(glob, ops);
+	unregister_ftrace_function_probe_func(glob, ops);
 
 	return 0;
 }
@@ -322,7 +322,7 @@ ftrace_trace_onoff_unreg(char *glob, char *cmd, char *param)
 static int
 ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 {
-	struct ftrace_hook_ops *ops;
+	struct ftrace_probe_ops *ops;
 	void *count = (void *)-1;
 	char *number;
 	int ret;
@@ -336,9 +336,9 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 
 	/* we register both traceon and traceoff to this callback */
 	if (strcmp(cmd, "traceon") == 0)
-		ops = &traceon_hook_ops;
+		ops = &traceon_probe_ops;
 	else
-		ops = &traceoff_hook_ops;
+		ops = &traceoff_probe_ops;
 
 	if (!param)
 		goto out_reg;
@@ -357,7 +357,7 @@ ftrace_trace_onoff_callback(char *glob, char *cmd, char *param, int enable)
 		return ret;
 
  out_reg:
-	ret = register_ftrace_function_hook(glob, ops, count);
+	ret = register_ftrace_function_probe(glob, ops, count);
 
 	return ret;
 }
-- 
cgit v1.2.3


From af513098452b8887d7c0e15a39d7cb74479501bd Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:07:28 -0500
Subject: tracing: use the more proper parameter

Pass tsk to tracing_record_cmdline instead of current.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 95f99a7abf2f..dc61e82faad9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -336,7 +336,7 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	data->rt_priority = tsk->rt_priority;
 
 	/* record this tasks comm */
-	tracing_record_cmdline(current);
+	tracing_record_cmdline(tsk);
 }
 
 static void
-- 
cgit v1.2.3


From d2ef7c2f0f9ab48c25eafc0ebad0df5f7930420b Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:09:47 -0500
Subject: tracing: fix the return value of trace selftest

This patch is to fix the return value of trace_selftest_startup_sysprof
and trace_selftest_startup_branch on failure.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_selftest.c | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 0c9aa1457e51..c72e749bcbef 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -622,7 +622,7 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
-		return 0;
+		return ret;
 	}
 
 	/* Sleep for a 1/10 of a second */
@@ -634,6 +634,11 @@ trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr)
 	trace->reset(tr);
 	tracing_start();
 
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
 	return ret;
 }
 #endif /* CONFIG_SYSPROF_TRACER */
@@ -661,6 +666,11 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr)
 	trace->reset(tr);
 	tracing_start();
 
+	if (!ret && !count) {
+		printk(KERN_CONT ".. no entries found ..");
+		ret = -1;
+	}
+
 	return ret;
 }
 #endif /* CONFIG_BRANCH_TRACER */
-- 
cgit v1.2.3


From 73d8b8bc4f24a97a406d09c8268ac019f4ac661e Mon Sep 17 00:00:00 2001
From: Wenji Huang <wenji.huang@oracle.com>
Date: Tue, 17 Feb 2009 01:10:02 -0500
Subject: tracing: fix typing mistake in hint message and comments

Impact: cleanup

Fix incorrect hint message in code and typos in comments.

Signed-off-by: Wenji Huang <wenji.huang@oracle.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_irqsoff.c      | 2 +-
 kernel/trace/trace_sched_switch.c | 2 +-
 kernel/trace/trace_sched_wakeup.c | 2 +-
 kernel/trace/trace_selftest.c     | 4 ++--
 kernel/trace/trace_stat.c         | 2 +-
 kernel/trace/trace_sysprof.c      | 2 +-
 6 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index c6b442d88de8..9e5ebd844158 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -1,5 +1,5 @@
 /*
- * trace irqs off criticall timings
+ * trace irqs off critical timings
  *
  * Copyright (C) 2007-2008 Steven Rostedt <srostedt@redhat.com>
  * Copyright (C) 2008 Ingo Molnar <mingo@redhat.com>
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 30e14fe85896..82fbb5a2df89 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -93,7 +93,7 @@ static int tracing_sched_register(void)
 	ret = register_trace_sched_switch(probe_sched_switch);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
-			" probe to kernel_sched_schedule\n");
+			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 96d716485898..276c51aaf314 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -284,7 +284,7 @@ static void start_wakeup_tracer(struct trace_array *tr)
 	ret = register_trace_sched_switch(probe_wakeup_sched_switch);
 	if (ret) {
 		pr_info("sched trace: Couldn't activate tracepoint"
-			" probe to kernel_sched_schedule\n");
+			" probe to kernel_sched_switch\n");
 		goto fail_deprobe_wake_new;
 	}
 
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index c72e749bcbef..01415f4edaa5 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -107,9 +107,9 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	func();
 
 	/*
-	 * Some archs *cough*PowerPC*cough* add charachters to the
+	 * Some archs *cough*PowerPC*cough* add characters to the
 	 * start of the function names. We simply put a '*' to
-	 * accomodate them.
+	 * accommodate them.
 	 */
 	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
 
diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index eae9cef39291..39310e3434ee 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -30,7 +30,7 @@ struct tracer_stat_session {
 	struct dentry		*file;
 };
 
-/* All of the sessions currently in use. Each stat file embeed one session */
+/* All of the sessions currently in use. Each stat file embed one session */
 static LIST_HEAD(all_stat_sessions);
 static DEFINE_MUTEX(all_stat_sessions_mutex);
 
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index 7c9a2d82a7d8..c771af4e8f1a 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -327,5 +327,5 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer)
 			d_tracer, NULL, &sysprof_sample_fops);
 	if (entry)
 		return;
-	pr_warning("Could not create debugfs 'dyn_ftrace_total_info' entry\n");
+	pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n");
 }
-- 
cgit v1.2.3


From 35ebf1caa4854ad5ba25f3a72967acc064147994 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:12:12 -0500
Subject: ftrace: show unlimited when traceon or traceoff has no counter

Impact: clean up

The traceon and traceoff function probes are confusing to developers
to what happens when a counter is not specified. This should help
clear things up.

 # echo "*:traceoff" > set_ftrace_filter
 # cat /debug/tracing/set_ftrace_filter

  #### all functions enabled ####
  do_fork:traceoff:unlimited

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 6ea73ed03bfa..4c113a8c466f 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -296,7 +296,9 @@ ftrace_trace_onoff_print(struct seq_file *m, unsigned long ip,
 	else
 		seq_printf(m, "traceoff");
 
-	if (count != -1)
+	if (count == -1)
+		seq_printf(m, ":unlimited\n");
+	else
 		seq_printf(m, ":count=%ld", count);
 	seq_putc(m, '\n');
 
-- 
cgit v1.2.3


From 5b058bcde961bf28678a70e44c079107313543b6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 17 Feb 2009 18:35:34 +0100
Subject: tracing/function-graph-tracer: trace the idle tasks

When the function graph tracer is activated, it iterates over the task_list
to allocate a stack to store the return addresses.

But the per cpu idle tasks are not iterated by using
do_each_thread / while_each_thread.

So we have to iterate on them manually.

This fixes somes weirdness in the traces and many losses of traces.
Examples on two cpus:

 0)   Xorg-4287    |   2.906 us    |              }
 0)   Xorg-4287    |   3.965 us    |            }
 0)   Xorg-4287    |   5.302 us    |          }
 ------------------------------------------
 0)   Xorg-4287    =>    <idle>-0
 ------------------------------------------

 0)    <idle>-0    |   2.861 us    |                        }
 0)    <idle>-0    |   0.526 us    |                        set_normalized_timespec();
 0)    <idle>-0    |   7.201 us    |                      }
 0)    <idle>-0    |   8.214 us    |                    }
 0)    <idle>-0    |               |                    clockevents_program_event() {
 0)    <idle>-0    |               |                      lapic_next_event() {
 0)    <idle>-0    |   0.510 us    |                        native_apic_mem_write();
 0)    <idle>-0    |   1.546 us    |                      }
 0)    <idle>-0    |   2.583 us    |                    }
 0)    <idle>-0    | + 12.435 us   |                  }
 0)    <idle>-0    | + 13.470 us   |                }
 0)    <idle>-0    |   0.608 us    |                _spin_unlock_irqrestore();
 0)    <idle>-0    | + 23.270 us   |              }
 0)    <idle>-0    | + 24.336 us   |            }
 0)    <idle>-0    | + 25.417 us   |          }
 0)    <idle>-0    |   0.593 us    |          _spin_unlock();
 0)    <idle>-0    | + 41.869 us   |        }
 0)    <idle>-0    | + 42.906 us   |      }
 0)    <idle>-0    | + 95.035 us   |    }
 0)    <idle>-0    |   0.540 us    |    menu_reflect();
 0)    <idle>-0    | ! 100.404 us  |  }
 0)    <idle>-0    |   0.564 us    |  mce_idle_callback();
 0)    <idle>-0    |               |  enter_idle() {
 0)    <idle>-0    |   0.526 us    |    mce_idle_callback();
 0)    <idle>-0    |   1.757 us    |  }
 0)    <idle>-0    |               |  cpuidle_idle_call() {
 0)    <idle>-0    |               |    menu_select() {
 0)    <idle>-0    |   0.525 us    |      pm_qos_requirement();
 0)    <idle>-0    |   0.518 us    |      tick_nohz_get_sleep_length();
 0)    <idle>-0    |   2.621 us    |    }
[...]
 1)    <idle>-0    |   0.518 us    |              touch_softlockup_watchdog();
 1)    <idle>-0    | + 14.355 us   |            }
 1)    <idle>-0    | + 22.840 us   |          }
 1)    <idle>-0    | + 25.949 us   |        }
 1)    <idle>-0    |               |        handle_irq() {
 1)    <idle>-0    |   0.511 us    |          irq_to_desc();
 1)    <idle>-0    |               |          handle_edge_irq() {
 1)    <idle>-0    |   0.638 us    |            _spin_lock();
 1)    <idle>-0    |               |            ack_apic_edge() {
 1)    <idle>-0    |   0.510 us    |              irq_to_desc();
 1)    <idle>-0    |               |              move_native_irq() {
 1)    <idle>-0    |   0.510 us    |                irq_to_desc();
 1)    <idle>-0    |   1.532 us    |              }
 1)    <idle>-0    |   0.511 us    |              native_apic_mem_write();
 ------------------------------------------
 1)    <idle>-0    =>    cat-5073
 ------------------------------------------

 1)    cat-5073    |   3.731 us    |                    }
 1)    cat-5073    |               |                    run_local_timers() {
 1)    cat-5073    |   0.533 us    |                      hrtimer_run_queues();
 1)    cat-5073    |               |                      raise_softirq() {
 1)    cat-5073    |               |                        __raise_softirq_irqoff() {
 1)    cat-5073    |               |                          /* nr: 1 */
 1)    cat-5073    |   2.718 us    |                        }
 1)    cat-5073    |   3.814 us    |                      }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 9a236ffe2aa4..fdf913dfc7e8 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2033,7 +2033,7 @@ free:
 static int start_graph_tracing(void)
 {
 	struct ftrace_ret_stack **ret_stack_list;
-	int ret;
+	int ret, cpu;
 
 	ret_stack_list = kmalloc(FTRACE_RETSTACK_ALLOC_SIZE *
 				sizeof(struct ftrace_ret_stack *),
@@ -2042,6 +2042,10 @@ static int start_graph_tracing(void)
 	if (!ret_stack_list)
 		return -ENOMEM;
 
+	/* The cpu_boot init_task->ret_stack will never be freed */
+	for_each_online_cpu(cpu)
+		ftrace_graph_init_task(idle_task(cpu));
+
 	do {
 		ret = alloc_retstack_tasklist(ret_stack_list);
 	} while (ret == -EAGAIN);
-- 
cgit v1.2.3


From 8316e38100c70cd1443ac90074eccdd033aa218d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 20:28:29 +0100
Subject: irq: further clean up the free_irq() code flow

Linus noticed that the 'pp' variable can be eliminated
altogether, and the loop can be cleaned up further.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 7a954b860c07..de5a765e88ab 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -575,7 +575,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 void free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action, **p, **pp;
+	struct irqaction *action, **p;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -592,7 +592,6 @@ void free_irq(unsigned int irq, void *dev_id)
 	p = &desc->action;
 	for (;;) {
 		action = *p;
-		pp = p;
 
 		if (!action) {
 			WARN(1, "Trying to free already-free IRQ %d\n", irq);
@@ -601,15 +600,13 @@ void free_irq(unsigned int irq, void *dev_id)
 			return;
 		}
 
+		if (action->dev_id == dev_id)
+			break;
 		p = &action->next;
-		if (action->dev_id != dev_id)
-			continue;
-
-		break;
 	}
 
 	/* Found it - now remove it from the list of entries: */
-	*pp = action->next;
+	*p = action->next;
 
 	/* Currently used only by UML, might disappear one day: */
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-- 
cgit v1.2.3


From f17c75453b2d195eba0a90d9f16a3ba88c85b3b4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Tue, 17 Feb 2009 20:43:37 +0100
Subject: irq: name 'p' variables a bit better

'p' stands for pointer - make it clear in setup_irq() and free_irq()
what kind of pointer it is.

Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index de5a765e88ab..c589305210d7 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -399,7 +399,7 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 static int
 __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 {
-	struct irqaction *old, **p;
+	struct irqaction *old, **old_ptr;
 	const char *old_name = NULL;
 	unsigned long flags;
 	int shared = 0;
@@ -431,8 +431,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 * The following block of code has to be executed atomically
 	 */
 	spin_lock_irqsave(&desc->lock, flags);
-	p = &desc->action;
-	old = *p;
+	old_ptr = &desc->action;
+	old = *old_ptr;
 	if (old) {
 		/*
 		 * Can't share interrupts unless both agree to and are
@@ -455,8 +455,8 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 
 		/* add new interrupt at end of irq queue */
 		do {
-			p = &old->next;
-			old = *p;
+			old_ptr = &old->next;
+			old = *old_ptr;
 		} while (old);
 		shared = 1;
 	}
@@ -507,7 +507,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 				(int)(new->flags & IRQF_TRIGGER_MASK));
 	}
 
-	*p = new;
+	*old_ptr = new;
 
 	/* Reset broken irq detection when installing new handler */
 	desc->irq_count = 0;
@@ -575,7 +575,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 void free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
-	struct irqaction *action, **p;
+	struct irqaction *action, **action_ptr;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -589,9 +589,9 @@ void free_irq(unsigned int irq, void *dev_id)
 	 * There can be multiple actions per IRQ descriptor, find the right
 	 * one based on the dev_id:
 	 */
-	p = &desc->action;
+	action_ptr = &desc->action;
 	for (;;) {
-		action = *p;
+		action = *action_ptr;
 
 		if (!action) {
 			WARN(1, "Trying to free already-free IRQ %d\n", irq);
@@ -602,11 +602,11 @@ void free_irq(unsigned int irq, void *dev_id)
 
 		if (action->dev_id == dev_id)
 			break;
-		p = &action->next;
+		action_ptr = &action->next;
 	}
 
 	/* Found it - now remove it from the list of entries: */
-	*p = action->next;
+	*action_ptr = action->next;
 
 	/* Currently used only by UML, might disappear one day: */
 #ifdef CONFIG_IRQ_RELEASE_METHOD
-- 
cgit v1.2.3


From 6eaaa5d57e76c454479833fc8594cd7c3b75c789 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 11 Feb 2009 02:25:00 +0100
Subject: tracing/core: use appropriate waiting on trace_pipe

Impact: api and pipe waiting change

Currently, the waiting used in tracing_read_pipe() is done through a
100 msecs schedule_timeout() loop which periodically check if there
are traces on the buffer.

This can cause small latencies for programs which are reading the incoming
events.

This patch makes the reader waiting for the trace_wait waitqueue except
for few tracers such as the sched and functions tracers which might be
already hold the runqueue lock while waking up the reader.

This is performed through a new callback wait_pipe() on struct tracer.
If none is implemented on a specific tracer, the default waiting for
trace_wait queue is attached.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                 | 62 ++++++++++++++++++++++++------------
 kernel/trace/trace.h                 | 25 +++++++++++++--
 kernel/trace/trace_functions.c       |  1 +
 kernel/trace/trace_functions_graph.c |  1 +
 kernel/trace/trace_sched_switch.c    |  1 +
 kernel/trace/trace_sched_wakeup.c    |  1 +
 6 files changed, 67 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dc61e82faad9..881a94474d79 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -499,6 +499,9 @@ __acquires(kernel_lock)
 	else
 		if (!type->flags->opts)
 			type->flags->opts = dummy_tracer_opt;
+	if (!type->wait_pipe)
+		type->wait_pipe = default_wait_pipe;
+
 
 #ifdef CONFIG_FTRACE_STARTUP_TEST
 	if (type->selftest && !tracing_selftest_disabled) {
@@ -1064,7 +1067,10 @@ tracing_sched_wakeup_trace(struct trace_array *tr,
 	entry->next_prio		= wakee->prio;
 	entry->next_state		= wakee->state;
 	entry->next_cpu			= task_cpu(wakee);
-	trace_buffer_unlock_commit(tr, event, flags, pc);
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+	ftrace_trace_stack(tr, flags, 6, pc);
+	ftrace_trace_userstack(tr, flags, pc);
 }
 
 void
@@ -2392,6 +2398,38 @@ tracing_poll_pipe(struct file *filp, poll_table *poll_table)
 	}
 }
 
+
+void default_wait_pipe(struct trace_iterator *iter)
+{
+	DEFINE_WAIT(wait);
+
+	prepare_to_wait(&trace_wait, &wait, TASK_INTERRUPTIBLE);
+
+	if (trace_empty(iter))
+		schedule();
+
+	finish_wait(&trace_wait, &wait);
+}
+
+/*
+ * This is a make-shift waitqueue.
+ * A tracer might use this callback on some rare cases:
+ *
+ *  1) the current tracer might hold the runqueue lock when it wakes up
+ *     a reader, hence a deadlock (sched, function, and function graph tracers)
+ *  2) the function tracers, trace all functions, we don't want
+ *     the overhead of calling wake_up and friends
+ *     (and tracing them too)
+ *
+ *     Anyway, this is really very primitive wakeup.
+ */
+void poll_wait_pipe(struct trace_iterator *iter)
+{
+	set_current_state(TASK_INTERRUPTIBLE);
+	/* sleep for 100 msecs, and try again. */
+	schedule_timeout(HZ / 10);
+}
+
 /* Must be called with trace_types_lock mutex held. */
 static int tracing_wait_pipe(struct file *filp)
 {
@@ -2403,30 +2441,14 @@ static int tracing_wait_pipe(struct file *filp)
 			return -EAGAIN;
 		}
 
-		/*
-		 * This is a make-shift waitqueue. The reason we don't use
-		 * an actual wait queue is because:
-		 *  1) we only ever have one waiter
-		 *  2) the tracing, traces all functions, we don't want
-		 *     the overhead of calling wake_up and friends
-		 *     (and tracing them too)
-		 *     Anyway, this is really very primitive wakeup.
-		 */
-		set_current_state(TASK_INTERRUPTIBLE);
-		iter->tr->waiter = current;
-
 		mutex_unlock(&trace_types_lock);
 
-		/* sleep for 100 msecs, and try again. */
-		schedule_timeout(HZ/10);
+		iter->trace->wait_pipe(iter);
 
 		mutex_lock(&trace_types_lock);
 
-		iter->tr->waiter = NULL;
-
-		if (signal_pending(current)) {
+		if (signal_pending(current))
 			return -EINTR;
-		}
 
 		if (iter->trace != current_trace)
 			return 0;
@@ -2442,8 +2464,6 @@ static int tracing_wait_pipe(struct file *filp)
 		 */
 		if (!tracer_enabled && iter->pos)
 			break;
-
-		continue;
 	}
 
 	return 1;
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index dbff0207b213..eed732c151fc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -337,18 +337,34 @@ struct tracer_flags {
 #define TRACER_OPT(s, b)	.name = #s, .bit = b
 
 
-/*
- * A specific tracer, represented by methods that operate on a trace array:
+/**
+ * struct tracer - a specific tracer and its callbacks to interact with debugfs
+ * @name: the name chosen to select it on the available_tracers file
+ * @init: called when one switches to this tracer (echo name > current_tracer)
+ * @reset: called when one switches to another tracer
+ * @start: called when tracing is unpaused (echo 1 > tracing_enabled)
+ * @stop: called when tracing is paused (echo 0 > tracing_enabled)
+ * @open: called when the trace file is opened
+ * @pipe_open: called when the trace_pipe file is opened
+ * @wait_pipe: override how the user waits for traces on trace_pipe
+ * @close: called when the trace file is released
+ * @read: override the default read callback on trace_pipe
+ * @splice_read: override the default splice_read callback on trace_pipe
+ * @selftest: selftest to run on boot (see trace_selftest.c)
+ * @print_headers: override the first lines that describe your columns
+ * @print_line: callback that prints a trace
+ * @set_flag: signals one of your private flags changed (trace_options file)
+ * @flags: your private flags
  */
 struct tracer {
 	const char		*name;
-	/* Your tracer should raise a warning if init fails */
 	int			(*init)(struct trace_array *tr);
 	void			(*reset)(struct trace_array *tr);
 	void			(*start)(struct trace_array *tr);
 	void			(*stop)(struct trace_array *tr);
 	void			(*open)(struct trace_iterator *iter);
 	void			(*pipe_open)(struct trace_iterator *iter);
+	void			(*wait_pipe)(struct trace_iterator *iter);
 	void			(*close)(struct trace_iterator *iter);
 	ssize_t			(*read)(struct trace_iterator *iter,
 					struct file *filp, char __user *ubuf,
@@ -432,6 +448,9 @@ void tracing_generic_entry_update(struct trace_entry *entry,
 				  unsigned long flags,
 				  int pc);
 
+void default_wait_pipe(struct trace_iterator *iter);
+void poll_wait_pipe(struct trace_iterator *iter);
+
 void ftrace(struct trace_array *tr,
 			    struct trace_array_cpu *data,
 			    unsigned long ip,
diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c
index 4c113a8c466f..c9a0b7df44ff 100644
--- a/kernel/trace/trace_functions.c
+++ b/kernel/trace/trace_functions.c
@@ -225,6 +225,7 @@ static struct tracer function_trace __read_mostly =
 	.init		= function_trace_init,
 	.reset		= function_trace_reset,
 	.start		= function_trace_start,
+	.wait_pipe	= poll_wait_pipe,
 	.flags		= &func_flags,
 	.set_flag	= func_set_flag,
 #ifdef CONFIG_FTRACE_SELFTEST
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 519a0cab1530..0ff5cb661900 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -757,6 +757,7 @@ static struct tracer graph_trace __read_mostly = {
 	.name	     	= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
+	.wait_pipe	= poll_wait_pipe,
 	.init	     	= graph_trace_init,
 	.reset	     	= graph_trace_reset,
 	.print_line	= print_graph_function,
diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 82fbb5a2df89..77132c2cf3d9 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -221,6 +221,7 @@ static struct tracer sched_switch_trace __read_mostly =
 	.reset		= sched_switch_trace_reset,
 	.start		= sched_switch_trace_start,
 	.stop		= sched_switch_trace_stop,
+	.wait_pipe	= poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_sched_switch,
 #endif
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 276c51aaf314..db55f7aaa640 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -380,6 +380,7 @@ static struct tracer wakeup_rt_tracer __read_mostly =
 	.reset		= wakeup_tracer_reset,
 	.start		= wakeup_tracer_start,
 	.stop		= wakeup_tracer_stop,
+	.wait_pipe	= poll_wait_pipe,
 	.print_max	= 1,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest    = trace_selftest_startup_wakeup,
-- 
cgit v1.2.3


From fa7c7f6e11f70d62505074a8b30a776236850dec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 11 Feb 2009 02:51:30 +0100
Subject: tracing/core: remove unused parameter in tracing_fill_pipe_page()

Impact: cleanup

The struct page *pages parameter is unused.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 881a94474d79..e1f3b99a2e52 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2571,8 +2571,7 @@ static struct pipe_buf_operations tracing_pipe_buf_ops = {
 };
 
 static size_t
-tracing_fill_pipe_page(struct page *pages, size_t rem,
-			struct trace_iterator *iter)
+tracing_fill_pipe_page(size_t rem, struct trace_iterator *iter)
 {
 	size_t count;
 	int ret;
@@ -2649,7 +2648,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		if (!pages[i])
 			break;
 
-		rem = tracing_fill_pipe_page(pages[i], rem, iter);
+		rem = tracing_fill_pipe_page(rem, iter);
 
 		/* Copy the data into the page, so we can start over. */
 		ret = trace_seq_to_buffer(&iter->seq,
-- 
cgit v1.2.3


From 93dbb393503d53cd226e5e1f0088fe8f4dbaa2b8 Mon Sep 17 00:00:00 2001
From: Jens Axboe <jens.axboe@oracle.com>
Date: Mon, 16 Feb 2009 10:25:40 +0100
Subject: block: fix bad definition of BIO_RW_SYNC

We can't OR shift values, so get rid of BIO_RW_SYNC and use BIO_RW_SYNCIO
and BIO_RW_UNPLUG explicitly. This brings back the behaviour from before
213d9417fec62ef4c3675621b9364a667954d4dd.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blktrace.c             | 2 +-
 drivers/md/dm-io.c           | 2 +-
 drivers/md/dm-kcopyd.c       | 2 +-
 drivers/md/md.c              | 4 ++--
 include/linux/bio.h          | 2 --
 include/linux/blktrace_api.h | 1 +
 include/linux/fs.h           | 6 +++---
 kernel/power/swap.c          | 5 +++--
 mm/page_io.c                 | 2 +-
 9 files changed, 13 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/block/blktrace.c b/block/blktrace.c
index 39cc3bfe56e4..7cf9d1ff45a0 100644
--- a/block/blktrace.c
+++ b/block/blktrace.c
@@ -142,7 +142,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 
 	what |= ddir_act[rw & WRITE];
 	what |= MASK_TC_BIT(rw, BARRIER);
-	what |= MASK_TC_BIT(rw, SYNC);
+	what |= MASK_TC_BIT(rw, SYNCIO);
 	what |= MASK_TC_BIT(rw, AHEAD);
 	what |= MASK_TC_BIT(rw, META);
 	what |= MASK_TC_BIT(rw, DISCARD);
diff --git a/drivers/md/dm-io.c b/drivers/md/dm-io.c
index a34338567a2a..f14813be4eff 100644
--- a/drivers/md/dm-io.c
+++ b/drivers/md/dm-io.c
@@ -328,7 +328,7 @@ static void dispatch_io(int rw, unsigned int num_regions,
 	struct dpages old_pages = *dp;
 
 	if (sync)
-		rw |= (1 << BIO_RW_SYNC);
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
 	/*
 	 * For multiple regions we need to be careful to rewind
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 3073618269ea..0a225da21272 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -344,7 +344,7 @@ static int run_io_job(struct kcopyd_job *job)
 {
 	int r;
 	struct dm_io_request io_req = {
-		.bi_rw = job->rw | (1 << BIO_RW_SYNC),
+		.bi_rw = job->rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG),
 		.mem.type = DM_IO_PAGE_LIST,
 		.mem.ptr.pl = job->pages,
 		.mem.offset = job->offset,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4495104f6c9f..03b4cd0a6344 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -474,7 +474,7 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
 	 * causes ENOTSUPP, we allocate a spare bio...
 	 */
 	struct bio *bio = bio_alloc(GFP_NOIO, 1);
-	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNC);
+	int rw = (1<<BIO_RW) | (1<<BIO_RW_SYNCIO) | (1<<BIO_RW_UNPLUG);
 
 	bio->bi_bdev = rdev->bdev;
 	bio->bi_sector = sector;
@@ -531,7 +531,7 @@ int sync_page_io(struct block_device *bdev, sector_t sector, int size,
 	struct completion event;
 	int ret;
 
-	rw |= (1 << BIO_RW_SYNC);
+	rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 
 	bio->bi_bdev = bdev;
 	bio->bi_sector = sector;
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 2aa283ab062b..1b16108a5417 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -171,8 +171,6 @@ struct bio {
 #define BIO_RW_FAILFAST_TRANSPORT	8
 #define BIO_RW_FAILFAST_DRIVER		9
 
-#define BIO_RW_SYNC	(BIO_RW_SYNCIO | BIO_RW_UNPLUG)
-
 #define bio_rw_flagged(bio, flag)	((bio)->bi_rw & (1 << (flag)))
 
 /*
diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h
index 25379cba2370..6e915878e88c 100644
--- a/include/linux/blktrace_api.h
+++ b/include/linux/blktrace_api.h
@@ -15,6 +15,7 @@ enum blktrace_cat {
 	BLK_TC_WRITE	= 1 << 1,	/* writes */
 	BLK_TC_BARRIER	= 1 << 2,	/* barrier */
 	BLK_TC_SYNC	= 1 << 3,	/* sync IO */
+	BLK_TC_SYNCIO	= BLK_TC_SYNC,
 	BLK_TC_QUEUE	= 1 << 4,	/* queueing/merging */
 	BLK_TC_REQUEUE	= 1 << 5,	/* requeueing */
 	BLK_TC_ISSUE	= 1 << 6,	/* issue */
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 6022f44043f2..67857dc16365 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -87,10 +87,10 @@ struct inodes_stat_t {
 #define WRITE 1
 #define READA 2		/* read-ahead  - don't block if no resources */
 #define SWRITE 3	/* for ll_rw_block() - wait for buffer lock */
-#define READ_SYNC	(READ | (1 << BIO_RW_SYNC))
+#define READ_SYNC	(READ | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define READ_META	(READ | (1 << BIO_RW_META))
-#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNC))
-#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNC))
+#define WRITE_SYNC	(WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
+#define SWRITE_SYNC	(SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG))
 #define WRITE_BARRIER	(WRITE | (1 << BIO_RW_BARRIER))
 #define DISCARD_NOBARRIER (1 << BIO_RW_DISCARD)
 #define DISCARD_BARRIER ((1 << BIO_RW_DISCARD) | (1 << BIO_RW_BARRIER))
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 6da14358537c..505f319e489c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -60,6 +60,7 @@ static struct block_device *resume_bdev;
 static int submit(int rw, pgoff_t page_off, struct page *page,
 			struct bio **bio_chain)
 {
+	const int bio_rw = rw | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 	struct bio *bio;
 
 	bio = bio_alloc(__GFP_WAIT | __GFP_HIGH, 1);
@@ -80,7 +81,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 	bio_get(bio);
 
 	if (bio_chain == NULL) {
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		submit_bio(bio_rw, bio);
 		wait_on_page_locked(page);
 		if (rw == READ)
 			bio_set_pages_dirty(bio);
@@ -90,7 +91,7 @@ static int submit(int rw, pgoff_t page_off, struct page *page,
 			get_page(page);	/* These pages are freed later */
 		bio->bi_private = *bio_chain;
 		*bio_chain = bio;
-		submit_bio(rw | (1 << BIO_RW_SYNC), bio);
+		submit_bio(bio_rw, bio);
 	}
 	return 0;
 }
diff --git a/mm/page_io.c b/mm/page_io.c
index dc6ce0afbded..3023c475e041 100644
--- a/mm/page_io.c
+++ b/mm/page_io.c
@@ -111,7 +111,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
 		goto out;
 	}
 	if (wbc->sync_mode == WB_SYNC_ALL)
-		rw |= (1 << BIO_RW_SYNC);
+		rw |= (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG);
 	count_vm_event(PSWPOUT);
 	set_page_writeback(page);
 	unlock_page(page);
-- 
cgit v1.2.3


From 74019224ac34b044b44a31dd89a54e3477db4896 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Feb 2009 12:23:29 +0100
Subject: timers: add mod_timer_pending()

Impact: new timer API

Based on an idea from Martin Josefsson with the help of
Patrick McHardy and Stephen Hemminger:

introduce the mod_timer_pending() API which is a mod_timer()
offspring that is an invariant on already removed timers.

(regular mod_timer() re-activates non-pending timers.)

This is useful for the networking code in that it can
allow unserialized mod_timer_pending() timer-forwarding
calls, but a single del_timer*() will stop the timer
from being reactivated again.

Also while at it:

- optimize the regular mod_timer() path some more, the
  timer-stat and a debug check was needlessly duplicated
  in __mod_timer().

- make the exports come straight after the function, as
  most other exports in timer.c already did.

- eliminate __mod_timer() as an external API, change the
  users to mod_timer().

The regular mod_timer() code path is not impacted
significantly, due to inlining optimizations and due to
the simplifications.

Based-on-patch-from: Stephen Hemminger <shemminger@vyatta.com>
Acked-by: Stephen Hemminger <shemminger@vyatta.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Patrick McHardy <kaber@trash.net>
Cc: netdev@vger.kernel.org
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/powerpc/platforms/cell/spufs/sched.c  |   2 +-
 drivers/infiniband/hw/ipath/ipath_driver.c |   6 +-
 include/linux/timer.h                      |  22 +-----
 kernel/relay.c                             |   2 +-
 kernel/timer.c                             | 110 +++++++++++++++++++----------
 5 files changed, 80 insertions(+), 62 deletions(-)

(limited to 'kernel')

diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c
index 6a0ad196aeb3..f085369301b1 100644
--- a/arch/powerpc/platforms/cell/spufs/sched.c
+++ b/arch/powerpc/platforms/cell/spufs/sched.c
@@ -508,7 +508,7 @@ static void __spu_add_to_rq(struct spu_context *ctx)
 		list_add_tail(&ctx->rq, &spu_prio->runq[ctx->prio]);
 		set_bit(ctx->prio, spu_prio->bitmap);
 		if (!spu_prio->nr_waiting++)
-			__mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
+			mod_timer(&spusched_timer, jiffies + SPUSCHED_TICK);
 	}
 }
 
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 69c0ce321b4e..cb9daa6ac029 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -2715,7 +2715,7 @@ static void ipath_hol_signal_up(struct ipath_devdata *dd)
  * to prevent HoL blocking, then start the HoL timer that
  * periodically continues, then stop procs, so they can detect
  * link down if they want, and do something about it.
- * Timer may already be running, so use __mod_timer, not add_timer.
+ * Timer may already be running, so use mod_timer, not add_timer.
  */
 void ipath_hol_down(struct ipath_devdata *dd)
 {
@@ -2724,7 +2724,7 @@ void ipath_hol_down(struct ipath_devdata *dd)
 	dd->ipath_hol_next = IPATH_HOL_DOWNCONT;
 	dd->ipath_hol_timer.expires = jiffies +
 		msecs_to_jiffies(ipath_hol_timeout_ms);
-	__mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
+	mod_timer(&dd->ipath_hol_timer, dd->ipath_hol_timer.expires);
 }
 
 /*
@@ -2763,7 +2763,7 @@ void ipath_hol_event(unsigned long opaque)
 	else {
 		dd->ipath_hol_timer.expires = jiffies +
 			msecs_to_jiffies(ipath_hol_timeout_ms);
-		__mod_timer(&dd->ipath_hol_timer,
+		mod_timer(&dd->ipath_hol_timer,
 			dd->ipath_hol_timer.expires);
 	}
 }
diff --git a/include/linux/timer.h b/include/linux/timer.h
index daf9685b861c..e2d662e3416e 100644
--- a/include/linux/timer.h
+++ b/include/linux/timer.h
@@ -86,8 +86,8 @@ static inline int timer_pending(const struct timer_list * timer)
 
 extern void add_timer_on(struct timer_list *timer, int cpu);
 extern int del_timer(struct timer_list * timer);
-extern int __mod_timer(struct timer_list *timer, unsigned long expires);
 extern int mod_timer(struct timer_list *timer, unsigned long expires);
+extern int mod_timer_pending(struct timer_list *timer, unsigned long expires);
 
 /*
  * The jiffies value which is added to now, when there is no timer
@@ -146,25 +146,7 @@ static inline void timer_stats_timer_clear_start_info(struct timer_list *timer)
 }
 #endif
 
-/**
- * add_timer - start a timer
- * @timer: the timer to be added
- *
- * The kernel will do a ->function(->data) callback from the
- * timer interrupt at the ->expires point in the future. The
- * current time is 'jiffies'.
- *
- * The timer's ->expires, ->function (and if the handler uses it, ->data)
- * fields must be set prior calling this function.
- *
- * Timers with an ->expires field in the past will be executed in the next
- * timer tick.
- */
-static inline void add_timer(struct timer_list *timer)
-{
-	BUG_ON(timer_pending(timer));
-	__mod_timer(timer, timer->expires);
-}
+extern void add_timer(struct timer_list *timer);
 
 #ifdef CONFIG_SMP
   extern int try_to_del_timer_sync(struct timer_list *timer);
diff --git a/kernel/relay.c b/kernel/relay.c
index 9d79b7854fa6..8f2179c8056f 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -750,7 +750,7 @@ size_t relay_switch_subbuf(struct rchan_buf *buf, size_t length)
 			 * from the scheduler (trying to re-grab
 			 * rq->lock), so defer it.
 			 */
-			__mod_timer(&buf->timer, jiffies + 1);
+			mod_timer(&buf->timer, jiffies + 1);
 	}
 
 	old = buf->data;
diff --git a/kernel/timer.c b/kernel/timer.c
index 13dd64fe143d..9b77fc9a9ac8 100644
--- a/kernel/timer.c
+++ b/kernel/timer.c
@@ -589,11 +589,14 @@ static struct tvec_base *lock_timer_base(struct timer_list *timer,
 	}
 }
 
-int __mod_timer(struct timer_list *timer, unsigned long expires)
+static inline int
+__mod_timer(struct timer_list *timer, unsigned long expires, bool pending_only)
 {
 	struct tvec_base *base, *new_base;
 	unsigned long flags;
-	int ret = 0;
+	int ret;
+
+	ret = 0;
 
 	timer_stats_timer_set_start_info(timer);
 	BUG_ON(!timer->function);
@@ -603,6 +606,9 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer_pending(timer)) {
 		detach_timer(timer, 0);
 		ret = 1;
+	} else {
+		if (pending_only)
+			goto out_unlock;
 	}
 
 	debug_timer_activate(timer);
@@ -629,42 +635,28 @@ int __mod_timer(struct timer_list *timer, unsigned long expires)
 
 	timer->expires = expires;
 	internal_add_timer(base, timer);
+
+out_unlock:
 	spin_unlock_irqrestore(&base->lock, flags);
 
 	return ret;
 }
 
-EXPORT_SYMBOL(__mod_timer);
-
 /**
- * add_timer_on - start a timer on a particular CPU
- * @timer: the timer to be added
- * @cpu: the CPU to start it on
+ * mod_timer_pending - modify a pending timer's timeout
+ * @timer: the pending timer to be modified
+ * @expires: new timeout in jiffies
  *
- * This is not very scalable on SMP. Double adds are not possible.
+ * mod_timer_pending() is the same for pending timers as mod_timer(),
+ * but will not re-activate and modify already deleted timers.
+ *
+ * It is useful for unserialized use of timers.
  */
-void add_timer_on(struct timer_list *timer, int cpu)
+int mod_timer_pending(struct timer_list *timer, unsigned long expires)
 {
-	struct tvec_base *base = per_cpu(tvec_bases, cpu);
-	unsigned long flags;
-
-	timer_stats_timer_set_start_info(timer);
-	BUG_ON(timer_pending(timer) || !timer->function);
-	spin_lock_irqsave(&base->lock, flags);
-	timer_set_base(timer, base);
-	debug_timer_activate(timer);
-	internal_add_timer(base, timer);
-	/*
-	 * Check whether the other CPU is idle and needs to be
-	 * triggered to reevaluate the timer wheel when nohz is
-	 * active. We are protected against the other CPU fiddling
-	 * with the timer by holding the timer base lock. This also
-	 * makes sure that a CPU on the way to idle can not evaluate
-	 * the timer wheel.
-	 */
-	wake_up_idle_cpu(cpu);
-	spin_unlock_irqrestore(&base->lock, flags);
+	return __mod_timer(timer, expires, true);
 }
+EXPORT_SYMBOL(mod_timer_pending);
 
 /**
  * mod_timer - modify a timer's timeout
@@ -688,9 +680,6 @@ void add_timer_on(struct timer_list *timer, int cpu)
  */
 int mod_timer(struct timer_list *timer, unsigned long expires)
 {
-	BUG_ON(!timer->function);
-
-	timer_stats_timer_set_start_info(timer);
 	/*
 	 * This is a common optimization triggered by the
 	 * networking code - if the timer is re-modified
@@ -699,11 +688,61 @@ int mod_timer(struct timer_list *timer, unsigned long expires)
 	if (timer->expires == expires && timer_pending(timer))
 		return 1;
 
-	return __mod_timer(timer, expires);
+	return __mod_timer(timer, expires, false);
 }
-
 EXPORT_SYMBOL(mod_timer);
 
+/**
+ * add_timer - start a timer
+ * @timer: the timer to be added
+ *
+ * The kernel will do a ->function(->data) callback from the
+ * timer interrupt at the ->expires point in the future. The
+ * current time is 'jiffies'.
+ *
+ * The timer's ->expires, ->function (and if the handler uses it, ->data)
+ * fields must be set prior calling this function.
+ *
+ * Timers with an ->expires field in the past will be executed in the next
+ * timer tick.
+ */
+void add_timer(struct timer_list *timer)
+{
+	BUG_ON(timer_pending(timer));
+	mod_timer(timer, timer->expires);
+}
+EXPORT_SYMBOL(add_timer);
+
+/**
+ * add_timer_on - start a timer on a particular CPU
+ * @timer: the timer to be added
+ * @cpu: the CPU to start it on
+ *
+ * This is not very scalable on SMP. Double adds are not possible.
+ */
+void add_timer_on(struct timer_list *timer, int cpu)
+{
+	struct tvec_base *base = per_cpu(tvec_bases, cpu);
+	unsigned long flags;
+
+	timer_stats_timer_set_start_info(timer);
+	BUG_ON(timer_pending(timer) || !timer->function);
+	spin_lock_irqsave(&base->lock, flags);
+	timer_set_base(timer, base);
+	debug_timer_activate(timer);
+	internal_add_timer(base, timer);
+	/*
+	 * Check whether the other CPU is idle and needs to be
+	 * triggered to reevaluate the timer wheel when nohz is
+	 * active. We are protected against the other CPU fiddling
+	 * with the timer by holding the timer base lock. This also
+	 * makes sure that a CPU on the way to idle can not evaluate
+	 * the timer wheel.
+	 */
+	wake_up_idle_cpu(cpu);
+	spin_unlock_irqrestore(&base->lock, flags);
+}
+
 /**
  * del_timer - deactive a timer.
  * @timer: the timer to be deactivated
@@ -733,7 +772,6 @@ int del_timer(struct timer_list *timer)
 
 	return ret;
 }
-
 EXPORT_SYMBOL(del_timer);
 
 #ifdef CONFIG_SMP
@@ -767,7 +805,6 @@ out:
 
 	return ret;
 }
-
 EXPORT_SYMBOL(try_to_del_timer_sync);
 
 /**
@@ -796,7 +833,6 @@ int del_timer_sync(struct timer_list *timer)
 		cpu_relax();
 	}
 }
-
 EXPORT_SYMBOL(del_timer_sync);
 #endif
 
@@ -1268,7 +1304,7 @@ signed long __sched schedule_timeout(signed long timeout)
 	expire = timeout + jiffies;
 
 	setup_timer_on_stack(&timer, process_timeout, (unsigned long)current);
-	__mod_timer(&timer, expire);
+	__mod_timer(&timer, expire, false);
 	schedule();
 	del_singleshot_timer_sync(&timer);
 
-- 
cgit v1.2.3


From 712406a6bf59ebf4a00358bb59a4a2a1b2953d90 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Feb 2009 10:54:03 -0800
Subject: tracing/function-graph-tracer: make arch generic push pop functions

There is nothing really arch specific of the push and pop functions
used by the function graph tracer. This patch moves them to generic
code.

Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/include/asm/ftrace.h        | 25 ------------
 arch/x86/kernel/dumpstack.c          |  1 +
 arch/x86/kernel/ftrace.c             | 75 +-----------------------------------
 include/linux/ftrace.h               | 24 ++++++++++++
 kernel/trace/trace_functions_graph.c | 75 ++++++++++++++++++++++++++++++++++++
 5 files changed, 101 insertions(+), 99 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/include/asm/ftrace.h b/arch/x86/include/asm/ftrace.h
index b55b4a7fbefd..db24c2278be0 100644
--- a/arch/x86/include/asm/ftrace.h
+++ b/arch/x86/include/asm/ftrace.h
@@ -55,29 +55,4 @@ struct dyn_arch_ftrace {
 #endif /* __ASSEMBLY__ */
 #endif /* CONFIG_FUNCTION_TRACER */
 
-#ifdef CONFIG_FUNCTION_GRAPH_TRACER
-
-#ifndef __ASSEMBLY__
-
-/*
- * Stack of return addresses for functions
- * of a thread.
- * Used in struct thread_info
- */
-struct ftrace_ret_stack {
-	unsigned long ret;
-	unsigned long func;
-	unsigned long long calltime;
-};
-
-/*
- * Primary handler of a function return.
- * It relays on ftrace_return_to_handler.
- * Defined in entry_32/64.S
- */
-extern void return_to_handler(void);
-
-#endif /* __ASSEMBLY__ */
-#endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-
 #endif /* _ASM_X86_FTRACE_H */
diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c
index 6b1f6f6f8661..c0852291b623 100644
--- a/arch/x86/kernel/dumpstack.c
+++ b/arch/x86/kernel/dumpstack.c
@@ -10,6 +10,7 @@
 #include <linux/kdebug.h>
 #include <linux/module.h>
 #include <linux/ptrace.h>
+#include <linux/ftrace.h>
 #include <linux/kexec.h>
 #include <linux/bug.h>
 #include <linux/nmi.h>
diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 231bdd3c5b1c..76f7141e0f91 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -389,79 +389,6 @@ void ftrace_nmi_exit(void)
 
 #endif /* !CONFIG_DYNAMIC_FTRACE */
 
-/* Add a function return address to the trace stack on thread info.*/
-static int push_return_trace(unsigned long ret, unsigned long long time,
-				unsigned long func, int *depth)
-{
-	int index;
-
-	if (!current->ret_stack)
-		return -EBUSY;
-
-	/* The return trace stack is full */
-	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
-		atomic_inc(&current->trace_overrun);
-		return -EBUSY;
-	}
-
-	index = ++current->curr_ret_stack;
-	barrier();
-	current->ret_stack[index].ret = ret;
-	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
-	*depth = index;
-
-	return 0;
-}
-
-/* Retrieve a function return address to the trace stack on thread info.*/
-static void pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
-{
-	int index;
-
-	index = current->curr_ret_stack;
-
-	if (unlikely(index < 0)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic, otherwise we have no where to go */
-		*ret = (unsigned long)panic;
-		return;
-	}
-
-	*ret = current->ret_stack[index].ret;
-	trace->func = current->ret_stack[index].func;
-	trace->calltime = current->ret_stack[index].calltime;
-	trace->overrun = atomic_read(&current->trace_overrun);
-	trace->depth = index;
-	barrier();
-	current->curr_ret_stack--;
-
-}
-
-/*
- * Send the trace to the ring-buffer.
- * @return the original return address.
- */
-unsigned long ftrace_return_to_handler(void)
-{
-	struct ftrace_graph_ret trace;
-	unsigned long ret;
-
-	pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
-	ftrace_graph_return(&trace);
-
-	if (unlikely(!ret)) {
-		ftrace_graph_stop();
-		WARN_ON(1);
-		/* Might as well panic. What else to do? */
-		ret = (unsigned long)panic;
-	}
-
-	return ret;
-}
-
 /*
  * Hook the return address and push it in the stack of return addrs
  * in current thread info.
@@ -521,7 +448,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 
 	calltime = cpu_clock(raw_smp_processor_id());
 
-	if (push_return_trace(old, calltime,
+	if (ftrace_push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..a7f8134c594e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -379,6 +379,30 @@ struct ftrace_graph_ret {
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
 
+/*
+ * Stack of return addresses for functions
+ * of a thread.
+ * Used in struct thread_info
+ */
+struct ftrace_ret_stack {
+	unsigned long ret;
+	unsigned long func;
+	unsigned long long calltime;
+};
+
+/*
+ * Primary handler of a function return.
+ * It relays on ftrace_return_to_handler.
+ * Defined in entry_32/64.S
+ */
+extern void return_to_handler(void);
+
+extern int
+ftrace_push_return_trace(unsigned long ret, unsigned long long time,
+			 unsigned long func, int *depth);
+extern void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
+
 /*
  * Sometimes we don't want to trace a function with the function
  * graph tracer but we want them to keep traced by the usual function
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 930c08e5b38e..dce71a5b51bc 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -42,6 +42,81 @@ static struct tracer_flags tracer_flags = {
 /* pid on the last trace processed */
 static pid_t last_pid[NR_CPUS] = { [0 ... NR_CPUS-1] = -1 };
 
+/* Add a function return address to the trace stack on thread info.*/
+int
+ftrace_push_return_trace(unsigned long ret, unsigned long long time,
+			 unsigned long func, int *depth)
+{
+	int index;
+
+	if (!current->ret_stack)
+		return -EBUSY;
+
+	/* The return trace stack is full */
+	if (current->curr_ret_stack == FTRACE_RETFUNC_DEPTH - 1) {
+		atomic_inc(&current->trace_overrun);
+		return -EBUSY;
+	}
+
+	index = ++current->curr_ret_stack;
+	barrier();
+	current->ret_stack[index].ret = ret;
+	current->ret_stack[index].func = func;
+	current->ret_stack[index].calltime = time;
+	*depth = index;
+
+	return 0;
+}
+
+/* Retrieve a function return address to the trace stack on thread info.*/
+void
+ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret)
+{
+	int index;
+
+	index = current->curr_ret_stack;
+
+	if (unlikely(index < 0)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic, otherwise we have no where to go */
+		*ret = (unsigned long)panic;
+		return;
+	}
+
+	*ret = current->ret_stack[index].ret;
+	trace->func = current->ret_stack[index].func;
+	trace->calltime = current->ret_stack[index].calltime;
+	trace->overrun = atomic_read(&current->trace_overrun);
+	trace->depth = index;
+	barrier();
+	current->curr_ret_stack--;
+
+}
+
+/*
+ * Send the trace to the ring-buffer.
+ * @return the original return address.
+ */
+unsigned long ftrace_return_to_handler(void)
+{
+	struct ftrace_graph_ret trace;
+	unsigned long ret;
+
+	ftrace_pop_return_trace(&trace, &ret);
+	trace.rettime = cpu_clock(raw_smp_processor_id());
+	ftrace_graph_return(&trace);
+
+	if (unlikely(!ret)) {
+		ftrace_graph_stop();
+		WARN_ON(1);
+		/* Might as well panic. What else to do? */
+		ret = (unsigned long)panic;
+	}
+
+	return ret;
+}
+
 static int graph_trace_init(struct trace_array *tr)
 {
 	int cpu, ret;
-- 
cgit v1.2.3


From 67e055d144c5b2acdc1c63811fde031263bf92c5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 18 Feb 2009 14:48:20 -0800
Subject: cgroups: fix possible use after free

In cgroup_kill_sb(), root is freed before sb is detached from the list, so
another sget() may find this sb and call cgroup_test_super(), which will
access the root that has been freed.

Reported-by: Al Viro <viro@ZenIV.linux.org.uk>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index e14db9c089b9..9edb5c4b79b4 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1122,8 +1122,8 @@ static void cgroup_kill_sb(struct super_block *sb) {
 
 	mutex_unlock(&cgroup_mutex);
 
-	kfree(root);
 	kill_litter_super(sb);
+	kfree(root);
 }
 
 static struct file_system_type cgroup_fs_type = {
-- 
cgit v1.2.3


From 42f5e039c3f6512271636928ddc4e7f7a0371672 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Wed, 18 Feb 2009 14:48:21 -0800
Subject: pm: fix build for CONFIG_PM unset

Compilation of kprobes.c with CONFIG_PM unset is broken due to some broken
config dependncies.  Fix that.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Reported-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Len Brown <lenb@kernel.org>
Acked-by: Pavel Machek <pavel@suse.cz>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile       | 1 +
 kernel/power/Makefile | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 170a9213c1b6..e4791b3ba55d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_FREEZER) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index d7a10167a25b..720ea4f781bd 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,7 +3,7 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
-obj-y				:= main.o
+obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
-- 
cgit v1.2.3


From 0c5119c1e655e0719a69601b1049acdd5ec1c125 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Feb 2009 18:33:57 -0500
Subject: tracing: disable tracing while testing ring buffer

Impact: fix to prevent hard lockup on self tests

If one of the tracers are broken and is constantly filling the ring
buffer while the test of the ring buffer is running, it will hang
the box. The reason is that the test is a consumer that will not
stop till the ring buffer is empty. But if the tracer is broken and
is constantly producing input to the buffer, this test will never
end. The result is a lockup of the box.

This happened when KALLSYMS was not defined and the dynamic ftrace
test constantly filled the ring buffer, because the filter failed
and all functions were being traced. Something was being called
that constantly filled the buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_selftest.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 88c8eb70f54a..a7e0ef662f9f 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -57,11 +57,20 @@ static int trace_test_buffer(struct trace_array *tr, unsigned long *count)
 
 	cnt = ring_buffer_entries(tr->buffer);
 
+	/*
+	 * The trace_test_buffer_cpu runs a while loop to consume all data.
+	 * If the calling tracer is broken, and is constantly filling
+	 * the buffer, this will run forever, and hard lock the box.
+	 * We disable the ring buffer while we do this test to prevent
+	 * a hard lock up.
+	 */
+	tracing_off();
 	for_each_possible_cpu(cpu) {
 		ret = trace_test_buffer_cpu(tr, cpu);
 		if (ret)
 			break;
 	}
+	tracing_on();
 	__raw_spin_unlock(&ftrace_max_lock);
 	local_irq_restore(flags);
 
-- 
cgit v1.2.3


From 4d7a077c0c7bfdba04cf0aa0b79053cf4ebaacf8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Feb 2009 22:06:18 -0500
Subject: tracing: have function trace select kallsyms

Impact: fix output of function tracer to be useful

The function tracer is pretty useless if KALLSYMS is not configured.
Unless you are good at reading hex values, the function tracer should
select the KALLSYMS configuration.

Also, the dynamic function tracer will fail its self test if KALLSYMS
is not selected.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Kconfig | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 58a93fbd68aa..34e707e5ab87 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config FUNCTION_TRACER
 	depends on HAVE_FUNCTION_TRACER
 	depends on DEBUG_KERNEL
 	select FRAME_POINTER
+	select KALLSYMS
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -238,6 +239,7 @@ config STACK_TRACER
 	depends on DEBUG_KERNEL
 	select FUNCTION_TRACER
 	select STACKTRACE
+	select KALLSYMS
 	help
 	  This special tracer records the maximum stack footprint of the
 	  kernel and displays it in debugfs/tracing/stack_trace.
-- 
cgit v1.2.3


From 4b3e3d228429c75d398f1aa24532e468d3220c49 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Feb 2009 22:50:01 -0500
Subject: tracing: limit the number of loops the ring buffer self test can make

Impact: prevent deadlock if ring buffer gets corrupted

This patch adds a paranoid check to make sure the ring buffer consumer
does not go into an infinite loop. Since the ring buffer has been set
to read only, the consumer should not loop for more than the ring buffer
size. A check is added to make sure the consumer does not loop more than
the ring buffer size.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_selftest.c | 10 ++++++++++
 1 file changed, 10 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a7e0ef662f9f..bc8e80a86bca 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -23,10 +23,20 @@ static int trace_test_buffer_cpu(struct trace_array *tr, int cpu)
 {
 	struct ring_buffer_event *event;
 	struct trace_entry *entry;
+	unsigned int loops = 0;
 
 	while ((event = ring_buffer_consume(tr->buffer, cpu, NULL))) {
 		entry = ring_buffer_event_data(event);
 
+		/*
+		 * The ring buffer is a size of trace_buf_size, if
+		 * we loop more than the size, there's something wrong
+		 * with the ring buffer.
+		 */
+		if (loops++ > trace_buf_size) {
+			printk(KERN_CONT ".. bad ring buffer ");
+			goto failed;
+		}
 		if (!trace_valid_entry(entry)) {
 			printk(KERN_CONT ".. invalid entry %d ",
 				entry->type);
-- 
cgit v1.2.3


From fdcedf7b75808dd72c3cc0b931be11b04d75c60a Mon Sep 17 00:00:00 2001
From: john stultz <johnstul@us.ibm.com>
Date: Wed, 18 Feb 2009 16:02:22 -0800
Subject: time: apply NTP frequency/tick changes immediately

Since the GENERIC_TIME changes landed, the adjtimex behavior changed
for struct timex.tick and .freq changed. When the tick or freq value
is set, we adjust the tick_length_base in ntp_update_frequency().
However, this new value doesn't get applied to tick_length until the
next second (via second_overflow).

This means some applications that do quick time tweaking do not see the
requested change made as quickly as expected.

I've run a few tests with this change, and ntpd still functions fine.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f5f793d92415..e1fa3689a903 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -51,6 +51,7 @@ static long ntp_tick_adj;
 
 static void ntp_update_frequency(void)
 {
+	u64 old_tick_length_base = tick_length_base;
 	u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
 				<< NTP_SCALE_SHIFT;
 	second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
@@ -60,6 +61,12 @@ static void ntp_update_frequency(void)
 
 	tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
 	tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
+
+	/*
+	 * Don't wait for the next second_overflow, apply
+	 * the change to the tick length immediately
+	 */
+	tick_length += tick_length_base - old_tick_length_base;
 }
 
 static void ntp_update_offset(long offset)
-- 
cgit v1.2.3


From d1f9cbd78841f1a797c77e9117e4882f932c2ef6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 18 Feb 2009 04:25:25 +0100
Subject: tracing/function-graph-tracer: fix traces weirdness while absolute
 time printing

Impact: trace output cleanup/reordering

When an interrupt occurs and and the abstime option is selected:

  echo funcgraph-abstime > /debug/tracing/trace_options

then we observe broken traces:

30581.025422 |   0)   Xorg-4291    |   0.503 us    |      idle_cpu();
30581.025424 |   0)   Xorg-4291    |   2.576 us    |    }
30581.025424 |   0)   Xorg-4291    | + 75.771 us   |  }
 0)   Xorg-4291    |   <========== |
30581.025425 |   0)   Xorg-4291    |               |  schedule() {
30581.025426 |   0)   Xorg-4291    |               |    __schedule() {
30581.025426 |   0)   Xorg-4291    |   0.705 us    |      _spin_lock_irq();

With this patch, the interrupts output better adapts
to absolute time printing:

  414.856543 |   1)   Xorg-4279    |   8.816 us    |                        }
  414.856544 |   1)   Xorg-4279    |   0.525 us    |                        rcu_irq_exit();
  414.856545 |   1)   Xorg-4279    |   0.526 us    |                        idle_cpu();
  414.856546 |   1)   Xorg-4279    | + 12.157 us   |                      }
  414.856549 |   1)   Xorg-4279    | ! 104.114 us  |                    }
  414.856549 |   1)   Xorg-4279    |   <========== |
  414.856549 |   1)   Xorg-4279    | ! 107.944 us  |                  }
  414.856550 |   1)   Xorg-4279    | ! 137.010 us  |                }
  414.856551 |   1)   Xorg-4279    |   0.624 us    |                _read_unlock();
  414.856552 |   1)   Xorg-4279    | ! 140.930 us  |              }
  414.856552 |   1)   Xorg-4279    | ! 166.159 us  |            }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 50 +++++++++++++++++++++---------------
 1 file changed, 29 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6c7738e4f98b..8f4004a00b4e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -351,16 +351,35 @@ print_graph_overhead(unsigned long long duration, struct trace_seq *s)
 	return trace_seq_printf(s, "  ");
 }
 
+static int print_graph_abs_time(u64 t, struct trace_seq *s)
+{
+	unsigned long usecs_rem;
+
+	usecs_rem = do_div(t, NSEC_PER_SEC);
+	usecs_rem /= 1000;
+
+	return trace_seq_printf(s, "%5lu.%06lu |  ",
+			(unsigned long)t, usecs_rem);
+}
+
 static enum print_line_t
-print_graph_irq(struct trace_seq *s, unsigned long addr,
+print_graph_irq(struct trace_iterator *iter, unsigned long addr,
 		enum trace_type type, int cpu, pid_t pid)
 {
 	int ret;
+	struct trace_seq *s = &iter->seq;
 
 	if (addr < (unsigned long)__irqentry_text_start ||
 		addr >= (unsigned long)__irqentry_text_end)
 		return TRACE_TYPE_UNHANDLED;
 
+	/* Absolute time */
+	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
+		ret = print_graph_abs_time(iter->ts, s);
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
@@ -446,17 +465,6 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s)
 
 }
 
-static int print_graph_abs_time(u64 t, struct trace_seq *s)
-{
-	unsigned long usecs_rem;
-
-	usecs_rem = do_div(t, 1000000000);
-	usecs_rem /= 1000;
-
-	return trace_seq_printf(s, "%5lu.%06lu |  ",
-			(unsigned long)t, usecs_rem);
-}
-
 /* Case of a leaf function on its call entry */
 static enum print_line_t
 print_graph_entry_leaf(struct trace_iterator *iter,
@@ -561,7 +569,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Interrupt */
-	ret = print_graph_irq(s, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
+	ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -581,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
+		ret = print_graph_proc(s, pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 
@@ -605,11 +613,11 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int i;
 	int ret;
 	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private;
+	pid_t *last_pid = iter->private, pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	/* Absolute time */
@@ -668,7 +676,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	ret = print_graph_irq(s, trace->func, TRACE_GRAPH_RET, cpu, ent->pid);
+	ret = print_graph_irq(iter, trace->func, TRACE_GRAPH_RET, cpu, pid);
 	if (ret == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -684,6 +692,10 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 	int cpu = iter->cpu;
 	pid_t *last_pid = iter->private;
 
+	/* Pid */
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	/* Absolute time */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
 		ret = print_graph_abs_time(iter->ts, s);
@@ -691,10 +703,6 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
-	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
-
 	/* Cpu */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
 		ret = print_graph_cpu(s, cpu);
-- 
cgit v1.2.3


From 00a8bf859331e349713274825e6fbf20bf2ac15a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Feb 2009 13:01:37 +0100
Subject: tracing/function-graph-tracer: fix merge

Merge artifact: pid got changed to ent->pid meanwhile.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_functions_graph.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8f4004a00b4e..c009553a8e81 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -589,7 +589,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 
 	/* Proc */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, pid);
+		ret = print_graph_proc(s, ent->pid);
 		if (ret == TRACE_TYPE_PARTIAL_LINE)
 			return TRACE_TYPE_PARTIAL_LINE;
 
-- 
cgit v1.2.3


From 6b588c18f8dacfa6d7957c33c5ff832096e752d3 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:07 +0900
Subject: module: reorder module pcpu related functions

Impact: cleanup

Move percpu_modinit() upwards.  This is to ease further changes.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 kernel/module.c | 33 ++++++++++++++++++---------------
 1 file changed, 18 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987e..52b3497b8748 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -480,21 +480,6 @@ static void percpu_modfree(void *freeme)
 	}
 }
 
-static unsigned int find_pcpusec(Elf_Ehdr *hdr,
-				 Elf_Shdr *sechdrs,
-				 const char *secstrings)
-{
-	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
-}
-
-static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
-{
-	int cpu;
-
-	for_each_possible_cpu(cpu)
-		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
-}
-
 static int percpu_modinit(void)
 {
 	pcpu_num_used = 2;
@@ -513,7 +498,24 @@ static int percpu_modinit(void)
 	return 0;
 }
 __initcall(percpu_modinit);
+
+static unsigned int find_pcpusec(Elf_Ehdr *hdr,
+				 Elf_Shdr *sechdrs,
+				 const char *secstrings)
+{
+	return find_sec(hdr, sechdrs, secstrings, ".data.percpu");
+}
+
+static void percpu_modcopy(void *pcpudest, const void *from, unsigned long size)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu)
+		memcpy(pcpudest + per_cpu_offset(cpu), from, size);
+}
+
 #else /* ... !CONFIG_SMP */
+
 static inline void *percpu_modalloc(unsigned long size, unsigned long align,
 				    const char *name)
 {
@@ -535,6 +537,7 @@ static inline void percpu_modcopy(void *pcpudst, const void *src,
 	/* pcpusec should be 0, and size of that section should be 0. */
 	BUG_ON(size != 0);
 }
+
 #endif /* CONFIG_SMP */
 
 #define MODINFO_ATTR(field)	\
-- 
cgit v1.2.3


From b36128c830a8f5bd7d4981f5b0b69950f5928ee6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: alloc_percpu: change percpu_ptr to per_cpu_ptr

Impact: cleanup

There are two allocated per-cpu accessor macros with almost identical
spelling.  The original and far more popular is per_cpu_ptr (44
files), so change over the other 4 files.

tj: kill percpu_ptr() and update UP too

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: mingo@redhat.com
Cc: lenb@kernel.org
Cc: cpufreq@vger.kernel.org
Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |  2 +-
 drivers/acpi/processor_perflib.c           |  4 ++--
 include/linux/percpu.h                     | 23 +++++++++++------------
 kernel/sched.c                             |  6 +++---
 kernel/stop_machine.c                      |  2 +-
 5 files changed, 18 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4b1c319d30c3..22590cf688ae 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -601,7 +601,7 @@ static int acpi_cpufreq_cpu_init(struct cpufreq_policy *policy)
 	if (!data)
 		return -ENOMEM;
 
-	data->acpi_data = percpu_ptr(acpi_perf_data, cpu);
+	data->acpi_data = per_cpu_ptr(acpi_perf_data, cpu);
 	per_cpu(drv_data, cpu) = data;
 
 	if (cpu_has(c, X86_FEATURE_CONSTANT_TSC))
diff --git a/drivers/acpi/processor_perflib.c b/drivers/acpi/processor_perflib.c
index 9cc769b587ff..68fd3d292799 100644
--- a/drivers/acpi/processor_perflib.c
+++ b/drivers/acpi/processor_perflib.c
@@ -516,12 +516,12 @@ int acpi_processor_preregister_performance(
 			continue;
 		}
 
-		if (!performance || !percpu_ptr(performance, i)) {
+		if (!performance || !per_cpu_ptr(performance, i)) {
 			retval = -EINVAL;
 			continue;
 		}
 
-		pr->performance = percpu_ptr(performance, i);
+		pr->performance = per_cpu_ptr(performance, i);
 		cpumask_set_cpu(i, pr->performance->shared_cpu_map);
 		if (acpi_processor_get_psd(pr)) {
 			retval = -EINVAL;
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index 3577ffd90d45..c80cfe1260ec 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -81,23 +81,13 @@ struct percpu_data {
 };
 
 #define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
-/* 
- * Use this to get to a cpu's version of the per-cpu object dynamically
- * allocated. Non-atomic access to the current CPU's version should
- * probably be combined with get_cpu()/put_cpu().
- */ 
-#define percpu_ptr(ptr, cpu)                              \
-({                                                        \
-        struct percpu_data *__p = __percpu_disguise(ptr); \
-        (__typeof__(ptr))__p->ptrs[(cpu)];	          \
-})
 
 extern void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask);
 extern void percpu_free(void *__pdata);
 
 #else /* CONFIG_SMP */
 
-#define percpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
+#define per_cpu_ptr(ptr, cpu) ({ (void)(cpu); (ptr); })
 
 static __always_inline void *__percpu_alloc_mask(size_t size, gfp_t gfp, cpumask_t *mask)
 {
@@ -122,6 +112,15 @@ static inline void percpu_free(void *__pdata)
 						  cpu_possible_map)
 #define alloc_percpu(type)	(type *)__alloc_percpu(sizeof(type))
 #define free_percpu(ptr)	percpu_free((ptr))
-#define per_cpu_ptr(ptr, cpu)	percpu_ptr((ptr), (cpu))
+/*
+ * Use this to get to a cpu's version of the per-cpu object dynamically
+ * allocated. Non-atomic access to the current CPU's version should
+ * probably be combined with get_cpu()/put_cpu().
+ */
+#define per_cpu_ptr(ptr, cpu)						\
+({									\
+        struct percpu_data *__p = __percpu_disguise(ptr);		\
+        (__typeof__(ptr))__p->ptrs[(cpu)];				\
+})
 
 #endif /* __LINUX_PERCPU_H */
diff --git a/kernel/sched.c b/kernel/sched.c
index fc17fd91ab57..9d30ac956328 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9472,7 +9472,7 @@ cpuacct_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
 
 static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 	u64 data;
 
 #ifndef CONFIG_64BIT
@@ -9491,7 +9491,7 @@ static u64 cpuacct_cpuusage_read(struct cpuacct *ca, int cpu)
 
 static void cpuacct_cpuusage_write(struct cpuacct *ca, int cpu, u64 val)
 {
-	u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+	u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 
 #ifndef CONFIG_64BIT
 	/*
@@ -9587,7 +9587,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	ca = task_ca(tsk);
 
 	for (; ca; ca = ca->parent) {
-		u64 *cpuusage = percpu_ptr(ca->cpuusage, cpu);
+		u64 *cpuusage = per_cpu_ptr(ca->cpuusage, cpu);
 		*cpuusage += cputime;
 	}
 }
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 0cd415ee62a2..74541ca49536 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -170,7 +170,7 @@ int __stop_machine(int (*fn)(void *), void *data, const struct cpumask *cpus)
 	 * doesn't hit this CPU until we're ready. */
 	get_cpu();
 	for_each_online_cpu(i) {
-		sm_work = percpu_ptr(stop_machine_work, i);
+		sm_work = per_cpu_ptr(stop_machine_work, i);
 		INIT_WORK(sm_work, stop_cpu);
 		queue_work_on(i, stop_machine_wq, sm_work);
 	}
-- 
cgit v1.2.3


From fbf59bc9d74d1fb30b8e0630743aff2806eafcea Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 20 Feb 2009 16:29:08 +0900
Subject: percpu: implement new dynamic percpu allocator

Impact: new scalable dynamic percpu allocator which allows dynamic
        percpu areas to be accessed the same way as static ones

Implement scalable dynamic percpu allocator which can be used for both
static and dynamic percpu areas.  This will allow static and dynamic
areas to share faster direct access methods.  This feature is optional
and enabled only when CONFIG_HAVE_DYNAMIC_PER_CPU_AREA is defined by
arch.  Please read comment on top of mm/percpu.c for details.

Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 include/linux/percpu.h |  22 +-
 kernel/module.c        |  31 ++
 mm/Makefile            |   4 +
 mm/percpu.c            | 890 +++++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 943 insertions(+), 4 deletions(-)
 create mode 100644 mm/percpu.c

(limited to 'kernel')

diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index d99e24ae1811..18080995ff3e 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -76,23 +76,37 @@
 
 #ifdef CONFIG_SMP
 
-struct percpu_data {
-	void *ptrs[1];
-};
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
 
-#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+extern void *pcpu_base_addr;
 
+typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
+
+extern size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				       struct page **pages, size_t cpu_size);
 /*
  * Use this to get to a cpu's version of the per-cpu object
  * dynamically allocated. Non-atomic access to the current CPU's
  * version should probably be combined with get_cpu()/put_cpu().
  */
+#define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
+
+#else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
+struct percpu_data {
+	void *ptrs[1];
+};
+
+#define __percpu_disguise(pdata) (struct percpu_data *)~(unsigned long)(pdata)
+
 #define per_cpu_ptr(ptr, cpu)						\
 ({									\
         struct percpu_data *__p = __percpu_disguise(ptr);		\
         (__typeof__(ptr))__p->ptrs[(cpu)];				\
 })
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 extern void *__alloc_percpu(size_t size, size_t align);
 extern void free_percpu(void *__pdata);
 
diff --git a/kernel/module.c b/kernel/module.c
index 52b3497b8748..1f0657ae555b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -51,6 +51,7 @@
 #include <linux/tracepoint.h>
 #include <linux/ftrace.h>
 #include <linux/async.h>
+#include <linux/percpu.h>
 
 #if 0
 #define DEBUGP printk
@@ -366,6 +367,34 @@ static struct module *find_module(const char *name)
 }
 
 #ifdef CONFIG_SMP
+
+#ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+
+static void *percpu_modalloc(unsigned long size, unsigned long align,
+			     const char *name)
+{
+	void *ptr;
+
+	if (align > PAGE_SIZE) {
+		printk(KERN_WARNING "%s: per-cpu alignment %li > %li\n",
+		       name, align, PAGE_SIZE);
+		align = PAGE_SIZE;
+	}
+
+	ptr = __alloc_percpu(size, align);
+	if (!ptr)
+		printk(KERN_WARNING
+		       "Could not allocate %lu bytes percpu data\n", size);
+	return ptr;
+}
+
+static void percpu_modfree(void *freeme)
+{
+	free_percpu(freeme);
+}
+
+#else /* ... !CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 /* Number of blocks used and allocated. */
 static unsigned int pcpu_num_used, pcpu_num_allocated;
 /* Size of each block.  -ve means used. */
@@ -499,6 +528,8 @@ static int percpu_modinit(void)
 }
 __initcall(percpu_modinit);
 
+#endif /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
+
 static unsigned int find_pcpusec(Elf_Ehdr *hdr,
 				 Elf_Shdr *sechdrs,
 				 const char *secstrings)
diff --git a/mm/Makefile b/mm/Makefile
index 72255be57f89..818569b68f46 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -30,6 +30,10 @@ obj-$(CONFIG_FAILSLAB) += failslab.o
 obj-$(CONFIG_MEMORY_HOTPLUG) += memory_hotplug.o
 obj-$(CONFIG_FS_XIP) += filemap_xip.o
 obj-$(CONFIG_MIGRATION) += migrate.o
+ifdef CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+obj-$(CONFIG_SMP) += percpu.o
+else
 obj-$(CONFIG_SMP) += allocpercpu.o
+endif
 obj-$(CONFIG_QUICKLIST) += quicklist.o
 obj-$(CONFIG_CGROUP_MEM_RES_CTLR) += memcontrol.o page_cgroup.o
diff --git a/mm/percpu.c b/mm/percpu.c
new file mode 100644
index 000000000000..4617d97e877c
--- /dev/null
+++ b/mm/percpu.c
@@ -0,0 +1,890 @@
+/*
+ * linux/mm/percpu.c - percpu memory allocator
+ *
+ * Copyright (C) 2009		SUSE Linux Products GmbH
+ * Copyright (C) 2009		Tejun Heo <tj@kernel.org>
+ *
+ * This file is released under the GPLv2.
+ *
+ * This is percpu allocator which can handle both static and dynamic
+ * areas.  Percpu areas are allocated in chunks in vmalloc area.  Each
+ * chunk is consisted of num_possible_cpus() units and the first chunk
+ * is used for static percpu variables in the kernel image (special
+ * boot time alloc/init handling necessary as these areas need to be
+ * brought up before allocation services are running).  Unit grows as
+ * necessary and all units grow or shrink in unison.  When a chunk is
+ * filled up, another chunk is allocated.  ie. in vmalloc area
+ *
+ *  c0                           c1                         c2
+ *  -------------------          -------------------        ------------
+ * | u0 | u1 | u2 | u3 |        | u0 | u1 | u2 | u3 |      | u0 | u1 | u
+ *  -------------------  ......  -------------------  ....  ------------
+ *
+ * Allocation is done in offset-size areas of single unit space.  Ie,
+ * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
+ * c1:u1, c1:u2 and c1:u3.  Percpu access can be done by configuring
+ * percpu base registers UNIT_SIZE apart.
+ *
+ * There are usually many small percpu allocations many of them as
+ * small as 4 bytes.  The allocator organizes chunks into lists
+ * according to free size and tries to allocate from the fullest one.
+ * Each chunk keeps the maximum contiguous area size hint which is
+ * guaranteed to be eqaul to or larger than the maximum contiguous
+ * area in the chunk.  This helps the allocator not to iterate the
+ * chunk maps unnecessarily.
+ *
+ * Allocation state in each chunk is kept using an array of integers
+ * on chunk->map.  A positive value in the map represents a free
+ * region and negative allocated.  Allocation inside a chunk is done
+ * by scanning this map sequentially and serving the first matching
+ * entry.  This is mostly copied from the percpu_modalloc() allocator.
+ * Chunks are also linked into a rb tree to ease address to chunk
+ * mapping during free.
+ *
+ * To use this allocator, arch code should do the followings.
+ *
+ * - define CONFIG_HAVE_DYNAMIC_PER_CPU_AREA
+ *
+ * - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
+ *   regular address to percpu pointer and back
+ *
+ * - use pcpu_setup_static() during percpu area initialization to
+ *   setup kernel static percpu area
+ */
+
+#include <linux/bitmap.h>
+#include <linux/bootmem.h>
+#include <linux/list.h>
+#include <linux/mm.h>
+#include <linux/module.h>
+#include <linux/mutex.h>
+#include <linux/percpu.h>
+#include <linux/pfn.h>
+#include <linux/rbtree.h>
+#include <linux/slab.h>
+#include <linux/vmalloc.h>
+
+#include <asm/cacheflush.h>
+#include <asm/tlbflush.h>
+
+#define PCPU_MIN_UNIT_PAGES_SHIFT	4	/* also max alloc size */
+#define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
+#define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
+
+struct pcpu_chunk {
+	struct list_head	list;		/* linked to pcpu_slot lists */
+	struct rb_node		rb_node;	/* key is chunk->vm->addr */
+	int			free_size;	/* free bytes in the chunk */
+	int			contig_hint;	/* max contiguous size hint */
+	struct vm_struct	*vm;		/* mapped vmalloc region */
+	int			map_used;	/* # of map entries used */
+	int			map_alloc;	/* # of map entries allocated */
+	int			*map;		/* allocation map */
+	struct page		*page[];	/* #cpus * UNIT_PAGES */
+};
+
+static int pcpu_unit_pages_shift;
+static int pcpu_unit_pages;
+static int pcpu_unit_shift;
+static int pcpu_unit_size;
+static int pcpu_chunk_size;
+static int pcpu_nr_slots;
+static size_t pcpu_chunk_struct_size;
+
+/* the address of the first chunk which starts with the kernel static area */
+void *pcpu_base_addr;
+EXPORT_SYMBOL_GPL(pcpu_base_addr);
+
+/* the size of kernel static area */
+static int pcpu_static_size;
+
+/*
+ * One mutex to rule them all.
+ *
+ * The following mutex is grabbed in the outermost public alloc/free
+ * interface functions and released only when the operation is
+ * complete.  As such, every function in this file other than the
+ * outermost functions are called under pcpu_mutex.
+ *
+ * It can easily be switched to use spinlock such that only the area
+ * allocation and page population commit are protected with it doing
+ * actual [de]allocation without holding any lock.  However, given
+ * what this allocator does, I think it's better to let them run
+ * sequentially.
+ */
+static DEFINE_MUTEX(pcpu_mutex);
+
+static struct list_head *pcpu_slot;		/* chunk list slots */
+static struct rb_root pcpu_addr_root = RB_ROOT;	/* chunks by address */
+
+static int pcpu_size_to_slot(int size)
+{
+	int highbit = fls(size);
+	return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
+}
+
+static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
+{
+	if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
+		return 0;
+
+	return pcpu_size_to_slot(chunk->free_size);
+}
+
+static int pcpu_page_idx(unsigned int cpu, int page_idx)
+{
+	return (cpu << pcpu_unit_pages_shift) + page_idx;
+}
+
+static struct page **pcpu_chunk_pagep(struct pcpu_chunk *chunk,
+				      unsigned int cpu, int page_idx)
+{
+	return &chunk->page[pcpu_page_idx(cpu, page_idx)];
+}
+
+static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
+				     unsigned int cpu, int page_idx)
+{
+	return (unsigned long)chunk->vm->addr +
+		(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
+}
+
+static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk,
+				     int page_idx)
+{
+	return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL;
+}
+
+/**
+ * pcpu_realloc - versatile realloc
+ * @p: the current pointer (can be NULL for new allocations)
+ * @size: the current size (can be 0 for new allocations)
+ * @new_size: the wanted new size (can be 0 for free)
+ *
+ * More robust realloc which can be used to allocate, resize or free a
+ * memory area of arbitrary size.  If the needed size goes over
+ * PAGE_SIZE, kernel VM is used.
+ *
+ * RETURNS:
+ * The new pointer on success, NULL on failure.
+ */
+static void *pcpu_realloc(void *p, size_t size, size_t new_size)
+{
+	void *new;
+
+	if (new_size <= PAGE_SIZE)
+		new = kmalloc(new_size, GFP_KERNEL);
+	else
+		new = vmalloc(new_size);
+	if (new_size && !new)
+		return NULL;
+
+	memcpy(new, p, min(size, new_size));
+	if (new_size > size)
+		memset(new + size, 0, new_size - size);
+
+	if (size <= PAGE_SIZE)
+		kfree(p);
+	else
+		vfree(p);
+
+	return new;
+}
+
+/**
+ * pcpu_chunk_relocate - put chunk in the appropriate chunk slot
+ * @chunk: chunk of interest
+ * @oslot: the previous slot it was on
+ *
+ * This function is called after an allocation or free changed @chunk.
+ * New slot according to the changed state is determined and @chunk is
+ * moved to the slot.
+ */
+static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
+{
+	int nslot = pcpu_chunk_slot(chunk);
+
+	if (oslot != nslot) {
+		if (oslot < nslot)
+			list_move(&chunk->list, &pcpu_slot[nslot]);
+		else
+			list_move_tail(&chunk->list, &pcpu_slot[nslot]);
+	}
+}
+
+static struct rb_node **pcpu_chunk_rb_search(void *addr,
+					     struct rb_node **parentp)
+{
+	struct rb_node **p = &pcpu_addr_root.rb_node;
+	struct rb_node *parent = NULL;
+	struct pcpu_chunk *chunk;
+
+	while (*p) {
+		parent = *p;
+		chunk = rb_entry(parent, struct pcpu_chunk, rb_node);
+
+		if (addr < chunk->vm->addr)
+			p = &(*p)->rb_left;
+		else if (addr > chunk->vm->addr)
+			p = &(*p)->rb_right;
+		else
+			break;
+	}
+
+	if (parentp)
+		*parentp = parent;
+	return p;
+}
+
+/**
+ * pcpu_chunk_addr_search - search for chunk containing specified address
+ * @addr: address to search for
+ *
+ * Look for chunk which might contain @addr.  More specifically, it
+ * searchs for the chunk with the highest start address which isn't
+ * beyond @addr.
+ *
+ * RETURNS:
+ * The address of the found chunk.
+ */
+static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
+{
+	struct rb_node *n, *parent;
+	struct pcpu_chunk *chunk;
+
+	n = *pcpu_chunk_rb_search(addr, &parent);
+	if (!n) {
+		/* no exactly matching chunk, the parent is the closest */
+		n = parent;
+		BUG_ON(!n);
+	}
+	chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+
+	if (addr < chunk->vm->addr) {
+		/* the parent was the next one, look for the previous one */
+		n = rb_prev(n);
+		BUG_ON(!n);
+		chunk = rb_entry(n, struct pcpu_chunk, rb_node);
+	}
+
+	return chunk;
+}
+
+/**
+ * pcpu_chunk_addr_insert - insert chunk into address rb tree
+ * @new: chunk to insert
+ *
+ * Insert @new into address rb tree.
+ */
+static void pcpu_chunk_addr_insert(struct pcpu_chunk *new)
+{
+	struct rb_node **p, *parent;
+
+	p = pcpu_chunk_rb_search(new->vm->addr, &parent);
+	BUG_ON(*p);
+	rb_link_node(&new->rb_node, parent, p);
+	rb_insert_color(&new->rb_node, &pcpu_addr_root);
+}
+
+/**
+ * pcpu_split_block - split a map block
+ * @chunk: chunk of interest
+ * @i: index of map block to split
+ * @head: head size (can be 0)
+ * @tail: tail size (can be 0)
+ *
+ * Split the @i'th map block into two or three blocks.  If @head is
+ * non-zero, @head bytes block is inserted before block @i moving it
+ * to @i+1 and reducing its size by @head bytes.
+ *
+ * If @tail is non-zero, the target block, which can be @i or @i+1
+ * depending on @head, is reduced by @tail bytes and @tail byte block
+ * is inserted after the target block.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+static int pcpu_split_block(struct pcpu_chunk *chunk, int i, int head, int tail)
+{
+	int nr_extra = !!head + !!tail;
+	int target = chunk->map_used + nr_extra;
+
+	/* reallocation required? */
+	if (chunk->map_alloc < target) {
+		int new_alloc = chunk->map_alloc;
+		int *new;
+
+		while (new_alloc < target)
+			new_alloc *= 2;
+
+		new = pcpu_realloc(chunk->map,
+				   chunk->map_alloc * sizeof(new[0]),
+				   new_alloc * sizeof(new[0]));
+		if (!new)
+			return -ENOMEM;
+
+		chunk->map_alloc = new_alloc;
+		chunk->map = new;
+	}
+
+	/* insert a new subblock */
+	memmove(&chunk->map[i + nr_extra], &chunk->map[i],
+		sizeof(chunk->map[0]) * (chunk->map_used - i));
+	chunk->map_used += nr_extra;
+
+	if (head) {
+		chunk->map[i + 1] = chunk->map[i] - head;
+		chunk->map[i++] = head;
+	}
+	if (tail) {
+		chunk->map[i++] -= tail;
+		chunk->map[i] = tail;
+	}
+	return 0;
+}
+
+/**
+ * pcpu_alloc_area - allocate area from a pcpu_chunk
+ * @chunk: chunk of interest
+ * @size: wanted size
+ * @align: wanted align
+ *
+ * Try to allocate @size bytes area aligned at @align from @chunk.
+ * Note that this function only allocates the offset.  It doesn't
+ * populate or map the area.
+ *
+ * RETURNS:
+ * Allocated offset in @chunk on success, -errno on failure.
+ */
+static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int max_contig = 0;
+	int i, off;
+
+	/*
+	 * The static chunk initially doesn't have map attached
+	 * because kmalloc wasn't available during init.  Give it one.
+	 */
+	if (unlikely(!chunk->map)) {
+		chunk->map = pcpu_realloc(NULL, 0,
+				PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+		if (!chunk->map)
+			return -ENOMEM;
+
+		chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+		chunk->map[chunk->map_used++] = -pcpu_static_size;
+		if (chunk->free_size)
+			chunk->map[chunk->map_used++] = chunk->free_size;
+	}
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
+		bool is_last = i + 1 == chunk->map_used;
+		int head, tail;
+
+		/* extra for alignment requirement */
+		head = ALIGN(off, align) - off;
+		BUG_ON(i == 0 && head != 0);
+
+		if (chunk->map[i] < 0)
+			continue;
+		if (chunk->map[i] < head + size) {
+			max_contig = max(chunk->map[i], max_contig);
+			continue;
+		}
+
+		/*
+		 * If head is small or the previous block is free,
+		 * merge'em.  Note that 'small' is defined as smaller
+		 * than sizeof(int), which is very small but isn't too
+		 * uncommon for percpu allocations.
+		 */
+		if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
+			if (chunk->map[i - 1] > 0)
+				chunk->map[i - 1] += head;
+			else {
+				chunk->map[i - 1] -= head;
+				chunk->free_size -= head;
+			}
+			chunk->map[i] -= head;
+			off += head;
+			head = 0;
+		}
+
+		/* if tail is small, just keep it around */
+		tail = chunk->map[i] - head - size;
+		if (tail < sizeof(int))
+			tail = 0;
+
+		/* split if warranted */
+		if (head || tail) {
+			if (pcpu_split_block(chunk, i, head, tail))
+				return -ENOMEM;
+			if (head) {
+				i++;
+				off += head;
+				max_contig = max(chunk->map[i - 1], max_contig);
+			}
+			if (tail)
+				max_contig = max(chunk->map[i + 1], max_contig);
+		}
+
+		/* update hint and mark allocated */
+		if (is_last)
+			chunk->contig_hint = max_contig; /* fully scanned */
+		else
+			chunk->contig_hint = max(chunk->contig_hint,
+						 max_contig);
+
+		chunk->free_size -= chunk->map[i];
+		chunk->map[i] = -chunk->map[i];
+
+		pcpu_chunk_relocate(chunk, oslot);
+		return off;
+	}
+
+	chunk->contig_hint = max_contig;	/* fully scanned */
+	pcpu_chunk_relocate(chunk, oslot);
+
+	/*
+	 * Tell the upper layer that this chunk has no area left.
+	 * Note that this is not an error condition but a notification
+	 * to upper layer that it needs to look at other chunks.
+	 * -ENOSPC is chosen as it isn't used in memory subsystem and
+	 * matches the meaning in a way.
+	 */
+	return -ENOSPC;
+}
+
+/**
+ * pcpu_free_area - free area to a pcpu_chunk
+ * @chunk: chunk of interest
+ * @freeme: offset of area to free
+ *
+ * Free area starting from @freeme to @chunk.  Note that this function
+ * only modifies the allocation map.  It doesn't depopulate or unmap
+ * the area.
+ */
+static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
+{
+	int oslot = pcpu_chunk_slot(chunk);
+	int i, off;
+
+	for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
+		if (off == freeme)
+			break;
+	BUG_ON(off != freeme);
+	BUG_ON(chunk->map[i] > 0);
+
+	chunk->map[i] = -chunk->map[i];
+	chunk->free_size += chunk->map[i];
+
+	/* merge with previous? */
+	if (i > 0 && chunk->map[i - 1] >= 0) {
+		chunk->map[i - 1] += chunk->map[i];
+		chunk->map_used--;
+		memmove(&chunk->map[i], &chunk->map[i + 1],
+			(chunk->map_used - i) * sizeof(chunk->map[0]));
+		i--;
+	}
+	/* merge with next? */
+	if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
+		chunk->map[i] += chunk->map[i + 1];
+		chunk->map_used--;
+		memmove(&chunk->map[i + 1], &chunk->map[i + 2],
+			(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
+	}
+
+	chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
+	pcpu_chunk_relocate(chunk, oslot);
+}
+
+/**
+ * pcpu_unmap - unmap pages out of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to unmap
+ * @page_end: page index of the last page to unmap + 1
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
+ * If @flush is true, vcache is flushed before unmapping and tlb
+ * after.
+ */
+static void pcpu_unmap(struct pcpu_chunk *chunk, int page_start, int page_end,
+		       bool flush)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+
+	/*
+	 * Each flushing trial can be very expensive, issue flush on
+	 * the whole region at once rather than doing it for each cpu.
+	 * This could be an overkill but is more scalable.
+	 */
+	if (flush)
+		flush_cache_vunmap(pcpu_chunk_addr(chunk, 0, page_start),
+				   pcpu_chunk_addr(chunk, last, page_end));
+
+	for_each_possible_cpu(cpu)
+		unmap_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT);
+
+	/* ditto as flush_cache_vunmap() */
+	if (flush)
+		flush_tlb_kernel_range(pcpu_chunk_addr(chunk, 0, page_start),
+				       pcpu_chunk_addr(chunk, last, page_end));
+}
+
+/**
+ * pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
+ * @chunk: chunk to depopulate
+ * @off: offset to the area to depopulate
+ * @size: size of the area to depopulate
+ * @flush: whether to flush cache and tlb or not
+ *
+ * For each cpu, depopulate and unmap pages [@page_start,@page_end)
+ * from @chunk.  If @flush is true, vcache is flushed before unmapping
+ * and tlb after.
+ */
+static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, size_t off,
+				  size_t size, bool flush)
+{
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int unmap_start = -1;
+	int uninitialized_var(unmap_end);
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			if (!*pagep)
+				continue;
+
+			__free_page(*pagep);
+
+			/*
+			 * If it's partial depopulation, it might get
+			 * populated or depopulated again.  Mark the
+			 * page gone.
+			 */
+			*pagep = NULL;
+
+			unmap_start = unmap_start < 0 ? i : unmap_start;
+			unmap_end = i + 1;
+		}
+	}
+
+	if (unmap_start >= 0)
+		pcpu_unmap(chunk, unmap_start, unmap_end, flush);
+}
+
+/**
+ * pcpu_map - map pages into a pcpu_chunk
+ * @chunk: chunk of interest
+ * @page_start: page index of the first page to map
+ * @page_end: page index of the last page to map + 1
+ *
+ * For each cpu, map pages [@page_start,@page_end) into @chunk.
+ * vcache is flushed afterwards.
+ */
+static int pcpu_map(struct pcpu_chunk *chunk, int page_start, int page_end)
+{
+	unsigned int last = num_possible_cpus() - 1;
+	unsigned int cpu;
+	int err;
+
+	for_each_possible_cpu(cpu) {
+		err = map_kernel_range_noflush(
+				pcpu_chunk_addr(chunk, cpu, page_start),
+				(page_end - page_start) << PAGE_SHIFT,
+				PAGE_KERNEL,
+				pcpu_chunk_pagep(chunk, cpu, page_start));
+		if (err < 0)
+			return err;
+	}
+
+	/* flush at once, please read comments in pcpu_unmap() */
+	flush_cache_vmap(pcpu_chunk_addr(chunk, 0, page_start),
+			 pcpu_chunk_addr(chunk, last, page_end));
+	return 0;
+}
+
+/**
+ * pcpu_populate_chunk - populate and map an area of a pcpu_chunk
+ * @chunk: chunk of interest
+ * @off: offset to the area to populate
+ * @size: size of the area to populate
+ *
+ * For each cpu, populate and map pages [@page_start,@page_end) into
+ * @chunk.  The area is cleared on return.
+ */
+static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
+{
+	const gfp_t alloc_mask = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
+	int page_start = PFN_DOWN(off);
+	int page_end = PFN_UP(off + size);
+	int map_start = -1;
+	int map_end;
+	unsigned int cpu;
+	int i;
+
+	for (i = page_start; i < page_end; i++) {
+		if (pcpu_chunk_page_occupied(chunk, i)) {
+			if (map_start >= 0) {
+				if (pcpu_map(chunk, map_start, map_end))
+					goto err;
+				map_start = -1;
+			}
+			continue;
+		}
+
+		map_start = map_start < 0 ? i : map_start;
+		map_end = i + 1;
+
+		for_each_possible_cpu(cpu) {
+			struct page **pagep = pcpu_chunk_pagep(chunk, cpu, i);
+
+			*pagep = alloc_pages_node(cpu_to_node(cpu),
+						  alloc_mask, 0);
+			if (!*pagep)
+				goto err;
+		}
+	}
+
+	if (map_start >= 0 && pcpu_map(chunk, map_start, map_end))
+		goto err;
+
+	for_each_possible_cpu(cpu)
+		memset(chunk->vm->addr + (cpu << pcpu_unit_shift) + off, 0,
+		       size);
+
+	return 0;
+err:
+	/* likely under heavy memory pressure, give memory back */
+	pcpu_depopulate_chunk(chunk, off, size, true);
+	return -ENOMEM;
+}
+
+static void free_pcpu_chunk(struct pcpu_chunk *chunk)
+{
+	if (!chunk)
+		return;
+	if (chunk->vm)
+		free_vm_area(chunk->vm);
+	pcpu_realloc(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]), 0);
+	kfree(chunk);
+}
+
+static struct pcpu_chunk *alloc_pcpu_chunk(void)
+{
+	struct pcpu_chunk *chunk;
+
+	chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
+	if (!chunk)
+		return NULL;
+
+	chunk->map = pcpu_realloc(NULL, 0,
+				  PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
+	chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
+	chunk->map[chunk->map_used++] = pcpu_unit_size;
+
+	chunk->vm = get_vm_area(pcpu_chunk_size, GFP_KERNEL);
+	if (!chunk->vm) {
+		free_pcpu_chunk(chunk);
+		return NULL;
+	}
+
+	INIT_LIST_HEAD(&chunk->list);
+	chunk->free_size = pcpu_unit_size;
+	chunk->contig_hint = pcpu_unit_size;
+
+	return chunk;
+}
+
+/**
+ * __alloc_percpu - allocate percpu area
+ * @size: size of area to allocate
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	void *ptr = NULL;
+	struct pcpu_chunk *chunk;
+	int slot, off;
+
+	if (unlikely(!size || size > PAGE_SIZE << PCPU_MIN_UNIT_PAGES_SHIFT ||
+		     align > PAGE_SIZE)) {
+		WARN(true, "illegal size (%zu) or align (%zu) for "
+		     "percpu allocation\n", size, align);
+		return NULL;
+	}
+
+	mutex_lock(&pcpu_mutex);
+
+	/* allocate area */
+	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
+		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (size > chunk->contig_hint)
+				continue;
+			off = pcpu_alloc_area(chunk, size, align);
+			if (off >= 0)
+				goto area_found;
+			if (off != -ENOSPC)
+				goto out_unlock;
+		}
+	}
+
+	/* hmmm... no space left, create a new chunk */
+	chunk = alloc_pcpu_chunk();
+	if (!chunk)
+		goto out_unlock;
+	pcpu_chunk_relocate(chunk, -1);
+	pcpu_chunk_addr_insert(chunk);
+
+	off = pcpu_alloc_area(chunk, size, align);
+	if (off < 0)
+		goto out_unlock;
+
+area_found:
+	/* populate, map and clear the area */
+	if (pcpu_populate_chunk(chunk, off, size)) {
+		pcpu_free_area(chunk, off);
+		goto out_unlock;
+	}
+
+	ptr = __addr_to_pcpu_ptr(chunk->vm->addr + off);
+out_unlock:
+	mutex_unlock(&pcpu_mutex);
+	return ptr;
+}
+EXPORT_SYMBOL_GPL(__alloc_percpu);
+
+static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
+{
+	pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size, false);
+	list_del(&chunk->list);
+	rb_erase(&chunk->rb_node, &pcpu_addr_root);
+	free_pcpu_chunk(chunk);
+}
+
+/**
+ * free_percpu - free percpu area
+ * @ptr: pointer to area to free
+ *
+ * Free percpu area @ptr.  Might sleep.
+ */
+void free_percpu(void *ptr)
+{
+	void *addr = __pcpu_ptr_to_addr(ptr);
+	struct pcpu_chunk *chunk;
+	int off;
+
+	if (!ptr)
+		return;
+
+	mutex_lock(&pcpu_mutex);
+
+	chunk = pcpu_chunk_addr_search(addr);
+	off = addr - chunk->vm->addr;
+
+	pcpu_free_area(chunk, off);
+
+	/* the chunk became fully free, kill one if there are other free ones */
+	if (chunk->free_size == pcpu_unit_size) {
+		struct pcpu_chunk *pos;
+
+		list_for_each_entry(pos,
+				    &pcpu_slot[pcpu_chunk_slot(chunk)], list)
+			if (pos != chunk) {
+				pcpu_kill_chunk(pos);
+				break;
+			}
+	}
+
+	mutex_unlock(&pcpu_mutex);
+}
+EXPORT_SYMBOL_GPL(free_percpu);
+
+/**
+ * pcpu_setup_static - initialize kernel static percpu area
+ * @populate_pte_fn: callback to allocate pagetable
+ * @pages: num_possible_cpus() * PFN_UP(cpu_size) pages
+ *
+ * Initialize kernel static percpu area.  The caller should allocate
+ * all the necessary pages and pass them in @pages.
+ * @populate_pte_fn() is called on each page to be used for percpu
+ * mapping and is responsible for making sure all the necessary page
+ * tables for the page is allocated.
+ *
+ * RETURNS:
+ * The determined pcpu_unit_size which can be used to initialize
+ * percpu access.
+ */
+size_t __init pcpu_setup_static(pcpu_populate_pte_fn_t populate_pte_fn,
+				struct page **pages, size_t cpu_size)
+{
+	static struct vm_struct static_vm;
+	struct pcpu_chunk *static_chunk;
+	int nr_cpu_pages = DIV_ROUND_UP(cpu_size, PAGE_SIZE);
+	unsigned int cpu;
+	int err, i;
+
+	pcpu_unit_pages_shift = max_t(int, PCPU_MIN_UNIT_PAGES_SHIFT,
+				      order_base_2(cpu_size) - PAGE_SHIFT);
+
+	pcpu_static_size = cpu_size;
+	pcpu_unit_pages = 1 << pcpu_unit_pages_shift;
+	pcpu_unit_shift = PAGE_SHIFT + pcpu_unit_pages_shift;
+	pcpu_unit_size = 1 << pcpu_unit_shift;
+	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
+	pcpu_nr_slots = pcpu_size_to_slot(pcpu_unit_size) + 1;
+	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+		+ (1 << pcpu_unit_pages_shift) * sizeof(struct page *);
+
+	/* allocate chunk slots */
+	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
+	for (i = 0; i < pcpu_nr_slots; i++)
+		INIT_LIST_HEAD(&pcpu_slot[i]);
+
+	/* init and register vm area */
+	static_vm.flags = VM_ALLOC;
+	static_vm.size = pcpu_chunk_size;
+	vm_area_register_early(&static_vm);
+
+	/* init static_chunk */
+	static_chunk = alloc_bootmem(pcpu_chunk_struct_size);
+	INIT_LIST_HEAD(&static_chunk->list);
+	static_chunk->vm = &static_vm;
+	static_chunk->free_size = pcpu_unit_size - pcpu_static_size;
+	static_chunk->contig_hint = static_chunk->free_size;
+
+	/* assign pages and map them */
+	for_each_possible_cpu(cpu) {
+		for (i = 0; i < nr_cpu_pages; i++) {
+			*pcpu_chunk_pagep(static_chunk, cpu, i) = *pages++;
+			populate_pte_fn(pcpu_chunk_addr(static_chunk, cpu, i));
+		}
+	}
+
+	err = pcpu_map(static_chunk, 0, nr_cpu_pages);
+	if (err)
+		panic("failed to setup static percpu area, err=%d\n", err);
+
+	/* link static_chunk in */
+	pcpu_chunk_relocate(static_chunk, -1);
+	pcpu_chunk_addr_insert(static_chunk);
+
+	/* we're done */
+	pcpu_base_addr = (void *)pcpu_chunk_addr(static_chunk, 0, 0);
+	return pcpu_unit_size;
+}
-- 
cgit v1.2.3


From f9349a8f978929a0c71d2c42ae299f7d462c239d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Feb 2009 21:13:12 +0100
Subject: tracing/function-graph-tracer: make set_graph_function file support
 ftrace regex

Impact: trace only functions matching a pattern

The set_graph_function file let one to trace only one or several
chosen functions and follow all their code flow.

Currently, only a constant function name is allowed so this patch
allows the ftrace_regex functions:

- matches all functions that end with "name":
  echo *name > set_graph_function

- matches all functions that begin with "name":
  echo name* > set_graph_function

- matches all functions that contains "name":
  echo *name* > set_graph_function

Example:

echo mutex* > set_graph_function

 0)               |  mutex_lock_nested() {
 0)   0.563 us    |    __might_sleep();
 0)   2.072 us    |  }
 0)               |  mutex_unlock() {
 0)   1.036 us    |    __mutex_unlock_slowpath();
 0)   2.433 us    |  }
 0)               |  mutex_unlock() {
 0)   0.691 us    |    __mutex_unlock_slowpath();
 0)   1.787 us    |  }
 0)               |  mutex_lock_interruptible_nested() {
 0)   0.548 us    |    __might_sleep();
 0)   1.945 us    |  }

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 56 ++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 38 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7dd5a2bef9cd..cf59f4c54745 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1895,6 +1895,10 @@ static void *g_start(struct seq_file *m, loff_t *pos)
 
 	mutex_lock(&graph_lock);
 
+	/* Nothing, tell g_show to print all functions are enabled */
+	if (!ftrace_graph_count && !*pos)
+		return (void *)1;
+
 	p = g_next(m, p, pos);
 
 	return p;
@@ -1913,6 +1917,11 @@ static int g_show(struct seq_file *m, void *v)
 	if (!ptr)
 		return 0;
 
+	if (ptr == (unsigned long *)1) {
+		seq_printf(m, "#### all functions enabled ####\n");
+		return 0;
+	}
+
 	kallsyms_lookup(*ptr, NULL, NULL, NULL, str);
 
 	seq_printf(m, "%s\n", str);
@@ -1966,38 +1975,51 @@ ftrace_graph_read(struct file *file, char __user *ubuf,
 }
 
 static int
-ftrace_set_func(unsigned long *array, int idx, char *buffer)
+ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
-	char str[KSYM_SYMBOL_LEN];
 	struct dyn_ftrace *rec;
 	struct ftrace_page *pg;
+	int search_len;
 	int found = 0;
-	int j;
+	int type, not;
+	char *search;
+	bool exists;
+	int i;
 
 	if (ftrace_disabled)
 		return -ENODEV;
 
+	/* decode regex */
+	type = ftrace_setup_glob(buffer, strlen(buffer), &search, &not);
+	if (not)
+		return -EINVAL;
+
+	search_len = strlen(search);
+
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
 
+		if (*idx >= FTRACE_GRAPH_MAX_FUNCS)
+			break;
+
 		if (rec->flags & (FTRACE_FL_FAILED | FTRACE_FL_FREE))
 			continue;
 
-		kallsyms_lookup(rec->ip, NULL, NULL, NULL, str);
-		if (strcmp(str, buffer) == 0) {
-			/* Return 1 if we add it to the array */
-			found = 1;
-			for (j = 0; j < idx; j++)
-				if (array[j] == rec->ip) {
-					found = 0;
+		if (ftrace_match_record(rec, search, search_len, type)) {
+			/* ensure it is not already in the array */
+			exists = false;
+			for (i = 0; i < *idx; i++)
+				if (array[i] == rec->ip) {
+					exists = true;
 					break;
 				}
-			if (found)
-				array[idx] = rec->ip;
-			goto out;
+			if (!exists) {
+				array[(*idx)++] = rec->ip;
+				found = 1;
+			}
 		}
 	} while_for_each_ftrace_rec();
- out:
+
 	mutex_unlock(&ftrace_lock);
 
 	return found ? 0 : -EINVAL;
@@ -2066,13 +2088,11 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 	}
 	buffer[index] = 0;
 
-	/* we allow only one at a time */
-	ret = ftrace_set_func(array, ftrace_graph_count, buffer);
+	/* we allow only one expression at a time */
+	ret = ftrace_set_func(array, &ftrace_graph_count, buffer);
 	if (ret)
 		goto out;
 
-	ftrace_graph_count++;
-
 	file->f_pos += read;
 
 	ret = read;
-- 
cgit v1.2.3


From 000ab691172db3921efa3cb7f17fc79235a1de7f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Feb 2009 13:35:06 -0500
Subject: ftrace: allow archs to preform pre and post process for code
 modification

This patch creates the weak functions: ftrace_arch_code_modify_prepare
and ftrace_arch_code_modify_post_process that are called before and
after the stop machine is called to modify the kernel text.

If the arch needs to do pre or post processing, it only needs to define
these functions.

[ Update: Ingo Molnar suggested using the name ftrace_arch_code_modify_*
          over using ftrace_arch_modify_* ]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ftrace.h |  3 +++
 kernel/trace/ftrace.c  | 28 ++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 677432b9cb7e..fdb2a89ae543 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -99,6 +99,9 @@ stack_trace_sysctl(struct ctl_table *table, int write,
 /* asm/ftrace.h must be defined for archs supporting dynamic ftrace */
 #include <asm/ftrace.h>
 
+int ftrace_arch_code_modify_prepare(void);
+int ftrace_arch_code_modify_post_process(void);
+
 enum {
 	FTRACE_FL_FREE		= (1 << 0),
 	FTRACE_FL_FAILED	= (1 << 1),
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..72316d9647bd 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -585,6 +585,24 @@ ftrace_code_disable(struct module *mod, struct dyn_ftrace *rec)
 	return 1;
 }
 
+/*
+ * archs can override this function if they must do something
+ * before the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_prepare(void)
+{
+	return 0;
+}
+
+/*
+ * archs can override this function if they must do something
+ * after the modifying code is performed.
+ */
+int __weak ftrace_arch_code_modify_post_process(void)
+{
+	return 0;
+}
+
 static int __ftrace_modify_code(void *data)
 {
 	int *command = data;
@@ -607,7 +625,17 @@ static int __ftrace_modify_code(void *data)
 
 static void ftrace_run_update_code(int command)
 {
+	int ret;
+
+	ret = ftrace_arch_code_modify_prepare();
+	FTRACE_WARN_ON(ret);
+	if (ret)
+		return;
+
 	stop_machine(__ftrace_modify_code, &command, NULL);
+
+	ret = ftrace_arch_code_modify_post_process();
+	FTRACE_WARN_ON(ret);
 }
 
 static ftrace_func_t saved_ftrace_func;
-- 
cgit v1.2.3


From 4377245aa93b65b6597e4b7bb460fb9abc48b56b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Feb 2009 13:41:27 -0500
Subject: ftrace: break out modify loop immediately on detection of error

Impact: added precaution on failure detection

Break out of the modifying loop as soon as a failure is detected.
This is just an added precaution found by code review and was not
found by any bug chasing.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 72316d9647bd..11ad796ca049 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -561,8 +561,11 @@ static void ftrace_replace_code(int enable)
 				if ((system_state == SYSTEM_BOOTING) ||
 				    !core_kernel_text(rec->ip)) {
 					ftrace_free_rec(rec);
-				} else
+				} else {
 					ftrace_bug(failed, rec->ip);
+					/* Stop processing */
+					return;
+				}
 			}
 		}
 	}
-- 
cgit v1.2.3


From eed3ee08292d821282169708e5e8e89a0d0a0c63 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 14 Feb 2009 02:00:19 +0100
Subject: PM/resume: wait for device probing to finish

the resume code does not currently wait for device probing to finish.
Even without async function calls this is dicey and not correct,
but with async function calls during the boot sequence this is going
to get hit more...

This patch adds the synchronization using the newly introduced helper.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <lenb@kernel.org>
Acked-by: Greg KH <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/disk.c | 11 +++++++++++
 1 file changed, 11 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 432ee575c9ee..7b40e94b1d42 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -594,6 +594,12 @@ static int software_resume(void)
 	int error;
 	unsigned int flags;
 
+	/*
+	 * If the user said "noresume".. bail out early.
+	 */
+	if (noresume)
+		return 0;
+
 	/*
 	 * name_to_dev_t() below takes a sysfs buffer mutex when sysfs
 	 * is configured into the kernel. Since the regular hibernate
@@ -610,6 +616,11 @@ static int software_resume(void)
 			mutex_unlock(&pm_mutex);
 			return -ENOENT;
 		}
+		/*
+		 * Some device discovery might still be in progress; we need
+		 * to wait for this to finish.
+		 */
+		wait_for_device_probe();
 		swsusp_resume_device = name_to_dev_t(resume_file);
 		pr_debug("PM: Resume from partition %s\n", resume_file);
 	} else {
-- 
cgit v1.2.3


From 09664fda48c5dd63277f1f42888ca9d5dca6037a Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sat, 14 Feb 2009 02:02:16 +0100
Subject: PM: fix build for CONFIG_PM unset

Compilation of kprobes.c with CONFIG_PM unset is broken due to some broken
config dependncies.  Fix that.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <gregkh@suse.de>
Reported-by: Ingo Molnar <mingo@elte.hu>
Tested-by: Masami Hiramatsu <mhiramat@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/Makefile       | 1 +
 kernel/power/Makefile | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/Makefile b/kernel/Makefile
index 170a9213c1b6..e4791b3ba55d 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -51,6 +51,7 @@ obj-$(CONFIG_UID16) += uid16.o
 obj-$(CONFIG_MODULES) += module.o
 obj-$(CONFIG_KALLSYMS) += kallsyms.o
 obj-$(CONFIG_PM) += power/
+obj-$(CONFIG_FREEZER) += power/
 obj-$(CONFIG_BSD_PROCESS_ACCT) += acct.o
 obj-$(CONFIG_KEXEC) += kexec.o
 obj-$(CONFIG_BACKTRACE_SELF_TEST) += backtracetest.o
diff --git a/kernel/power/Makefile b/kernel/power/Makefile
index d7a10167a25b..720ea4f781bd 100644
--- a/kernel/power/Makefile
+++ b/kernel/power/Makefile
@@ -3,7 +3,7 @@ ifeq ($(CONFIG_PM_DEBUG),y)
 EXTRA_CFLAGS	+=	-DDEBUG
 endif
 
-obj-y				:= main.o
+obj-$(CONFIG_PM)		+= main.o
 obj-$(CONFIG_PM_SLEEP)		+= console.o
 obj-$(CONFIG_FREEZER)		+= process.o
 obj-$(CONFIG_HIBERNATION)	+= swsusp.o disk.o snapshot.o swap.o user.o
-- 
cgit v1.2.3


From ebae2604f2c3693717d9dc687c84578f0526480c Mon Sep 17 00:00:00 2001
From: Andrey Borzenkov <arvidjaar@mail.ru>
Date: Sat, 14 Feb 2009 02:05:14 +0100
Subject: PM: Fix pm_notifiers during user mode hibernation

Snapshot device is opened with O_RDONLY during suspend and O_WRONLY durig
resume.  Make sure we also call notifiers with correct parameter telling
them what we are really doing.

Signed-off-by: Andrey Borzenkov <arvidjaar@mail.ru>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <gregkh@suse.de>
Acked-by: Pavel Machek <pavel@ucw.cz>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/user.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/user.c b/kernel/power/user.c
index 005b93d839ba..6c85359364f2 100644
--- a/kernel/power/user.c
+++ b/kernel/power/user.c
@@ -95,15 +95,15 @@ static int snapshot_open(struct inode *inode, struct file *filp)
 		data->swap = swsusp_resume_device ?
 			swap_type_of(swsusp_resume_device, 0, NULL) : -1;
 		data->mode = O_RDONLY;
-		error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
+		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
 		if (error)
-			pm_notifier_call_chain(PM_POST_RESTORE);
+			pm_notifier_call_chain(PM_POST_HIBERNATION);
 	} else {
 		data->swap = -1;
 		data->mode = O_WRONLY;
-		error = pm_notifier_call_chain(PM_HIBERNATION_PREPARE);
+		error = pm_notifier_call_chain(PM_RESTORE_PREPARE);
 		if (error)
-			pm_notifier_call_chain(PM_POST_HIBERNATION);
+			pm_notifier_call_chain(PM_POST_RESTORE);
 	}
 	if (error)
 		atomic_inc(&snapshot_device_available);
-- 
cgit v1.2.3


From b090f9fa53d51c8a33370071de9e391919ee1fa7 Mon Sep 17 00:00:00 2001
From: Arve Hjønnevåg <arve@android.com>
Date: Sat, 14 Feb 2009 02:06:17 +0100
Subject: PM: Wait for console in resume
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Avoids later waking up to a blinking cursor if the device woke up and
returned to sleep before the console switch happened.

Signed-off-by: Brian Swetland <swetland@google.com>
Signed-off-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/power/console.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/power/console.c b/kernel/power/console.c
index b8628be2a465..a3961b205de7 100644
--- a/kernel/power/console.c
+++ b/kernel/power/console.c
@@ -78,6 +78,12 @@ void pm_restore_console(void)
 	}
 	set_console(orig_fgconsole);
 	release_console_sem();
+
+	if (vt_waitactive(orig_fgconsole)) {
+		pr_debug("Resume: Can't switch VCs.");
+		return;
+	}
+
 	kmsg_redirect = orig_kmsg;
 }
 #endif
-- 
cgit v1.2.3


From 403f307576396f3362fbb65af190885b6036c72c Mon Sep 17 00:00:00 2001
From: Arve Hjønnevåg <arve@android.com>
Date: Sat, 14 Feb 2009 02:07:24 +0100
Subject: PM: Fix suspend_console and resume_console to use only one semaphore
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

This fixes a race where a thread acquires the console while the
console is suspended, and the console is resumed before this
thread releases it. In this case, the secondary console
semaphore would be left locked, and the primary semaphore would
be released twice. This in turn would cause the console switch
on suspend or resume to hang forever.

Note that suspend_console does not actually lock the console
for clients that use acquire_console_sem, it only locks it for
clients that use try_acquire_console_sem. If we change
suspend_console to fully lock the console, then the kernel
may deadlock on suspend. One client of try_acquire_console_sem
is acquire_console_semaphore_for_printk, which uses it to
prevent printk from using the console while it is suspended.

Signed-off-by: Arve Hjønnevåg <arve@android.com>
Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Cc: Len Brown <lenb@kernel.org>
Cc: Greg KH <gregkh@suse.de>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/printk.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index 69188f226a93..e3602d0755b0 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -73,7 +73,6 @@ EXPORT_SYMBOL(oops_in_progress);
  * driver system.
  */
 static DECLARE_MUTEX(console_sem);
-static DECLARE_MUTEX(secondary_console_sem);
 struct console *console_drivers;
 EXPORT_SYMBOL_GPL(console_drivers);
 
@@ -891,12 +890,14 @@ void suspend_console(void)
 	printk("Suspending console(s) (use no_console_suspend to debug)\n");
 	acquire_console_sem();
 	console_suspended = 1;
+	up(&console_sem);
 }
 
 void resume_console(void)
 {
 	if (!console_suspend_enabled)
 		return;
+	down(&console_sem);
 	console_suspended = 0;
 	release_console_sem();
 }
@@ -912,11 +913,9 @@ void resume_console(void)
 void acquire_console_sem(void)
 {
 	BUG_ON(in_interrupt());
-	if (console_suspended) {
-		down(&secondary_console_sem);
-		return;
-	}
 	down(&console_sem);
+	if (console_suspended)
+		return;
 	console_locked = 1;
 	console_may_schedule = 1;
 }
@@ -926,6 +925,10 @@ int try_acquire_console_sem(void)
 {
 	if (down_trylock(&console_sem))
 		return -1;
+	if (console_suspended) {
+		up(&console_sem);
+		return -1;
+	}
 	console_locked = 1;
 	console_may_schedule = 0;
 	return 0;
@@ -979,7 +982,7 @@ void release_console_sem(void)
 	unsigned wake_klogd = 0;
 
 	if (console_suspended) {
-		up(&secondary_console_sem);
+		up(&console_sem);
 		return;
 	}
 
-- 
cgit v1.2.3


From 770824bdc421ff58a64db608294323571c949f4c Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Sun, 22 Feb 2009 18:38:50 +0100
Subject: PM: Split up sysdev_[suspend|resume] from device_power_[down|up]

Move the sysdev_suspend/resume from the callee to the callers, with
no real change in semantics, so that we can rework the disabling of
interrupts during suspend/hibernation.

This is based on an earlier patch from Linus.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/apm_32.c  |  4 ++++
 drivers/base/base.h       |  2 --
 drivers/base/power/main.c |  3 ---
 drivers/xen/manage.c      |  8 ++++++++
 include/linux/pm.h        |  2 ++
 kernel/kexec.c            |  7 +++++++
 kernel/power/disk.c       | 11 +++++++++++
 kernel/power/main.c       |  8 ++++++--
 8 files changed, 38 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 98807bb095ad..266ec6c18b6c 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1192,6 +1192,7 @@ static int suspend(int vetoable)
 	device_suspend(PMSG_SUSPEND);
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
+	sysdev_suspend(PMSG_SUSPEND);
 
 	local_irq_enable();
 
@@ -1208,6 +1209,7 @@ static int suspend(int vetoable)
 	if (err != APM_SUCCESS)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
+	sysdev_resume();
 	device_power_up(PMSG_RESUME);
 	local_irq_enable();
 	device_resume(PMSG_RESUME);
@@ -1228,6 +1230,7 @@ static void standby(void)
 
 	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
+	sysdev_suspend(PMSG_SUSPEND);
 	local_irq_enable();
 
 	err = set_system_power_state(APM_STATE_STANDBY);
@@ -1235,6 +1238,7 @@ static void standby(void)
 		apm_error("standby", err);
 
 	local_irq_disable();
+	sysdev_resume();
 	device_power_up(PMSG_RESUME);
 	local_irq_enable();
 }
diff --git a/drivers/base/base.h b/drivers/base/base.h
index 0a5f055dffba..9f50f1b545dc 100644
--- a/drivers/base/base.h
+++ b/drivers/base/base.h
@@ -88,8 +88,6 @@ extern void driver_detach(struct device_driver *drv);
 extern int driver_probe_device(struct device_driver *drv, struct device *dev);
 
 extern void sysdev_shutdown(void);
-extern int sysdev_suspend(pm_message_t state);
-extern int sysdev_resume(void);
 
 extern char *make_class_name(const char *name, struct kobject *kobj);
 
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index 670c9d6c1407..2d14f4ae6c01 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -333,7 +333,6 @@ static void dpm_power_up(pm_message_t state)
  */
 void device_power_up(pm_message_t state)
 {
-	sysdev_resume();
 	dpm_power_up(state);
 }
 EXPORT_SYMBOL_GPL(device_power_up);
@@ -577,8 +576,6 @@ int device_power_down(pm_message_t state)
 		}
 		dev->power.status = DPM_OFF_IRQ;
 	}
-	if (!error)
-		error = sysdev_suspend(state);
 	if (error)
 		dpm_power_up(resume_event(state));
 	return error;
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 9b91617b9582..56892a142ee2 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -45,6 +45,13 @@ static int xen_suspend(void *data)
 		       err);
 		return err;
 	}
+	err = sysdev_suspend(PMSG_SUSPEND);
+	if (err) {
+		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
+			err);
+		device_power_up(PMSG_RESUME);
+		return err;
+	}
 
 	xen_mm_pin_all();
 	gnttab_suspend();
@@ -61,6 +68,7 @@ static int xen_suspend(void *data)
 	gnttab_resume();
 	xen_mm_unpin_all();
 
+	sysdev_resume();
 	device_power_up(PMSG_RESUME);
 
 	if (!*cancelled) {
diff --git a/include/linux/pm.h b/include/linux/pm.h
index de2e0a8f6728..24ba5f67b3a3 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -381,10 +381,12 @@ struct dev_pm_info {
 
 #ifdef CONFIG_PM_SLEEP
 extern void device_pm_lock(void);
+extern int sysdev_resume(void);
 extern void device_power_up(pm_message_t state);
 extern void device_resume(pm_message_t state);
 
 extern void device_pm_unlock(void);
+extern int sysdev_suspend(pm_message_t state);
 extern int device_power_down(pm_message_t state);
 extern int device_suspend(pm_message_t state);
 extern int device_prepare_suspend(pm_message_t state);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 8a6d7b08864e..483899578259 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1465,6 +1465,11 @@ int kernel_kexec(void)
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
 			goto Enable_irqs;
+
+		/* Suspend system devices */
+		error = sysdev_suspend(PMSG_FREEZE);
+		if (error)
+			goto Power_up_devices;
 	} else
 #endif
 	{
@@ -1477,6 +1482,8 @@ int kernel_kexec(void)
 
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
+		sysdev_resume();
+ Power_up_devices:
 		device_power_up(PMSG_RESTORE);
  Enable_irqs:
 		local_irq_enable();
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 7b40e94b1d42..4a4a206b1979 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -227,6 +227,12 @@ static int create_image(int platform_mode)
 			"aborting hibernation\n");
 		goto Enable_irqs;
 	}
+	sysdev_suspend(PMSG_FREEZE);
+	if (error) {
+		printk(KERN_ERR "PM: Some devices failed to power down, "
+			"aborting hibernation\n");
+		goto Power_up_devices;
+	}
 
 	if (hibernation_test(TEST_CORE))
 		goto Power_up;
@@ -242,9 +248,11 @@ static int create_image(int platform_mode)
 	if (!in_suspend)
 		platform_leave(platform_mode);
  Power_up:
+	sysdev_resume();
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
+ Power_up_devices:
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
  Enable_irqs:
@@ -335,6 +343,7 @@ static int resume_target_kernel(void)
 			"aborting resume\n");
 		goto Enable_irqs;
 	}
+	sysdev_suspend(PMSG_QUIESCE);
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = restore_highmem();
@@ -357,6 +366,7 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
+	sysdev_resume();
 	device_power_up(PMSG_RECOVER);
  Enable_irqs:
 	local_irq_enable();
@@ -440,6 +450,7 @@ int hibernation_platform_enter(void)
 	local_irq_disable();
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
+		sysdev_suspend(PMSG_HIBERNATE);
 		hibernation_ops->enter();
 		/* We should never get here */
 		while (1);
diff --git a/kernel/power/main.c b/kernel/power/main.c
index b4d219016b6c..c9632f841f64 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -298,8 +298,12 @@ static int suspend_enter(suspend_state_t state)
 		goto Done;
 	}
 
-	if (!suspend_test(TEST_CORE))
-		error = suspend_ops->enter(state);
+	error = sysdev_suspend(PMSG_SUSPEND);
+	if (!error) {
+		if (!suspend_test(TEST_CORE))
+			error = suspend_ops->enter(state);
+		sysdev_resume();
+	}
 
 	device_power_up(PMSG_RESUME);
  Done:
-- 
cgit v1.2.3


From 5e01cb695d29619dd551bac7d6aa4ef1dc8ebc95 Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 24 Feb 2009 13:55:18 +0100
Subject: x86, ftrace: fix section mismatch in hw-branch-tracer

Fix an invalid memory reference problem when cpu hotplug support is
disabled and the hw-branch-tracer is set as current tracer.

Initializing the tracer calls bts_trace_init() which has already
been freed at this time.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3561aace075c..3335e807144b 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -127,20 +127,18 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = {
 	.notifier_call = bts_hotcpu_handler
 };
 
-static int __cpuinit bts_trace_init(struct trace_array *tr)
+static int bts_trace_init(struct trace_array *tr)
 {
 	hw_branch_trace = tr;
 
-	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	bts_trace_start(tr);
 
 	return 0;
 }
 
-static void __cpuinit bts_trace_reset(struct trace_array *tr)
+static void bts_trace_reset(struct trace_array *tr)
 {
 	bts_trace_stop(tr);
-	unregister_hotcpu_notifier(&bts_hotcpu_notifier);
 }
 
 static void bts_trace_print_header(struct seq_file *m)
@@ -299,6 +297,7 @@ struct tracer bts_tracer __read_mostly =
 
 __init static int init_bts_trace(void)
 {
+	register_hotcpu_notifier(&bts_hotcpu_notifier);
 	return register_tracer(&bts_tracer);
 }
 device_initcall(init_bts_trace);
-- 
cgit v1.2.3


From b77e38aa240c3bd9c55c98b9f7c81541e042eae5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 10:21:36 -0500
Subject: tracing: add event trace infrastructure

This patch creates the event tracing infrastructure of ftrace.
It will create the files:

 /debug/tracing/available_events
 /debug/tracing/set_event

The available_events will list the trace points that have been
registered with the event tracer.

set_events will allow the user to enable or disable an event hook.

example:

 # echo sched_wakeup > /debug/tracing/set_event

Will enable the sched_wakeup event (if it is registered).

 # echo "!sched_wakeup" >> /debug/tracing/set_event

Will disable the sched_wakeup event (and only that event).

 # echo > /debug/tracing/set_event

Will disable all events (notice the '>')

 # cat /debug/tracing/available_events > /debug/tracing/set_event

Will enable all registered event hooks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/asm-generic/vmlinux.lds.h |  11 +-
 kernel/trace/Kconfig              |   9 ++
 kernel/trace/Makefile             |   1 +
 kernel/trace/trace_events.c       | 280 ++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.h       |  52 +++++++
 5 files changed, 352 insertions(+), 1 deletion(-)
 create mode 100644 kernel/trace/trace_events.c
 create mode 100644 kernel/trace/trace_events.h

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..0add6b28c366 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -61,6 +61,14 @@
 #define BRANCH_PROFILE()
 #endif
 
+#ifdef CONFIG_EVENT_TRACER
+#define FTRACE_EVENTS()	VMLINUX_SYMBOL(__start_ftrace_events) = .;	\
+			*(_ftrace_events)				\
+			VMLINUX_SYMBOL(__stop_ftrace_events) = .;
+#else
+#define FTRACE_EVENTS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -81,7 +89,8 @@
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
 	LIKELY_PROFILE()		       				\
-	BRANCH_PROFILE()
+	BRANCH_PROFILE()						\
+	FTRACE_EVENTS()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 07877f4b5233..999c6a2485df 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -159,6 +159,15 @@ config CONTEXT_SWITCH_TRACER
 	  This tracer gets called from the context switch and records
 	  all switching of tasks.
 
+config EVENT_TRACER
+	bool "Trace various events in the kernel"
+	depends on DEBUG_KERNEL
+	select TRACING
+	help
+	  This tracer hooks to various trace points in the kernel
+	  allowing the user to pick and choose which trace point they
+	  want to trace.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	depends on DEBUG_KERNEL
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 627090bc262d..c7363568b1cf 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -38,5 +38,6 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
new file mode 100644
index 000000000000..05bc80ec8d2c
--- /dev/null
+++ b/kernel/trace/trace_events.c
@@ -0,0 +1,280 @@
+/*
+ * event tracer
+ *
+ * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
+ *
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+
+#include "trace_events.h"
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	va_list ap;
+
+	va_start(ap, fmt);
+	tracing_record_cmdline(current);
+	trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+}
+
+static void ftrace_clear_events(void)
+{
+	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+
+
+	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		call++;
+	}
+}
+
+static int ftrace_set_clr_event(char *buf, int set)
+{
+	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+
+
+	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+
+		if (strcmp(buf, call->name) != 0) {
+			call++;
+			continue;
+		}
+
+		if (set) {
+			/* Already set? */
+			if (call->enabled)
+				return 0;
+			call->enabled = 1;
+			call->regfunc();
+		} else {
+			/* Already cleared? */
+			if (!call->enabled)
+				return 0;
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		return 0;
+	}
+	return -EINVAL;
+}
+
+/* 128 should be much more than enough */
+#define EVENT_BUF_SIZE		127
+
+static ssize_t
+ftrace_event_write(struct file *file, const char __user *ubuf,
+		   size_t cnt, loff_t *ppos)
+{
+	size_t read = 0;
+	int i, set = 1;
+	ssize_t ret;
+	char *buf;
+	char ch;
+
+	if (!cnt || cnt < 0)
+		return 0;
+
+	ret = get_user(ch, ubuf++);
+	if (ret)
+		return ret;
+	read++;
+	cnt--;
+
+	/* skip white space */
+	while (cnt && isspace(ch)) {
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			return ret;
+		read++;
+		cnt--;
+	}
+
+	/* Only white space found? */
+	if (isspace(ch)) {
+		file->f_pos += read;
+		ret = read;
+		return ret;
+	}
+
+	buf = kmalloc(EVENT_BUF_SIZE+1, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	if (cnt > EVENT_BUF_SIZE)
+		cnt = EVENT_BUF_SIZE;
+
+	i = 0;
+	while (cnt && !isspace(ch)) {
+		if (!i && ch == '!')
+			set = 0;
+		else
+			buf[i++] = ch;
+
+		ret = get_user(ch, ubuf++);
+		if (ret)
+			goto out_free;
+		read++;
+		cnt--;
+	}
+	buf[i] = 0;
+
+	file->f_pos += read;
+
+	ret = ftrace_set_clr_event(buf, set);
+	if (ret)
+		goto out_free;
+
+	ret = read;
+
+ out_free:
+	kfree(buf);
+
+	return ret;
+}
+
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_call *next = call;
+
+	(*pos)++;
+
+	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		return NULL;
+
+	m->private = ++next;
+
+	return call;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	return t_next(m, NULL, pos);
+}
+
+static void *
+s_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	struct ftrace_event_call *call = m->private;
+	struct ftrace_event_call *next;
+
+	(*pos)++;
+
+ retry:
+	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+		return NULL;
+
+	if (!call->enabled) {
+		call++;
+		goto retry;
+	}
+
+	next = call;
+	m->private = ++next;
+
+	return call;
+}
+
+static void *s_start(struct seq_file *m, loff_t *pos)
+{
+	return s_next(m, NULL, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	struct ftrace_event_call *call = v;
+
+	seq_printf(m, "%s\n", call->name);
+
+	return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static int
+ftrace_event_seq_open(struct inode *inode, struct file *file)
+{
+	int ret;
+	const struct seq_operations *seq_ops;
+
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND))
+		ftrace_clear_events();
+
+	seq_ops = inode->i_private;
+	ret = seq_open(file, seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = __start_ftrace_events;
+	}
+	return ret;
+}
+
+static const struct seq_operations show_event_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static const struct seq_operations show_set_event_seq_ops = {
+	.start = s_start,
+	.next = s_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static const struct file_operations ftrace_avail_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static const struct file_operations ftrace_set_event_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.write = ftrace_event_write,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int event_trace_init(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("available_events", 0444, d_tracer,
+				    (void *)&show_event_seq_ops,
+				    &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_events' entry\n");
+
+	entry = debugfs_create_file("set_event", 0644, d_tracer,
+				    (void *)&show_set_event_seq_ops,
+				    &ftrace_set_event_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'set_event' entry\n");
+
+	return 0;
+}
+fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
new file mode 100644
index 000000000000..39342f86db27
--- /dev/null
+++ b/kernel/trace/trace_events.h
@@ -0,0 +1,52 @@
+#ifndef _LINUX_KERNEL_TRACE_EVENTS_H
+#define _LINUX_KERNEL_TRACE_EVENTS_H
+
+#include <linux/ftrace.h>
+#include "trace.h"
+
+struct ftrace_event_call {
+	char		*name;
+	int		enabled;
+	int		(*regfunc)(void);
+	void		(*unregfunc)(void);
+};
+
+
+#undef TPFMT
+#define TPFMT(fmt, args...)	fmt "\n", ##args
+
+#undef DEFINE_TRACE_FMT
+#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+}
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...);
+extern unsigned long __start_ftrace_events[];
+extern unsigned long __stop_ftrace_events[];
+
+#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
-- 
cgit v1.2.3


From f3fe8e4a38fd19dbb3f8ffb1826aa840ae304a65 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 10:22:57 -0500
Subject: tracing: add schedule events to event trace

This patch changes the trace/sched.h to use the DECLARE_TRACE_FMT
such that they are automatically registered with the event tracer.

And it also adds the tracing sched headers to kernel/trace/events.c

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/sched.h             | 49 +-------------------------
 include/trace/sched_event_types.h | 72 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/Makefile             |  1 +
 kernel/trace/events.c             | 13 +++++++
 4 files changed, 87 insertions(+), 48 deletions(-)
 create mode 100644 include/trace/sched_event_types.h
 create mode 100644 kernel/trace/events.c

(limited to 'kernel')

diff --git a/include/trace/sched.h b/include/trace/sched.h
index 0d81098ee9fc..4e372a1a29bf 100644
--- a/include/trace/sched.h
+++ b/include/trace/sched.h
@@ -4,53 +4,6 @@
 #include <linux/sched.h>
 #include <linux/tracepoint.h>
 
-DECLARE_TRACE(sched_kthread_stop,
-	TPPROTO(struct task_struct *t),
-		TPARGS(t));
-
-DECLARE_TRACE(sched_kthread_stop_ret,
-	TPPROTO(int ret),
-		TPARGS(ret));
-
-DECLARE_TRACE(sched_wait_task,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-		TPARGS(rq, p));
-
-DECLARE_TRACE(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-		TPARGS(rq, p, success));
-
-DECLARE_TRACE(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-		TPARGS(rq, p, success));
-
-DECLARE_TRACE(sched_switch,
-	TPPROTO(struct rq *rq, struct task_struct *prev,
-		struct task_struct *next),
-		TPARGS(rq, prev, next));
-
-DECLARE_TRACE(sched_migrate_task,
-	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-		TPARGS(p, orig_cpu, dest_cpu));
-
-DECLARE_TRACE(sched_process_free,
-	TPPROTO(struct task_struct *p),
-		TPARGS(p));
-
-DECLARE_TRACE(sched_process_exit,
-	TPPROTO(struct task_struct *p),
-		TPARGS(p));
-
-DECLARE_TRACE(sched_process_wait,
-	TPPROTO(struct pid *pid),
-		TPARGS(pid));
-
-DECLARE_TRACE(sched_process_fork,
-	TPPROTO(struct task_struct *parent, struct task_struct *child),
-		TPARGS(parent, child));
-
-DECLARE_TRACE(sched_signal_send,
-	TPPROTO(int sig, struct task_struct *p),
-		TPARGS(sig, p));
+#include <trace/sched_event_types.h>
 
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
new file mode 100644
index 000000000000..a4f662940f4e
--- /dev/null
+++ b/include/trace/sched_event_types.h
@@ -0,0 +1,72 @@
+
+/* use <trace/sched.h> instead */
+#ifndef DEFINE_TRACE_FMT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+DEFINE_TRACE_FMT(sched_kthread_stop,
+	TPPROTO(struct task_struct *t),
+	TPARGS(t),
+	TPFMT("task %s:%d", t->comm, t->pid));
+
+DEFINE_TRACE_FMT(sched_kthread_stop_ret,
+	TPPROTO(int ret),
+	TPARGS(ret),
+	TPFMT("ret=%d", ret));
+
+DEFINE_TRACE_FMT(sched_wait_task,
+	TPPROTO(struct rq *rq, struct task_struct *p),
+	TPARGS(rq, p),
+	TPFMT("task %s:%d", p->comm, p->pid));
+
+DEFINE_TRACE_FMT(sched_wakeup,
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+	TPARGS(rq, p, success),
+	TPFMT("task %s:%d %s",
+	      p->comm, p->pid, success?"succeeded":"failed"));
+
+DEFINE_TRACE_FMT(sched_wakeup_new,
+	TPPROTO(struct rq *rq, struct task_struct *p, int success),
+	TPARGS(rq, p, success),
+	TPFMT("task %s:%d",
+	      p->comm, p->pid, success?"succeeded":"failed"));
+
+DEFINE_TRACE_FMT(sched_switch,
+	TPPROTO(struct rq *rq, struct task_struct *prev,
+		struct task_struct *next),
+	TPARGS(rq, prev, next),
+	TPFMT("task %s:%d ==> %s:%d",
+	      prev->comm, prev->pid, next->comm, next->pid));
+
+DEFINE_TRACE_FMT(sched_migrate_task,
+	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TPARGS(p, orig_cpu, dest_cpu),
+	TPFMT("task %s:%d from: %d  to: %d",
+	      p->comm, p->pid, orig_cpu, dest_cpu));
+
+DEFINE_TRACE_FMT(sched_process_free,
+	TPPROTO(struct task_struct *p),
+	TPARGS(p),
+	TPFMT("task %s:%d", p->comm, p->pid));
+
+DEFINE_TRACE_FMT(sched_process_exit,
+	TPPROTO(struct task_struct *p),
+	TPARGS(p),
+	TPFMT("task %s:%d", p->comm, p->pid));
+
+DEFINE_TRACE_FMT(sched_process_wait,
+	TPPROTO(struct pid *pid),
+	TPARGS(pid),
+	TPFMT("pid %d", pid));
+
+DEFINE_TRACE_FMT(sched_process_fork,
+	TPPROTO(struct task_struct *parent, struct task_struct *child),
+	TPARGS(parent, child),
+	TPFMT("parent %s:%d  child %s:%d",
+	      parent->comm, parent->pid, child->comm, child->pid));
+
+DEFINE_TRACE_FMT(sched_signal_send,
+	TPPROTO(int sig, struct task_struct *p),
+	TPARGS(sig, p),
+	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7363568b1cf..664b6c0dc75a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -39,5 +39,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
+obj-$(CONFIG_EVENT_TRACER) += events.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
new file mode 100644
index 000000000000..38c89eef99ee
--- /dev/null
+++ b/kernel/trace/events.c
@@ -0,0 +1,13 @@
+/*
+ * This is the place to register all trace points as events.
+ * Include the trace/<type>.h at the top.
+ * Include the trace/<type>_event_types.h at the bottom.
+ */
+
+/* trace/<type>.h here */
+#include <trace/sched.h>
+
+#include "trace_events.h"
+
+/* trace/<type>_event_types.h here */
+#include <trace/sched_event_types.h>
-- 
cgit v1.2.3


From 1473e4417c79f12d91ef91a469699bfa911f510f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Feb 2009 14:15:08 -0500
Subject: tracing: make event directory structure

This patch adds the directory /debug/tracing/events/ that will contain
all the registered trace points.

 # ls /debug/tracing/events/
sched_kthread_stop      sched_process_fork  sched_switch
sched_kthread_stop_ret  sched_process_free  sched_wait_task
sched_migrate_task      sched_process_wait  sched_wakeup
sched_process_exit      sched_signal_send   sched_wakeup_new

 # ls /debug/tracing/events/sched_switch/
enable

 # cat /debug/tracing/events/sched_switch/enable
1

 # cat /debug/tracing/set_event
sched_switch

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 137 ++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace_events.h |   7 ++-
 2 files changed, 137 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 05bc80ec8d2c..3bcb9df93342 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,11 @@
 
 #include "trace_events.h"
 
+#define events_for_each(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	va_list ap;
@@ -39,15 +44,16 @@ static void ftrace_clear_events(void)
 
 static int ftrace_set_clr_event(char *buf, int set)
 {
-	struct ftrace_event_call *call = (void *)__start_ftrace_events;
+	struct ftrace_event_call *call = __start_ftrace_events;
 
 
-	while ((unsigned long)call < (unsigned long)__stop_ftrace_events) {
+	events_for_each(call) {
 
-		if (strcmp(buf, call->name) != 0) {
-			call++;
+		if (!call->name)
+			continue;
+
+		if (strcmp(buf, call->name) != 0)
 			continue;
-		}
 
 		if (set) {
 			/* Already set? */
@@ -223,6 +229,67 @@ ftrace_event_seq_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static ssize_t
+event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char *buf;
+
+	if (call->enabled)
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		if (!call->enabled)
+			break;
+
+		call->enabled = 0;
+		call->unregfunc();
+		break;
+	case 1:
+		if (call->enabled)
+			break;
+
+		call->enabled = 1;
+		call->regfunc();
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -252,10 +319,59 @@ static const struct file_operations ftrace_set_event_fops = {
 	.release = seq_release,
 };
 
+static const struct file_operations ftrace_enable_fops = {
+	.open = tracing_open_generic,
+	.read = event_enable_read,
+	.write = event_enable_write,
+};
+
+static struct dentry *event_trace_events_dir(void)
+{
+	static struct dentry *d_tracer;
+	static struct dentry *d_events;
+
+	if (d_events)
+		return d_events;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return NULL;
+
+	d_events = debugfs_create_dir("events", d_tracer);
+	if (!d_events)
+		pr_warning("Could not create debugfs "
+			   "'events' directory\n");
+
+	return d_events;
+}
+
+static int
+event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
+{
+	struct dentry *entry;
+
+	call->dir = debugfs_create_dir(call->name, d_events);
+	if (!call->dir) {
+		pr_warning("Could not create debugfs "
+			   "'%s' directory\n", call->name);
+		return -1;
+	}
+
+	entry = debugfs_create_file("enable", 0644, call->dir, call,
+				    &ftrace_enable_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/enable' entry\n", call->name);
+
+	return 0;
+}
+
 static __init int event_trace_init(void)
 {
+	struct ftrace_event_call *call = __start_ftrace_events;
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	struct dentry *d_events;
 
 	d_tracer = tracing_init_dentry();
 	if (!d_tracer)
@@ -275,6 +391,17 @@ static __init int event_trace_init(void)
 		pr_warning("Could not create debugfs "
 			   "'set_event' entry\n");
 
+	d_events = event_trace_events_dir();
+	if (!d_events)
+		return 0;
+
+	events_for_each(call) {
+		/* The linker may leave blanks */
+		if (!call->name)
+			continue;
+		event_create_dir(call, d_events);
+	}
+
 	return 0;
 }
 fs_initcall(event_trace_init);
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index 39342f86db27..cb8455b3ac9a 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -1,11 +1,13 @@
 #ifndef _LINUX_KERNEL_TRACE_EVENTS_H
 #define _LINUX_KERNEL_TRACE_EVENTS_H
 
+#include <linux/debugfs.h>
 #include <linux/ftrace.h>
 #include "trace.h"
 
 struct ftrace_event_call {
 	char		*name;
+	struct dentry	*dir;
 	int		enabled;
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
@@ -39,6 +41,7 @@ static void ftrace_unreg_event_##call(void)				\
 }									\
 									\
 static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.regfunc		= ftrace_reg_event_##call,		\
@@ -46,7 +49,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 }
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern unsigned long __start_ftrace_events[];
-extern unsigned long __stop_ftrace_events[];
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
 
 #endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
-- 
cgit v1.2.3


From 2d542cf34264ac92e9e7ac55c0b096b066d569d2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 08:40:09 +0100
Subject: tracing/hw-branch-tracing: convert bts-tracer mutex to a spinlock

Impact: fix CPU hotplug lockup

bts_hotcpu_handler() is called with irqs disabled, so using mutex_lock()
is a no-no.

All the BTS codepaths here are atomic (they do not schedule), so using
a spinlock is the right solution.

Cc: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_hw_branches.c | 57 ++++++++++++++++++++--------------------
 1 file changed, 28 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c
index 3335e807144b..7bfdf4c2347f 100644
--- a/kernel/trace/trace_hw_branches.c
+++ b/kernel/trace/trace_hw_branches.c
@@ -3,17 +3,15 @@
  *
  * Copyright (C) 2008-2009 Intel Corporation.
  * Markus Metzger <markus.t.metzger@gmail.com>, 2008-2009
- *
  */
-
-#include <linux/module.h>
-#include <linux/fs.h>
+#include <linux/spinlock.h>
+#include <linux/kallsyms.h>
 #include <linux/debugfs.h>
 #include <linux/ftrace.h>
-#include <linux/kallsyms.h>
-#include <linux/mutex.h>
+#include <linux/module.h>
 #include <linux/cpu.h>
 #include <linux/smp.h>
+#include <linux/fs.h>
 
 #include <asm/ds.h>
 
@@ -23,16 +21,17 @@
 
 #define SIZEOF_BTS (1 << 13)
 
-/* The tracer mutex protects the below per-cpu tracer array.
-   It needs to be held to:
-   - start tracing on all cpus
-   - stop tracing on all cpus
-   - start tracing on a single hotplug cpu
-   - stop tracing on a single hotplug cpu
-   - read the trace from all cpus
-   - read the trace from a single cpu
-*/
-static DEFINE_MUTEX(bts_tracer_mutex);
+/*
+ * The tracer lock protects the below per-cpu tracer array.
+ * It needs to be held to:
+ * - start tracing on all cpus
+ * - stop tracing on all cpus
+ * - start tracing on a single hotplug cpu
+ * - stop tracing on a single hotplug cpu
+ * - read the trace from all cpus
+ * - read the trace from a single cpu
+ */
+static DEFINE_SPINLOCK(bts_tracer_lock);
 static DEFINE_PER_CPU(struct bts_tracer *, tracer);
 static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer);
 
@@ -47,7 +46,7 @@ static struct trace_array *hw_branch_trace __read_mostly;
  * Start tracing on the current cpu.
  * The argument is ignored.
  *
- * pre: bts_tracer_mutex must be locked.
+ * pre: bts_tracer_lock must be locked.
  */
 static void bts_trace_start_cpu(void *arg)
 {
@@ -66,19 +65,19 @@ static void bts_trace_start_cpu(void *arg)
 
 static void bts_trace_start(struct trace_array *tr)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	on_each_cpu(bts_trace_start_cpu, NULL, 1);
 	trace_hw_branches_enabled = 1;
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 /*
  * Stop tracing on the current cpu.
  * The argument is ignored.
  *
- * pre: bts_tracer_mutex must be locked.
+ * pre: bts_tracer_lock must be locked.
  */
 static void bts_trace_stop_cpu(void *arg)
 {
@@ -90,12 +89,12 @@ static void bts_trace_stop_cpu(void *arg)
 
 static void bts_trace_stop(struct trace_array *tr)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	trace_hw_branches_enabled = 0;
 	on_each_cpu(bts_trace_stop_cpu, NULL, 1);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
@@ -103,7 +102,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 {
 	unsigned int cpu = (unsigned long)hcpu;
 
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	if (!trace_hw_branches_enabled)
 		goto out;
@@ -119,7 +118,7 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb,
 	}
 
  out:
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 	return NOTIFY_DONE;
 }
 
@@ -225,7 +224,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at)
 /*
  * Collect the trace on the current cpu and write it into the ftrace buffer.
  *
- * pre: bts_tracer_mutex must be locked
+ * pre: bts_tracer_lock must be locked
  */
 static void trace_bts_cpu(void *arg)
 {
@@ -261,11 +260,11 @@ out:
 
 static void trace_bts_prepare(struct trace_iterator *iter)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	on_each_cpu(trace_bts_cpu, iter->tr, 1);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 static void trace_bts_close(struct trace_iterator *iter)
@@ -275,11 +274,11 @@ static void trace_bts_close(struct trace_iterator *iter)
 
 void trace_hw_branch_oops(void)
 {
-	mutex_lock(&bts_tracer_mutex);
+	spin_lock(&bts_tracer_lock);
 
 	trace_bts_cpu(hw_branch_trace);
 
-	mutex_unlock(&bts_tracer_mutex);
+	spin_unlock(&bts_tracer_lock);
 }
 
 struct tracer bts_tracer __read_mostly =
-- 
cgit v1.2.3


From 886b5b73d71e4027d7dc6c14f5f7ab102201ea6b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 11:03:44 +0100
Subject: tracing: remove /debug/tracing/latency_trace

Impact: remove old debug/tracing API

/debug/tracing/latency_trace is an old legacy format we kept from
the old latency tracer. Remove the file for now. If there's any
useful bit missing then we'll propagate any useful output bits into
the /debug/tracing/trace output.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 5 -----
 1 file changed, 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e1f3b99a2e52..11ba100f9a9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2938,11 +2938,6 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
-	entry = debugfs_create_file("latency_trace", 0444, d_tracer,
-				    &global_trace, &tracing_lt_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs 'latency_trace' entry\n");
-
 	entry = debugfs_create_file("trace", 0444, d_tracer,
 				    &global_trace, &tracing_fops);
 	if (!entry)
-- 
cgit v1.2.3


From 15d0d3b3371227f846b9f644547fde081c7e1c0c Mon Sep 17 00:00:00 2001
From: Nick Piggin <npiggin@suse.de>
Date: Wed, 25 Feb 2009 06:22:45 +0100
Subject: generic IPI: simplify barriers and locking

Simplify the barriers in generic remote function call interrupt
code.

Firstly, just unconditionally take the lock and check the list
in the generic_call_function_single_interrupt IPI handler. As
we've just taken an IPI here, the chances are fairly high that
there will be work on the list for us, so do the locking
unconditionally. This removes the tricky lockless list_empty
check and dubious barriers. The change looks bigger than it is
because it is just removing an outer loop.

Secondly, clarify architecture specific IPI locking rules.
Generic code has no tools to impose any sane ordering on IPIs if
they go outside normal cache coherency, ergo the arch code must
make them appear to obey cache coherency as a "memory operation"
to initiate an IPI, and a "memory operation" to receive one.
This way at least they can be reasoned about in generic code,
and smp_mb used to provide ordering.

The combination of these two changes means that explict barriers
can be taken out of queue handling for the single case -- shared
data is explicitly locked, and ipi ordering must conform to
that, so no barriers needed. An extra barrier is needed in the
many handler, so as to ensure we load the list element after the
IPI is received.

Does any architecture actually *need* these barriers? For the
initiator I could see it, but for the handler I would be
surprised. So the other thing we could do for simplicity is just
to require that, rather than just matching with cache coherency,
we just require a full barrier before generating an IPI, and
after receiving an IPI. In which case, the smp_mb()s can go
away. But just for now, we'll be on the safe side and use the
barriers (they're in the slow case anyway).

Signed-off-by: Nick Piggin <npiggin@suse.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: linux-arch@vger.kernel.org
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 83 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 44 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index bbedbb7efe32..6ecf4b9895d4 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -74,9 +74,16 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 	spin_unlock_irqrestore(&dst->lock, flags);
 
 	/*
-	 * Make the list addition visible before sending the ipi.
+	 * The list addition should be visible before sending the IPI
+	 * handler locks the list to pull the entry off it because of
+	 * normal cache coherency rules implied by spinlocks.
+	 *
+	 * If IPIs can go out of order to the cache coherency protocol
+	 * in an architecture, sufficient synchronisation should be added
+	 * to arch code to make it appear to obey cache coherency WRT
+	 * locking and barrier primitives. Generic code isn't really equipped
+	 * to do the right thing...
 	 */
-	smp_mb();
 
 	if (ipi)
 		arch_send_call_function_single_ipi(cpu);
@@ -103,6 +110,14 @@ void generic_smp_call_function_interrupt(void)
 	struct call_function_data *data;
 	int cpu = get_cpu();
 
+	/*
+	 * Ensure entry is visible on call_function_queue after we have
+	 * entered the IPI. See comment in smp_call_function_many.
+	 * If we don't have this, then we may miss an entry on the list
+	 * and never get another IPI to process it.
+	 */
+	smp_mb();
+
 	/*
 	 * It's ok to use list_for_each_rcu() here even though we may delete
 	 * 'pos', since list_del_rcu() doesn't clear ->next
@@ -154,49 +169,37 @@ void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
 	LIST_HEAD(list);
+	unsigned int data_flags;
 
-	/*
-	 * Need to see other stores to list head for checking whether
-	 * list is empty without holding q->lock
-	 */
-	smp_read_barrier_depends();
-	while (!list_empty(&q->list)) {
-		unsigned int data_flags;
-
-		spin_lock(&q->lock);
-		list_replace_init(&q->list, &list);
-		spin_unlock(&q->lock);
+	spin_lock(&q->lock);
+	list_replace_init(&q->list, &list);
+	spin_unlock(&q->lock);
 
-		while (!list_empty(&list)) {
-			struct call_single_data *data;
+	while (!list_empty(&list)) {
+		struct call_single_data *data;
 
-			data = list_entry(list.next, struct call_single_data,
-						list);
-			list_del(&data->list);
+		data = list_entry(list.next, struct call_single_data,
+					list);
+		list_del(&data->list);
 
-			/*
-			 * 'data' can be invalid after this call if
-			 * flags == 0 (when called through
-			 * generic_exec_single(), so save them away before
-			 * making the call.
-			 */
-			data_flags = data->flags;
-
-			data->func(data->info);
-
-			if (data_flags & CSD_FLAG_WAIT) {
-				smp_wmb();
-				data->flags &= ~CSD_FLAG_WAIT;
-			} else if (data_flags & CSD_FLAG_LOCK) {
-				smp_wmb();
-				data->flags &= ~CSD_FLAG_LOCK;
-			} else if (data_flags & CSD_FLAG_ALLOC)
-				kfree(data);
-		}
 		/*
-		 * See comment on outer loop
+		 * 'data' can be invalid after this call if
+		 * flags == 0 (when called through
+		 * generic_exec_single(), so save them away before
+		 * making the call.
 		 */
-		smp_read_barrier_depends();
+		data_flags = data->flags;
+
+		data->func(data->info);
+
+		if (data_flags & CSD_FLAG_WAIT) {
+			smp_wmb();
+			data->flags &= ~CSD_FLAG_WAIT;
+		} else if (data_flags & CSD_FLAG_LOCK) {
+			smp_wmb();
+			data->flags &= ~CSD_FLAG_LOCK;
+		} else if (data_flags & CSD_FLAG_ALLOC)
+			kfree(data);
 	}
 }
 
@@ -375,6 +378,8 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/*
 	 * Make the list addition visible before sending the ipi.
+	 * (IPIs must obey or appear to obey normal Linux cache coherency
+	 * rules -- see comment in generic_exec_single).
 	 */
 	smp_mb();
 
-- 
cgit v1.2.3


From b04cc6b1f6398b0e0b60d37e27ce51b4899672ec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Feb 2009 03:22:28 +0100
Subject: tracing/core: introduce per cpu tracing files

Impact: split up tracing output per cpu

Currently, on the tracing debugfs directory, three files are
available to the user to let him extracting the trace output:

- trace is an iterator through the ring-buffer. It's a reader
  but not a consumer It doesn't block when no more traces are
  available.

- trace pretty similar to the former, except that it adds more
  informations such as prempt count, irq flag, ...

- trace_pipe is a reader and a consumer, it will also block
  waiting for traces if necessary (heh, yes it's a pipe).

The traces coming from different cpus are curretly mixed up
inside these files. Sometimes it messes up the informations,
sometimes it's useful, depending on what does the tracer
capture.

The tracing_cpumask file is useful to filter the output and
select only the traces captured a custom defined set of cpus.
But still it is not enough powerful to extract at the same time
one trace buffer per cpu.

So this patch creates a new directory: /debug/tracing/per_cpu/.

Inside this directory, you will now find one trace_pipe file and
one trace file per cpu.

Which means if you have two cpus, you will have:

 trace0
 trace1
 trace_pipe0
 trace_pipe1

And of course, reading these files will have the same effect
than with the usual tracing files, except that you will only see
the traces from the given cpu.

The original all-in-one cpu trace file are still available on
their original place.

Until now, only one consumer was allowed on trace_pipe to avoid
racy consuming on the ring-buffer. Now the approach changed a
bit, you can have only one consumer per cpu.

Which means you are allowed to read concurrently trace_pipe0 and
trace_pipe1 But you can't have two readers on trace_pipe0 or
trace_pipe1.

Following the same logic, if there is one reader on the common
trace_pipe, you can not have at the same time another reader on
trace_pipe0 or in trace_pipe1. Because in trace_pipe is already
a consumer in all cpu buffers in essence.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 168 +++++++++++++++++++++++++++++++++++++++++++--------
 kernel/trace/trace.h |   3 +
 2 files changed, 147 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 11ba100f9a9e..aa58b7bc847c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -98,6 +98,9 @@ static inline void ftrace_enable_cpu(void)
 
 static cpumask_var_t __read_mostly	tracing_buffer_mask;
 
+/* Define which cpu buffers are currently read in trace_pipe */
+static cpumask_var_t			tracing_reader_cpumask;
+
 #define for_each_tracing_cpu(cpu)	\
 	for_each_cpu(cpu, tracing_buffer_mask)
 
@@ -1195,10 +1198,25 @@ __find_next_entry(struct trace_iterator *iter, int *ent_cpu, u64 *ent_ts)
 {
 	struct ring_buffer *buffer = iter->tr->buffer;
 	struct trace_entry *ent, *next = NULL;
+	int cpu_file = iter->cpu_file;
 	u64 next_ts = 0, ts;
 	int next_cpu = -1;
 	int cpu;
 
+	/*
+	 * If we are in a per_cpu trace file, don't bother by iterating over
+	 * all cpu and peek directly.
+	 */
+	if (cpu_file > TRACE_PIPE_ALL_CPU) {
+		if (ring_buffer_empty_cpu(buffer, cpu_file))
+			return NULL;
+		ent = peek_next_entry(iter, cpu_file, ent_ts);
+		if (ent_cpu)
+			*ent_cpu = cpu_file;
+
+		return ent;
+	}
+
 	for_each_tracing_cpu(cpu) {
 
 		if (ring_buffer_empty_cpu(buffer, cpu))
@@ -1279,6 +1297,7 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
@@ -1299,9 +1318,12 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 
 		ftrace_disable_cpu();
 
-		for_each_tracing_cpu(cpu) {
-			ring_buffer_iter_reset(iter->buffer_iter[cpu]);
-		}
+		if (cpu_file == TRACE_PIPE_ALL_CPU) {
+			for_each_tracing_cpu(cpu)
+				ring_buffer_iter_reset(iter->buffer_iter[cpu]);
+		} else
+			ring_buffer_iter_reset(iter->buffer_iter[cpu_file]);
+
 
 		ftrace_enable_cpu();
 
@@ -1653,6 +1675,7 @@ static struct seq_operations tracer_seq_ops = {
 static struct trace_iterator *
 __tracing_open(struct inode *inode, struct file *file, int *ret)
 {
+	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
 	struct seq_file *m;
 	int cpu;
@@ -1672,9 +1695,10 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
-		iter->tr = inode->i_private;
+		iter->tr = &global_trace;
 	iter->trace = current_trace;
 	iter->pos = -1;
+	iter->cpu_file = cpu_file;
 
 	/* Notify the tracer early; before we stop tracing. */
 	if (iter->trace && iter->trace->open)
@@ -1684,14 +1708,22 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	if (ring_buffer_overruns(iter->tr->buffer))
 		iter->iter_flags |= TRACE_FILE_ANNOTATE;
 
+	if (iter->cpu_file == TRACE_PIPE_ALL_CPU) {
+		for_each_tracing_cpu(cpu) {
 
-	for_each_tracing_cpu(cpu) {
+			iter->buffer_iter[cpu] =
+				ring_buffer_read_start(iter->tr->buffer, cpu);
 
+			if (!iter->buffer_iter[cpu])
+				goto fail_buffer;
+		}
+	} else {
+		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
-			ring_buffer_read_start(iter->tr->buffer, cpu);
+				ring_buffer_read_start(iter->tr->buffer, cpu);
 
 		if (!iter->buffer_iter[cpu])
-			goto fail_buffer;
+			goto fail;
 	}
 
 	/* TODO stop tracer */
@@ -1715,6 +1747,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
+fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter);
 
@@ -2325,54 +2358,77 @@ tracing_max_lat_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static atomic_t tracing_reader;
-
 static int tracing_open_pipe(struct inode *inode, struct file *filp)
 {
+	long cpu_file = (long) inode->i_private;
 	struct trace_iterator *iter;
+	int ret = 0;
 
 	if (tracing_disabled)
 		return -ENODEV;
 
-	/* We only allow for reader of the pipe */
-	if (atomic_inc_return(&tracing_reader) != 1) {
-		atomic_dec(&tracing_reader);
-		return -EBUSY;
+	mutex_lock(&trace_types_lock);
+
+	/* We only allow one reader per cpu */
+	if (cpu_file == TRACE_PIPE_ALL_CPU) {
+		if (!cpumask_empty(tracing_reader_cpumask)) {
+			ret = -EBUSY;
+			goto out;
+		}
+		cpumask_setall(tracing_reader_cpumask);
+	} else {
+		if (!cpumask_test_cpu(cpu_file, tracing_reader_cpumask))
+			cpumask_set_cpu(cpu_file, tracing_reader_cpumask);
+		else {
+			ret = -EBUSY;
+			goto out;
+		}
 	}
 
 	/* create a buffer to store the information to pass to userspace */
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter)
-		return -ENOMEM;
+	if (!iter) {
+		ret = -ENOMEM;
+		goto out;
+	}
 
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
 		kfree(iter);
-		return -ENOMEM;
+		ret = -ENOMEM;
+		goto out;
 	}
 
-	mutex_lock(&trace_types_lock);
-
 	/* trace pipe does not show start of buffer */
 	cpumask_setall(iter->started);
 
+	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
 	iter->trace = current_trace;
 	filp->private_data = iter;
 
 	if (iter->trace->pipe_open)
 		iter->trace->pipe_open(iter);
-	mutex_unlock(&trace_types_lock);
 
-	return 0;
+out:
+	mutex_unlock(&trace_types_lock);
+	return ret;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter = file->private_data;
 
+	mutex_lock(&trace_types_lock);
+
+	if (iter->cpu_file == TRACE_PIPE_ALL_CPU)
+		cpumask_clear(tracing_reader_cpumask);
+	else
+		cpumask_clear_cpu(iter->cpu_file, tracing_reader_cpumask);
+
+	mutex_unlock(&trace_types_lock);
+
 	free_cpumask_var(iter->started);
 	kfree(iter);
-	atomic_dec(&tracing_reader);
 
 	return 0;
 }
@@ -2911,6 +2967,59 @@ struct dentry *tracing_init_dentry(void)
 	return d_tracer;
 }
 
+static struct dentry *d_percpu;
+
+struct dentry *tracing_dentry_percpu(void)
+{
+	static int once;
+	struct dentry *d_tracer;
+
+	if (d_percpu)
+		return d_percpu;
+
+	d_tracer = tracing_init_dentry();
+
+	if (!d_tracer)
+		return NULL;
+
+	d_percpu = debugfs_create_dir("per_cpu", d_tracer);
+
+	if (!d_percpu && !once) {
+		once = 1;
+		pr_warning("Could not create debugfs directory 'per_cpu'\n");
+		return NULL;
+	}
+
+	return d_percpu;
+}
+
+static void tracing_init_debugfs_percpu(long cpu)
+{
+	struct dentry *d_percpu = tracing_dentry_percpu();
+	struct dentry *entry;
+	/* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */
+	char filename[17];
+
+	if (cpu > 999 || cpu < 0)
+		return;
+
+	/* per cpu trace_pipe */
+	sprintf(filename, "trace_pipe%ld", cpu);
+
+	entry = debugfs_create_file(filename, 0444, d_percpu,
+				(void *) cpu, &tracing_pipe_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs '%s' entry\n", filename);
+
+	/* per cpu trace */
+	sprintf(filename, "trace%ld", cpu);
+
+	entry = debugfs_create_file(filename, 0444, d_percpu,
+				(void *) cpu, &tracing_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs '%s' entry\n", filename);
+}
+
 #ifdef CONFIG_FTRACE_SELFTEST
 /* Let selftest have access to static functions in this file */
 #include "trace_selftest.c"
@@ -2920,6 +3029,7 @@ static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
 	struct dentry *entry;
+	int cpu;
 
 	d_tracer = tracing_init_dentry();
 
@@ -2939,7 +3049,7 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
 	entry = debugfs_create_file("trace", 0444, d_tracer,
-				    &global_trace, &tracing_fops);
+				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
 
@@ -2970,8 +3080,8 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'README' entry\n");
 
-	entry = debugfs_create_file("trace_pipe", 0644, d_tracer,
-				    NULL, &tracing_pipe_fops);
+	entry = debugfs_create_file("trace_pipe", 0444, d_tracer,
+			(void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
 			   "'trace_pipe' entry\n");
@@ -2999,6 +3109,10 @@ static __init int tracer_init_debugfs(void)
 #ifdef CONFIG_SYSPROF_TRACER
 	init_tracer_sysprof_debugfs(d_tracer);
 #endif
+
+	for_each_tracing_cpu(cpu)
+		tracing_init_debugfs_percpu(cpu);
+
 	return 0;
 }
 
@@ -3222,8 +3336,12 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_cpumask, GFP_KERNEL))
 		goto out_free_buffer_mask;
 
+	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
+		goto out_free_tracing_cpumask;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
+	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
 	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
@@ -3272,6 +3390,8 @@ __init static int tracer_alloc_buffers(void)
 	ret = 0;
 
 out_free_cpumask:
+	free_cpumask_var(tracing_reader_cpumask);
+out_free_tracing_cpumask:
 	free_cpumask_var(tracing_cpumask);
 out_free_buffer_mask:
 	free_cpumask_var(tracing_buffer_mask);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index eed732c151fc..508235a39da6 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,8 @@ struct trace_seq {
 	unsigned int		readpos;
 };
 
+#define TRACE_PIPE_ALL_CPU	-1
+
 /*
  * Trace iterator - used by printout routines who present trace
  * results to users and which routines might sleep, etc:
@@ -404,6 +406,7 @@ struct trace_iterator {
 	struct tracer		*trace;
 	void			*private;
 	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
+	int			cpu_file;
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
-- 
cgit v1.2.3


From d7350c3f45694104e820041969c8185c5f99e57c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 25 Feb 2009 06:13:16 +0100
Subject: tracing/core: make the read callbacks reentrants

Now that several per-cpu files can be read or spliced at the
same, we want the read/splice callbacks for tracing files to be
reentrants.

Until now, a single global mutex (trace_types_lock) serialized
the access to tracing_read_pipe(), tracing_splice_read_pipe(),
and the seq helpers.

Ie: it means that if a user tries to read trace_pipe0 and
trace_pipe1 at the same time, the access to the function
tracing_read_pipe() is contended and one reader must wait for
the other to finish its read call.

The trace_type_lock mutex is mostly here to serialize the access
to the global current tracer (current_trace), which can be
changed concurrently. Although the iter struct keeps a private
pointer to this tracer, its callbacks can be changed by another
function.

The method used here is to not keep anymore private reference to
the tracer inside the iterator but to make a copy of it inside
the iterator. Then it checks on subsequents read calls if the
tracer has changed. This is not costly because the current
tracer is not expected to be changed often, so we use a branch
prediction for that.

Moreover, we add a private mutex to the iterator (there is one
iterator per file descriptor) to serialize the accesses in case
of multiple consumers per file descriptor (which would be a
silly idea from the user). Note that this is not to protect the
ring buffer, since the ring buffer already serializes the
readers accesses. This is to prevent from traces weirdness in
case of concurrent consumers. But these mutexes can be dropped
anyway, that would not result in any crash. Just tell me what
you think about it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 101 ++++++++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.h |   3 +-
 2 files changed, 85 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index aa58b7bc847c..d8d899f3bd6f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1294,20 +1294,32 @@ static void *s_next(struct seq_file *m, void *v, loff_t *pos)
 	return ent;
 }
 
+/*
+ * No necessary locking here. The worst thing which can
+ * happen is loosing events consumed at the same time
+ * by a trace_pipe reader.
+ * Other than that, we don't risk to crash the ring buffer
+ * because it serializes the readers.
+ *
+ * The current tracer is copied to avoid a global locking
+ * all around.
+ */
 static void *s_start(struct seq_file *m, loff_t *pos)
 {
 	struct trace_iterator *iter = m->private;
+	static struct tracer *old_tracer;
 	int cpu_file = iter->cpu_file;
 	void *p = NULL;
 	loff_t l = 0;
 	int cpu;
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
-
-	if (!current_trace || current_trace != iter->trace) {
-		mutex_unlock(&trace_types_lock);
-		return NULL;
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
 	}
+	mutex_unlock(&trace_types_lock);
 
 	atomic_inc(&trace_record_cmdline_disabled);
 
@@ -1341,7 +1353,6 @@ static void *s_start(struct seq_file *m, loff_t *pos)
 static void s_stop(struct seq_file *m, void *p)
 {
 	atomic_dec(&trace_record_cmdline_disabled);
-	mutex_unlock(&trace_types_lock);
 }
 
 static void print_lat_help_header(struct seq_file *m)
@@ -1691,13 +1702,25 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		goto out;
 	}
 
+	/*
+	 * We make a copy of the current tracer to avoid concurrent
+	 * changes on it while we are reading.
+	 */
 	mutex_lock(&trace_types_lock);
+	iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
+	if (!iter->trace) {
+		*ret = -ENOMEM;
+		goto fail;
+	}
+	if (current_trace)
+		*iter->trace = *current_trace;
+
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
 		iter->tr = &global_trace;
-	iter->trace = current_trace;
 	iter->pos = -1;
+	mutex_init(&iter->mutex);
 	iter->cpu_file = cpu_file;
 
 	/* Notify the tracer early; before we stop tracing. */
@@ -1747,8 +1770,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
-fail:
+ fail:
 	mutex_unlock(&trace_types_lock);
+	kfree(iter->trace);
 	kfree(iter);
 
 	return ERR_PTR(-ENOMEM);
@@ -1783,6 +1807,8 @@ static int tracing_release(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 
 	seq_release(inode, file);
+	mutex_destroy(&iter->mutex);
+	kfree(iter->trace);
 	kfree(iter);
 	return 0;
 }
@@ -2392,10 +2418,21 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 		goto out;
 	}
 
+	/*
+	 * We make a copy of the current tracer to avoid concurrent
+	 * changes on it while we are reading.
+	 */
+	iter->trace = kmalloc(sizeof(*iter->trace), GFP_KERNEL);
+	if (!iter->trace) {
+		ret = -ENOMEM;
+		goto fail;
+	}
+	if (current_trace)
+		*iter->trace = *current_trace;
+
 	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL)) {
-		kfree(iter);
 		ret = -ENOMEM;
-		goto out;
+		goto fail;
 	}
 
 	/* trace pipe does not show start of buffer */
@@ -2403,7 +2440,7 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 
 	iter->cpu_file = cpu_file;
 	iter->tr = &global_trace;
-	iter->trace = current_trace;
+	mutex_init(&iter->mutex);
 	filp->private_data = iter;
 
 	if (iter->trace->pipe_open)
@@ -2412,6 +2449,12 @@ static int tracing_open_pipe(struct inode *inode, struct file *filp)
 out:
 	mutex_unlock(&trace_types_lock);
 	return ret;
+
+fail:
+	kfree(iter->trace);
+	kfree(iter);
+	mutex_unlock(&trace_types_lock);
+	return ret;
 }
 
 static int tracing_release_pipe(struct inode *inode, struct file *file)
@@ -2428,6 +2471,8 @@ static int tracing_release_pipe(struct inode *inode, struct file *file)
 	mutex_unlock(&trace_types_lock);
 
 	free_cpumask_var(iter->started);
+	mutex_destroy(&iter->mutex);
+	kfree(iter->trace);
 	kfree(iter);
 
 	return 0;
@@ -2497,18 +2542,15 @@ static int tracing_wait_pipe(struct file *filp)
 			return -EAGAIN;
 		}
 
-		mutex_unlock(&trace_types_lock);
+		mutex_unlock(&iter->mutex);
 
 		iter->trace->wait_pipe(iter);
 
-		mutex_lock(&trace_types_lock);
+		mutex_lock(&iter->mutex);
 
 		if (signal_pending(current))
 			return -EINTR;
 
-		if (iter->trace != current_trace)
-			return 0;
-
 		/*
 		 * We block until we read something and tracing is disabled.
 		 * We still block if tracing is disabled, but we have never
@@ -2533,6 +2575,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 		  size_t cnt, loff_t *ppos)
 {
 	struct trace_iterator *iter = filp->private_data;
+	static struct tracer *old_tracer;
 	ssize_t sret;
 
 	/* return any leftover data */
@@ -2542,7 +2585,20 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 
 	trace_seq_reset(&iter->seq);
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
+	}
+	mutex_unlock(&trace_types_lock);
+
+	/*
+	 * Avoid more than one consumer on a single file descriptor
+	 * This is just a matter of traces coherency, the ring buffer itself
+	 * is protected.
+	 */
+	mutex_lock(&iter->mutex);
 	if (iter->trace->read) {
 		sret = iter->trace->read(iter, filp, ubuf, cnt, ppos);
 		if (sret)
@@ -2599,7 +2655,7 @@ waitagain:
 		goto waitagain;
 
 out:
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	return sret;
 }
@@ -2676,11 +2732,20 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		.ops		= &tracing_pipe_buf_ops,
 		.spd_release	= tracing_spd_release_pipe,
 	};
+	static struct tracer *old_tracer;
 	ssize_t ret;
 	size_t rem;
 	unsigned int i;
 
+	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
+	if (unlikely(old_tracer != current_trace && current_trace)) {
+		old_tracer = current_trace;
+		*iter->trace = *current_trace;
+	}
+	mutex_unlock(&trace_types_lock);
+
+	mutex_lock(&iter->mutex);
 
 	if (iter->trace->splice_read) {
 		ret = iter->trace->splice_read(iter, filp,
@@ -2720,14 +2785,14 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		trace_seq_reset(&iter->seq);
 	}
 
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	spd.nr_pages = i;
 
 	return splice_to_pipe(pipe, &spd);
 
 out_err:
-	mutex_unlock(&trace_types_lock);
+	mutex_unlock(&iter->mutex);
 
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 508235a39da6..632191770aac 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -405,8 +405,9 @@ struct trace_iterator {
 	struct trace_array	*tr;
 	struct tracer		*trace;
 	void			*private;
-	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 	int			cpu_file;
+	struct mutex		mutex;
+	struct ring_buffer_iter	*buffer_iter[NR_CPUS];
 
 	/* The below is zeroed out in pipe_read */
 	struct trace_seq	seq;
-- 
cgit v1.2.3


From 8969a5ede0f9e17da4b943712429aef2c9bcd82b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:47 +0100
Subject: generic-ipi: remove kmalloc()

Remove the use of kmalloc() from the smp_call_function_*()
calls.

Steven's generic-ipi patch (d7240b98: generic-ipi: use per cpu
data for single cpu ipi calls) started the discussion on the use
of kmalloc() in this code and fixed the
smp_call_function_single(.wait=0) fallback case.

In this patch we complete this by also providing means for the
_many() call, which fully removes the need for kmalloc() in this
code.

The problem with the _many() call is that other cpus might still
be observing our entry when we're done with it. It solved this
by dynamically allocating data elements and RCU-freeing it.

We solve it by using a single per-cpu entry which provides
static storage and solves one half of the problem (avoiding
referencing freed data).

The other half, ensuring the queue iteration it still possible,
is done by placing re-used entries at the head of the list. This
means that if someone was still iterating that entry when it got
moved, he will now re-visit the entries on the list he had
already seen, but avoids skipping over entries like would have
happened had we placed the new entry at the end.

Furthermore, visiting entries twice is not a problem, since we
remove our cpu from the entry's cpumask once its called.

Many thanks to Oleg for his suggestions and him poking holes in
my earlier attempts.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 264 +++++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 166 insertions(+), 98 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 6ecf4b9895d4..7a0ce25829dc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -10,23 +10,28 @@
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
 #include <linux/smp.h>
+#include <linux/cpu.h>
 
 static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
-static LIST_HEAD(call_function_queue);
-__cacheline_aligned_in_smp DEFINE_SPINLOCK(call_function_lock);
+
+static struct {
+	struct list_head	queue;
+	spinlock_t		lock;
+} call_function __cacheline_aligned_in_smp = {
+	.queue = LIST_HEAD_INIT(call_function.queue),
+	.lock  = __SPIN_LOCK_UNLOCKED(call_function.lock),
+};
 
 enum {
 	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_ALLOC		= 0x02,
-	CSD_FLAG_LOCK		= 0x04,
+	CSD_FLAG_LOCK		= 0x02,
 };
 
 struct call_function_data {
 	struct call_single_data csd;
 	spinlock_t lock;
 	unsigned int refs;
-	struct rcu_head rcu_head;
-	unsigned long cpumask_bits[];
+	cpumask_var_t cpumask;
 };
 
 struct call_single_queue {
@@ -34,8 +39,45 @@ struct call_single_queue {
 	spinlock_t lock;
 };
 
+static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
+	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+};
+
+static int
+hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
+{
+	long cpu = (long)hcpu;
+	struct call_function_data *cfd = &per_cpu(cfd_data, cpu);
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (!alloc_cpumask_var_node(&cfd->cpumask, GFP_KERNEL,
+				cpu_to_node(cpu)))
+			return NOTIFY_BAD;
+		break;
+
+#ifdef CONFIG_CPU_HOTPLUG
+	case CPU_UP_CANCELED:
+	case CPU_UP_CANCELED_FROZEN:
+
+	case CPU_DEAD:
+	case CPU_DEAD_FROZEN:
+		free_cpumask_var(cfd->cpumask);
+		break;
+#endif
+	};
+
+	return NOTIFY_OK;
+}
+
+static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
+	.notifier_call = hotplug_cfd,
+};
+
 static int __cpuinit init_call_single_data(void)
 {
+	void *cpu = (void *)(long)smp_processor_id();
 	int i;
 
 	for_each_possible_cpu(i) {
@@ -44,18 +86,69 @@ static int __cpuinit init_call_single_data(void)
 		spin_lock_init(&q->lock);
 		INIT_LIST_HEAD(&q->list);
 	}
+
+	hotplug_cfd(&hotplug_cfd_notifier, CPU_UP_PREPARE, cpu);
+	register_cpu_notifier(&hotplug_cfd_notifier);
+
 	return 0;
 }
 early_initcall(init_call_single_data);
 
-static void csd_flag_wait(struct call_single_data *data)
+/*
+ * csd_wait/csd_complete are used for synchronous ipi calls
+ */
+static void csd_wait_prepare(struct call_single_data *data)
 {
-	/* Wait for response */
-	do {
-		if (!(data->flags & CSD_FLAG_WAIT))
-			break;
+	data->flags |= CSD_FLAG_WAIT;
+}
+
+static void csd_complete(struct call_single_data *data)
+{
+	if (data->flags & CSD_FLAG_WAIT) {
+		/*
+		 * ensure we're all done before saying we are
+		 */
+		smp_mb();
+		data->flags &= ~CSD_FLAG_WAIT;
+	}
+}
+
+static void csd_wait(struct call_single_data *data)
+{
+	while (data->flags & CSD_FLAG_WAIT)
 		cpu_relax();
-	} while (1);
+}
+
+/*
+ * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
+ *
+ * For non-synchronous ipi calls the csd can still be in use by the previous
+ * function call. For multi-cpu calls its even more interesting as we'll have
+ * to ensure no other cpu is observing our csd.
+ */
+static void csd_lock(struct call_single_data *data)
+{
+	while (data->flags & CSD_FLAG_LOCK)
+		cpu_relax();
+	data->flags = CSD_FLAG_LOCK;
+
+	/*
+	 * prevent CPU from reordering the above assignment to ->flags
+	 * with any subsequent assignments to other fields of the
+	 * specified call_single_data structure.
+	 */
+
+	smp_mb();
+}
+
+static void csd_unlock(struct call_single_data *data)
+{
+	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+	/*
+	 * ensure we're all done before releasing data
+	 */
+	smp_mb();
+	data->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
@@ -89,16 +182,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_flag_wait(data);
-}
-
-static void rcu_free_call_data(struct rcu_head *head)
-{
-	struct call_function_data *data;
-
-	data = container_of(head, struct call_function_data, rcu_head);
-
-	kfree(data);
+		csd_wait(data);
 }
 
 /*
@@ -122,41 +206,35 @@ void generic_smp_call_function_interrupt(void)
 	 * It's ok to use list_for_each_rcu() here even though we may delete
 	 * 'pos', since list_del_rcu() doesn't clear ->next
 	 */
-	rcu_read_lock();
-	list_for_each_entry_rcu(data, &call_function_queue, csd.list) {
+	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
 
-		if (!cpumask_test_cpu(cpu, to_cpumask(data->cpumask_bits)))
+		spin_lock(&data->lock);
+		if (!cpumask_test_cpu(cpu, data->cpumask)) {
+			spin_unlock(&data->lock);
 			continue;
+		}
+		cpumask_clear_cpu(cpu, data->cpumask);
+		spin_unlock(&data->lock);
 
 		data->csd.func(data->csd.info);
 
 		spin_lock(&data->lock);
-		cpumask_clear_cpu(cpu, to_cpumask(data->cpumask_bits));
 		WARN_ON(data->refs == 0);
-		data->refs--;
-		refs = data->refs;
+		refs = --data->refs;
+		if (!refs) {
+			spin_lock(&call_function.lock);
+			list_del_rcu(&data->csd.list);
+			spin_unlock(&call_function.lock);
+		}
 		spin_unlock(&data->lock);
 
 		if (refs)
 			continue;
 
-		spin_lock(&call_function_lock);
-		list_del_rcu(&data->csd.list);
-		spin_unlock(&call_function_lock);
-
-		if (data->csd.flags & CSD_FLAG_WAIT) {
-			/*
-			 * serialize stores to data with the flag clear
-			 * and wakeup
-			 */
-			smp_wmb();
-			data->csd.flags &= ~CSD_FLAG_WAIT;
-		}
-		if (data->csd.flags & CSD_FLAG_ALLOC)
-			call_rcu(&data->rcu_head, rcu_free_call_data);
+		csd_complete(&data->csd);
+		csd_unlock(&data->csd);
 	}
-	rcu_read_unlock();
 
 	put_cpu();
 }
@@ -192,14 +270,14 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT) {
-			smp_wmb();
-			data->flags &= ~CSD_FLAG_WAIT;
-		} else if (data_flags & CSD_FLAG_LOCK) {
-			smp_wmb();
-			data->flags &= ~CSD_FLAG_LOCK;
-		} else if (data_flags & CSD_FLAG_ALLOC)
-			kfree(data);
+		if (data_flags & CSD_FLAG_WAIT)
+			csd_complete(data);
+
+		/*
+		 * Unlocked CSDs are valid through generic_exec_single()
+		 */
+		if (data_flags & CSD_FLAG_LOCK)
+			csd_unlock(data);
 	}
 }
 
@@ -218,7 +296,9 @@ static DEFINE_PER_CPU(struct call_single_data, csd_data);
 int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			     int wait)
 {
-	struct call_single_data d;
+	struct call_single_data d = {
+		.flags = 0,
+	};
 	unsigned long flags;
 	/* prevent preemption and reschedule on another processor,
 	   as well as CPU removal */
@@ -239,13 +319,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			/*
 			 * We are calling a function on a single CPU
 			 * and we are not going to wait for it to finish.
-			 * We first try to allocate the data, but if we
-			 * fail, we fall back to use a per cpu data to pass
-			 * the information to that CPU. Since all callers
-			 * of this code will use the same data, we must
-			 * synchronize the callers to prevent a new caller
-			 * from corrupting the data before the callee
-			 * can access it.
+			 * We use a per cpu data to pass the information to
+			 * that CPU. Since all callers of this code will
+			 * use the same data, we must synchronize the
+			 * callers to prevent a new caller from corrupting
+			 * the data before the callee can access it.
 			 *
 			 * The CSD_FLAG_LOCK is used to let us know when
 			 * the IPI handler is done with the data.
@@ -255,18 +333,11 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 			 * will make sure the callee is done with the
 			 * data before a new caller will use it.
 			 */
-			data = kmalloc(sizeof(*data), GFP_ATOMIC);
-			if (data)
-				data->flags = CSD_FLAG_ALLOC;
-			else {
-				data = &per_cpu(csd_data, me);
-				while (data->flags & CSD_FLAG_LOCK)
-					cpu_relax();
-				data->flags = CSD_FLAG_LOCK;
-			}
+			data = &__get_cpu_var(csd_data);
+			csd_lock(data);
 		} else {
 			data = &d;
-			data->flags = CSD_FLAG_WAIT;
+			csd_wait_prepare(data);
 		}
 
 		data->func = func;
@@ -326,14 +397,14 @@ void smp_call_function_many(const struct cpumask *mask,
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu;
+	int cpu, next_cpu, me = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
 	/* So, what's a CPU they want?  Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-	if (cpu == smp_processor_id())
+	if (cpu == me)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
 	/* No online cpus?  We're done. */
 	if (cpu >= nr_cpu_ids)
@@ -341,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* Do we have another CPU which isn't us? */
 	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
-	if (next_cpu == smp_processor_id())
+	if (next_cpu == me)
 		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
 
 	/* Fastpath: do that cpu by itself. */
@@ -350,31 +421,28 @@ void smp_call_function_many(const struct cpumask *mask,
 		return;
 	}
 
-	data = kmalloc(sizeof(*data) + cpumask_size(), GFP_ATOMIC);
-	if (unlikely(!data)) {
-		/* Slow path. */
-		for_each_online_cpu(cpu) {
-			if (cpu == smp_processor_id())
-				continue;
-			if (cpumask_test_cpu(cpu, mask))
-				smp_call_function_single(cpu, func, info, wait);
-		}
-		return;
-	}
+	data = &__get_cpu_var(cfd_data);
+	csd_lock(&data->csd);
 
-	spin_lock_init(&data->lock);
-	data->csd.flags = CSD_FLAG_ALLOC;
+	spin_lock_irqsave(&data->lock, flags);
 	if (wait)
-		data->csd.flags |= CSD_FLAG_WAIT;
+		csd_wait_prepare(&data->csd);
+
 	data->csd.func = func;
 	data->csd.info = info;
-	cpumask_and(to_cpumask(data->cpumask_bits), mask, cpu_online_mask);
-	cpumask_clear_cpu(smp_processor_id(), to_cpumask(data->cpumask_bits));
-	data->refs = cpumask_weight(to_cpumask(data->cpumask_bits));
+	cpumask_and(data->cpumask, mask, cpu_online_mask);
+	cpumask_clear_cpu(me, data->cpumask);
+	data->refs = cpumask_weight(data->cpumask);
 
-	spin_lock_irqsave(&call_function_lock, flags);
-	list_add_tail_rcu(&data->csd.list, &call_function_queue);
-	spin_unlock_irqrestore(&call_function_lock, flags);
+	spin_lock(&call_function.lock);
+	/*
+	 * Place entry at the _HEAD_ of the list, so that any cpu still
+	 * observing the entry in generic_smp_call_function_interrupt() will
+	 * not miss any other list entries.
+	 */
+	list_add_rcu(&data->csd.list, &call_function.queue);
+	spin_unlock(&call_function.lock);
+	spin_unlock_irqrestore(&data->lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
@@ -384,11 +452,11 @@ void smp_call_function_many(const struct cpumask *mask,
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
-	arch_send_call_function_ipi_mask(to_cpumask(data->cpumask_bits));
+	arch_send_call_function_ipi_mask(data->cpumask);
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_flag_wait(&data->csd);
+		csd_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
@@ -418,20 +486,20 @@ EXPORT_SYMBOL(smp_call_function);
 
 void ipi_call_lock(void)
 {
-	spin_lock(&call_function_lock);
+	spin_lock(&call_function.lock);
 }
 
 void ipi_call_unlock(void)
 {
-	spin_unlock(&call_function_lock);
+	spin_unlock(&call_function.lock);
 }
 
 void ipi_call_lock_irq(void)
 {
-	spin_lock_irq(&call_function_lock);
+	spin_lock_irq(&call_function.lock);
 }
 
 void ipi_call_unlock_irq(void)
 {
-	spin_unlock_irq(&call_function_lock);
+	spin_unlock_irq(&call_function.lock);
 }
-- 
cgit v1.2.3


From 6e2756376c706e4da3454a272947983f92e80a7e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 25 Feb 2009 13:59:48 +0100
Subject: generic-ipi: remove CSD_FLAG_WAIT

Oleg noticed that we don't strictly need CSD_FLAG_WAIT, rework
the code so that we can use CSD_FLAG_LOCK for both purposes.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-softirq.c |  2 +-
 include/linux/smp.h |  3 +-
 kernel/sched.c      |  2 +-
 kernel/smp.c        | 90 ++++++++++++++---------------------------------------
 kernel/softirq.c    |  2 +-
 5 files changed, 28 insertions(+), 71 deletions(-)

(limited to 'kernel')

diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index ce0efc6b26dc..ee9c21602228 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -64,7 +64,7 @@ static int raise_blk_irq(int cpu, struct request *rq)
 		data->info = rq;
 		data->flags = 0;
 
-		__smp_call_function_single(cpu, data);
+		__smp_call_function_single(cpu, data, 0);
 		return 0;
 	}
 
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 715196b09d67..00866d7fdf34 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -82,7 +82,8 @@ smp_call_function_mask(cpumask_t mask, void(*func)(void *info), void *info,
 	return 0;
 }
 
-void __smp_call_function_single(int cpuid, struct call_single_data *data);
+void __smp_call_function_single(int cpuid, struct call_single_data *data,
+				int wait);
 
 /*
  * Generic and arch helpers
diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec404133..d4c2749a2998 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1093,7 +1093,7 @@ static void hrtick_start(struct rq *rq, u64 delay)
 	if (rq == this_rq()) {
 		hrtimer_restart(timer);
 	} else if (!rq->hrtick_csd_pending) {
-		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd);
+		__smp_call_function_single(cpu_of(rq), &rq->hrtick_csd, 0);
 		rq->hrtick_csd_pending = 1;
 	}
 }
diff --git a/kernel/smp.c b/kernel/smp.c
index 7a0ce25829dc..f5308258891a 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -23,8 +23,7 @@ static struct {
 };
 
 enum {
-	CSD_FLAG_WAIT		= 0x01,
-	CSD_FLAG_LOCK		= 0x02,
+	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
@@ -94,31 +93,6 @@ static int __cpuinit init_call_single_data(void)
 }
 early_initcall(init_call_single_data);
 
-/*
- * csd_wait/csd_complete are used for synchronous ipi calls
- */
-static void csd_wait_prepare(struct call_single_data *data)
-{
-	data->flags |= CSD_FLAG_WAIT;
-}
-
-static void csd_complete(struct call_single_data *data)
-{
-	if (data->flags & CSD_FLAG_WAIT) {
-		/*
-		 * ensure we're all done before saying we are
-		 */
-		smp_mb();
-		data->flags &= ~CSD_FLAG_WAIT;
-	}
-}
-
-static void csd_wait(struct call_single_data *data)
-{
-	while (data->flags & CSD_FLAG_WAIT)
-		cpu_relax();
-}
-
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
@@ -126,10 +100,15 @@ static void csd_wait(struct call_single_data *data)
  * function call. For multi-cpu calls its even more interesting as we'll have
  * to ensure no other cpu is observing our csd.
  */
-static void csd_lock(struct call_single_data *data)
+static void csd_lock_wait(struct call_single_data *data)
 {
 	while (data->flags & CSD_FLAG_LOCK)
 		cpu_relax();
+}
+
+static void csd_lock(struct call_single_data *data)
+{
+	csd_lock_wait(data);
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
@@ -155,11 +134,12 @@ static void csd_unlock(struct call_single_data *data)
  * Insert a previously allocated call_single_data element for execution
  * on the given CPU. data must already have ->func, ->info, and ->flags set.
  */
-static void generic_exec_single(int cpu, struct call_single_data *data)
+static
+void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 {
 	struct call_single_queue *dst = &per_cpu(call_single_queue, cpu);
-	int wait = data->flags & CSD_FLAG_WAIT, ipi;
 	unsigned long flags;
+	int ipi;
 
 	spin_lock_irqsave(&dst->lock, flags);
 	ipi = list_empty(&dst->list);
@@ -182,7 +162,7 @@ static void generic_exec_single(int cpu, struct call_single_data *data)
 		arch_send_call_function_single_ipi(cpu);
 
 	if (wait)
-		csd_wait(data);
+		csd_lock_wait(data);
 }
 
 /*
@@ -232,7 +212,6 @@ void generic_smp_call_function_interrupt(void)
 		if (refs)
 			continue;
 
-		csd_complete(&data->csd);
 		csd_unlock(&data->csd);
 	}
 
@@ -270,9 +249,6 @@ void generic_smp_call_function_single_interrupt(void)
 
 		data->func(data->info);
 
-		if (data_flags & CSD_FLAG_WAIT)
-			csd_complete(data);
-
 		/*
 		 * Unlocked CSDs are valid through generic_exec_single()
 		 */
@@ -313,36 +289,16 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		func(info);
 		local_irq_restore(flags);
 	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data;
+		struct call_single_data *data = &d;
 
-		if (!wait) {
-			/*
-			 * We are calling a function on a single CPU
-			 * and we are not going to wait for it to finish.
-			 * We use a per cpu data to pass the information to
-			 * that CPU. Since all callers of this code will
-			 * use the same data, we must synchronize the
-			 * callers to prevent a new caller from corrupting
-			 * the data before the callee can access it.
-			 *
-			 * The CSD_FLAG_LOCK is used to let us know when
-			 * the IPI handler is done with the data.
-			 * The first caller will set it, and the callee
-			 * will clear it. The next caller must wait for
-			 * it to clear before we set it again. This
-			 * will make sure the callee is done with the
-			 * data before a new caller will use it.
-			 */
+		if (!wait)
 			data = &__get_cpu_var(csd_data);
-			csd_lock(data);
-		} else {
-			data = &d;
-			csd_wait_prepare(data);
-		}
+
+		csd_lock(data);
 
 		data->func = func;
 		data->info = info;
-		generic_exec_single(cpu, data);
+		generic_exec_single(cpu, data, wait);
 	} else {
 		err = -ENXIO;	/* CPU not online */
 	}
@@ -362,12 +318,15 @@ EXPORT_SYMBOL(smp_call_function_single);
  * instance.
  *
  */
-void __smp_call_function_single(int cpu, struct call_single_data *data)
+void __smp_call_function_single(int cpu, struct call_single_data *data,
+				int wait)
 {
+	csd_lock(data);
+
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON((data->flags & CSD_FLAG_WAIT) && irqs_disabled());
+	WARN_ON(wait && irqs_disabled());
 
-	generic_exec_single(cpu, data);
+	generic_exec_single(cpu, data, wait);
 }
 
 /* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
@@ -425,9 +384,6 @@ void smp_call_function_many(const struct cpumask *mask,
 	csd_lock(&data->csd);
 
 	spin_lock_irqsave(&data->lock, flags);
-	if (wait)
-		csd_wait_prepare(&data->csd);
-
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
@@ -456,7 +412,7 @@ void smp_call_function_many(const struct cpumask *mask,
 
 	/* optionally wait for the CPUs to complete */
 	if (wait)
-		csd_wait(&data->csd);
+		csd_lock_wait(&data->csd);
 }
 EXPORT_SYMBOL(smp_call_function_many);
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..48c3d5d627a8 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -496,7 +496,7 @@ static int __try_remote_softirq(struct call_single_data *cp, int cpu, int softir
 		cp->flags = 0;
 		cp->priv = softirq;
 
-		__smp_call_function_single(cpu, cp);
+		__smp_call_function_single(cpu, cp, 0);
 		return 0;
 	}
 	return 1;
-- 
cgit v1.2.3


From 0b13fda1e0936b3d64c4c407f183d33fa6bd2ad4 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Feb 2009 16:52:11 +0100
Subject: generic-ipi: cleanups

Andrew pointed out that there's some small amount of
style rot in kernel/smp.c.

Clean it up.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Cc: Nick Piggin <npiggin@suse.de>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 162 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 86 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index f5308258891a..7ad2262d2eca 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -2,13 +2,12 @@
  * Generic helpers for smp ipi calls
  *
  * (C) Jens Axboe <jens.axboe@oracle.com> 2008
- *
  */
-#include <linux/init.h>
-#include <linux/module.h>
-#include <linux/percpu.h>
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/init.h>
 #include <linux/smp.h>
 #include <linux/cpu.h>
 
@@ -17,29 +16,30 @@ static DEFINE_PER_CPU(struct call_single_queue, call_single_queue);
 static struct {
 	struct list_head	queue;
 	spinlock_t		lock;
-} call_function __cacheline_aligned_in_smp = {
-	.queue = LIST_HEAD_INIT(call_function.queue),
-	.lock  = __SPIN_LOCK_UNLOCKED(call_function.lock),
-};
+} call_function __cacheline_aligned_in_smp =
+	{
+		.queue		= LIST_HEAD_INIT(call_function.queue),
+		.lock		= __SPIN_LOCK_UNLOCKED(call_function.lock),
+	};
 
 enum {
 	CSD_FLAG_LOCK		= 0x01,
 };
 
 struct call_function_data {
-	struct call_single_data csd;
-	spinlock_t lock;
-	unsigned int refs;
-	cpumask_var_t cpumask;
+	struct call_single_data	csd;
+	spinlock_t		lock;
+	unsigned int		refs;
+	cpumask_var_t		cpumask;
 };
 
 struct call_single_queue {
-	struct list_head list;
-	spinlock_t lock;
+	struct list_head	list;
+	spinlock_t		lock;
 };
 
 static DEFINE_PER_CPU(struct call_function_data, cfd_data) = {
-	.lock = __SPIN_LOCK_UNLOCKED(cfd_data.lock),
+	.lock			= __SPIN_LOCK_UNLOCKED(cfd_data.lock),
 };
 
 static int
@@ -71,7 +71,7 @@ hotplug_cfd(struct notifier_block *nfb, unsigned long action, void *hcpu)
 }
 
 static struct notifier_block __cpuinitdata hotplug_cfd_notifier = {
-	.notifier_call = hotplug_cfd,
+	.notifier_call		= hotplug_cfd,
 };
 
 static int __cpuinit init_call_single_data(void)
@@ -96,9 +96,9 @@ early_initcall(init_call_single_data);
 /*
  * csd_lock/csd_unlock used to serialize access to per-cpu csd resources
  *
- * For non-synchronous ipi calls the csd can still be in use by the previous
- * function call. For multi-cpu calls its even more interesting as we'll have
- * to ensure no other cpu is observing our csd.
+ * For non-synchronous ipi calls the csd can still be in use by the
+ * previous function call. For multi-cpu calls its even more interesting
+ * as we'll have to ensure no other cpu is observing our csd.
  */
 static void csd_lock_wait(struct call_single_data *data)
 {
@@ -112,27 +112,29 @@ static void csd_lock(struct call_single_data *data)
 	data->flags = CSD_FLAG_LOCK;
 
 	/*
-	 * prevent CPU from reordering the above assignment to ->flags
-	 * with any subsequent assignments to other fields of the
-	 * specified call_single_data structure.
+	 * prevent CPU from reordering the above assignment
+	 * to ->flags with any subsequent assignments to other
+	 * fields of the specified call_single_data structure:
 	 */
-
 	smp_mb();
 }
 
 static void csd_unlock(struct call_single_data *data)
 {
 	WARN_ON(!(data->flags & CSD_FLAG_LOCK));
+
 	/*
-	 * ensure we're all done before releasing data
+	 * ensure we're all done before releasing data:
 	 */
 	smp_mb();
+
 	data->flags &= ~CSD_FLAG_LOCK;
 }
 
 /*
- * Insert a previously allocated call_single_data element for execution
- * on the given CPU. data must already have ->func, ->info, and ->flags set.
+ * Insert a previously allocated call_single_data element
+ * for execution on the given CPU. data must already have
+ * ->func, ->info, and ->flags set.
  */
 static
 void generic_exec_single(int cpu, struct call_single_data *data, int wait)
@@ -154,10 +156,9 @@ void generic_exec_single(int cpu, struct call_single_data *data, int wait)
 	 * If IPIs can go out of order to the cache coherency protocol
 	 * in an architecture, sufficient synchronisation should be added
 	 * to arch code to make it appear to obey cache coherency WRT
-	 * locking and barrier primitives. Generic code isn't really equipped
-	 * to do the right thing...
+	 * locking and barrier primitives. Generic code isn't really
+	 * equipped to do the right thing...
 	 */
-
 	if (ipi)
 		arch_send_call_function_single_ipi(cpu);
 
@@ -183,8 +184,8 @@ void generic_smp_call_function_interrupt(void)
 	smp_mb();
 
 	/*
-	 * It's ok to use list_for_each_rcu() here even though we may delete
-	 * 'pos', since list_del_rcu() doesn't clear ->next
+	 * It's ok to use list_for_each_rcu() here even though we may
+	 * delete 'pos', since list_del_rcu() doesn't clear ->next
 	 */
 	list_for_each_entry_rcu(data, &call_function.queue, csd.list) {
 		int refs;
@@ -219,14 +220,14 @@ void generic_smp_call_function_interrupt(void)
 }
 
 /*
- * Invoked by arch to handle an IPI for call function single. Must be called
- * from the arch with interrupts disabled.
+ * Invoked by arch to handle an IPI for call function single. Must be
+ * called from the arch with interrupts disabled.
  */
 void generic_smp_call_function_single_interrupt(void)
 {
 	struct call_single_queue *q = &__get_cpu_var(call_single_queue);
-	LIST_HEAD(list);
 	unsigned int data_flags;
+	LIST_HEAD(list);
 
 	spin_lock(&q->lock);
 	list_replace_init(&q->list, &list);
@@ -235,22 +236,20 @@ void generic_smp_call_function_single_interrupt(void)
 	while (!list_empty(&list)) {
 		struct call_single_data *data;
 
-		data = list_entry(list.next, struct call_single_data,
-					list);
+		data = list_entry(list.next, struct call_single_data, list);
 		list_del(&data->list);
 
 		/*
-		 * 'data' can be invalid after this call if
-		 * flags == 0 (when called through
-		 * generic_exec_single(), so save them away before
-		 * making the call.
+		 * 'data' can be invalid after this call if flags == 0
+		 * (when called through generic_exec_single()),
+		 * so save them away before making the call:
 		 */
 		data_flags = data->flags;
 
 		data->func(data->info);
 
 		/*
-		 * Unlocked CSDs are valid through generic_exec_single()
+		 * Unlocked CSDs are valid through generic_exec_single():
 		 */
 		if (data_flags & CSD_FLAG_LOCK)
 			csd_unlock(data);
@@ -276,34 +275,41 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 		.flags = 0,
 	};
 	unsigned long flags;
-	/* prevent preemption and reschedule on another processor,
-	   as well as CPU removal */
-	int me = get_cpu();
+	int this_cpu;
 	int err = 0;
 
+	/*
+	 * prevent preemption and reschedule on another processor,
+	 * as well as CPU removal
+	 */
+	this_cpu = get_cpu();
+
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	if (cpu == me) {
+	if (cpu == this_cpu) {
 		local_irq_save(flags);
 		func(info);
 		local_irq_restore(flags);
-	} else if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
-		struct call_single_data *data = &d;
+	} else {
+		if ((unsigned)cpu < nr_cpu_ids && cpu_online(cpu)) {
+			struct call_single_data *data = &d;
 
-		if (!wait)
-			data = &__get_cpu_var(csd_data);
+			if (!wait)
+				data = &__get_cpu_var(csd_data);
 
-		csd_lock(data);
+			csd_lock(data);
 
-		data->func = func;
-		data->info = info;
-		generic_exec_single(cpu, data, wait);
-	} else {
-		err = -ENXIO;	/* CPU not online */
+			data->func = func;
+			data->info = info;
+			generic_exec_single(cpu, data, wait);
+		} else {
+			err = -ENXIO;	/* CPU not online */
+		}
 	}
 
 	put_cpu();
+
 	return err;
 }
 EXPORT_SYMBOL(smp_call_function_single);
@@ -313,10 +319,9 @@ EXPORT_SYMBOL(smp_call_function_single);
  * @cpu: The CPU to run on.
  * @data: Pre-allocated and setup data structure
  *
- * Like smp_call_function_single(), but allow caller to pass in a pre-allocated
- * data structure. Useful for embedding @data inside other structures, for
- * instance.
- *
+ * Like smp_call_function_single(), but allow caller to pass in a
+ * pre-allocated data structure. Useful for embedding @data inside
+ * other structures, for instance.
  */
 void __smp_call_function_single(int cpu, struct call_single_data *data,
 				int wait)
@@ -329,10 +334,11 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	generic_exec_single(cpu, data, wait);
 }
 
-/* FIXME: Shim for archs using old arch_send_call_function_ipi API. */
+/* Deprecated: shim for archs using old arch_send_call_function_ipi API. */
+
 #ifndef arch_send_call_function_ipi_mask
-#define arch_send_call_function_ipi_mask(maskp) \
-	arch_send_call_function_ipi(*(maskp))
+# define arch_send_call_function_ipi_mask(maskp) \
+	 arch_send_call_function_ipi(*(maskp))
 #endif
 
 /**
@@ -340,7 +346,8 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * @mask: The set of cpus to run on (only runs on online subset).
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
  *
  * If @wait is true, then returns once @func has returned. Note that @wait
  * will be implicitly turned on in case of allocation failures, since
@@ -351,27 +358,27 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
  * must be disabled when calling this function.
  */
 void smp_call_function_many(const struct cpumask *mask,
-			    void (*func)(void *), void *info,
-			    bool wait)
+			    void (*func)(void *), void *info, bool wait)
 {
 	struct call_function_data *data;
 	unsigned long flags;
-	int cpu, next_cpu, me = smp_processor_id();
+	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
 	WARN_ON(irqs_disabled());
 
-	/* So, what's a CPU they want?  Ignoring this one. */
+	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-	if (cpu == me)
+	if (cpu == this_cpu)
 		cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
+
 	/* No online cpus?  We're done. */
 	if (cpu >= nr_cpu_ids)
 		return;
 
 	/* Do we have another CPU which isn't us? */
 	next_cpu = cpumask_next_and(cpu, mask, cpu_online_mask);
-	if (next_cpu == me)
+	if (next_cpu == this_cpu)
 		next_cpu = cpumask_next_and(next_cpu, mask, cpu_online_mask);
 
 	/* Fastpath: do that cpu by itself. */
@@ -387,30 +394,31 @@ void smp_call_function_many(const struct cpumask *mask,
 	data->csd.func = func;
 	data->csd.info = info;
 	cpumask_and(data->cpumask, mask, cpu_online_mask);
-	cpumask_clear_cpu(me, data->cpumask);
+	cpumask_clear_cpu(this_cpu, data->cpumask);
 	data->refs = cpumask_weight(data->cpumask);
 
 	spin_lock(&call_function.lock);
 	/*
 	 * Place entry at the _HEAD_ of the list, so that any cpu still
-	 * observing the entry in generic_smp_call_function_interrupt() will
-	 * not miss any other list entries.
+	 * observing the entry in generic_smp_call_function_interrupt()
+	 * will not miss any other list entries:
 	 */
 	list_add_rcu(&data->csd.list, &call_function.queue);
 	spin_unlock(&call_function.lock);
+
 	spin_unlock_irqrestore(&data->lock, flags);
 
 	/*
 	 * Make the list addition visible before sending the ipi.
-	 * (IPIs must obey or appear to obey normal Linux cache coherency
-	 * rules -- see comment in generic_exec_single).
+	 * (IPIs must obey or appear to obey normal Linux cache
+	 * coherency rules -- see comment in generic_exec_single).
 	 */
 	smp_mb();
 
 	/* Send a message to all CPUs in the map */
 	arch_send_call_function_ipi_mask(data->cpumask);
 
-	/* optionally wait for the CPUs to complete */
+	/* Optionally wait for the CPUs to complete */
 	if (wait)
 		csd_lock_wait(&data->csd);
 }
@@ -420,7 +428,8 @@ EXPORT_SYMBOL(smp_call_function_many);
  * smp_call_function(): Run a function on all other CPUs.
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed on other CPUs.
+ * @wait: If true, wait (atomically) until function has completed
+ *        on other CPUs.
  *
  * Returns 0.
  *
@@ -436,6 +445,7 @@ int smp_call_function(void (*func)(void *), void *info, int wait)
 	preempt_disable();
 	smp_call_function_many(cpu_online_mask, func, info, wait);
 	preempt_enable();
+
 	return 0;
 }
 EXPORT_SYMBOL(smp_call_function);
-- 
cgit v1.2.3


From 53bbfa9e9437e70b322368e82c723112d690e304 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 20 Feb 2008 07:58:42 +0100
Subject: time: ntp: clean up kernel/time/ntp.c

Impact: cleanup, no functionality changed

Make this file a bit more readable by applying a consistent coding style.

No code changed:

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2552	    170	    168	   2890	    b4a	ntp.o.before
   2552	    170	    168	   2890	    b4a	ntp.o.after

md5:
   eae1275df0b7d6290c13f6f6f8f05c8c  ntp.o.before.asm
   eae1275df0b7d6290c13f6f6f8f05c8c  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 129 ++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 81 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index e1fa3689a903..3479ec48e604 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -1,53 +1,81 @@
 /*
- * linux/kernel/time/ntp.c
- *
  * NTP state machine interfaces and logic.
  *
  * This code was mainly moved from kernel/timer.c and kernel/time.c
  * Please see those files for relevant copyright info and historical
  * changelogs.
  */
-
-#include <linux/mm.h>
-#include <linux/time.h>
-#include <linux/timex.h>
-#include <linux/jiffies.h>
-#include <linux/hrtimer.h>
 #include <linux/capability.h>
-#include <linux/math64.h>
 #include <linux/clocksource.h>
 #include <linux/workqueue.h>
-#include <asm/timex.h>
+#include <linux/hrtimer.h>
+#include <linux/jiffies.h>
+#include <linux/math64.h>
+#include <linux/timex.h>
+#include <linux/time.h>
+#include <linux/mm.h>
 
 /*
- * Timekeeping variables
+ * NTP timekeeping variables:
  */
-unsigned long tick_usec = TICK_USEC; 		/* USER_HZ period (usec) */
-unsigned long tick_nsec;			/* ACTHZ period (nsec) */
-u64 tick_length;
-static u64 tick_length_base;
 
-static struct hrtimer leap_timer;
+/* USER_HZ period (usecs): */
+unsigned long			tick_usec = TICK_USEC;
+
+/* ACTHZ period (nsecs): */
+unsigned long			tick_nsec;
 
-#define MAX_TICKADJ		500		/* microsecs */
-#define MAX_TICKADJ_SCALED	(((u64)(MAX_TICKADJ * NSEC_PER_USEC) << \
-				  NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+u64				tick_length;
+static u64			tick_length_base;
+
+static struct hrtimer		leap_timer;
+
+#define MAX_TICKADJ		500		/* usecs */
+#define MAX_TICKADJ_SCALED \
+  (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
 
 /*
  * phase-lock loop variables
  */
-/* TIME_ERROR prevents overwriting the CMOS clock */
-static int time_state = TIME_OK;	/* clock synchronization status	*/
-int time_status = STA_UNSYNC;		/* clock status bits		*/
-static long time_tai;			/* TAI offset (s)		*/
-static s64 time_offset;			/* time adjustment (ns)		*/
-static long time_constant = 2;		/* pll time constant		*/
-long time_maxerror = NTP_PHASE_LIMIT;	/* maximum error (us)		*/
-long time_esterror = NTP_PHASE_LIMIT;	/* estimated error (us)		*/
-static s64 time_freq;			/* frequency offset (scaled ns/s)*/
-static long time_reftime;		/* time at last adjustment (s)	*/
-long time_adjust;
-static long ntp_tick_adj;
+
+/*
+ * clock synchronization status
+ *
+ * (TIME_ERROR prevents overwriting the CMOS clock)
+ */
+static int			time_state = TIME_OK;
+
+/* clock status bits:							*/
+int				time_status = STA_UNSYNC;
+
+/* TAI offset (secs):							*/
+static long			time_tai;
+
+/* time adjustment (nsecs):						*/
+static s64			time_offset;
+
+/* pll time constant:							*/
+static long			time_constant = 2;
+
+/* maximum error (usecs):						*/
+long				time_maxerror = NTP_PHASE_LIMIT;
+
+/* estimated error (usecs):						*/
+long				time_esterror = NTP_PHASE_LIMIT;
+
+/* frequency offset (scaled nsecs/secs):				*/
+static s64			time_freq;
+
+/* time at last adjustment (secs):					*/
+static long			time_reftime;
+
+long				time_adjust;
+
+static long			ntp_tick_adj;
+
+/*
+ * NTP methods:
+ */
 
 static void ntp_update_frequency(void)
 {
@@ -118,15 +146,15 @@ static void ntp_update_offset(long offset)
  */
 void ntp_clear(void)
 {
-	time_adjust = 0;		/* stop active adjtime() */
-	time_status |= STA_UNSYNC;
-	time_maxerror = NTP_PHASE_LIMIT;
-	time_esterror = NTP_PHASE_LIMIT;
+	time_adjust	= 0;		/* stop active adjtime() */
+	time_status	|= STA_UNSYNC;
+	time_maxerror	= NTP_PHASE_LIMIT;
+	time_esterror	= NTP_PHASE_LIMIT;
 
 	ntp_update_frequency();
 
-	tick_length = tick_length_base;
-	time_offset = 0;
+	tick_length	= tick_length_base;
+	time_offset	= 0;
 }
 
 /*
@@ -147,8 +175,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 		xtime.tv_sec--;
 		wall_to_monotonic.tv_sec++;
 		time_state = TIME_OOP;
-		printk(KERN_NOTICE "Clock: "
-		       "inserting leap second 23:59:60 UTC\n");
+		printk(KERN_NOTICE
+			"Clock: inserting leap second 23:59:60 UTC\n");
 		hrtimer_add_expires_ns(&leap_timer, NSEC_PER_SEC);
 		res = HRTIMER_RESTART;
 		break;
@@ -157,8 +185,8 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
 		time_tai--;
 		wall_to_monotonic.tv_sec--;
 		time_state = TIME_WAIT;
-		printk(KERN_NOTICE "Clock: "
-		       "deleting leap second 23:59:59 UTC\n");
+		printk(KERN_NOTICE
+			"Clock: deleting leap second 23:59:59 UTC\n");
 		break;
 	case TIME_OOP:
 		time_tai++;
@@ -199,10 +227,10 @@ void second_overflow(void)
 	 * Compute the phase adjustment for the next second. The offset is
 	 * reduced by a fixed factor times the time constant.
 	 */
-	tick_length = tick_length_base;
-	time_adj = shift_right(time_offset, SHIFT_PLL + time_constant);
-	time_offset -= time_adj;
-	tick_length += time_adj;
+	tick_length	= tick_length_base;
+	time_adj	= shift_right(time_offset, SHIFT_PLL + time_constant);
+	time_offset	-= time_adj;
+	tick_length	+= time_adj;
 
 	if (unlikely(time_adjust)) {
 		if (time_adjust > MAX_TICKADJ) {
@@ -240,12 +268,13 @@ static void sync_cmos_clock(struct work_struct *work)
 	 * This code is run on a timer.  If the clock is set, that timer
 	 * may not expire at the correct time.  Thus, we adjust...
 	 */
-	if (!ntp_synced())
+	if (!ntp_synced()) {
 		/*
 		 * Not synced, exit, do not restart a timer (if one is
 		 * running, let it run out).
 		 */
 		return;
+	}
 
 	getnstimeofday(&now);
 	if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2)
@@ -277,7 +306,8 @@ static void notify_cmos_timer(void)
 static inline void notify_cmos_timer(void) { }
 #endif
 
-/* adjtimex mainly allows reading (and writing, if superuser) of
+/*
+ * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
  */
 int do_adjtimex(struct timex *txc)
@@ -298,7 +328,10 @@ int do_adjtimex(struct timex *txc)
 		 if (txc->modes && !capable(CAP_SYS_TIME))
 			return -EPERM;
 
-		/* if the quartz is off by more than 10% something is VERY wrong! */
+		/*
+		 * if the quartz is off by more than 10% then
+		 * something is VERY wrong!
+		 */
 		if (txc->modes & ADJ_TICK &&
 		    (txc->tick <  900000/USER_HZ ||
 		     txc->tick > 1100000/USER_HZ))
-- 
cgit v1.2.3


From 3c972c2444dcb7088999c32b8c5a7ab3b8a6c0b6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 12:06:57 +0100
Subject: time: ntp: simplify the second_overflow() code flow

Impact: cleanup, no functionality changed

Instead of a hierarchy of conditions, transform them to clean
gradual conditions and return's.

This makes the flow easier to read and makes the purpose of
the function easier to understand.

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2552	    170	    168	   2890	    b4a	ntp.o.before
   2552	    170	    168	   2890	    b4a	ntp.o.after

md5:
   eae1275df0b7d6290c13f6f6f8f05c8c  ntp.o.before.asm
   eae1275df0b7d6290c13f6f6f8f05c8c  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 3479ec48e604..1fa6615b317a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -232,19 +232,24 @@ void second_overflow(void)
 	time_offset	-= time_adj;
 	tick_length	+= time_adj;
 
-	if (unlikely(time_adjust)) {
-		if (time_adjust > MAX_TICKADJ) {
-			time_adjust -= MAX_TICKADJ;
-			tick_length += MAX_TICKADJ_SCALED;
-		} else if (time_adjust < -MAX_TICKADJ) {
-			time_adjust += MAX_TICKADJ;
-			tick_length -= MAX_TICKADJ_SCALED;
-		} else {
-			tick_length += (s64)(time_adjust * NSEC_PER_USEC /
-					NTP_INTERVAL_FREQ) << NTP_SCALE_SHIFT;
-			time_adjust = 0;
-		}
+	if (!time_adjust)
+		return;
+
+	if (time_adjust > MAX_TICKADJ) {
+		time_adjust -= MAX_TICKADJ;
+		tick_length += MAX_TICKADJ_SCALED;
+		return;
 	}
+
+	if (time_adjust < -MAX_TICKADJ) {
+		time_adjust += MAX_TICKADJ;
+		tick_length -= MAX_TICKADJ_SCALED;
+		return;
+	}
+
+	tick_length += (s64)(time_adjust * NSEC_PER_USEC / NTP_INTERVAL_FREQ)
+							 << NTP_SCALE_SHIFT;
+	time_adjust = 0;
 }
 
 #ifdef CONFIG_GENERIC_CMOS_UPDATE
-- 
cgit v1.2.3


From bbd1267690bb6940d0722dd33e929442c0409c01 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 12:11:11 +0100
Subject: time: ntp: simplify the MAX_TICKADJ_SCALED definition

Impact: cleanup, no functionality changed

There's an ugly u64 typecase in the MAX_TICKADJ_SCALED definition,
this can be eliminated by making the MAX_TICKADJ constant's type
64-bit (signed).

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2504	    114	    136	   2754	    ac2	ntp.o.before
   2504	    114	    136	   2754	    ac2	ntp.o.after

md5:
   41f3009debc9b397d7394dd77d912f0a  ntp.o.before.asm
   41f3009debc9b397d7394dd77d912f0a  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 1fa6615b317a..2b758c935c65 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -30,9 +30,9 @@ static u64			tick_length_base;
 
 static struct hrtimer		leap_timer;
 
-#define MAX_TICKADJ		500		/* usecs */
+#define MAX_TICKADJ		500LL		/* usecs */
 #define MAX_TICKADJ_SCALED \
-  (((u64)(MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
+	(((MAX_TICKADJ * NSEC_PER_USEC) << NTP_SCALE_SHIFT) / NTP_INTERVAL_FREQ)
 
 /*
  * phase-lock loop variables
-- 
cgit v1.2.3


From 9ce616aaefcb9309cb9c49a36310ebda6061b98b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 12:42:59 +0100
Subject: time: ntp: clean up ntp_update_frequency()

Impact: cleanup, no functionality changed

Prepare a refactoring of ntp_update_frequency().

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2504	    114	    136	   2754	    ac2	ntp.o.before
   2504	    114	    136	   2754	    ac2	ntp.o.after

md5:
   41f3009debc9b397d7394dd77d912f0a  ntp.o.before.asm
   41f3009debc9b397d7394dd77d912f0a  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 27 ++++++++++++++++++---------
 1 file changed, 18 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 2b758c935c65..7d281d9fbe30 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -77,24 +77,33 @@ static long			ntp_tick_adj;
  * NTP methods:
  */
 
+/*
+ * Update (tick_length, tick_length_base, tick_nsec), based
+ * on (tick_usec, ntp_tick_adj, time_freq):
+ */
 static void ntp_update_frequency(void)
 {
-	u64 old_tick_length_base = tick_length_base;
-	u64 second_length = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
-				<< NTP_SCALE_SHIFT;
-	second_length += (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
-	second_length += time_freq;
+	u64 prev_base;
+	u64 second_length;
+
+	prev_base = tick_length_base;
+
+	second_length		 = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
+						<< NTP_SCALE_SHIFT;
+
+	second_length		+= (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
+	second_length		+= time_freq;
 
-	tick_length_base = second_length;
+	tick_length_base	 = second_length;
 
-	tick_nsec = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
-	tick_length_base = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
+	tick_nsec		 = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
+	tick_length_base	 = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
 
 	/*
 	 * Don't wait for the next second_overflow, apply
 	 * the change to the tick length immediately
 	 */
-	tick_length += tick_length_base - old_tick_length_base;
+	tick_length		+= tick_length_base - prev_base;
 }
 
 static void ntp_update_offset(long offset)
-- 
cgit v1.2.3


From bc26c31d446bc9c24cd6f7003777a05fe268ae48 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 12:17:36 +0100
Subject: time: ntp: refactor up ntp_update_frequency()

Impact: cleanup, no functionality changed

Change ntp_update_frequency() from a hard to follow code
flow that uses global variables as temporaries, to a clean
input+output flow.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7d281d9fbe30..f1abad738579 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -83,10 +83,8 @@ static long			ntp_tick_adj;
  */
 static void ntp_update_frequency(void)
 {
-	u64 prev_base;
 	u64 second_length;
-
-	prev_base = tick_length_base;
+	u64 new_base;
 
 	second_length		 = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
 						<< NTP_SCALE_SHIFT;
@@ -94,16 +92,15 @@ static void ntp_update_frequency(void)
 	second_length		+= (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
 	second_length		+= time_freq;
 
-	tick_length_base	 = second_length;
-
 	tick_nsec		 = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
-	tick_length_base	 = div_u64(tick_length_base, NTP_INTERVAL_FREQ);
+	new_base		 = div_u64(second_length, NTP_INTERVAL_FREQ);
 
 	/*
 	 * Don't wait for the next second_overflow, apply
-	 * the change to the tick length immediately
+	 * the change to the tick length immediately:
 	 */
-	tick_length		+= tick_length_base - prev_base;
+	tick_length		+= new_base - tick_length_base;
+	tick_length_base	 = new_base;
 }
 
 static void ntp_update_offset(long offset)
-- 
cgit v1.2.3


From f939890b6687e05c42361655fb6610fa08f5a601 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 12:57:49 +0100
Subject: time: ntp: refactor and clean up ntp_update_offset()

Impact: cleanup, no functionality changed

- introduce the ntp_update_offset_fll() helper
- clean up the flow and variable naming

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2504	    114	    136	   2754	    ac2	ntp.o.before
   2504	    114	    136	   2754	    ac2	ntp.o.after

md5:
   01f7b8e1a5472a3056f9e4ae84d46315  ntp.o.before.asm
   01f7b8e1a5472a3056f9e4ae84d46315  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 44 ++++++++++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index f1abad738579..ee437e1445d1 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -103,10 +103,27 @@ static void ntp_update_frequency(void)
 	tick_length_base	 = new_base;
 }
 
+static inline s64 ntp_update_offset_fll(s64 freq_adj, s64 offset64, long secs)
+{
+	time_status &= ~STA_MODE;
+
+	if (secs < MINSEC)
+		return freq_adj;
+
+	if (!(time_status & STA_FLL) && (secs <= MAXSEC))
+		return freq_adj;
+
+	freq_adj += div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
+	time_status |= STA_MODE;
+
+	return freq_adj;
+}
+
 static void ntp_update_offset(long offset)
 {
-	long mtemp;
 	s64 freq_adj;
+	s64 offset64;
+	long secs;
 
 	if (!(time_status & STA_PLL))
 		return;
@@ -127,22 +144,21 @@ static void ntp_update_offset(long offset)
 	 */
 	if (time_status & STA_FREQHOLD || time_reftime == 0)
 		time_reftime = xtime.tv_sec;
-	mtemp = xtime.tv_sec - time_reftime;
+
+	secs = xtime.tv_sec - time_reftime;
 	time_reftime = xtime.tv_sec;
 
-	freq_adj = (s64)offset * mtemp;
-	freq_adj <<= NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant);
-	time_status &= ~STA_MODE;
-	if (mtemp >= MINSEC && (time_status & STA_FLL || mtemp > MAXSEC)) {
-		freq_adj += div_s64((s64)offset << (NTP_SCALE_SHIFT - SHIFT_FLL),
-				    mtemp);
-		time_status |= STA_MODE;
-	}
-	freq_adj += time_freq;
-	freq_adj = min(freq_adj, MAXFREQ_SCALED);
-	time_freq = max(freq_adj, -MAXFREQ_SCALED);
+	offset64    = offset;
+	freq_adj    = (offset64 * secs) <<
+			(NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
+
+	freq_adj    = ntp_update_offset_fll(freq_adj, offset64, secs);
+
+	freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
+
+	time_freq   = max(freq_adj, -MAXFREQ_SCALED);
 
-	time_offset = div_s64((s64)offset << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
+	time_offset = div_s64(offset64 << NTP_SCALE_SHIFT, NTP_INTERVAL_FREQ);
 }
 
 /**
-- 
cgit v1.2.3


From 478b7aab1682246a3d1e76e27a0aecb2f0013379 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 13:22:23 +0100
Subject: time: ntp: simplify ntp_update_offset_fll()

Impact: cleanup, no functionality changed

Change ntp_update_offset_fll() to delta logic instead of
absolute value logic. This eliminates 'freq_adj' from the
function.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index ee437e1445d1..5202dde2f0af 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -103,20 +103,19 @@ static void ntp_update_frequency(void)
 	tick_length_base	 = new_base;
 }
 
-static inline s64 ntp_update_offset_fll(s64 freq_adj, s64 offset64, long secs)
+static inline s64 ntp_update_offset_fll(s64 offset64, long secs)
 {
 	time_status &= ~STA_MODE;
 
 	if (secs < MINSEC)
-		return freq_adj;
+		return 0;
 
 	if (!(time_status & STA_FLL) && (secs <= MAXSEC))
-		return freq_adj;
+		return 0;
 
-	freq_adj += div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
 	time_status |= STA_MODE;
 
-	return freq_adj;
+	return div_s64(offset64 << (NTP_SCALE_SHIFT - SHIFT_FLL), secs);
 }
 
 static void ntp_update_offset(long offset)
@@ -152,7 +151,7 @@ static void ntp_update_offset(long offset)
 	freq_adj    = (offset64 * secs) <<
 			(NTP_SCALE_SHIFT - 2 * (SHIFT_PLL + 2 + time_constant));
 
-	freq_adj    = ntp_update_offset_fll(freq_adj, offset64, secs);
+	freq_adj    += ntp_update_offset_fll(offset64, secs);
 
 	freq_adj    = min(freq_adj + time_freq, MAXFREQ_SCALED);
 
-- 
cgit v1.2.3


From c7986acba211e8285e14c9603fb89e6f4ea0b9f8 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 13:29:09 +0100
Subject: time: ntp: micro-optimize ntp_update_offset()

Impact: cleanup, no functionality changed

The time_reftime update in ntp_update_offset() to xtime.tv_sec
is a convoluted way of saying that we want to freeze the frequency
and want the 'secs' delta to be 0. Also make this branch unlikely.

This shaves off 8 bytes from the code size:

   text	   data	    bss	    dec	    hex	filename
   2504	    114	    136	   2754	    ac2	ntp.o.before
   2496	    114	    136	   2746	    aba	ntp.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 5202dde2f0af..580a35028693 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -141,10 +141,10 @@ static void ntp_update_offset(long offset)
 	 * Select how the frequency is to be controlled
 	 * and in which mode (PLL or FLL).
 	 */
-	if (time_status & STA_FREQHOLD || time_reftime == 0)
-		time_reftime = xtime.tv_sec;
-
 	secs = xtime.tv_sec - time_reftime;
+	if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0))
+		secs = 0;
+
 	time_reftime = xtime.tv_sec;
 
 	offset64    = offset;
-- 
cgit v1.2.3


From 10dd31a7a17254d6ba793305fc590455393e610e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 13:38:40 +0100
Subject: time: ntp: fix bug in ntp_update_offset() & do_adjtimex()

Impact: change (fix) the way the NTP PLL seconds offset is initialized/tracked

Fix a bug and do a micro-optimization:

When PLL is enabled we do not reset time_reftime. If the PLL
was off for a long time (for example after bootup), this is
arguably the wrong thing to do.

We already had a hack for the common boot-time case in
ntp_update_offset(), in form of:

	if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0))
 		secs = 0;

But the update delta should be reset later on too - not just when
the PLL is enabled for the first time after bootup.

So do it on !STA_PLL -> STA_PLL transitions.

This changes behavior, as previously if ntpd was disabled for
a long time and we restarted it, we'd run from that last update,
with a very large delta.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 580a35028693..fc08eb10ced4 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -142,7 +142,7 @@ static void ntp_update_offset(long offset)
 	 * and in which mode (PLL or FLL).
 	 */
 	secs = xtime.tv_sec - time_reftime;
-	if (unlikely(time_status & STA_FREQHOLD || time_reftime == 0))
+	if (unlikely(time_status & STA_FREQHOLD))
 		secs = 0;
 
 	time_reftime = xtime.tv_sec;
@@ -394,6 +394,13 @@ int do_adjtimex(struct timex *txc)
 			}
 			/* only set allowed bits */
 			time_status &= STA_RONLY;
+			/*
+			 * If we turn on PLL adjustments then reset the
+			 * reference time to current time.
+			 */
+			if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
+				time_reftime = xtime.tv_sec;
+
 			time_status |= txc->status & ~STA_RONLY;
 
 			switch (time_state) {
-- 
cgit v1.2.3


From 80f2257116474ceed5fccab510b4f7245c0f49d7 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 15:15:32 +0100
Subject: time: ntp: refactor do_adjtimex()

Impact: cleanup, no functionality changed

do_adjtimex() is currently a monster function with a maze of
branches. Refactor the txc->modes setting aspects of it into
two new helper functions:

	process_adj_status()
	process_adjtimex_modes()

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2512	    114	    136	   2762	    aca	ntp.o.before
   2512	    114	    136	   2762	    aca	ntp.o.after

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 182 +++++++++++++++++++++++++++++-------------------------
 1 file changed, 99 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index fc08eb10ced4..aded09be98cc 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -332,6 +332,102 @@ static void notify_cmos_timer(void)
 static inline void notify_cmos_timer(void) { }
 #endif
 
+
+/*
+ * Propagate a new txc->status value into the NTP state:
+ */
+static inline void process_adj_status(struct timex *txc, struct timespec *ts)
+{
+	long now;
+
+	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
+		time_state = TIME_OK;
+		time_status = STA_UNSYNC;
+	}
+	/* only set allowed bits */
+	time_status &= STA_RONLY;
+
+	/*
+	 * If we turn on PLL adjustments then reset the
+	 * reference time to current time.
+	 */
+	if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
+		time_reftime = xtime.tv_sec;
+
+	time_status |= txc->status & ~STA_RONLY;
+
+	switch (time_state) {
+	case TIME_OK:
+	start_timer:
+		now = ts->tv_sec;
+		if (time_status & STA_INS) {
+			time_state = TIME_INS;
+			now += 86400 - now % 86400;
+			hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+		} else if (time_status & STA_DEL) {
+			time_state = TIME_DEL;
+			now += 86400 - (now + 1) % 86400;
+			hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+		}
+		break;
+	case TIME_INS:
+	case TIME_DEL:
+		time_state = TIME_OK;
+		goto start_timer;
+	case TIME_WAIT:
+		if (!(time_status & (STA_INS | STA_DEL)))
+			time_state = TIME_OK;
+		break;
+	case TIME_OOP:
+		hrtimer_restart(&leap_timer);
+		break;
+	}
+}
+/*
+ * Called with the xtime lock held, so we can access and modify
+ * all the global NTP state:
+ */
+static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts)
+{
+	if (txc->modes & ADJ_STATUS)
+		process_adj_status(txc, ts);
+
+	if (txc->modes & ADJ_NANO)
+		time_status |= STA_NANO;
+	if (txc->modes & ADJ_MICRO)
+		time_status &= ~STA_NANO;
+
+	if (txc->modes & ADJ_FREQUENCY) {
+		time_freq = (s64)txc->freq * PPM_SCALE;
+		time_freq = min(time_freq, MAXFREQ_SCALED);
+		time_freq = max(time_freq, -MAXFREQ_SCALED);
+	}
+
+	if (txc->modes & ADJ_MAXERROR)
+		time_maxerror = txc->maxerror;
+	if (txc->modes & ADJ_ESTERROR)
+		time_esterror = txc->esterror;
+
+	if (txc->modes & ADJ_TIMECONST) {
+		time_constant = txc->constant;
+		if (!(time_status & STA_NANO))
+			time_constant += 4;
+		time_constant = min(time_constant, (long)MAXTC);
+		time_constant = max(time_constant, 0l);
+	}
+
+	if (txc->modes & ADJ_TAI && txc->constant > 0)
+		time_tai = txc->constant;
+
+	if (txc->modes & ADJ_OFFSET)
+		ntp_update_offset(txc->offset);
+	if (txc->modes & ADJ_TICK)
+		tick_usec = txc->tick;
+
+	if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
+		ntp_update_frequency();
+}
+
 /*
  * adjtimex mainly allows reading (and writing, if superuser) of
  * kernel time-keeping variables. used by xntpd.
@@ -383,90 +479,10 @@ int do_adjtimex(struct timex *txc)
 		txc->offset = save_adjust;
 		goto adj_done;
 	}
-	if (txc->modes) {
-		long sec;
-
-		if (txc->modes & ADJ_STATUS) {
-			if ((time_status & STA_PLL) &&
-			    !(txc->status & STA_PLL)) {
-				time_state = TIME_OK;
-				time_status = STA_UNSYNC;
-			}
-			/* only set allowed bits */
-			time_status &= STA_RONLY;
-			/*
-			 * If we turn on PLL adjustments then reset the
-			 * reference time to current time.
-			 */
-			if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
-				time_reftime = xtime.tv_sec;
-
-			time_status |= txc->status & ~STA_RONLY;
-
-			switch (time_state) {
-			case TIME_OK:
-			start_timer:
-				sec = ts.tv_sec;
-				if (time_status & STA_INS) {
-					time_state = TIME_INS;
-					sec += 86400 - sec % 86400;
-					hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
-				} else if (time_status & STA_DEL) {
-					time_state = TIME_DEL;
-					sec += 86400 - (sec + 1) % 86400;
-					hrtimer_start(&leap_timer, ktime_set(sec, 0), HRTIMER_MODE_ABS);
-				}
-				break;
-			case TIME_INS:
-			case TIME_DEL:
-				time_state = TIME_OK;
-				goto start_timer;
-				break;
-			case TIME_WAIT:
-				if (!(time_status & (STA_INS | STA_DEL)))
-					time_state = TIME_OK;
-				break;
-			case TIME_OOP:
-				hrtimer_restart(&leap_timer);
-				break;
-			}
-		}
-
-		if (txc->modes & ADJ_NANO)
-			time_status |= STA_NANO;
-		if (txc->modes & ADJ_MICRO)
-			time_status &= ~STA_NANO;
 
-		if (txc->modes & ADJ_FREQUENCY) {
-			time_freq = (s64)txc->freq * PPM_SCALE;
-			time_freq = min(time_freq, MAXFREQ_SCALED);
-			time_freq = max(time_freq, -MAXFREQ_SCALED);
-		}
-
-		if (txc->modes & ADJ_MAXERROR)
-			time_maxerror = txc->maxerror;
-		if (txc->modes & ADJ_ESTERROR)
-			time_esterror = txc->esterror;
-
-		if (txc->modes & ADJ_TIMECONST) {
-			time_constant = txc->constant;
-			if (!(time_status & STA_NANO))
-				time_constant += 4;
-			time_constant = min(time_constant, (long)MAXTC);
-			time_constant = max(time_constant, 0l);
-		}
-
-		if (txc->modes & ADJ_TAI && txc->constant > 0)
-			time_tai = txc->constant;
-
-		if (txc->modes & ADJ_OFFSET)
-			ntp_update_offset(txc->offset);
-		if (txc->modes & ADJ_TICK)
-			tick_usec = txc->tick;
-
-		if (txc->modes & (ADJ_TICK|ADJ_FREQUENCY|ADJ_OFFSET))
-			ntp_update_frequency();
-	}
+	/* If there are input parameters, then process them: */
+	if (txc->modes)
+		process_adjtimex_modes(txc, &ts);
 
 	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 				  NTP_SCALE_SHIFT);
-- 
cgit v1.2.3


From e96291653b2e4df02f160b574070f6e632868e5e Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 15:35:18 +0100
Subject: time: ntp: refactor do_adjtimex() some more

Impact: cleanup, no functionality changed

Further simplify do_adjtimex():

 - introduce the ntp_start_leap_timer() helper function
 - eliminate the goto adj_done complication

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 61 ++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index aded09be98cc..4346ed6e623f 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -332,14 +332,33 @@ static void notify_cmos_timer(void)
 static inline void notify_cmos_timer(void) { }
 #endif
 
+/*
+ * Start the leap seconds timer:
+ */
+static inline void ntp_start_leap_timer(struct timespec *ts)
+{
+	long now = ts->tv_sec;
+
+	if (time_status & STA_INS) {
+		time_state = TIME_INS;
+		now += 86400 - now % 86400;
+		hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+
+		return;
+	}
+
+	if (time_status & STA_DEL) {
+		time_state = TIME_DEL;
+		now += 86400 - (now + 1) % 86400;
+		hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
+	}
+}
 
 /*
  * Propagate a new txc->status value into the NTP state:
  */
 static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 {
-	long now;
-
 	if ((time_status & STA_PLL) && !(txc->status & STA_PLL)) {
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
@@ -358,22 +377,12 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 
 	switch (time_state) {
 	case TIME_OK:
-	start_timer:
-		now = ts->tv_sec;
-		if (time_status & STA_INS) {
-			time_state = TIME_INS;
-			now += 86400 - now % 86400;
-			hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-		} else if (time_status & STA_DEL) {
-			time_state = TIME_DEL;
-			now += 86400 - (now + 1) % 86400;
-			hrtimer_start(&leap_timer, ktime_set(now, 0), HRTIMER_MODE_ABS);
-		}
+		ntp_start_leap_timer(ts);
 		break;
 	case TIME_INS:
 	case TIME_DEL:
 		time_state = TIME_OK;
-		goto start_timer;
+		ntp_start_leap_timer(ts);
 	case TIME_WAIT:
 		if (!(time_status & (STA_INS | STA_DEL)))
 			time_state = TIME_OK;
@@ -394,6 +403,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 
 	if (txc->modes & ADJ_NANO)
 		time_status |= STA_NANO;
+
 	if (txc->modes & ADJ_MICRO)
 		time_status &= ~STA_NANO;
 
@@ -405,6 +415,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 
 	if (txc->modes & ADJ_MAXERROR)
 		time_maxerror = txc->maxerror;
+
 	if (txc->modes & ADJ_ESTERROR)
 		time_esterror = txc->esterror;
 
@@ -421,6 +432,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 
 	if (txc->modes & ADJ_OFFSET)
 		ntp_update_offset(txc->offset);
+
 	if (txc->modes & ADJ_TICK)
 		tick_usec = txc->tick;
 
@@ -457,7 +469,7 @@ int do_adjtimex(struct timex *txc)
 		if (txc->modes & ADJ_TICK &&
 		    (txc->tick <  900000/USER_HZ ||
 		     txc->tick > 1100000/USER_HZ))
-				return -EINVAL;
+			return -EINVAL;
 
 		if (txc->modes & ADJ_STATUS && time_state != TIME_OK)
 			hrtimer_cancel(&leap_timer);
@@ -467,7 +479,6 @@ int do_adjtimex(struct timex *txc)
 
 	write_seqlock_irq(&xtime_lock);
 
-	/* If there are input parameters, then process them */
 	if (txc->modes & ADJ_ADJTIME) {
 		long save_adjust = time_adjust;
 
@@ -477,19 +488,18 @@ int do_adjtimex(struct timex *txc)
 			ntp_update_frequency();
 		}
 		txc->offset = save_adjust;
-		goto adj_done;
-	}
+	} else {
 
-	/* If there are input parameters, then process them: */
-	if (txc->modes)
-		process_adjtimex_modes(txc, &ts);
+		/* If there are input parameters, then process them: */
+		if (txc->modes)
+			process_adjtimex_modes(txc, &ts);
 
-	txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
+		txc->offset = shift_right(time_offset * NTP_INTERVAL_FREQ,
 				  NTP_SCALE_SHIFT);
-	if (!(time_status & STA_NANO))
-		txc->offset /= NSEC_PER_USEC;
+		if (!(time_status & STA_NANO))
+			txc->offset /= NSEC_PER_USEC;
+	}
 
-adj_done:
 	result = time_state;	/* mostly `TIME_OK' */
 	if (time_status & (STA_UNSYNC|STA_CLOCKERR))
 		result = TIME_ERROR;
@@ -514,6 +524,7 @@ adj_done:
 	txc->calcnt	   = 0;
 	txc->errcnt	   = 0;
 	txc->stbcnt	   = 0;
+
 	write_sequnlock_irq(&xtime_lock);
 
 	txc->time.tv_sec = ts.tv_sec;
-- 
cgit v1.2.3


From 2b9d1496e7835a603c340e8f0dd81f4b74d5f248 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 15:48:43 +0100
Subject: time: ntp: make 64-bit constants more robust

Impact: cleanup, no functionality changed

 - make PPM_SCALE an explicit s64 constant, to
   remove (s64) casts from usage sites.

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2536	    114	    136	   2786	    ae2	ntp.o.before
   2536	    114	    136	   2786	    ae2	ntp.o.after

md5:
   40a7728d1188aa18e83e21a81fa7b150  ntp.o.before.asm
   40a7728d1188aa18e83e21a81fa7b150  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/timex.h | 2 +-
 kernel/time/ntp.c     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/timex.h b/include/linux/timex.h
index 998a55d80acf..aa3475fcff64 100644
--- a/include/linux/timex.h
+++ b/include/linux/timex.h
@@ -190,7 +190,7 @@ struct timex {
  * offset and maximum frequency tolerance.
  */
 #define SHIFT_USEC 16		/* frequency offset scale (shift) */
-#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
+#define PPM_SCALE ((s64)NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC))
 #define PPM_SCALE_INV_SHIFT 19
 #define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \
 		       PPM_SCALE + 1)
diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 4346ed6e623f..7447d57e021a 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -408,7 +408,7 @@ static inline void process_adjtimex_modes(struct timex *txc, struct timespec *ts
 		time_status &= ~STA_NANO;
 
 	if (txc->modes & ADJ_FREQUENCY) {
-		time_freq = (s64)txc->freq * PPM_SCALE;
+		time_freq = txc->freq * PPM_SCALE;
 		time_freq = min(time_freq, MAXFREQ_SCALED);
 		time_freq = max(time_freq, -MAXFREQ_SCALED);
 	}
@@ -505,7 +505,7 @@ int do_adjtimex(struct timex *txc)
 		result = TIME_ERROR;
 
 	txc->freq	   = shift_right((time_freq >> PPM_SCALE_INV_SHIFT) *
-					 (s64)PPM_SCALE_INV, NTP_SCALE_SHIFT);
+					 PPM_SCALE_INV, NTP_SCALE_SHIFT);
 	txc->maxerror	   = time_maxerror;
 	txc->esterror	   = time_esterror;
 	txc->status	   = time_status;
-- 
cgit v1.2.3


From 069569e025706f27f939785f86a94d5d8ce55dce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 16:03:37 +0100
Subject: time: ntp: simplify ntp_tick_adj calculations

Impact: micro-optimization

Convert the (internal) ntp_tick_adj value we store from unscaled
units to scaled units. This is a constant that we never modify,
so scaling it up once during bootup is enough - we dont have to
do it for every adjustment step.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index 7447d57e021a..a3fe7ef2d83b 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -71,7 +71,8 @@ static long			time_reftime;
 
 long				time_adjust;
 
-static long			ntp_tick_adj;
+/* constant (boot-param configurable) NTP tick adjustment (upscaled)	*/
+static s64			ntp_tick_adj;
 
 /*
  * NTP methods:
@@ -89,7 +90,7 @@ static void ntp_update_frequency(void)
 	second_length		 = (u64)(tick_usec * NSEC_PER_USEC * USER_HZ)
 						<< NTP_SCALE_SHIFT;
 
-	second_length		+= (s64)ntp_tick_adj << NTP_SCALE_SHIFT;
+	second_length		+= ntp_tick_adj;
 	second_length		+= time_freq;
 
 	tick_nsec		 = div_u64(second_length, HZ) >> NTP_SCALE_SHIFT;
@@ -540,6 +541,8 @@ int do_adjtimex(struct timex *txc)
 static int __init ntp_tick_adj_setup(char *str)
 {
 	ntp_tick_adj = simple_strtol(str, NULL, 0);
+	ntp_tick_adj <<= NTP_SCALE_SHIFT;
+
 	return 1;
 }
 
-- 
cgit v1.2.3


From 39854fe8c165872d743f6a0c4860ca2de8e45ac9 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Feb 2009 16:06:58 +0100
Subject: time: ntp: clean up second_overflow()

Impact: cleanup, no functionality changed

The 'time_adj' local variable is named in a very confusing
way because it almost shadows the 'time_adjust' global
variable - which is used in this same function.

Rename it to 'delta' - to make them stand apart more clearly.

kernel/time/ntp.o:

   text	   data	    bss	    dec	    hex	filename
   2545	    114	    144	   2803	    af3	ntp.o.before
   2545	    114	    144	   2803	    af3	ntp.o.after

md5:
   1bf0b3be564512279ba7cee299d1d2be  ntp.o.before.asm
   1bf0b3be564512279ba7cee299d1d2be  ntp.o.after.asm

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index a3fe7ef2d83b..c74eb7d9d854 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -236,7 +236,7 @@ static enum hrtimer_restart ntp_leap_second(struct hrtimer *timer)
  */
 void second_overflow(void)
 {
-	s64 time_adj;
+	s64 delta;
 
 	/* Bump the maxerror field */
 	time_maxerror += MAXFREQ / NSEC_PER_USEC;
@@ -249,10 +249,11 @@ void second_overflow(void)
 	 * Compute the phase adjustment for the next second. The offset is
 	 * reduced by a fixed factor times the time constant.
 	 */
-	tick_length	= tick_length_base;
-	time_adj	= shift_right(time_offset, SHIFT_PLL + time_constant);
-	time_offset	-= time_adj;
-	tick_length	+= time_adj;
+	tick_length	 = tick_length_base;
+
+	delta		 = shift_right(time_offset, SHIFT_PLL + time_constant);
+	time_offset	-= delta;
+	tick_length	+= delta;
 
 	if (!time_adjust)
 		return;
-- 
cgit v1.2.3


From eef62a6826b8ab530cefff5aa55c1661a209c803 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 25 Feb 2009 15:49:52 -0500
Subject: tracing: rename DEFINE_TRACE_FMT to just TRACE_FORMAT

There's been a bit confusion to whether DEFINE/DECLARE_TRACE_FMT should
be a DEFINE or a DECLARE. Ingo Molnar suggested simply calling it
TRACE_FORMAT.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h        |  2 +-
 include/trace/sched_event_types.h | 26 +++++++++++++-------------
 kernel/trace/trace_events.h       |  4 ++--
 3 files changed, 16 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 34ae464effff..3de09fa8e01d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -153,7 +153,7 @@ static inline void tracepoint_synchronize_unregister(void)
 	synchronize_sched();
 }
 
-#define DEFINE_TRACE_FMT(name, proto, args, fmt)		\
+#define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, TPPROTO(proto), TPARGS(args))
 
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a4f662940f4e..a3d3d66a51c8 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,72 +1,72 @@
 
 /* use <trace/sched.h> instead */
-#ifndef DEFINE_TRACE_FMT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
 
-DEFINE_TRACE_FMT(sched_kthread_stop,
+TRACE_FORMAT(sched_kthread_stop,
 	TPPROTO(struct task_struct *t),
 	TPARGS(t),
 	TPFMT("task %s:%d", t->comm, t->pid));
 
-DEFINE_TRACE_FMT(sched_kthread_stop_ret,
+TRACE_FORMAT(sched_kthread_stop_ret,
 	TPPROTO(int ret),
 	TPARGS(ret),
 	TPFMT("ret=%d", ret));
 
-DEFINE_TRACE_FMT(sched_wait_task,
+TRACE_FORMAT(sched_wait_task,
 	TPPROTO(struct rq *rq, struct task_struct *p),
 	TPARGS(rq, p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_wakeup,
+TRACE_FORMAT(sched_wakeup,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d %s",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_wakeup_new,
+TRACE_FORMAT(sched_wakeup_new,
 	TPPROTO(struct rq *rq, struct task_struct *p, int success),
 	TPARGS(rq, p, success),
 	TPFMT("task %s:%d",
 	      p->comm, p->pid, success?"succeeded":"failed"));
 
-DEFINE_TRACE_FMT(sched_switch,
+TRACE_FORMAT(sched_switch,
 	TPPROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
 	TPARGS(rq, prev, next),
 	TPFMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid));
 
-DEFINE_TRACE_FMT(sched_migrate_task,
+TRACE_FORMAT(sched_migrate_task,
 	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
 	TPARGS(p, orig_cpu, dest_cpu),
 	TPFMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu));
 
-DEFINE_TRACE_FMT(sched_process_free,
+TRACE_FORMAT(sched_process_free,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_exit,
+TRACE_FORMAT(sched_process_exit,
 	TPPROTO(struct task_struct *p),
 	TPARGS(p),
 	TPFMT("task %s:%d", p->comm, p->pid));
 
-DEFINE_TRACE_FMT(sched_process_wait,
+TRACE_FORMAT(sched_process_wait,
 	TPPROTO(struct pid *pid),
 	TPARGS(pid),
 	TPFMT("pid %d", pid));
 
-DEFINE_TRACE_FMT(sched_process_fork,
+TRACE_FORMAT(sched_process_fork,
 	TPPROTO(struct task_struct *parent, struct task_struct *child),
 	TPARGS(parent, child),
 	TPFMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid));
 
-DEFINE_TRACE_FMT(sched_signal_send,
+TRACE_FORMAT(sched_signal_send,
 	TPPROTO(int sig, struct task_struct *p),
 	TPARGS(sig, p),
 	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid));
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index cb8455b3ac9a..deb95e5006c8 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -17,8 +17,8 @@ struct ftrace_event_call {
 #undef TPFMT
 #define TPFMT(fmt, args...)	fmt "\n", ##args
 
-#undef DEFINE_TRACE_FMT
-#define DEFINE_TRACE_FMT(call, proto, args, fmt)			\
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
 	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-- 
cgit v1.2.3


From a682604838763981613e42015cd0e39f2989d6bb Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Feb 2009 18:03:42 -0800
Subject: rcu: Teach RCU that idle task is not quiscent state at boot

This patch fixes a bug located by Vegard Nossum with the aid of
kmemcheck, updated based on review comments from Nick Piggin,
Ingo Molnar, and Andrew Morton.  And cleans up the variable-name
and function-name language.  ;-)

The boot CPU runs in the context of its idle thread during boot-up.
During this time, idle_cpu(0) will always return nonzero, which will
fool Classic and Hierarchical RCU into deciding that a large chunk of
the boot-up sequence is a big long quiescent state.  This in turn causes
RCU to prematurely end grace periods during this time.

This patch changes the rcutree.c and rcuclassic.c rcu_check_callbacks()
function to ignore the idle task as a quiescent state until the
system has started up the scheduler in rest_init(), introducing a
new non-API function rcu_idle_now_means_idle() to inform RCU of this
transition.  RCU maintains an internal rcu_idle_cpu_truthful variable
to track this state, which is then used by rcu_check_callback() to
determine if it should believe idle_cpu().

Because this patch has the effect of disallowing RCU grace periods
during long stretches of the boot-up sequence, this patch also introduces
Josh Triplett's UP-only optimization that makes synchronize_rcu() be a
no-op if num_online_cpus() returns 1.  This allows boot-time code that
calls synchronize_rcu() to proceed normally.  Note, however, that RCU
callbacks registered by call_rcu() will likely queue up until later in
the boot sequence.  Although rcuclassic and rcutree can also use this
same optimization after boot completes, rcupreempt must restrict its
use of this optimization to the portion of the boot sequence before the
scheduler starts up, given that an rcupreempt RCU read-side critical
section may be preeempted.

In addition, this patch takes Nick Piggin's suggestion to make the
system_state global variable be __read_mostly.

Changes since v4:

o	Changes the name of the introduced function and variable to
	be less emotional.  ;-)

Changes since v3:

o	WARN_ON(nr_context_switches() > 0) to verify that RCU
	switches out of boot-time mode before the first context
	switch, as suggested by Nick Piggin.

Changes since v2:

o	Created rcu_blocking_is_gp() internal-to-RCU API that
	determines whether a call to synchronize_rcu() is itself
	a grace period.

o	The definition of rcu_blocking_is_gp() for rcuclassic and
	rcutree checks to see if but a single CPU is online.

o	The definition of rcu_blocking_is_gp() for rcupreempt
	checks to see both if but a single CPU is online and if
	the system is still in early boot.

	This allows rcupreempt to again work correctly if running
	on a single CPU after booting is complete.

o	Added check to rcupreempt's synchronize_sched() for there
	being but one online CPU.

Tested all three variants both SMP and !SMP, booted fine, passed a short
rcutorture test on both x86 and Power.

Located-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Vegard Nossum <vegard.nossum@gmail.com>
Tested-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h |  6 ++++++
 include/linux/rcupdate.h   |  4 ++++
 include/linux/rcupreempt.h | 15 +++++++++++++++
 include/linux/rcutree.h    |  6 ++++++
 init/main.c                |  3 ++-
 kernel/rcuclassic.c        |  4 ++--
 kernel/rcupdate.c          | 12 ++++++++++++
 kernel/rcupreempt.c        |  3 +++
 kernel/rcutree.c           |  4 ++--
 9 files changed, 52 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index f3f697df1d71..80044a4f3ab9 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -181,4 +181,10 @@ extern long rcu_batches_completed_bh(void);
 #define rcu_enter_nohz()	do { } while (0)
 #define rcu_exit_nohz()		do { } while (0)
 
+/* A context switch is a grace period for rcuclassic. */
+static inline int rcu_blocking_is_gp(void)
+{
+	return num_online_cpus() == 1;
+}
+
 #endif /* __LINUX_RCUCLASSIC_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 921340a7b71c..528343e6da51 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -52,6 +52,9 @@ struct rcu_head {
 	void (*func)(struct rcu_head *head);
 };
 
+/* Internal to kernel, but needed by rcupreempt.h. */
+extern int rcu_scheduler_active;
+
 #if defined(CONFIG_CLASSIC_RCU)
 #include <linux/rcuclassic.h>
 #elif defined(CONFIG_TREE_RCU)
@@ -265,6 +268,7 @@ extern void rcu_barrier_sched(void);
 
 /* Internal to kernel */
 extern void rcu_init(void);
+extern void rcu_scheduler_starting(void);
 extern int rcu_needs_cpu(int cpu);
 
 #endif /* __LINUX_RCUPDATE_H */
diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 3e05c09b54a2..74304b4538d8 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -142,4 +142,19 @@ static inline void rcu_exit_nohz(void)
 #define rcu_exit_nohz()		do { } while (0)
 #endif /* CONFIG_NO_HZ */
 
+/*
+ * A context switch is a grace period for rcupreempt synchronize_rcu()
+ * only during early boot, before the scheduler has been initialized.
+ * So, how the heck do we get a context switch?  Well, if the caller
+ * invokes synchronize_rcu(), they are willing to accept a context
+ * switch, so we simply pretend that one happened.
+ *
+ * After boot, there might be a blocked or preempted task in an RCU
+ * read-side critical section, so we cannot then take the fastpath.
+ */
+static inline int rcu_blocking_is_gp(void)
+{
+	return num_online_cpus() == 1 && !rcu_scheduler_active;
+}
+
 #endif /* __LINUX_RCUPREEMPT_H */
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index d4368b7975c3..a722fb67bb2d 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -326,4 +326,10 @@ static inline void rcu_exit_nohz(void)
 }
 #endif /* CONFIG_NO_HZ */
 
+/* A context switch is a grace period for rcutree. */
+static inline int rcu_blocking_is_gp(void)
+{
+	return num_online_cpus() == 1;
+}
+
 #endif /* __LINUX_RCUTREE_H */
diff --git a/init/main.c b/init/main.c
index 844209453c02..83697e160b3a 100644
--- a/init/main.c
+++ b/init/main.c
@@ -97,7 +97,7 @@ static inline void mark_rodata_ro(void) { }
 extern void tc_init(void);
 #endif
 
-enum system_states system_state;
+enum system_states system_state __read_mostly;
 EXPORT_SYMBOL(system_state);
 
 /*
@@ -463,6 +463,7 @@ static noinline void __init_refok rest_init(void)
 	 * at least once to get things moving:
 	 */
 	init_idle_bootup_task(current);
+	rcu_scheduler_starting();
 	preempt_enable_no_resched();
 	schedule();
 	preempt_disable();
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index bd5a9003497c..654c640a6b9c 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -679,8 +679,8 @@ int rcu_needs_cpu(int cpu)
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
-	    (idle_cpu(cpu) && !in_softirq() &&
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+	    (idle_cpu(cpu) && rcu_scheduler_active &&
+	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
 		 * Get here if this CPU took its interrupt from user
diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index d92a76a881aa..cae8a059cf47 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -44,6 +44,7 @@
 #include <linux/cpu.h>
 #include <linux/mutex.h>
 #include <linux/module.h>
+#include <linux/kernel_stat.h>
 
 enum rcu_barrier {
 	RCU_BARRIER_STD,
@@ -55,6 +56,7 @@ static DEFINE_PER_CPU(struct rcu_head, rcu_barrier_head) = {NULL};
 static atomic_t rcu_barrier_cpu_count;
 static DEFINE_MUTEX(rcu_barrier_mutex);
 static struct completion rcu_barrier_completion;
+int rcu_scheduler_active __read_mostly;
 
 /*
  * Awaken the corresponding synchronize_rcu() instance now that a
@@ -80,6 +82,10 @@ void wakeme_after_rcu(struct rcu_head  *head)
 void synchronize_rcu(void)
 {
 	struct rcu_synchronize rcu;
+
+	if (rcu_blocking_is_gp())
+		return;
+
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu(&rcu.head, wakeme_after_rcu);
@@ -175,3 +181,9 @@ void __init rcu_init(void)
 	__rcu_init();
 }
 
+void rcu_scheduler_starting(void)
+{
+	WARN_ON(num_online_cpus() != 1);
+	WARN_ON(nr_context_switches() > 0);
+	rcu_scheduler_active = 1;
+}
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 33cfc50781f9..5d59e850fb71 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -1181,6 +1181,9 @@ void __synchronize_sched(void)
 {
 	struct rcu_synchronize rcu;
 
+	if (num_online_cpus() == 1)
+		return;  /* blocking is gp if only one CPU! */
+
 	init_completion(&rcu.completion);
 	/* Will wake me after RCU finished. */
 	call_rcu_sched(&rcu.head, wakeme_after_rcu);
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index b2fd602a6f6f..97ce31579ec0 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -948,8 +948,8 @@ static void rcu_do_batch(struct rcu_data *rdp)
 void rcu_check_callbacks(int cpu, int user)
 {
 	if (user ||
-	    (idle_cpu(cpu) && !in_softirq() &&
-				hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
+	    (idle_cpu(cpu) && rcu_scheduler_active &&
+	     !in_softirq() && hardirq_count() <= (1 << HARDIRQ_SHIFT))) {
 
 		/*
 		 * Get here if this CPU took its interrupt from user
-- 
cgit v1.2.3


From c40c6f85a7594ad842233885386a0ca4cd40eafe Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 26 Feb 2009 15:40:15 +0800
Subject: cpuacct: add a branch prediction

cpuacct_charge() is in fast-path, and checking of !cpuacct_susys.active
always returns false after cpuacct has been initialized at system boot.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Menage <menage@google.com>
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5475d56a20f1..8e63ffb6ed05 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9684,7 +9684,7 @@ static void cpuacct_charge(struct task_struct *tsk, u64 cputime)
 	struct cpuacct *ca;
 	int cpu;
 
-	if (!cpuacct_subsys.active)
+	if (unlikely(!cpuacct_subsys.active))
 		return;
 
 	cpu = task_cpu(tsk);
-- 
cgit v1.2.3


From 8656e7a2fa6afcd8682990f804a2a9674568738f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 26 Feb 2009 00:41:38 +0100
Subject: tracing/core: make the per cpu trace files in per cpu directories

Impact: restructure the VFS layout of per CPU trace buffers

The per cpu trace files are all in a single directory:
/debug/tracing/per_cpu. In case of a large number of cpu, the
content of this directory becomes messy so we create now one
directory per cpu inside /debug/tracing/per_cpu which contain
each their own trace_pipe and trace files.

Ie:

 /debug/tracing$ ls -R per_cpu
 per_cpu:
 cpu0  cpu1

 per_cpu/cpu0:
 trace  trace_pipe

 per_cpu/cpu1:
 trace  trace_pipe

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 25 ++++++++++++++-----------
 1 file changed, 14 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d8d899f3bd6f..bdaf60d3d337 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3061,28 +3061,31 @@ struct dentry *tracing_dentry_percpu(void)
 static void tracing_init_debugfs_percpu(long cpu)
 {
 	struct dentry *d_percpu = tracing_dentry_percpu();
-	struct dentry *entry;
-	/* strlen(trace_pipe) + MAX(log10(cpu)) + '\0' */
-	char filename[17];
+	struct dentry *entry, *d_cpu;
+	/* strlen(cpu) + MAX(log10(cpu)) + '\0' */
+	char cpu_dir[7];
 
 	if (cpu > 999 || cpu < 0)
 		return;
 
-	/* per cpu trace_pipe */
-	sprintf(filename, "trace_pipe%ld", cpu);
+	sprintf(cpu_dir, "cpu%ld", cpu);
+	d_cpu = debugfs_create_dir(cpu_dir, d_percpu);
+	if (!d_cpu) {
+		pr_warning("Could not create debugfs '%s' entry\n", cpu_dir);
+		return;
+	}
 
-	entry = debugfs_create_file(filename, 0444, d_percpu,
+	/* per cpu trace_pipe */
+	entry = debugfs_create_file("trace_pipe", 0444, d_cpu,
 				(void *) cpu, &tracing_pipe_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs '%s' entry\n", filename);
+		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
 
 	/* per cpu trace */
-	sprintf(filename, "trace%ld", cpu);
-
-	entry = debugfs_create_file(filename, 0444, d_percpu,
+	entry = debugfs_create_file("trace", 0444, d_cpu,
 				(void *) cpu, &tracing_fops);
 	if (!entry)
-		pr_warning("Could not create debugfs '%s' entry\n", filename);
+		pr_warning("Could not create debugfs 'trace' entry\n");
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
-- 
cgit v1.2.3


From cac64d00c256e65776d575e82aaf540632b66178 Mon Sep 17 00:00:00 2001
From: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Date: Wed, 25 Feb 2009 09:59:26 -0800
Subject: sched_rt: don't start timer when rt bandwidth disabled

Impact: fix incorrect condition check

No need to start rt bandwidth timer when rt bandwidth is disabled.
If this timer starts, it may stop at sched_rt_period_timer() on the first time.

Signed-off-by: Hiroshi Shimamoto <h-shimamoto@ct.jp.nec.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 410eec404133..c3baa9653d1d 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -223,7 +223,7 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 {
 	ktime_t now;
 
-	if (rt_bandwidth_enabled() && rt_b->rt_runtime == RUNTIME_INF)
+	if (!rt_bandwidth_enabled() || rt_b->rt_runtime == RUNTIME_INF)
 		return;
 
 	if (hrtimer_active(&rt_b->rt_period_timer))
-- 
cgit v1.2.3


From af39241b90a345556b8884adff87096afe71b050 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 26 Feb 2009 10:11:05 -0500
Subject: tracing, genirq: add irq enter and exit trace events

Impact: add new tracepoints

Add them to the generic IRQ code, that way every architecture
gets these new tracepoints, not just x86.

Using Steve's new 'TRACE_FORMAT', I can get function graph
trace as follows using the original two IRQ tracepoints:

 3)               |    handle_IRQ_event() {
 3)               |    /* (irq_handler_entry) irq=28 handler=eth0 */
 3)               |    e1000_intr_msi() {
 3)   2.460 us    |      __napi_schedule();
 3)   9.416 us    |    }
 3)               |    /* (irq_handler_exit) irq=28 handler=eth0 return=handled */
 3) + 22.935 us   |  }

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Mathieu Desnoyers <compudj@krystal.dyndns.org>
Cc: "Frank Ch. Eigler" <fche@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/irq.h             |  9 +++++++++
 include/trace/irq_event_types.h | 17 +++++++++++++++++
 kernel/irq/handle.c             |  6 ++++++
 kernel/trace/events.c           |  2 ++
 4 files changed, 34 insertions(+)
 create mode 100644 include/trace/irq.h
 create mode 100644 include/trace/irq_event_types.h

(limited to 'kernel')

diff --git a/include/trace/irq.h b/include/trace/irq.h
new file mode 100644
index 000000000000..ff5d4495dc37
--- /dev/null
+++ b/include/trace/irq.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_IRQ_H
+#define _TRACE_IRQ_H
+
+#include <linux/interrupt.h>
+#include <linux/tracepoint.h>
+
+#include <trace/irq_event_types.h>
+
+#endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
new file mode 100644
index 000000000000..5d0919fdd2d4
--- /dev/null
+++ b/include/trace/irq_event_types.h
@@ -0,0 +1,17 @@
+
+/* use <trace/irq.h> instead */
+#ifndef TRACE_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+TRACE_FORMAT(irq_handler_entry,
+	TPPROTO(int irq, struct irqaction *action),
+	TPARGS(irq, action),
+	TPFMT("irq=%d handler=%s", irq, action->name));
+
+TRACE_FORMAT(irq_handler_exit,
+	TPPROTO(int irq, struct irqaction *action, int ret),
+	TPARGS(irq, action, ret),
+	TPFMT("irq=%d handler=%s return=%s",
+		irq, action->name, ret ? "handled" : "unhandled"));
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d12f328..4709a7c870d7 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -17,6 +17,7 @@
 #include <linux/kernel_stat.h>
 #include <linux/rculist.h>
 #include <linux/hash.h>
+#include <trace/irq.h>
 
 #include "internals.h"
 
@@ -316,6 +317,9 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+DEFINE_TRACE(irq_handler_entry);
+DEFINE_TRACE(irq_handler_exit);
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -332,7 +336,9 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 		local_irq_enable_in_hardirq();
 
 	do {
+		trace_irq_handler_entry(irq, action);
 		ret = action->handler(irq, action->dev_id);
+		trace_irq_handler_exit(irq, action, ret);
 		if (ret == IRQ_HANDLED)
 			status |= action->flags;
 		retval |= ret;
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 38c89eef99ee..3c75623893cc 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -6,8 +6,10 @@
 
 /* trace/<type>.h here */
 #include <trace/sched.h>
+#include <trace/irq.h>
 
 #include "trace_events.h"
 
 /* trace/<type>_event_types.h here */
 #include <trace/sched_event_types.h>
+#include <trace/irq_event_types.h>
-- 
cgit v1.2.3


From 6409c4da289d6905f7ae2bd0630438368439bda2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 12 May 2008 21:21:14 +0200
Subject: sched: sched_clock() improvement: use in_nmi()

make sure we dont execute more complex sched_clock() code in NMI context.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..db69174b1178 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -29,6 +29,7 @@
 #include <linux/spinlock.h>
 #include <linux/ktime.h>
 #include <linux/module.h>
+#include <linux/hardirq.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -151,6 +152,13 @@ u64 sched_clock_cpu(int cpu)
 	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock, this_clock, remote_clock;
 
+	/*
+	 * Normally this is not called in NMI context - but if it is,
+	 * trying to do any locking here is totally lethal.
+	 */
+	if (unlikely(in_nmi()))
+		return scd->clock;
+
 	if (unlikely(!sched_clock_running))
 		return 0ull;
 
-- 
cgit v1.2.3


From 14131f2f98ac350ee9e73faed916d2238a8b6a0d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 18:47:11 +0100
Subject: tracing: implement trace_clock_*() APIs

Impact: implement new tracing timestamp APIs

Add three trace clock variants, with differing scalability/precision
tradeoffs:

 -   local: CPU-local trace clock
 -  medium: scalable global clock with some jitter
 -  global: globally monotonic, serialized clock

Make the ring-buffer use the local trace clock internally.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/trace_clock.h |  19 +++++++++
 kernel/trace/Makefile       |   1 +
 kernel/trace/ring_buffer.c  |   5 +--
 kernel/trace/trace_clock.c  | 101 ++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 123 insertions(+), 3 deletions(-)
 create mode 100644 include/linux/trace_clock.h
 create mode 100644 kernel/trace/trace_clock.c

(limited to 'kernel')

diff --git a/include/linux/trace_clock.h b/include/linux/trace_clock.h
new file mode 100644
index 000000000000..7a8130384087
--- /dev/null
+++ b/include/linux/trace_clock.h
@@ -0,0 +1,19 @@
+#ifndef _LINUX_TRACE_CLOCK_H
+#define _LINUX_TRACE_CLOCK_H
+
+/*
+ * 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ */
+#include <linux/compiler.h>
+#include <linux/types.h>
+
+extern u64 notrace trace_clock_local(void);
+extern u64 notrace trace_clock(void);
+extern u64 notrace trace_clock_global(void);
+
+#endif /* _LINUX_TRACE_CLOCK_H */
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 664b6c0dc75a..c931fe0560cb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -19,6 +19,7 @@ obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o
 obj-$(CONFIG_RING_BUFFER) += ring_buffer.o
 
 obj-$(CONFIG_TRACING) += trace.o
+obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 8f19f1aa42b0..a8c275c01e83 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -4,6 +4,7 @@
  * Copyright (C) 2008 Steven Rostedt <srostedt@redhat.com>
  */
 #include <linux/ring_buffer.h>
+#include <linux/trace_clock.h>
 #include <linux/ftrace_irq.h>
 #include <linux/spinlock.h>
 #include <linux/debugfs.h>
@@ -12,7 +13,6 @@
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/mutex.h>
-#include <linux/sched.h>	/* used for sched_clock() (for now) */
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
@@ -112,14 +112,13 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 /* Up this if you want to test the TIME_EXTENTS and normalization */
 #define DEBUG_SHIFT 0
 
-/* FIXME!!! */
 u64 ring_buffer_time_stamp(int cpu)
 {
 	u64 time;
 
 	preempt_disable_notrace();
 	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = sched_clock() << DEBUG_SHIFT;
+	time = trace_clock_local() << DEBUG_SHIFT;
 	preempt_enable_no_resched_notrace();
 
 	return time;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
new file mode 100644
index 000000000000..2d4953f93560
--- /dev/null
+++ b/kernel/trace/trace_clock.c
@@ -0,0 +1,101 @@
+/*
+ * tracing clocks
+ *
+ *  Copyright (C) 2009 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
+ *
+ * Implements 3 trace clock variants, with differing scalability/precision
+ * tradeoffs:
+ *
+ *  -   local: CPU-local trace clock
+ *  -  medium: scalable global clock with some jitter
+ *  -  global: globally monotonic, serialized clock
+ *
+ * Tracer plugins will chose a default from these clocks.
+ */
+#include <linux/spinlock.h>
+#include <linux/hardirq.h>
+#include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/ktime.h>
+
+/*
+ * trace_clock_local(): the simplest and least coherent tracing clock.
+ *
+ * Useful for tracing that does not cross to other CPUs nor
+ * does it go through idle events.
+ */
+u64 notrace trace_clock_local(void)
+{
+	/*
+	 * sched_clock() is an architecture implemented, fast, scalable,
+	 * lockless clock. It is not guaranteed to be coherent across
+	 * CPUs, nor across CPU idle events.
+	 */
+	return sched_clock();
+}
+
+/*
+ * trace_clock(): 'inbetween' trace clock. Not completely serialized,
+ * but not completely incorrect when crossing CPUs either.
+ *
+ * This is based on cpu_clock(), which will allow at most ~1 jiffy of
+ * jitter between CPUs. So it's a pretty scalable clock, but there
+ * can be offsets in the trace data.
+ */
+u64 notrace trace_clock(void)
+{
+	return cpu_clock(raw_smp_processor_id());
+}
+
+
+/*
+ * trace_clock_global(): special globally coherent trace clock
+ *
+ * It has higher overhead than the other trace clocks but is still
+ * an order of magnitude faster than GTOD derived hardware clocks.
+ *
+ * Used by plugins that need globally coherent timestamps.
+ */
+
+static u64 prev_trace_clock_time;
+
+static raw_spinlock_t trace_clock_lock ____cacheline_aligned_in_smp =
+	(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
+
+u64 notrace trace_clock_global(void)
+{
+	unsigned long flags;
+	int this_cpu;
+	u64 now;
+
+	raw_local_irq_save(flags);
+
+	this_cpu = raw_smp_processor_id();
+	now = cpu_clock(this_cpu);
+	/*
+	 * If in an NMI context then dont risk lockups and return the
+	 * cpu_clock() time:
+	 */
+	if (unlikely(in_nmi()))
+		goto out;
+
+	__raw_spin_lock(&trace_clock_lock);
+
+	/*
+	 * TODO: if this happens often then maybe we should reset
+	 * my_scd->clock to prev_trace_clock_time+1, to make sure
+	 * we start ticking with the local clock from now on?
+	 */
+	if ((s64)(now - prev_trace_clock_time) < 0)
+		now = prev_trace_clock_time + 1;
+
+	prev_trace_clock_time = now;
+
+	__raw_spin_unlock(&trace_clock_lock);
+
+ out:
+	raw_local_irq_restore(flags);
+
+	return now;
+}
-- 
cgit v1.2.3


From a2a5ac8650b570bea3cb3614f77739dcd07d6632 Mon Sep 17 00:00:00 2001
From: John Stultz <johnstul@us.ibm.com>
Date: Thu, 26 Feb 2009 09:46:14 -0800
Subject: time: ntp: fix bug in ntp_update_offset() & do_adjtimex(), fix

The time_status conditional was accidentally placed right after we clear
the checked time_status bits, which causes us to take the conditional
every time through. This fixes it by moving the conditional to before we
clear the time_status bits.

Signed-off-by: John Stultz <johnstul@us.ibm.com>
Cc: Clark Williams <williams@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/time/ntp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c
index c74eb7d9d854..7fc64375ff43 100644
--- a/kernel/time/ntp.c
+++ b/kernel/time/ntp.c
@@ -365,8 +365,6 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 		time_state = TIME_OK;
 		time_status = STA_UNSYNC;
 	}
-	/* only set allowed bits */
-	time_status &= STA_RONLY;
 
 	/*
 	 * If we turn on PLL adjustments then reset the
@@ -375,6 +373,8 @@ static inline void process_adj_status(struct timex *txc, struct timespec *ts)
 	if (!(time_status & STA_PLL) && (txc->status & STA_PLL))
 		time_reftime = xtime.tv_sec;
 
+	/* only set allowed bits */
+	time_status &= STA_RONLY;
 	time_status |= txc->status & ~STA_RONLY;
 
 	switch (time_state) {
-- 
cgit v1.2.3


From b342501cd31e5546d0c9ca8ceff5ded1832f9e5b Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 26 Feb 2009 20:20:29 +0100
Subject: sched: allow architectures to specify sched_clock_stable

Allow CONFIG_HAVE_UNSTABLE_SCHED_CLOCK architectures to still specify
that their sched_clock() implementation is reliable.

This will be used by x86 to switch on a faster sched_clock_cpu()
implementation on certain CPU types.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 10 ++++++++++
 kernel/sched_clock.c  | 45 ++++++++++++++++++++-------------------------
 2 files changed, 30 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52c714f..a063d19b7a7d 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1670,6 +1670,16 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
 	return set_cpus_allowed_ptr(p, &new_mask);
 }
 
+/*
+ * Architectures can set this to 1 if they have specified
+ * CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
+ * but then during bootup it turns out that sched_clock()
+ * is reliable after all:
+ */
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+extern int sched_clock_stable;
+#endif
+
 extern unsigned long long sched_clock(void);
 
 extern void sched_clock_init(void);
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a0b0852414cc..a755d023805a 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -24,11 +24,11 @@
  * The clock: sched_clock_cpu() is monotonic per cpu, and should be somewhat
  * consistent between cpus (never more than 2 jiffies difference).
  */
-#include <linux/sched.h>
-#include <linux/percpu.h>
 #include <linux/spinlock.h>
-#include <linux/ktime.h>
 #include <linux/module.h>
+#include <linux/percpu.h>
+#include <linux/ktime.h>
+#include <linux/sched.h>
 
 /*
  * Scheduler clock - returns current time in nanosec units.
@@ -43,6 +43,10 @@ unsigned long long __attribute__((weak)) sched_clock(void)
 static __read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+__read_mostly int sched_clock_stable;
+#else
+static const int sched_clock_stable = 1;
+#endif
 
 struct sched_clock_data {
 	/*
@@ -87,7 +91,7 @@ void sched_clock_init(void)
 }
 
 /*
- * min,max except they take wrapping into account
+ * min, max except they take wrapping into account
  */
 
 static inline u64 wrap_min(u64 x, u64 y)
@@ -116,10 +120,13 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	if (unlikely(delta < 0))
 		delta = 0;
 
+	if (unlikely(!sched_clock_running))
+		return 0ull;
+
 	/*
 	 * scd->clock = clamp(scd->tick_gtod + delta,
-	 * 		      max(scd->tick_gtod, scd->clock),
-	 * 		      scd->tick_gtod + TICK_NSEC);
+	 *		      max(scd->tick_gtod, scd->clock),
+	 *		      scd->tick_gtod + TICK_NSEC);
 	 */
 
 	clock = scd->tick_gtod + delta;
@@ -148,12 +155,13 @@ static void lock_double_clock(struct sched_clock_data *data1,
 
 u64 sched_clock_cpu(int cpu)
 {
-	struct sched_clock_data *scd = cpu_sdc(cpu);
 	u64 now, clock, this_clock, remote_clock;
+	struct sched_clock_data *scd;
 
-	if (unlikely(!sched_clock_running))
-		return 0ull;
+	if (sched_clock_stable)
+		return sched_clock();
 
+	scd = cpu_sdc(cpu);
 	WARN_ON_ONCE(!irqs_disabled());
 	now = sched_clock();
 
@@ -193,6 +201,8 @@ u64 sched_clock_cpu(int cpu)
 	return clock;
 }
 
+#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
+
 void sched_clock_tick(void)
 {
 	struct sched_clock_data *scd = this_scd();
@@ -235,22 +245,7 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
-#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
-
-void sched_clock_init(void)
-{
-	sched_clock_running = 1;
-}
-
-u64 sched_clock_cpu(int cpu)
-{
-	if (unlikely(!sched_clock_running))
-		return 0;
-
-	return sched_clock();
-}
-
-#endif
+#endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 unsigned long long cpu_clock(int cpu)
 {
-- 
cgit v1.2.3


From 8325d9c09dedf45476f4d6261d1b6a72e4a7453f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Thu, 26 Feb 2009 21:40:16 +0100
Subject: sched_clock: cleanups

- remove superfluous checks in __update_sched_clock()
- skip sched_clock_tick() for sched_clock_stable
- reinstate the simple !HAVE_UNSTABLE_SCHED_CLOCK code to please the bloatwatch

Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_clock.c | 31 ++++++++++++++++++++-----------
 1 file changed, 20 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index a755d023805a..390f33234bd0 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -44,9 +44,6 @@ static __read_mostly int sched_clock_running;
 
 #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
 __read_mostly int sched_clock_stable;
-#else
-static const int sched_clock_stable = 1;
-#endif
 
 struct sched_clock_data {
 	/*
@@ -115,14 +112,9 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	s64 delta = now - scd->tick_raw;
 	u64 clock, min_clock, max_clock;
 
-	WARN_ON_ONCE(!irqs_disabled());
-
 	if (unlikely(delta < 0))
 		delta = 0;
 
-	if (unlikely(!sched_clock_running))
-		return 0ull;
-
 	/*
 	 * scd->clock = clamp(scd->tick_gtod + delta,
 	 *		      max(scd->tick_gtod, scd->clock),
@@ -201,18 +193,20 @@ u64 sched_clock_cpu(int cpu)
 	return clock;
 }
 
-#ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
-
 void sched_clock_tick(void)
 {
-	struct sched_clock_data *scd = this_scd();
+	struct sched_clock_data *scd;
 	u64 now, now_gtod;
 
+	if (sched_clock_stable)
+		return;
+
 	if (unlikely(!sched_clock_running))
 		return;
 
 	WARN_ON_ONCE(!irqs_disabled());
 
+	scd = this_scd();
 	now_gtod = ktime_to_ns(ktime_get());
 	now = sched_clock();
 
@@ -245,6 +239,21 @@ void sched_clock_idle_wakeup_event(u64 delta_ns)
 }
 EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event);
 
+#else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
+
+void sched_clock_init(void)
+{
+	sched_clock_running = 1;
+}
+
+u64 sched_clock_cpu(int cpu)
+{
+	if (unlikely(!sched_clock_running))
+		return 0;
+
+	return sched_clock();
+}
+
 #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */
 
 unsigned long long cpu_clock(int cpu)
-- 
cgit v1.2.3


From 1d1e97562e5e2ac60fb7b25437ba619f95f67fab Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 26 Feb 2009 18:27:38 -0600
Subject: keys: distinguish per-uid keys in different namespaces

per-uid keys were looked by uid only.  Use the user namespace
to distinguish the same uid in different namespaces.

This does not address key_permission.  So a task can for instance
try to join a keyring owned by the same uid in another namespace.
That will be handled by a separate patch.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Acked-by: David Howells <dhowells@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
---
 kernel/user.c                |  2 +-
 security/keys/internal.h     |  4 +++-
 security/keys/key.c          | 11 +++++++++--
 security/keys/keyctl.c       |  2 +-
 security/keys/process_keys.c |  2 ++
 security/keys/request_key.c  |  2 +-
 6 files changed, 17 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 477b6660f447..d8b332c3ae3a 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -20,7 +20,7 @@
 
 struct user_namespace init_user_ns = {
 	.kref = {
-		.refcount	= ATOMIC_INIT(1),
+		.refcount	= ATOMIC_INIT(2),
 	},
 	.creator = &root_user,
 };
diff --git a/security/keys/internal.h b/security/keys/internal.h
index 81932abefe7b..9fb679c66b8a 100644
--- a/security/keys/internal.h
+++ b/security/keys/internal.h
@@ -53,6 +53,7 @@ struct key_user {
 	atomic_t		nkeys;		/* number of keys */
 	atomic_t		nikeys;		/* number of instantiated keys */
 	uid_t			uid;
+	struct user_namespace	*user_ns;
 	int			qnkeys;		/* number of keys allocated to this user */
 	int			qnbytes;	/* number of bytes allocated to this user */
 };
@@ -61,7 +62,8 @@ extern struct rb_root	key_user_tree;
 extern spinlock_t	key_user_lock;
 extern struct key_user	root_key_user;
 
-extern struct key_user *key_user_lookup(uid_t uid);
+extern struct key_user *key_user_lookup(uid_t uid,
+					struct user_namespace *user_ns);
 extern void key_user_put(struct key_user *user);
 
 /*
diff --git a/security/keys/key.c b/security/keys/key.c
index f76c8a546fd3..4a1297d1ada4 100644
--- a/security/keys/key.c
+++ b/security/keys/key.c
@@ -18,6 +18,7 @@
 #include <linux/workqueue.h>
 #include <linux/random.h>
 #include <linux/err.h>
+#include <linux/user_namespace.h>
 #include "internal.h"
 
 static struct kmem_cache	*key_jar;
@@ -60,7 +61,7 @@ void __key_check(const struct key *key)
  * get the key quota record for a user, allocating a new record if one doesn't
  * already exist
  */
-struct key_user *key_user_lookup(uid_t uid)
+struct key_user *key_user_lookup(uid_t uid, struct user_namespace *user_ns)
 {
 	struct key_user *candidate = NULL, *user;
 	struct rb_node *parent = NULL;
@@ -79,6 +80,10 @@ struct key_user *key_user_lookup(uid_t uid)
 			p = &(*p)->rb_left;
 		else if (uid > user->uid)
 			p = &(*p)->rb_right;
+		else if (user_ns < user->user_ns)
+			p = &(*p)->rb_left;
+		else if (user_ns > user->user_ns)
+			p = &(*p)->rb_right;
 		else
 			goto found;
 	}
@@ -106,6 +111,7 @@ struct key_user *key_user_lookup(uid_t uid)
 	atomic_set(&candidate->nkeys, 0);
 	atomic_set(&candidate->nikeys, 0);
 	candidate->uid = uid;
+	candidate->user_ns = get_user_ns(user_ns);
 	candidate->qnkeys = 0;
 	candidate->qnbytes = 0;
 	spin_lock_init(&candidate->lock);
@@ -136,6 +142,7 @@ void key_user_put(struct key_user *user)
 	if (atomic_dec_and_lock(&user->usage, &key_user_lock)) {
 		rb_erase(&user->node, &key_user_tree);
 		spin_unlock(&key_user_lock);
+		put_user_ns(user->user_ns);
 
 		kfree(user);
 	}
@@ -234,7 +241,7 @@ struct key *key_alloc(struct key_type *type, const char *desc,
 	quotalen = desclen + type->def_datalen;
 
 	/* get hold of the key tracking for this user */
-	user = key_user_lookup(uid);
+	user = key_user_lookup(uid, cred->user->user_ns);
 	if (!user)
 		goto no_memory_1;
 
diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c
index b1ec3b4ee17d..7f09fb897d2b 100644
--- a/security/keys/keyctl.c
+++ b/security/keys/keyctl.c
@@ -726,7 +726,7 @@ long keyctl_chown_key(key_serial_t id, uid_t uid, gid_t gid)
 	/* change the UID */
 	if (uid != (uid_t) -1 && uid != key->uid) {
 		ret = -ENOMEM;
-		newowner = key_user_lookup(uid);
+		newowner = key_user_lookup(uid, current_user_ns());
 		if (!newowner)
 			goto error_put;
 
diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c
index 2f5d89e92b85..276d27882ce8 100644
--- a/security/keys/process_keys.c
+++ b/security/keys/process_keys.c
@@ -17,6 +17,7 @@
 #include <linux/fs.h>
 #include <linux/err.h>
 #include <linux/mutex.h>
+#include <linux/user_namespace.h>
 #include <asm/uaccess.h>
 #include "internal.h"
 
@@ -34,6 +35,7 @@ struct key_user root_key_user = {
 	.nkeys		= ATOMIC_INIT(2),
 	.nikeys		= ATOMIC_INIT(2),
 	.uid		= 0,
+	.user_ns	= &init_user_ns,
 };
 
 /*****************************************************************************/
diff --git a/security/keys/request_key.c b/security/keys/request_key.c
index 0e04f72ef2d4..22a31582bfaa 100644
--- a/security/keys/request_key.c
+++ b/security/keys/request_key.c
@@ -365,7 +365,7 @@ static struct key *construct_key_and_link(struct key_type *type,
 
 	kenter("");
 
-	user = key_user_lookup(current_fsuid());
+	user = key_user_lookup(current_fsuid(), current_user_ns());
 	if (!user)
 		return ERR_PTR(-ENOMEM);
 
-- 
cgit v1.2.3


From a8259075074fb09c230b4cd2c8d3ee3c49d6ecd1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 22:19:12 -0500
Subject: tracing: add options directory and core option files

This patch creates an options directory in the debugfs, that contains
the available tracing options. These files contain 1 or 0, where 1
is the option is enabled and 0 it is disabled.

Simply echoing in 1 will enable the option and 0 will disable it.
This patch only contains the core options, not the tracer options.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 117 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 117 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index bdaf60d3d337..40e983ed994f 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3093,6 +3093,121 @@ static void tracing_init_debugfs_percpu(long cpu)
 #include "trace_selftest.c"
 #endif
 
+static ssize_t
+trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
+			loff_t *ppos)
+{
+	long index = (long)filp->private_data;
+	char *buf;
+
+	if (trace_flags & (1 << index))
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
+			 loff_t *ppos)
+{
+	long index = (long)filp->private_data;
+	char buf[64];
+	unsigned long val;
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	switch (val) {
+	case 0:
+		trace_flags &= ~(1 << index);
+		break;
+	case 1:
+		trace_flags |= 1 << index;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+
+static const struct file_operations trace_options_core_fops = {
+	.open = tracing_open_generic,
+	.read = trace_options_core_read,
+	.write = trace_options_core_write,
+};
+
+static struct dentry *trace_options_init_dentry(void)
+{
+	struct dentry *d_tracer;
+	static struct dentry *t_options;
+
+	if (t_options)
+		return t_options;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return NULL;
+
+	t_options = debugfs_create_dir("options", d_tracer);
+	if (!t_options) {
+		pr_warning("Could not create debugfs directory 'options'\n");
+		return NULL;
+	}
+
+	return t_options;
+}
+
+static struct dentry *
+create_trace_option_core_file(const char *option, long index)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return NULL;
+
+	entry = debugfs_create_file(option, 0644, t_options, (void *)index,
+				    &trace_options_core_fops);
+
+	return entry;
+}
+
+static __init void create_trace_options_dir(void)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+	int i;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return;
+
+	for (i = 0; trace_options[i]; i++) {
+		entry = create_trace_option_core_file(trace_options[i], i);
+		if (!entry)
+			pr_warning("Could not create debugfs %s entry\n",
+				   trace_options[i]);
+	}
+}
+
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
@@ -3111,6 +3226,8 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace_options' entry\n");
 
+	create_trace_options_dir();
+
 	entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer,
 				    NULL, &tracing_cpumask_fops);
 	if (!entry)
-- 
cgit v1.2.3


From 577b785f55168d5acb3d123ba41bfe8d7981e044 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 23:43:05 -0500
Subject: tracing: add tracer dependent options to options directory

This patch adds the tracer dependent options dynamically to the
options directory when the tracer is activated. These options are
removed when the tracer is deactivated.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 174 ++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 173 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 40e983ed994f..485c6e7f4465 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2275,8 +2275,17 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 	return t->init(tr);
 }
 
+struct trace_option_dentry;
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer);
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts);
+
 static int tracing_set_tracer(const char *buf)
 {
+	static struct trace_option_dentry *topts;
 	struct trace_array *tr = &global_trace;
 	struct tracer *t;
 	int ret = 0;
@@ -2297,7 +2306,12 @@ static int tracing_set_tracer(const char *buf)
 	if (current_trace && current_trace->reset)
 		current_trace->reset(tr);
 
+	destroy_trace_option_files(topts);
+
 	current_trace = t;
+
+	topts = create_trace_option_files(current_trace);
+
 	if (t->init) {
 		ret = tracer_init(t, tr);
 		if (ret)
@@ -3093,6 +3107,95 @@ static void tracing_init_debugfs_percpu(long cpu)
 #include "trace_selftest.c"
 #endif
 
+struct trace_option_dentry {
+	struct tracer_opt		*opt;
+	struct tracer_flags		*flags;
+	struct dentry			*entry;
+};
+
+static ssize_t
+trace_options_read(struct file *filp, char __user *ubuf, size_t cnt,
+			loff_t *ppos)
+{
+	struct trace_option_dentry *topt = filp->private_data;
+	char *buf;
+
+	if (topt->flags->val & topt->opt->bit)
+		buf = "1\n";
+	else
+		buf = "0\n";
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, 2);
+}
+
+static ssize_t
+trace_options_write(struct file *filp, const char __user *ubuf, size_t cnt,
+			 loff_t *ppos)
+{
+	struct trace_option_dentry *topt = filp->private_data;
+	unsigned long val;
+	char buf[64];
+	int ret;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	ret = strict_strtoul(buf, 10, &val);
+	if (ret < 0)
+		return ret;
+
+	ret = 0;
+	switch (val) {
+	case 0:
+		/* do nothing if already cleared */
+		if (!(topt->flags->val & topt->opt->bit))
+			break;
+
+		mutex_lock(&trace_types_lock);
+		if (current_trace->set_flag)
+			ret = current_trace->set_flag(topt->flags->val,
+						      topt->opt->bit, 0);
+		mutex_unlock(&trace_types_lock);
+		if (ret)
+			return ret;
+		topt->flags->val &= ~topt->opt->bit;
+		break;
+	case 1:
+		/* do nothing if already set */
+		if (topt->flags->val & topt->opt->bit)
+			break;
+
+		mutex_lock(&trace_types_lock);
+		if (current_trace->set_flag)
+			ret = current_trace->set_flag(topt->flags->val,
+						      topt->opt->bit, 1);
+		mutex_unlock(&trace_types_lock);
+		if (ret)
+			return ret;
+		topt->flags->val |= topt->opt->bit;
+		break;
+
+	default:
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+
+static const struct file_operations trace_options_fops = {
+	.open = tracing_open_generic,
+	.read = trace_options_read,
+	.write = trace_options_write,
+};
+
 static ssize_t
 trace_options_core_read(struct file *filp, char __user *ubuf, size_t cnt,
 			loff_t *ppos)
@@ -3146,7 +3249,6 @@ trace_options_core_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-
 static const struct file_operations trace_options_core_fops = {
 	.open = tracing_open_generic,
 	.read = trace_options_core_read,
@@ -3174,6 +3276,76 @@ static struct dentry *trace_options_init_dentry(void)
 	return t_options;
 }
 
+static void
+create_trace_option_file(struct trace_option_dentry *topt,
+			 struct tracer_flags *flags,
+			 struct tracer_opt *opt)
+{
+	struct dentry *t_options;
+	struct dentry *entry;
+
+	t_options = trace_options_init_dentry();
+	if (!t_options)
+		return;
+
+	topt->flags = flags;
+	topt->opt = opt;
+
+	entry = debugfs_create_file(opt->name, 0644, t_options, topt,
+				    &trace_options_fops);
+
+	topt->entry = entry;
+
+}
+
+static struct trace_option_dentry *
+create_trace_option_files(struct tracer *tracer)
+{
+	struct trace_option_dentry *topts;
+	struct tracer_flags *flags;
+	struct tracer_opt *opts;
+	int cnt;
+
+	if (!tracer)
+		return NULL;
+
+	flags = tracer->flags;
+
+	if (!flags || !flags->opts)
+		return NULL;
+
+	opts = flags->opts;
+
+	for (cnt = 0; opts[cnt].name; cnt++)
+		;
+
+	topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL);
+	if (!topts)
+		return NULL;
+
+	for (cnt = 0; opts[cnt].name; cnt++)
+		create_trace_option_file(&topts[cnt], flags,
+					 &opts[cnt]);
+
+	return topts;
+}
+
+static void
+destroy_trace_option_files(struct trace_option_dentry *topts)
+{
+	int cnt;
+
+	if (!topts)
+		return;
+
+	for (cnt = 0; topts[cnt].opt; cnt++) {
+		if (topts[cnt].entry)
+			debugfs_remove(topts[cnt].entry);
+	}
+
+	kfree(topts);
+}
+
 static struct dentry *
 create_trace_option_core_file(const char *option, long index)
 {
-- 
cgit v1.2.3


From d8e83d26b5ab3b31ee0ff6d093a2627707a1e221 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 26 Feb 2009 23:55:58 -0500
Subject: tracing: add protection around open use of current_tracer

Impact: fix to possible race conditions

There's some uses of current_tracer that is not protected by the
trace_types_lock. There is a small chance that a sysadmin changes
the tracer while the current_tracer is being referenced.

If the race is hit, it is unlikely to cause any harm since the
tracers are constant and are not freed. But some strang side
effects may occur.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 485c6e7f4465..6c89ec6cbd48 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2024,12 +2024,12 @@ static ssize_t
 tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		       size_t cnt, loff_t *ppos)
 {
-	int i;
+	struct tracer_opt *trace_opts;
+	u32 tracer_flags;
+	int len = 0;
 	char *buf;
 	int r = 0;
-	int len = 0;
-	u32 tracer_flags = current_trace->flags->val;
-	struct tracer_opt *trace_opts = current_trace->flags->opts;
+	int i;
 
 
 	/* calculate max size */
@@ -2038,6 +2038,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 		len += 3; /* "no" and space */
 	}
 
+	mutex_lock(&trace_types_lock);
+	tracer_flags = current_trace->flags->val;
+	trace_opts = current_trace->flags->opts;
+
 	/*
 	 * Increase the size with names of options specific
 	 * of the current tracer.
@@ -2049,8 +2053,10 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 
 	/* +2 for \n and \0 */
 	buf = kmalloc(len + 2, GFP_KERNEL);
-	if (!buf)
+	if (!buf) {
+		mutex_unlock(&trace_types_lock);
 		return -ENOMEM;
+	}
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
@@ -2067,6 +2073,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 			r += sprintf(buf + r, "no%s ",
 				trace_opts[i].name);
 	}
+	mutex_unlock(&trace_types_lock);
 
 	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
@@ -2074,7 +2081,6 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 
 	kfree(buf);
-
 	return r;
 }
 
@@ -2149,7 +2155,9 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 
 	/* If no option could be set, test the specific tracer options */
 	if (!trace_options[i]) {
+		mutex_lock(&trace_types_lock);
 		ret = set_tracer_option(current_trace, cmp, neg);
+		mutex_unlock(&trace_types_lock);
 		if (ret)
 			return ret;
 	}
-- 
cgit v1.2.3


From 85a2f9b46f8cd8aaa11c64c715e1ea3ec27ec486 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 00:12:38 -0500
Subject: tracing: use pointer error returns for __tracing_open

Impact: fix compile warning and clean up

When I first wrote __tracing_open, instead of passing the error
code via the ERR_PTR macros, I lazily used a separate parameter
to hold the return for errors.

When Frederic Weisbecker updated that function, he used the Linux
kernel ERR_PTR for the returns. This caused the parameter return
to possibly not be initialized on error. gcc correctly pointed this
out with a warning.

This patch converts the entire function to use the Linux kernel
ERR_PTR macro methods.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 46 ++++++++++++++++++++++++----------------------
 1 file changed, 24 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6c89ec6cbd48..304e02ce39c4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1684,23 +1684,20 @@ static struct seq_operations tracer_seq_ops = {
 };
 
 static struct trace_iterator *
-__tracing_open(struct inode *inode, struct file *file, int *ret)
+__tracing_open(struct inode *inode, struct file *file)
 {
 	long cpu_file = (long) inode->i_private;
+	void *fail_ret = ERR_PTR(-ENOMEM);
 	struct trace_iterator *iter;
 	struct seq_file *m;
-	int cpu;
+	int cpu, ret;
 
-	if (tracing_disabled) {
-		*ret = -ENODEV;
-		return NULL;
-	}
+	if (tracing_disabled)
+		return ERR_PTR(-ENODEV);
 
 	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
-	if (!iter) {
-		*ret = -ENOMEM;
-		goto out;
-	}
+	if (!iter)
+		return ERR_PTR(-ENOMEM);
 
 	/*
 	 * We make a copy of the current tracer to avoid concurrent
@@ -1708,10 +1705,9 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	 */
 	mutex_lock(&trace_types_lock);
 	iter->trace = kzalloc(sizeof(*iter->trace), GFP_KERNEL);
-	if (!iter->trace) {
-		*ret = -ENOMEM;
+	if (!iter->trace)
 		goto fail;
-	}
+
 	if (current_trace)
 		*iter->trace = *current_trace;
 
@@ -1750,9 +1746,11 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	}
 
 	/* TODO stop tracer */
-	*ret = seq_open(file, &tracer_seq_ops);
-	if (*ret)
+	ret = seq_open(file, &tracer_seq_ops);
+	if (ret < 0) {
+		fail_ret = ERR_PTR(ret);
 		goto fail_buffer;
+	}
 
 	m = file->private_data;
 	m->private = iter;
@@ -1762,7 +1760,6 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 
 	mutex_unlock(&trace_types_lock);
 
- out:
 	return iter;
 
  fail_buffer:
@@ -1775,7 +1772,7 @@ __tracing_open(struct inode *inode, struct file *file, int *ret)
 	kfree(iter->trace);
 	kfree(iter);
 
-	return ERR_PTR(-ENOMEM);
+	return fail_ret;
 }
 
 int tracing_open_generic(struct inode *inode, struct file *filp)
@@ -1815,9 +1812,12 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 static int tracing_open(struct inode *inode, struct file *file)
 {
-	int ret;
+	struct trace_iterator *iter;
+	int ret = 0;
 
-	__tracing_open(inode, file, &ret);
+	iter = __tracing_open(inode, file);
+	if (IS_ERR(iter))
+		ret = PTR_ERR(iter);
 
 	return ret;
 }
@@ -1825,11 +1825,13 @@ static int tracing_open(struct inode *inode, struct file *file)
 static int tracing_lt_open(struct inode *inode, struct file *file)
 {
 	struct trace_iterator *iter;
-	int ret;
+	int ret = 0;
 
-	iter = __tracing_open(inode, file, &ret);
+	iter = __tracing_open(inode, file);
 
-	if (!ret)
+	if (IS_ERR(iter))
+		ret = PTR_ERR(iter);
+	else
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	return ret;
-- 
cgit v1.2.3


From 5c6a3ae1b4beebb56e2916b84f1208d96a9e32ff Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 00:22:21 -0500
Subject: tracing: use newline separator for trace options list

Impact: clean up

Instead of listing the trace options like:

 # cat /debug/tracing/trace_options
print-parent nosym-offset nosym-addr noverbose noraw nohex nobin noblock nostacktrace nosched-tree ftrace_printk noftrace_preempt nobranch annotate nouserstacktrace nosym-userobj

We now list them like:

 # cat /debug/tracing/trace_options
print-parent
nosym-offset
nosym-addr
noverbose
noraw
nohex
nobin
noblock
nostacktrace
nosched-tree
ftrace_printk
noftrace_preempt
nobranch
annotate
nouserstacktrace
nosym-userobj

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 304e02ce39c4..5db7485158df 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2037,7 +2037,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	/* calculate max size */
 	for (i = 0; trace_options[i]; i++) {
 		len += strlen(trace_options[i]);
-		len += 3; /* "no" and space */
+		len += 3; /* "no" and newline */
 	}
 
 	mutex_lock(&trace_types_lock);
@@ -2050,7 +2050,7 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 	 */
 	for (i = 0; trace_opts[i].name; i++) {
 		len += strlen(trace_opts[i].name);
-		len += 3; /* "no" and space */
+		len += 3; /* "no" and newline */
 	}
 
 	/* +2 for \n and \0 */
@@ -2062,22 +2062,21 @@ tracing_trace_options_read(struct file *filp, char __user *ubuf,
 
 	for (i = 0; trace_options[i]; i++) {
 		if (trace_flags & (1 << i))
-			r += sprintf(buf + r, "%s ", trace_options[i]);
+			r += sprintf(buf + r, "%s\n", trace_options[i]);
 		else
-			r += sprintf(buf + r, "no%s ", trace_options[i]);
+			r += sprintf(buf + r, "no%s\n", trace_options[i]);
 	}
 
 	for (i = 0; trace_opts[i].name; i++) {
 		if (tracer_flags & trace_opts[i].bit)
-			r += sprintf(buf + r, "%s ",
+			r += sprintf(buf + r, "%s\n",
 				trace_opts[i].name);
 		else
-			r += sprintf(buf + r, "no%s ",
+			r += sprintf(buf + r, "no%s\n",
 				trace_opts[i].name);
 	}
 	mutex_unlock(&trace_types_lock);
 
-	r += sprintf(buf + r, "\n");
 	WARN_ON(r >= len + 2);
 
 	r = simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-- 
cgit v1.2.3


From 54e991242850edc8c53f71fa5aa3ba7a93ce38f5 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Fri, 27 Feb 2009 15:13:54 +0530
Subject: sched: don't allow setuid to succeed if the user does not have rt
 bandwidth

Impact: fix hung task with certain (non-default) rt-limit settings

Corey Hickey reported that on using setuid to change the uid of a
rt process, the process would be unkillable and not be running.
This is because there was no rt runtime for that user group. Add
in a check to see if a user can attach an rt task to its task group.
On failure, return EINVAL, which is also returned in
CONFIG_CGROUP_SCHED.

Reported-by: Corey Hickey <bugfood-ml@fatooh.org>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h |  4 ++++
 kernel/sched.c        | 13 +++++++++++--
 kernel/sys.c          | 31 ++++++++++++++++++++-----------
 kernel/user.c         | 18 ++++++++++++++++++
 4 files changed, 53 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 8981e52c714f..8c216e057c94 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2291,9 +2291,13 @@ extern long sched_group_rt_runtime(struct task_group *tg);
 extern int sched_group_set_rt_period(struct task_group *tg,
 				      long rt_period_us);
 extern long sched_group_rt_period(struct task_group *tg);
+extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
 #endif
 #endif
 
+extern int task_can_switch_user(struct user_struct *up,
+					struct task_struct *tsk);
+
 #ifdef CONFIG_TASK_XACCT
 static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
 {
diff --git a/kernel/sched.c b/kernel/sched.c
index c3baa9653d1d..8e2558c2ba67 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -9224,6 +9224,16 @@ static int sched_rt_global_constraints(void)
 
 	return ret;
 }
+
+int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk)
+{
+	/* Don't accept realtime tasks when there is no way for them to run */
+	if (rt_task(tsk) && tg->rt_bandwidth.rt_runtime == 0)
+		return 0;
+
+	return 1;
+}
+
 #else /* !CONFIG_RT_GROUP_SCHED */
 static int sched_rt_global_constraints(void)
 {
@@ -9317,8 +9327,7 @@ cpu_cgroup_can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 		      struct task_struct *tsk)
 {
 #ifdef CONFIG_RT_GROUP_SCHED
-	/* Don't accept realtime tasks when there is no way for them to run */
-	if (rt_task(tsk) && cgroup_tg(cgrp)->rt_bandwidth.rt_runtime == 0)
+	if (!sched_rt_can_attach(cgroup_tg(cgrp), tsk))
 		return -EINVAL;
 #else
 	/* We don't support RT-tasks being in separate groups */
diff --git a/kernel/sys.c b/kernel/sys.c
index f145c415bc16..37f458e6882a 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -559,7 +559,7 @@ error:
 	abort_creds(new);
 	return retval;
 }
-  
+
 /*
  * change the user struct in a credentials set to match the new UID
  */
@@ -571,6 +571,11 @@ static int set_user(struct cred *new)
 	if (!new_user)
 		return -EAGAIN;
 
+	if (!task_can_switch_user(new_user, current)) {
+		free_uid(new_user);
+		return -EINVAL;
+	}
+
 	if (atomic_read(&new_user->processes) >=
 				current->signal->rlim[RLIMIT_NPROC].rlim_cur &&
 			new_user != INIT_USER) {
@@ -631,10 +636,11 @@ SYSCALL_DEFINE2(setreuid, uid_t, ruid, uid_t, euid)
 			goto error;
 	}
 
-	retval = -EAGAIN;
-	if (new->uid != old->uid && set_user(new) < 0)
-		goto error;
-
+	if (new->uid != old->uid) {
+		retval = set_user(new);
+		if (retval < 0)
+			goto error;
+	}
 	if (ruid != (uid_t) -1 ||
 	    (euid != (uid_t) -1 && euid != old->uid))
 		new->suid = new->euid;
@@ -680,9 +686,10 @@ SYSCALL_DEFINE1(setuid, uid_t, uid)
 	retval = -EPERM;
 	if (capable(CAP_SETUID)) {
 		new->suid = new->uid = uid;
-		if (uid != old->uid && set_user(new) < 0) {
-			retval = -EAGAIN;
-			goto error;
+		if (uid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				goto error;
 		}
 	} else if (uid != old->uid && uid != new->suid) {
 		goto error;
@@ -734,11 +741,13 @@ SYSCALL_DEFINE3(setresuid, uid_t, ruid, uid_t, euid, uid_t, suid)
 			goto error;
 	}
 
-	retval = -EAGAIN;
 	if (ruid != (uid_t) -1) {
 		new->uid = ruid;
-		if (ruid != old->uid && set_user(new) < 0)
-			goto error;
+		if (ruid != old->uid) {
+			retval = set_user(new);
+			if (retval < 0)
+				goto error;
+		}
 	}
 	if (euid != (uid_t) -1)
 		new->euid = euid;
diff --git a/kernel/user.c b/kernel/user.c
index 3551ac742395..6a9b696128c8 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -362,6 +362,24 @@ static void free_user(struct user_struct *up, unsigned long flags)
 
 #endif
 
+#if defined(CONFIG_RT_GROUP_SCHED) && defined(CONFIG_USER_SCHED)
+/*
+ * We need to check if a setuid can take place. This function should be called
+ * before successfully completing the setuid.
+ */
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+
+	return sched_rt_can_attach(up->tg, tsk);
+
+}
+#else
+int task_can_switch_user(struct user_struct *up, struct task_struct *tsk)
+{
+	return 1;
+}
+#endif
+
 /*
  * Locate the user_struct for the passed UID.  If found, take a ref on it.  The
  * caller must undo that ref with free_uid().
-- 
cgit v1.2.3


From 0cfe82451dfa3ebf4e69158f2eb450f2fbb6b715 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 10:51:10 -0500
Subject: tracing: replace kzalloc with kcalloc

Impact: clean up

kcalloc is a better approach to allocate a NULL array.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5db7485158df..9c5987aca74b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3328,7 +3328,7 @@ create_trace_option_files(struct tracer *tracer)
 	for (cnt = 0; opts[cnt].name; cnt++)
 		;
 
-	topts = kzalloc(sizeof(*topts) * (cnt + 1), GFP_KERNEL);
+	topts = kcalloc(cnt + 1, sizeof(*topts), GFP_KERNEL);
 	if (!topts)
 		return NULL;
 
-- 
cgit v1.2.3


From 5170836679185357dc1b7660bad13287b39e1e33 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 27 Feb 2009 14:03:03 -0800
Subject: Fix recursive lock in free_uid()/free_user_ns()

free_uid() and free_user_ns() are corecursive when CONFIG_USER_SCHED=n,
but free_user_ns() is called from free_uid() by way of uid_hash_remove(),
which requires uidhash_lock to be held.  free_user_ns() then calls
free_uid() to complete the destruction.

Fix this by deferring the destruction of the user_namespace.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/user_namespace.h |  1 +
 kernel/user_namespace.c        | 21 +++++++++++++++++----
 2 files changed, 18 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h
index 315bcd375224..cc4f45361dbb 100644
--- a/include/linux/user_namespace.h
+++ b/include/linux/user_namespace.h
@@ -13,6 +13,7 @@ struct user_namespace {
 	struct kref		kref;
 	struct hlist_head	uidhash_table[UIDHASH_SZ];
 	struct user_struct	*creator;
+	struct work_struct	destroyer;
 };
 
 extern struct user_namespace init_user_ns;
diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c
index 79084311ee57..076c7c8215b0 100644
--- a/kernel/user_namespace.c
+++ b/kernel/user_namespace.c
@@ -60,12 +60,25 @@ int create_user_ns(struct cred *new)
 	return 0;
 }
 
-void free_user_ns(struct kref *kref)
+/*
+ * Deferred destructor for a user namespace.  This is required because
+ * free_user_ns() may be called with uidhash_lock held, but we need to call
+ * back to free_uid() which will want to take the lock again.
+ */
+static void free_user_ns_work(struct work_struct *work)
 {
-	struct user_namespace *ns;
-
-	ns = container_of(kref, struct user_namespace, kref);
+	struct user_namespace *ns =
+		container_of(work, struct user_namespace, destroyer);
 	free_uid(ns->creator);
 	kfree(ns);
 }
+
+void free_user_ns(struct kref *kref)
+{
+	struct user_namespace *ns =
+		container_of(kref, struct user_namespace, kref);
+
+	INIT_WORK(&ns->destroyer, free_user_ns_work);
+	schedule_work(&ns->destroyer);
+}
 EXPORT_SYMBOL(free_user_ns);
-- 
cgit v1.2.3


From eb594e45f6979cd10b18d87f7b3f02119e00a108 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 17:36:06 -0500
Subject: tracing: move trace point formats to files in include/trace directory

Impact: clean up

To further facilitate the ease of adding trace points for developers, this
patch creates include/trace/trace_events.h and
include/trace/trace_event_types.h.

The former file will hold the trace/<type>.h files and the latter will hold
the trace/<type>_event_types.h files.

To create new tracepoints and to have them automatically
appear in the event tracer, a developer makes the trace/<type>.h file
which includes <linux/tracepoint.h> and the trace/<type>_event_types.h file.

The trace/<type>_event_types.h file will hold the TRACE_FORMAT
macros.

Then add the trace/<type>.h file to trace/trace_events.h,
and add the trace/<type>_event_types.h to the trace_event_types.h file.

No need to modify files elsewhere.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/trace_event_types.h |  4 ++++
 include/trace/trace_events.h      |  4 ++++
 kernel/trace/events.c             | 10 ++--------
 3 files changed, 10 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/trace_event_types.h
 create mode 100644 include/trace/trace_events.h

(limited to 'kernel')

diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
new file mode 100644
index 000000000000..33c8ed5ccb6c
--- /dev/null
+++ b/include/trace/trace_event_types.h
@@ -0,0 +1,4 @@
+/* trace/<type>_event_types.h here */
+
+#include <trace/sched_event_types.h>
+#include <trace/irq_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
new file mode 100644
index 000000000000..ea2ef2051762
--- /dev/null
+++ b/include/trace/trace_events.h
@@ -0,0 +1,4 @@
+/* trace/<type>.h here */
+
+#include <trace/sched.h>
+#include <trace/irq.h>
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 3c75623893cc..46e27ad2487e 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -1,15 +1,9 @@
 /*
  * This is the place to register all trace points as events.
- * Include the trace/<type>.h at the top.
- * Include the trace/<type>_event_types.h at the bottom.
  */
 
-/* trace/<type>.h here */
-#include <trace/sched.h>
-#include <trace/irq.h>
+#include <trace/trace_events.h>
 
 #include "trace_events.h"
 
-/* trace/<type>_event_types.h here */
-#include <trace/sched_event_types.h>
-#include <trace/irq_event_types.h>
+#include <trace/trace_event_types.h>
-- 
cgit v1.2.3


From 6ecc2d1ca39177edb6fbdb7412948b0e9f409d02 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 21:33:02 -0500
Subject: tracing: add subsystem level to trace events

If a trace point header defines TRACE_SYSTEM, then it will add the
following trace points into that event system.

If include/trace/irq_event_types.h has:

 #define TRACE_SYSTEM irq

at the top and

 #undef TRACE_SYSTEM

at the bottom, then a directory "irq" will be created in the
/debug/tracing/events directory. Inside that directory will contain the
two trace points that are defined in include/trace/irq_event_types.h.

Only adding the above to irq and not to sched, we get:

 # ls /debug/tracing/events/
irq                     sched_process_exit  sched_signal_send  sched_wakeup_new
sched_kthread_stop      sched_process_fork  sched_switch
sched_kthread_stop_ret  sched_process_free  sched_wait_task
sched_migrate_task      sched_process_wait  sched_wakeup

 # ls /debug/tracing/events/irq
irq_handler_entry  irq_handler_exit

If we add #define TRACE_SYSTEM sched to the trace/sched_event_types.h
then the rest of the trace events will be put in a sched directory
within the events directory.

I've been playing with this idea of the subsystem for a while, but
recently Tom Zanussi posted some patches to lkml that included this
method. Tom's approach was clean and got me to finally put some effort
to clean up the event trace points.

Thanks to Tom Zanussi for demonstrating how nice the subsystem
method is.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c       |  4 ++++
 kernel/trace/trace_events.c | 48 +++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.h |  2 ++
 3 files changed, 54 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 46e27ad2487e..4e4e45860c58 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -2,6 +2,10 @@
  * This is the place to register all trace points as events.
  */
 
+/* someday this needs to go in a generic header */
+#define __STR(x) #x
+#define STR(x) __STR(x)
+
 #include <trace/trace_events.h>
 
 #include "trace_events.h"
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3bcb9df93342..19332200c457 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -345,11 +345,59 @@ static struct dentry *event_trace_events_dir(void)
 	return d_events;
 }
 
+struct event_subsystem {
+	struct list_head	list;
+	const char		*name;
+	struct dentry		*entry;
+};
+
+static LIST_HEAD(event_subsystems);
+
+static struct dentry *
+event_subsystem_dir(const char *name, struct dentry *d_events)
+{
+	struct event_subsystem *system;
+
+	/* First see if we did not already create this dir */
+	list_for_each_entry(system, &event_subsystems, list) {
+		if (strcmp(system->name, name) == 0)
+			return system->entry;
+	}
+
+	/* need to create new entry */
+	system = kmalloc(sizeof(*system), GFP_KERNEL);
+	if (!system) {
+		pr_warning("No memory to create event subsystem %s\n",
+			   name);
+		return d_events;
+	}
+
+	system->entry = debugfs_create_dir(name, d_events);
+	if (!system->entry) {
+		pr_warning("Could not create event subsystem %s\n",
+			   name);
+		kfree(system);
+		return d_events;
+	}
+
+	system->name = name;
+	list_add(&system->list, &event_subsystems);
+
+	return system->entry;
+}
+
 static int
 event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 {
 	struct dentry *entry;
 
+	/*
+	 * If the trace point header did not define TRACE_SYSTEM
+	 * then the system would be called "TRACE_SYSTEM".
+	 */
+	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
+		d_events = event_subsystem_dir(call->system, d_events);
+
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
index deb95e5006c8..b015d7b19878 100644
--- a/kernel/trace/trace_events.h
+++ b/kernel/trace/trace_events.h
@@ -7,6 +7,7 @@
 
 struct ftrace_event_call {
 	char		*name;
+	char		*system;
 	struct dentry	*dir;
 	int		enabled;
 	int		(*regfunc)(void);
@@ -44,6 +45,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
-- 
cgit v1.2.3


From b628b3e629b1436710e59a21cc020fbb04a52ce1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 23:32:58 -0500
Subject: tracing: make the set_event and available_events subsystem aware

This patch makes the event files, set_event and available_events
aware of the subsystem.

Now you can enable an entire subsystem with:

  echo 'irq:*' > set_event

Note: the '*' is not needed.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 43 ++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 40 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 19332200c457..b811eb343522 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -12,6 +12,8 @@
 
 #include "trace_events.h"
 
+#define TRACE_SYSTEM "TRACE_SYSTEM"
+
 #define events_for_each(event)						\
 	for (event = __start_ftrace_events;				\
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
@@ -45,14 +47,47 @@ static void ftrace_clear_events(void)
 static int ftrace_set_clr_event(char *buf, int set)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
+	char *event = NULL, *sub = NULL, *match;
+	int ret = -EINVAL;
+
+	/*
+	 * The buf format can be <subsystem>:<event-name>
+	 *  *:<event-name> means any event by that name.
+	 *  :<event-name> is the same.
+	 *
+	 *  <subsystem>:* means all events in that subsystem
+	 *  <subsystem>: means the same.
+	 *
+	 *  <name> (no ':') means all events in a subsystem with
+	 *  the name <name> or any event that matches <name>
+	 */
+
+	match = strsep(&buf, ":");
+	if (buf) {
+		sub = match;
+		event = buf;
+		match = NULL;
 
+		if (!strlen(sub) || strcmp(sub, "*") == 0)
+			sub = NULL;
+		if (!strlen(event) || strcmp(event, "*") == 0)
+			event = NULL;
+	}
 
 	events_for_each(call) {
 
 		if (!call->name)
 			continue;
 
-		if (strcmp(buf, call->name) != 0)
+		if (match &&
+		    strcmp(match, call->name) != 0 &&
+		    strcmp(match, call->system) != 0)
+			continue;
+
+		if (sub && strcmp(sub, call->system) != 0)
+			continue;
+
+		if (event && strcmp(event, call->name) != 0)
 			continue;
 
 		if (set) {
@@ -68,9 +103,9 @@ static int ftrace_set_clr_event(char *buf, int set)
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		return 0;
+		ret = 0;
 	}
-	return -EINVAL;
+	return ret;
 }
 
 /* 128 should be much more than enough */
@@ -200,6 +235,8 @@ static int t_show(struct seq_file *m, void *v)
 {
 	struct ftrace_event_call *call = v;
 
+	if (strcmp(call->system, TRACE_SYSTEM) != 0)
+		seq_printf(m, "%s:", call->system);
 	seq_printf(m, "%s\n", call->name);
 
 	return 0;
-- 
cgit v1.2.3


From ef5580d0fffce6e0a01043bac0625128b5d409a7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 19:38:04 -0500
Subject: tracing: add interface to write into current tracer buffer

Right now all tracers must manage their own trace buffers. This was
to enforce tracers to be independent in case we finally decide to
allow each tracer to have their own trace buffer.

But now we are adding event tracing that writes to the current tracer's
buffer. This adds an interface to allow events to write to the current
tracer buffer without having to manage its own. Since event tracing
has no "tracer", and is just a way to hook into any other tracer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 14 ++++++++++++++
 kernel/trace/trace.h |  6 ++++++
 2 files changed, 20 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 9c5987aca74b..c5e39cd7310d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -846,6 +846,20 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 	trace_wake_up();
 }
 
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc)
+{
+	return trace_buffer_lock_reserve(&global_trace,
+					 type, len, flags, pc);
+}
+
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
+}
+
 void
 trace_function(struct trace_array *tr,
 	       unsigned long ip, unsigned long parent_ip, unsigned long flags,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 632191770aac..adf161f6dd11 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -442,6 +442,12 @@ void trace_buffer_unlock_commit(struct trace_array *tr,
 				struct ring_buffer_event *event,
 				unsigned long flags, int pc);
 
+struct ring_buffer_event *
+trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
+				  unsigned long flags, int pc);
+void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
+
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
 
-- 
cgit v1.2.3


From c32e827b25054cb17b79cf97fb5e63ae4ce2223c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 27 Feb 2009 19:12:30 -0500
Subject: tracing: add raw trace point recording infrastructure

Impact: lower overhead tracing

The current event tracer can automatically pick up trace points
that are registered with the TRACE_FORMAT macro. But it required
a printf format string and parsing. Although, this adds the ability
to get guaranteed information like task names and such, it took
a hit in overhead processing. This processing can add about 500-1000
nanoseconds overhead, but in some cases that too is considered
too much and we want to shave off as much from this overhead as
possible.

Tom Zanussi recently posted tracing patches to lkml that are based
on a nice idea about capturing the data via C structs using
STRUCT_ENTER, STRUCT_EXIT type of macros.

I liked that method very much, but did not like the implementation
that required a developer to add data/code in several disjoint
locations.

This patch extends the event_tracer macros to do a similar "raw C"
approach that Tom Zanussi did. But instead of having the developers
needing to tweak a bunch of code all over the place, they can do it
all in one macro - preferably placed near the code that it is
tracing. That makes it much more likely that tracepoints will be
maintained on an ongoing basis by the code they modify.

The new macro TRACE_EVENT_FORMAT is created for this approach. (Note,
a developer may still utilize the more low level DECLARE_TRACE macros
if they don't care about getting their traces automatically in the event
tracer.)

They can also use the existing TRACE_FORMAT if they don't need to code
the tracepoint in C, but just want to use the convenience of printf.

So if the developer wants to "hardwire" a tracepoint in the fastest
possible way, and wants to acquire their data via a user space utility
in a raw binary format, or wants to see it in the trace output but not
sacrifice any performance, then they can implement the faster but
more complex TRACE_EVENT_FORMAT macro.

Here's what usage looks like:

  TRACE_EVENT_FORMAT(name,
	TPPROTO(proto),
	TPARGS(args),
	TPFMT(fmt, fmt_args),
	TRACE_STUCT(
		TRACE_FIELD(type1, item1, assign1)
		TRACE_FIELD(type2, item2, assign2)
			[...]
	),
	TPRAWFMT(raw_fmt)
	);

Note name, proto, args, and fmt, are all identical to what TRACE_FORMAT
uses.

 name: is the unique identifier of the trace point
 proto: The proto type that the trace point uses
 args: the args in the proto type
 fmt: printf format to use with the event printf tracer
 fmt_args: the printf argments to match fmt

 TRACE_STRUCT starts the ability to create a structure.
 Each item in the structure is defined with a TRACE_FIELD

  TRACE_FIELD(type, item, assign)

 type: the C type of item.
 item: the name of the item in the stucture
 assign: what to assign the item in the trace point callback

 raw_fmt is a way to pretty print the struct. It must match
  the order of the items are added in TRACE_STUCT

 An example of this would be:

 TRACE_EVENT_FORMAT(sched_wakeup,
	TPPROTO(struct rq *rq, struct task_struct *p, int success),
	TPARGS(rq, p, success),
	TPFMT("task %s:%d %s",
	      p->comm, p->pid, success?"succeeded":"failed"),
	TRACE_STRUCT(
		TRACE_FIELD(pid_t, pid, p->pid)
		TRACE_FIELD(int, success, success)
	),
	TPRAWFMT("task %d success=%d")
	);

 This creates us a unique struct of:

 struct {
	pid_t		pid;
	int		success;
 };

 And the way the call back would assign these values would be:

	entry->pid = p->pid;
	entry->success = success;

The nice part about this is that the creation of the assignent is done
via macro magic in the event tracer.  Once the TRACE_EVENT_FORMAT is
created, the developer will then have a faster method to record
into the ring buffer. They do not need to worry about the tracer itself.

The developer would only need to touch the files in include/trace/*.h

Again, I would like to give special thanks to Tom Zanussi for this
nice idea.

Idea-from: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c               |   6 +-
 kernel/trace/trace.h                |  19 ++++
 kernel/trace/trace_events.c         |   2 +-
 kernel/trace/trace_events.h         |  57 ----------
 kernel/trace/trace_events_stage_1.h |  34 ++++++
 kernel/trace/trace_events_stage_2.h |  72 ++++++++++++
 kernel/trace/trace_events_stage_3.h | 219 ++++++++++++++++++++++++++++++++++++
 7 files changed, 350 insertions(+), 59 deletions(-)
 delete mode 100644 kernel/trace/trace_events.h
 create mode 100644 kernel/trace/trace_events_stage_1.h
 create mode 100644 kernel/trace/trace_events_stage_2.h
 create mode 100644 kernel/trace/trace_events_stage_3.h

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 4e4e45860c58..f2509cbaacea 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -8,6 +8,10 @@
 
 #include <trace/trace_events.h>
 
-#include "trace_events.h"
+#include "trace_output.h"
+
+#include "trace_events_stage_1.h"
+#include "trace_events_stage_2.h"
+#include "trace_events_stage_3.h"
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index adf161f6dd11..aa1ab0cb80ab 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -726,4 +726,23 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+struct ftrace_event_call {
+	char		*name;
+	char		*system;
+	struct dentry	*dir;
+	int		enabled;
+	int		(*regfunc)(void);
+	void		(*unregfunc)(void);
+	int		id;
+	struct dentry	*raw_dir;
+	int		raw_enabled;
+	int		(*raw_init)(void);
+	int		(*raw_reg)(void);
+	void		(*raw_unreg)(void);
+};
+
+void event_trace_printk(unsigned long ip, const char *fmt, ...);
+extern struct ftrace_event_call __start_ftrace_events[];
+extern struct ftrace_event_call __stop_ftrace_events[];
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index b811eb343522..77a5c02bd634 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -10,7 +10,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 
-#include "trace_events.h"
+#include "trace.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
diff --git a/kernel/trace/trace_events.h b/kernel/trace/trace_events.h
deleted file mode 100644
index b015d7b19878..000000000000
--- a/kernel/trace/trace_events.h
+++ /dev/null
@@ -1,57 +0,0 @@
-#ifndef _LINUX_KERNEL_TRACE_EVENTS_H
-#define _LINUX_KERNEL_TRACE_EVENTS_H
-
-#include <linux/debugfs.h>
-#include <linux/ftrace.h>
-#include "trace.h"
-
-struct ftrace_event_call {
-	char		*name;
-	char		*system;
-	struct dentry	*dir;
-	int		enabled;
-	int		(*regfunc)(void);
-	void		(*unregfunc)(void);
-};
-
-
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
-
-#undef TRACE_FORMAT
-#define TRACE_FORMAT(call, proto, args, fmt)				\
-static void ftrace_event_##call(proto)					\
-{									\
-	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
-}									\
-									\
-static int ftrace_reg_event_##call(void)				\
-{									\
-	int ret;							\
-									\
-	ret = register_trace_##call(ftrace_event_##call);		\
-	if (!ret)							\
-		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
-	return ret;							\
-}									\
-									\
-static void ftrace_unreg_event_##call(void)				\
-{									\
-	unregister_trace_##call(ftrace_event_##call);			\
-}									\
-									\
-static struct ftrace_event_call __used					\
-__attribute__((__aligned__(4)))						\
-__attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
-}
-
-void event_trace_printk(unsigned long ip, const char *fmt, ...);
-extern struct ftrace_event_call __start_ftrace_events[];
-extern struct ftrace_event_call __stop_ftrace_events[];
-
-#endif /* _LINUX_KERNEL_TRACE_EVENTS_H */
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
new file mode 100644
index 000000000000..fd3bf9382d37
--- /dev/null
+++ b/kernel/trace/trace_events_stage_1.h
@@ -0,0 +1,34 @@
+/*
+ * Stage 1 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * struct ftrace_raw_<call> {
+ *	struct trace_entry		ent;
+ *	<type>				<item>;
+ *	[...]
+ * };
+ *
+ * The <type> <item> is created by the TRACE_FIELD(type, item, assign)
+ * macro. We simply do "type item;", and that will create the fields
+ * in the structure.
+ */
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
+	struct ftrace_raw_##name {					\
+		struct trace_entry	ent;				\
+		tstruct							\
+	};								\
+	static struct ftrace_event_call event_##name
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#define TRACE_FIELD(type, item, assign) \
+	type item;
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
new file mode 100644
index 000000000000..3eaaef5f19e1
--- /dev/null
+++ b/kernel/trace/trace_events_stage_2.h
@@ -0,0 +1,72 @@
+/*
+ * Stage 2 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * enum print_line_t
+ * ftrace_raw_output_<call>(struct trace_iterator *iter, int flags)
+ * {
+ *	struct trace_seq *s = &iter->seq;
+ *	struct ftrace_raw_<call> *field; <-- defined in stage 1
+ *	struct trace_entry *entry;
+ *	int ret;
+ *
+ *	entry = iter->ent;
+ *
+ *	if (entry->type != event_<call>.id) {
+ *		WARN_ON_ONCE(1);
+ *		return TRACE_TYPE_UNHANDLED;
+ *	}
+ *
+ *	field = (typeof(field))entry;
+ *
+ *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	if (!ret)
+ *		return TRACE_TYPE_PARTIAL_LINE;
+ *
+ *	return TRACE_TYPE_HANDLED;
+ * }
+ *
+ * This is the method used to print the raw event to the trace
+ * output format. Note, this is not needed if the data is read
+ * in binary.
+ */
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign) \
+	field->item,
+
+
+#undef TPRAWFMT
+#define TPRAWFMT(args...)	args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+enum print_line_t							\
+ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
+{									\
+	struct trace_seq *s = &iter->seq;				\
+	struct ftrace_raw_##call *field;				\
+	struct trace_entry *entry;					\
+	int ret;							\
+									\
+	entry = iter->ent;						\
+									\
+	if (entry->type != event_##call.id) {				\
+		WARN_ON_ONCE(1);					\
+		return TRACE_TYPE_UNHANDLED;				\
+	}								\
+									\
+	field = (typeof(field))entry;					\
+									\
+	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	if (!ret)							\
+		return TRACE_TYPE_PARTIAL_LINE;				\
+									\
+	return TRACE_TYPE_HANDLED;					\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
new file mode 100644
index 000000000000..7a161c49deb4
--- /dev/null
+++ b/kernel/trace/trace_events_stage_3.h
@@ -0,0 +1,219 @@
+/*
+ * Stage 3 of the trace events.
+ *
+ * Override the macros in <trace/trace_event_types.h> to include the following:
+ *
+ * static void ftrace_event_<call>(proto)
+ * {
+ * 	event_trace_printk(_RET_IP_, "(<call>) " <fmt>);
+ * }
+ *
+ * static int ftrace_reg_event_<call>(void)
+ * {
+ * 	int ret;
+ *
+ * 	ret = register_trace_<call>(ftrace_event_<call>);
+ * 	if (!ret)
+ * 		pr_info("event trace: Could not activate trace point "
+ * 			"probe to  <call>");
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ * 	unregister_trace_<call>(ftrace_event_<call>);
+ * }
+ *
+ * For those macros defined with TRACE_FORMAT:
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * 	.name 			= "<call>",
+ * 	.regfunc		= ftrace_reg_event_<call>,
+ * 	.unregfunc		= ftrace_unreg_event_<call>,
+ * }
+ *
+ *
+ * For those macros defined with TRACE_EVENT_FORMAT:
+ *
+ * static struct ftrace_event_call event_<call>;
+ *
+ * static void ftrace_raw_event_<call>(proto)
+ * {
+ * 	struct ring_buffer_event *event;
+ * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ * 	unsigned long irq_flags;
+ * 	int pc;
+ *
+ * 	local_save_flags(irq_flags);
+ * 	pc = preempt_count();
+ *
+ * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ * 				  sizeof(struct ftrace_raw_<call>),
+ * 				  irq_flags, pc);
+ * 	if (!event)
+ * 		return;
+ * 	entry	= ring_buffer_event_data(event);
+ *
+ * 	<tstruct>;  <-- Here we assign the entries by the TRACE_FIELD.
+ *
+ * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ * }
+ *
+ * static int ftrace_raw_reg_event_<call>(void)
+ * {
+ * 	int ret;
+ *
+ * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ * 	if (!ret)
+ * 		pr_info("event trace: Could not activate trace point "
+ * 			"probe to <call>");
+ * 	return ret;
+ * }
+ *
+ * static void ftrace_unreg_event_<call>(void)
+ * {
+ * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ * }
+ *
+ * static struct trace_event ftrace_event_type_<call> = {
+ * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ * };
+ *
+ * static int ftrace_raw_init_event_<call>(void)
+ * {
+ * 	int id;
+ *
+ * 	id = register_ftrace_event(&ftrace_event_type_<call>);
+ * 	if (!id)
+ * 		return -ENODEV;
+ * 	event_<call>.id = id;
+ * 	return 0;
+ * }
+ *
+ * static struct ftrace_event_call __used
+ * __attribute__((__aligned__(4)))
+ * __attribute__((section("_ftrace_events"))) event_<call> = {
+ * 	.name 			= "<call>",
+ * 	.regfunc		= ftrace_reg_event_<call>,
+ * 	.unregfunc		= ftrace_unreg_event_<call>,
+ * 	.raw_init		= ftrace_raw_init_event_<call>,
+ * 	.raw_reg		= ftrace_raw_reg_event_<call>,
+ * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
+ * }
+ *
+ */
+
+#undef TPFMT
+#define TPFMT(fmt, args...)	fmt "\n", ##args
+
+#define _TRACE_FORMAT(call, proto, args, fmt)				\
+static void ftrace_event_##call(proto)					\
+{									\
+	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+}									\
+									\
+static int ftrace_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_event_##call);			\
+}									\
+
+
+#undef TRACE_FORMAT
+#define TRACE_FORMAT(call, proto, args, fmt)				\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+}
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static void ftrace_raw_event_##call(proto)				\
+{									\
+	struct ring_buffer_event *event;				\
+	struct ftrace_raw_##call *entry;				\
+	unsigned long irq_flags;					\
+	int pc;								\
+									\
+	local_save_flags(irq_flags);					\
+	pc = preempt_count();						\
+									\
+	event = trace_current_buffer_lock_reserve(event_##call.id,	\
+				  sizeof(struct ftrace_raw_##call), 	\
+				  irq_flags, pc);			\
+	if (!event)							\
+		return;							\
+	entry	= ring_buffer_event_data(event);			\
+									\
+	tstruct;							\
+									\
+	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+}									\
+									\
+static int ftrace_raw_reg_event_##call(void)				\
+{									\
+	int ret;							\
+									\
+	ret = register_trace_##call(ftrace_raw_event_##call);		\
+	if (!ret)							\
+		pr_info("event trace: Could not activate trace point "	\
+			"probe to " #call);				\
+	return ret;							\
+}									\
+									\
+static void ftrace_raw_unreg_event_##call(void)				\
+{									\
+	unregister_trace_##call(ftrace_raw_event_##call);		\
+}									\
+									\
+static struct trace_event ftrace_event_type_##call = {			\
+	.trace			= ftrace_raw_output_##call,		\
+};									\
+									\
+static int ftrace_raw_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(&ftrace_event_type_##call);		\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}									\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.system			= STR(TRACE_SYSTEM),			\
+	.regfunc		= ftrace_reg_event_##call,		\
+	.unregfunc		= ftrace_unreg_event_##call,		\
+	.raw_init		= ftrace_raw_init_event_##call,		\
+	.raw_reg		= ftrace_raw_reg_event_##call,		\
+	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+}
-- 
cgit v1.2.3


From fd99498989f3b3feeab89dcadf537138ba136d24 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 28 Feb 2009 02:41:25 -0500
Subject: tracing: add raw fast tracing interface for trace events

This patch adds the interface to enable the C style trace points.
In the directory /debugfs/tracing/events/subsystem/event
We now have three files:

 enable : values 0 or 1 to enable or disable the trace event.

 available_types: values 'raw' and 'printf' which indicate the tracing
       types available for the trace point. If a developer does not
       use the TRACE_EVENT_FORMAT macro and just uses the TRACE_FORMAT
       macro, then only 'printf' will be available. This file is
       read only.

 type: values 'raw' or 'printf'. This indicates which type of tracing
       is active for that trace point. 'printf' is the default and
       if 'raw' is not available, this file is read only.

 # echo raw > /debug/tracing/events/sched/sched_wakeup/type
 # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable

 Will enable the C style tracing for the sched_wakeup trace point.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h        |   7 ++
 kernel/trace/trace_events.c | 199 ++++++++++++++++++++++++++++++++++++++------
 2 files changed, 181 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index aa1ab0cb80ab..f6fa0b9f83a8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -726,6 +726,12 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+/* trace event type bit fields, not numeric */
+enum {
+	TRACE_EVENT_TYPE_PRINTF		= 1,
+	TRACE_EVENT_TYPE_RAW		= 2,
+};
+
 struct ftrace_event_call {
 	char		*name;
 	char		*system;
@@ -736,6 +742,7 @@ struct ftrace_event_call {
 	int		id;
 	struct dentry	*raw_dir;
 	int		raw_enabled;
+	int		type;
 	int		(*raw_init)(void);
 	int		(*raw_reg)(void);
 	void		(*raw_unreg)(void);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 77a5c02bd634..1d07f800a9ce 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -44,6 +44,36 @@ static void ftrace_clear_events(void)
 	}
 }
 
+static void ftrace_event_enable_disable(struct ftrace_event_call *call,
+					int enable)
+{
+
+	switch (enable) {
+	case 0:
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+		}
+		if (call->raw_enabled) {
+			call->raw_enabled = 0;
+			call->raw_unreg();
+		}
+		break;
+	case 1:
+		if (!call->enabled &&
+		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+			call->enabled = 1;
+			call->regfunc();
+		}
+		if (!call->raw_enabled &&
+		    (call->type & TRACE_EVENT_TYPE_RAW)) {
+			call->raw_enabled = 1;
+			call->raw_reg();
+		}
+		break;
+	}
+}
+
 static int ftrace_set_clr_event(char *buf, int set)
 {
 	struct ftrace_event_call *call = __start_ftrace_events;
@@ -90,19 +120,8 @@ static int ftrace_set_clr_event(char *buf, int set)
 		if (event && strcmp(event, call->name) != 0)
 			continue;
 
-		if (set) {
-			/* Already set? */
-			if (call->enabled)
-				return 0;
-			call->enabled = 1;
-			call->regfunc();
-		} else {
-			/* Already cleared? */
-			if (!call->enabled)
-				return 0;
-			call->enabled = 0;
-			call->unregfunc();
-		}
+		ftrace_event_enable_disable(call, set);
+
 		ret = 0;
 	}
 	return ret;
@@ -273,7 +292,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled)
+	if (call->enabled || call->raw_enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -304,18 +323,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	switch (val) {
 	case 0:
-		if (!call->enabled)
-			break;
-
-		call->enabled = 0;
-		call->unregfunc();
-		break;
 	case 1:
-		if (call->enabled)
-			break;
-
-		call->enabled = 1;
-		call->regfunc();
+		ftrace_event_enable_disable(call, val);
 		break;
 
 	default:
@@ -327,6 +336,107 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
+		loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[16];
+	int r = 0;
+
+	if (call->type & TRACE_EVENT_TYPE_PRINTF)
+		r += sprintf(buf, "printf\n");
+
+	if (call->type & TRACE_EVENT_TYPE_RAW)
+		r += sprintf(buf+r, "raw\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
+static ssize_t
+event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		 loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64];
+
+	/*
+	 * If there's only one type, we can't change it.
+	 * And currently we always have printf type, and we
+	 * may or may not have raw type.
+	 *
+	 * This is a redundant check, the file should be read
+	 * only if this is the case anyway.
+	 */
+
+	if (!call->raw_init)
+		return -EPERM;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	buf[cnt] = 0;
+
+	if (!strncmp(buf, "printf", 6) &&
+	    (!buf[6] || isspace(buf[6]))) {
+
+		call->type = TRACE_EVENT_TYPE_PRINTF;
+
+		/*
+		 * If raw enabled, the disable it and enable
+		 * printf type.
+		 */
+		if (call->raw_enabled) {
+			call->raw_enabled = 0;
+			call->raw_unreg();
+
+			call->enabled = 1;
+			call->regfunc();
+		}
+
+	} else if (!strncmp(buf, "raw", 3) &&
+	    (!buf[3] || isspace(buf[3]))) {
+
+		call->type = TRACE_EVENT_TYPE_RAW;
+
+		/*
+		 * If printf enabled, the disable it and enable
+		 * raw type.
+		 */
+		if (call->enabled) {
+			call->enabled = 0;
+			call->unregfunc();
+
+			call->raw_enabled = 1;
+			call->raw_reg();
+		}
+	} else
+		return -EINVAL;
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
+static ssize_t
+event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
+			   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[16];
+	int r = 0;
+
+	r += sprintf(buf, "printf\n");
+
+	if (call->raw_init)
+		r += sprintf(buf+r, "raw\n");
+
+	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -362,6 +472,17 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
+static const struct file_operations ftrace_type_fops = {
+	.open = tracing_open_generic,
+	.read = event_type_read,
+	.write = event_type_write,
+};
+
+static const struct file_operations ftrace_available_types_fops = {
+	.open = tracing_open_generic,
+	.read = event_available_types_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -427,6 +548,7 @@ static int
 event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 {
 	struct dentry *entry;
+	int ret;
 
 	/*
 	 * If the trace point header did not define TRACE_SYSTEM
@@ -435,6 +557,18 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 	if (strcmp(call->system, "TRACE_SYSTEM") != 0)
 		d_events = event_subsystem_dir(call->system, d_events);
 
+	if (call->raw_init) {
+		ret = call->raw_init();
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
+		}
+	}
+
+	/* default the output to printf */
+	call->type = TRACE_EVENT_TYPE_PRINTF;
+
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -448,6 +582,21 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		pr_warning("Could not create debugfs "
 			   "'%s/enable' entry\n", call->name);
 
+	/* Only let type be writable, if we can change it */
+	entry = debugfs_create_file("type",
+				    call->raw_init ? 0644 : 0444,
+				    call->dir, call,
+				    &ftrace_type_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/type' entry\n", call->name);
+
+	entry = debugfs_create_file("available_types", 0444, call->dir, call,
+				    &ftrace_available_types_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/type' available_types\n", call->name);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From b67802ea8061393f7bd2d4db934646e76096027c Mon Sep 17 00:00:00 2001
From: Wang Chen <wangchen@cn.fujitsu.com>
Date: Mon, 2 Mar 2009 13:55:26 +0800
Subject: sched: kill unused parameter of pick_next_task()

Impact: micro-optimization

Parameter "prev" is not used really.

Signed-off-by: Wang Chen <wangchen@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dfae1bf6d5b2..9fe8e17574af 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4603,7 +4603,7 @@ static inline void schedule_debug(struct task_struct *prev)
  * Pick up the highest-prio task:
  */
 static inline struct task_struct *
-pick_next_task(struct rq *rq, struct task_struct *prev)
+pick_next_task(struct rq *rq)
 {
 	const struct sched_class *class;
 	struct task_struct *p;
@@ -4678,7 +4678,7 @@ need_resched_nonpreemptible:
 		idle_balance(cpu, rq);
 
 	prev->sched_class->put_prev_task(rq, prev);
-	next = pick_next_task(rq, prev);
+	next = pick_next_task(rq);
 
 	if (likely(prev != next)) {
 		sched_info_switch(prev, next);
@@ -6514,7 +6514,7 @@ static void migrate_dead_tasks(unsigned int dead_cpu)
 		if (!rq->nr_running)
 			break;
 		update_rq_clock(rq);
-		next = pick_next_task(rq, rq->curr);
+		next = pick_next_task(rq);
 		if (!next)
 			break;
 		next->sched_class->put_prev_task(rq, next);
-- 
cgit v1.2.3


From d20e3b03842bfeb9d21817ff19054c277cc3eac0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 10:53:15 -0500
Subject: tracing: add TRACE_FIELD_SPECIAL to record complex entries

Tom Zanussi pointed out that the simple TRACE_FIELD was not enough to
record trace data that required memcpy. This patch addresses this issue
by adding a TRACE_FIELD_SPECIAL. The format is similar to TRACE_FIELD
but looks like so:

  TRACE_FIELD_SPECIAL(type_item, item, cmd)

What TRACE_FIELD gave was:

  TRACE_FIELD(type, item, assign)

The TRACE_FIELD would be used in declaring a structure:

  struct {
	type	item;
  };

And later assign it via:

  entry->item = assign;

What TRACE_FIELD_SPECIAL gives us is:

In the declaration of the structure:

  struct {
	type_item;
  };

And the assignment:

  cmd;

This change log will explain the one example used in the patch:

 TRACE_EVENT_FORMAT(sched_switch,
	TPPROTO(struct rq *rq, struct task_struct *prev,
		struct task_struct *next),
	TPARGS(rq, prev, next),
	TPFMT("task %s:%d ==> %s:%d",
	      prev->comm, prev->pid, next->comm, next->pid),
	TRACE_STRUCT(
		TRACE_FIELD(pid_t, prev_pid, prev->pid)
		TRACE_FIELD(int, prev_prio, prev->prio)
		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
				    next_comm,
				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
						 next->comm,
						 TASK_COMM_LEN)))
		TRACE_FIELD(pid_t, next_pid, next->pid)
		TRACE_FIELD(int, next_prio, next->prio)
	),
	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
	);

 The struct will be create as:

  struct {
	pid_t		prev_pid;
	int		prev_prio;
	char next_comm[TASK_COMM_LEN];
	pid_t		next_pid;
	int		next_prio;
  };

Note the TRACE_ENTRY in the cmd part of TRACE_SPECIAL. TRACE_ENTRY will
be set by the tracer to point to the structure inside the trace buffer.

  entry->prev_pid	= prev->pid;
  entry->prev_prio	= prev->prio;
  memcpy(entry->next_comm, next->comm, TASK_COMM_LEN);
  entry->next_pid	= next->pid;
  entry->next_prio	= next->prio

Reported-by: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/sched_event_types.h   |  7 ++++++-
 kernel/trace/trace_events_stage_1.h |  2 ++
 kernel/trace/trace_events_stage_2.h |  4 ++++
 kernel/trace/trace_events_stage_3.h | 14 ++++++++++++++
 4 files changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index ba059c10b58a..a6de5c1601a0 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -71,10 +71,15 @@ TRACE_EVENT_FORMAT(sched_switch,
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, prev_pid, prev->pid)
 		TRACE_FIELD(int, prev_prio, prev->prio)
+		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
+				    next_comm,
+				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
+						 next->comm,
+						 TASK_COMM_LEN)))
 		TRACE_FIELD(pid_t, next_pid, next->pid)
 		TRACE_FIELD(int, next_prio, next->prio)
 	),
-	TPRAWFMT("prev %d:%d ==> next %d:%d")
+	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index fd3bf9382d37..3830a731424c 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -30,5 +30,7 @@
 
 #define TRACE_FIELD(type, item, assign) \
 	type item;
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	type_item;
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 3eaaef5f19e1..dc79fe3a2ecb 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,6 +39,10 @@
 #define TRACE_FIELD(type, item, assign) \
 	field->item,
 
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	field->item,
+
 
 #undef TPRAWFMT
 #define TPRAWFMT(args...)	args
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 7a161c49deb4..2ab65e958223 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -147,6 +147,20 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TPCMD
+#define TPCMD(cmd...)	cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY	entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	cmd;
+
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
 _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
-- 
cgit v1.2.3


From 11a241a3302277db05561e01477528629d806c4e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 11:49:04 -0500
Subject: tracing: add protection around modify trace event fields

The trace event objects are currently not proctected against
reentrancy. This patch adds a mutex around the modifications of
the trace event fields.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1d07f800a9ce..26069fa6b3b0 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -14,6 +14,8 @@
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
+static DEFINE_MUTEX(event_mutex);
+
 #define events_for_each(event)						\
 	for (event = __start_ftrace_events;				\
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
@@ -104,6 +106,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 			event = NULL;
 	}
 
+	mutex_lock(&event_mutex);
 	events_for_each(call) {
 
 		if (!call->name)
@@ -124,6 +127,8 @@ static int ftrace_set_clr_event(char *buf, int set)
 
 		ret = 0;
 	}
+	mutex_unlock(&event_mutex);
+
 	return ret;
 }
 
@@ -324,7 +329,9 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	switch (val) {
 	case 0:
 	case 1:
+		mutex_lock(&event_mutex);
 		ftrace_event_enable_disable(call, val);
+		mutex_unlock(&event_mutex);
 		break;
 
 	default:
-- 
cgit v1.2.3


From f9520750c4c9924c14325cd951efae5fae58104c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 14:04:40 -0500
Subject: tracing: make trace_seq_reset global and rename to trace_seq_init

Impact: clean up

The trace_seq functions may be used separately outside of the ftrace
iterator. The trace_seq_reset is needed for these operations.

This patch also renames trace_seq_reset to the more appropriate
trace_seq_init.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 17 +++++------------
 kernel/trace/trace.h |  8 ++++++++
 2 files changed, 13 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c5e39cd7310d..ea055aa21cd9 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -342,13 +342,6 @@ __update_max_tr(struct trace_array *tr, struct task_struct *tsk, int cpu)
 	tracing_record_cmdline(tsk);
 }
 
-static void
-trace_seq_reset(struct trace_seq *s)
-{
-	s->len = 0;
-	s->readpos = 0;
-}
-
 ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 {
 	int len;
@@ -395,7 +388,7 @@ trace_print_seq(struct seq_file *m, struct trace_seq *s)
 	s->buffer[len] = 0;
 	seq_puts(m, s->buffer);
 
-	trace_seq_reset(s);
+	trace_seq_init(s);
 }
 
 /**
@@ -2620,7 +2613,7 @@ tracing_read_pipe(struct file *filp, char __user *ubuf,
 	if (sret != -EBUSY)
 		return sret;
 
-	trace_seq_reset(&iter->seq);
+	trace_seq_init(&iter->seq);
 
 	/* copy the tracer to avoid using a global lock all around */
 	mutex_lock(&trace_types_lock);
@@ -2682,7 +2675,7 @@ waitagain:
 	/* Now copy what we have to the user */
 	sret = trace_seq_to_user(&iter->seq, ubuf, cnt);
 	if (iter->seq.readpos >= iter->seq.len)
-		trace_seq_reset(&iter->seq);
+		trace_seq_init(&iter->seq);
 
 	/*
 	 * If there was nothing to send to user, inspite of consuming trace
@@ -2819,7 +2812,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp,
 		partial[i].offset = 0;
 		partial[i].len = iter->seq.len;
 
-		trace_seq_reset(&iter->seq);
+		trace_seq_init(&iter->seq);
 	}
 
 	mutex_unlock(&iter->mutex);
@@ -3631,7 +3624,7 @@ trace_printk_seq(struct trace_seq *s)
 
 	printk(KERN_TRACE "%s", s->buffer);
 
-	trace_seq_reset(s);
+	trace_seq_init(s);
 }
 
 void ftrace_dump(void)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f6fa0b9f83a8..cf6ba4181b14 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -395,6 +395,14 @@ struct trace_seq {
 	unsigned int		readpos;
 };
 
+static inline void
+trace_seq_init(struct trace_seq *s)
+{
+	s->len = 0;
+	s->readpos = 0;
+}
+
+
 #define TRACE_PIPE_ALL_CPU	-1
 
 /*
-- 
cgit v1.2.3


From 981d081ec8b958b7d962ee40d433581a55d40fc5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 13:53:59 -0500
Subject: tracing: add format file to describe event struct fields

This patch adds the "format" file to the trace point event directory.
This is based off of work by Tom Zanussi, in which a file is exported
to be tread from user land such that a user space app may read the
binary record stored in the ring buffer.

 # cat /debug/tracing/events/sched/sched_switch/format
        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

Idea-from: Tom Zanussi <tzanussi@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h                |  1 +
 kernel/trace/trace_events.c         | 56 ++++++++++++++++++++++++++++++++++++-
 kernel/trace/trace_events_stage_2.h | 52 ++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |  2 ++
 4 files changed, 110 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cf6ba4181b14..e606633fb498 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -754,6 +754,7 @@ struct ftrace_event_call {
 	int		(*raw_init)(void);
 	int		(*raw_reg)(void);
 	void		(*raw_unreg)(void);
+	int		(*show_format)(struct trace_seq *s);
 };
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 26069fa6b3b0..d57a772981c1 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -3,6 +3,9 @@
  *
  * Copyright (C) 2008 Red Hat Inc, Steven Rostedt <srostedt@redhat.com>
  *
+ *  - Added format output of fields of the trace point.
+ *    This was based off of work by Tom Zanussi <tzanussi@gmail.com>.
+ *
  */
 
 #include <linux/debugfs.h>
@@ -444,6 +447,42 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+static ssize_t
+event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	char *buf;
+	int r;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	if (*ppos)
+		return 0;
+
+	r = call->show_format(s);
+	if (!r) {
+		/*
+		 * ug!  The format output is bigger than a PAGE!!
+		 */
+		buf = "FORMAT TOO BIG\n";
+		r = simple_read_from_buffer(ubuf, cnt, ppos,
+					      buf, strlen(buf));
+		goto out;
+	}
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+ out:
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -490,6 +529,11 @@ static const struct file_operations ftrace_available_types_fops = {
 	.read = event_available_types_read,
 };
 
+static const struct file_operations ftrace_event_format_fops = {
+	.open = tracing_open_generic,
+	.read = event_format_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -602,7 +646,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				    &ftrace_available_types_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-			   "'%s/type' available_types\n", call->name);
+			   "'%s/available_types' entry\n", call->name);
+
+	/* A trace may not want to export its format */
+	if (!call->show_format)
+		return 0;
+
+	entry = debugfs_create_file("format", 0444, call->dir, call,
+				    &ftrace_event_format_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/format' entry\n", call->name);
 
 	return 0;
 }
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index dc79fe3a2ecb..3a80ea4e92cb 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -74,3 +74,55 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 }
 
 #include <trace/trace_event_types.h>
+
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ * 	struct ftrace_raw_##call field;
+ * 	int ret;
+ *
+ * 	ret = trace_seq_printf(s, #type " " #item ";"
+ * 			       " size:%d; offset:%d;\n",
+ * 			       sizeof(field.type),
+ * 			       offsetof(struct ftrace_raw_##call,
+ * 					item));
+ *
+ * }
+ */
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2ab65e958223..c62a4d2a5283 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -101,6 +101,7 @@
  * 	.raw_init		= ftrace_raw_init_event_<call>,
  * 	.raw_reg		= ftrace_raw_reg_event_<call>,
  * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
+ *	.show_format		= ftrace_format_<call>,
  * }
  *
  */
@@ -230,4 +231,5 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.raw_reg		= ftrace_raw_reg_event_##call,		\
 	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.show_format		= ftrace_format_##call,			\
 }
-- 
cgit v1.2.3


From 91729ef96661bfa7dc53923746cd90b62d5495cc Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:03:01 -0500
Subject: tracing: add ftrace headers to event format files

This patch includes the ftrace header to the event formats files:

 # cat /debug/tracing/events/sched/sched_switch/format
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

A blank line is used as a deliminator between the ftrace header and the
trace point fields.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d57a772981c1..cdcc3aed76fd 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -13,7 +13,7 @@
 #include <linux/module.h>
 #include <linux/ctype.h>
 
-#include "trace.h"
+#include "trace_output.h"
 
 #define TRACE_SYSTEM "TRACE_SYSTEM"
 
@@ -447,6 +447,28 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
+#undef FIELD
+#define FIELD(type, name) \
+	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+
+static int trace_write_header(struct trace_seq *s)
+{
+	struct trace_entry field;
+
+	/* struct trace_entry */
+	return trace_seq_printf(s,
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\n",
+				FIELD(unsigned char, type),
+				FIELD(unsigned char, flags),
+				FIELD(unsigned char, preempt_count),
+				FIELD(int, pid),
+				FIELD(int, tgid));
+}
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -465,6 +487,9 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (*ppos)
 		return 0;
 
+	/* If this fails, so will the show_format. */
+	trace_write_header(s);
+
 	r = call->show_format(s);
 	if (!r) {
 		/*
-- 
cgit v1.2.3


From c5e4e19271edfdf1abd4184933d40d646da6a091 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:10:02 -0500
Subject: tracing: add trace name and id to event formats

To be able to identify the trace in the binary format output, the
id of the trace event (which is dynamically assigned) must also be listed.

This patch adds the name of the trace point as well as the id assigned.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index cdcc3aed76fd..210e71ff82db 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -487,7 +487,11 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	if (*ppos)
 		return 0;
 
-	/* If this fails, so will the show_format. */
+	/* If any of the first writes fail, so will the show_format. */
+
+	trace_seq_printf(s, "name: %s\n", call->name);
+	trace_seq_printf(s, "ID: %d\n", call->id);
+	trace_seq_printf(s, "format:\n");
 	trace_write_header(s);
 
 	r = call->show_format(s);
-- 
cgit v1.2.3


From 96ccd21cd13140221bda74a4fc4e53ffeba7c7d4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 15:22:21 -0500
Subject: tracing: add print format to event trace format files

This patch adds the internal print format used to print the raw events
to the event trace point format file.

 # cat /debug/tracing/events/sched/sched_switch/format
name: sched_switch
ID: 29
format:
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:pid_t prev_pid;   offset:12;      size:4;
        field:int prev_prio;    offset:16;      size:4;
        field special:char next_comm[TASK_COMM_LEN];    offset:20;      size:16;
        field:pid_t next_pid;   offset:36;      size:4;
        field:int next_prio;    offset:40;      size:4;

print fmt: "prev %d:%d ==> next %s:%d:%d"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 3a80ea4e92cb..b1cebba1d9b4 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -122,6 +122,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 									\
 	tstruct;							\
 									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
 	return ret;							\
 }
 
-- 
cgit v1.2.3


From 044d408409cc4e1bc75c886e27ca85c270db104c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Mon, 2 Mar 2009 16:13:32 +0100
Subject: genirq: assert that irq handlers are indeed running in hardirq
 context

Make sure the genirq layer handlers are indeed running handlers
in hardirq context. That is the genirq expectation and doing
anything else is broken.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <1236006812.5330.632.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/handle.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 3aba8d12f328..a2ee682bca2e 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -328,6 +328,8 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 	irqreturn_t ret, retval = IRQ_NONE;
 	unsigned int status = 0;
 
+	WARN_ONCE(!in_irq(), "BUG: IRQ handler called from non-hardirq context!");
+
 	if (!(action->flags & IRQF_DISABLED))
 		local_irq_enable_in_hardirq();
 
-- 
cgit v1.2.3


From 5b1017404aea6d2e552e991b3fd814d839e9cd67 Mon Sep 17 00:00:00 2001
From: Roland McGrath <roland@redhat.com>
Date: Fri, 27 Feb 2009 23:25:54 -0800
Subject: x86-64: seccomp: fix 32/64 syscall hole

On x86-64, a 32-bit process (TIF_IA32) can switch to 64-bit mode with
ljmp, and then use the "syscall" instruction to make a 64-bit system
call.  A 64-bit process make a 32-bit system call with int $0x80.

In both these cases under CONFIG_SECCOMP=y, secure_computing() will use
the wrong system call number table.  The fix is simple: test TS_COMPAT
instead of TIF_IA32.  Here is an example exploit:

	/* test case for seccomp circumvention on x86-64

	   There are two failure modes: compile with -m64 or compile with -m32.

	   The -m64 case is the worst one, because it does "chmod 777 ." (could
	   be any chmod call).  The -m32 case demonstrates it was able to do
	   stat(), which can glean information but not harm anything directly.

	   A buggy kernel will let the test do something, print, and exit 1; a
	   fixed kernel will make it exit with SIGKILL before it does anything.
	*/

	#define _GNU_SOURCE
	#include <assert.h>
	#include <inttypes.h>
	#include <stdio.h>
	#include <linux/prctl.h>
	#include <sys/stat.h>
	#include <unistd.h>
	#include <asm/unistd.h>

	int
	main (int argc, char **argv)
	{
	  char buf[100];
	  static const char dot[] = ".";
	  long ret;
	  unsigned st[24];

	  if (prctl (PR_SET_SECCOMP, 1, 0, 0, 0) != 0)
	    perror ("prctl(PR_SET_SECCOMP) -- not compiled into kernel?");

	#ifdef __x86_64__
	  assert ((uintptr_t) dot < (1UL << 32));
	  asm ("int $0x80 # %0 <- %1(%2 %3)"
	       : "=a" (ret) : "0" (15), "b" (dot), "c" (0777));
	  ret = snprintf (buf, sizeof buf,
			  "result %ld (check mode on .!)\n", ret);
	#elif defined __i386__
	  asm (".code32\n"
	       "pushl %%cs\n"
	       "pushl $2f\n"
	       "ljmpl $0x33, $1f\n"
	       ".code64\n"
	       "1: syscall # %0 <- %1(%2 %3)\n"
	       "lretl\n"
	       ".code32\n"
	       "2:"
	       : "=a" (ret) : "0" (4), "D" (dot), "S" (&st));
	  if (ret == 0)
	    ret = snprintf (buf, sizeof buf,
			    "stat . -> st_uid=%u\n", st[7]);
	  else
	    ret = snprintf (buf, sizeof buf, "result %ld\n", ret);
	#else
	# error "not this one"
	#endif

	  write (1, buf, ret);

	  syscall (__NR_exit, 1);
	  return 2;
	}

Signed-off-by: Roland McGrath <roland@redhat.com>
[ I don't know if anybody actually uses seccomp, but it's enabled in
  at least both Fedora and SuSE kernels, so maybe somebody is. - Linus ]
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/mips/include/asm/seccomp.h    | 1 -
 arch/powerpc/include/asm/compat.h  | 5 +++++
 arch/powerpc/include/asm/seccomp.h | 4 ----
 arch/sparc/include/asm/compat.h    | 5 +++++
 arch/sparc/include/asm/seccomp.h   | 6 ------
 arch/x86/include/asm/seccomp_32.h  | 6 ------
 arch/x86/include/asm/seccomp_64.h  | 8 --------
 kernel/seccomp.c                   | 7 ++++---
 8 files changed, 14 insertions(+), 28 deletions(-)

(limited to 'kernel')

diff --git a/arch/mips/include/asm/seccomp.h b/arch/mips/include/asm/seccomp.h
index 36ed44070256..a6772e9507f5 100644
--- a/arch/mips/include/asm/seccomp.h
+++ b/arch/mips/include/asm/seccomp.h
@@ -1,6 +1,5 @@
 #ifndef __ASM_SECCOMP_H
 
-#include <linux/thread_info.h>
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/powerpc/include/asm/compat.h b/arch/powerpc/include/asm/compat.h
index d811a8cd7b58..4774c2f92232 100644
--- a/arch/powerpc/include/asm/compat.h
+++ b/arch/powerpc/include/asm/compat.h
@@ -210,5 +210,10 @@ struct compat_shmid64_ds {
 	compat_ulong_t __unused6;
 };
 
+static inline int is_compat_task(void)
+{
+	return test_thread_flag(TIF_32BIT);
+}
+
 #endif /* __KERNEL__ */
 #endif /* _ASM_POWERPC_COMPAT_H */
diff --git a/arch/powerpc/include/asm/seccomp.h b/arch/powerpc/include/asm/seccomp.h
index 853765eb1f65..00c1d9133cfe 100644
--- a/arch/powerpc/include/asm/seccomp.h
+++ b/arch/powerpc/include/asm/seccomp.h
@@ -1,10 +1,6 @@
 #ifndef _ASM_POWERPC_SECCOMP_H
 #define _ASM_POWERPC_SECCOMP_H
 
-#ifdef __KERNEL__
-#include <linux/thread_info.h>
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/sparc/include/asm/compat.h b/arch/sparc/include/asm/compat.h
index f260b58f5ce9..0e706257918f 100644
--- a/arch/sparc/include/asm/compat.h
+++ b/arch/sparc/include/asm/compat.h
@@ -240,4 +240,9 @@ struct compat_shmid64_ds {
 	unsigned int	__unused2;
 };
 
+static inline int is_compat_task(void)
+{
+	return test_thread_flag(TIF_32BIT);
+}
+
 #endif /* _ASM_SPARC64_COMPAT_H */
diff --git a/arch/sparc/include/asm/seccomp.h b/arch/sparc/include/asm/seccomp.h
index 7fcd9968192b..adca1bce41d4 100644
--- a/arch/sparc/include/asm/seccomp.h
+++ b/arch/sparc/include/asm/seccomp.h
@@ -1,11 +1,5 @@
 #ifndef _ASM_SECCOMP_H
 
-#include <linux/thread_info.h> /* already defines TIF_32BIT */
-
-#ifndef TIF_32BIT
-#error "unexpected TIF_32BIT on sparc64"
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/x86/include/asm/seccomp_32.h b/arch/x86/include/asm/seccomp_32.h
index a6ad87b352c4..b811d6f5780c 100644
--- a/arch/x86/include/asm/seccomp_32.h
+++ b/arch/x86/include/asm/seccomp_32.h
@@ -1,12 +1,6 @@
 #ifndef _ASM_X86_SECCOMP_32_H
 #define _ASM_X86_SECCOMP_32_H
 
-#include <linux/thread_info.h>
-
-#ifdef TIF_32BIT
-#error "unexpected TIF_32BIT on i386"
-#endif
-
 #include <linux/unistd.h>
 
 #define __NR_seccomp_read __NR_read
diff --git a/arch/x86/include/asm/seccomp_64.h b/arch/x86/include/asm/seccomp_64.h
index 4171bb794e9e..84ec1bd161a5 100644
--- a/arch/x86/include/asm/seccomp_64.h
+++ b/arch/x86/include/asm/seccomp_64.h
@@ -1,14 +1,6 @@
 #ifndef _ASM_X86_SECCOMP_64_H
 #define _ASM_X86_SECCOMP_64_H
 
-#include <linux/thread_info.h>
-
-#ifdef TIF_32BIT
-#error "unexpected TIF_32BIT on x86_64"
-#else
-#define TIF_32BIT TIF_IA32
-#endif
-
 #include <linux/unistd.h>
 #include <asm/ia32_unistd.h>
 
diff --git a/kernel/seccomp.c b/kernel/seccomp.c
index ad64fcb731f2..57d4b13b631d 100644
--- a/kernel/seccomp.c
+++ b/kernel/seccomp.c
@@ -8,6 +8,7 @@
 
 #include <linux/seccomp.h>
 #include <linux/sched.h>
+#include <linux/compat.h>
 
 /* #define SECCOMP_DEBUG 1 */
 #define NR_SECCOMP_MODES 1
@@ -22,7 +23,7 @@ static int mode1_syscalls[] = {
 	0, /* null terminated */
 };
 
-#ifdef TIF_32BIT
+#ifdef CONFIG_COMPAT
 static int mode1_syscalls_32[] = {
 	__NR_seccomp_read_32, __NR_seccomp_write_32, __NR_seccomp_exit_32, __NR_seccomp_sigreturn_32,
 	0, /* null terminated */
@@ -37,8 +38,8 @@ void __secure_computing(int this_syscall)
 	switch (mode) {
 	case 1:
 		syscall = mode1_syscalls;
-#ifdef TIF_32BIT
-		if (test_thread_flag(TIF_32BIT))
+#ifdef CONFIG_COMPAT
+		if (is_compat_task())
 			syscall = mode1_syscalls_32;
 #endif
 		do {
-- 
cgit v1.2.3


From 633ddaa7f471e9db181f993c1458d6f4bae321ca Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 09:43:50 -0500
Subject: tracing: fix return value to registering events

The registering of events had the return value check backwards.
A zero returned is success, the check had it as a failure.

This patch also fixes a missing "\n" in the warning that the check
failed.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_3.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index c62a4d2a5283..041789ffbac1 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -120,9 +120,9 @@ static int ftrace_reg_event_##call(void)				\
 	int ret;							\
 									\
 	ret = register_trace_##call(ftrace_event_##call);		\
-	if (!ret)							\
+	if (ret)							\
 		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
+			"probe to " #call "\n");			\
 	return ret;							\
 }									\
 									\
@@ -195,9 +195,9 @@ static int ftrace_raw_reg_event_##call(void)				\
 	int ret;							\
 									\
 	ret = register_trace_##call(ftrace_raw_event_##call);		\
-	if (!ret)							\
+	if (ret)							\
 		pr_info("event trace: Could not activate trace point "	\
-			"probe to " #call);				\
+			"probe to " #call "\n");			\
 	return ret;							\
 }									\
 									\
-- 
cgit v1.2.3


From 41be4da4e85e58520b934040966a6ae919c66c2d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 2 Mar 2009 20:56:48 -0500
Subject: ring-buffer: reset write field for ring_buffer_read_page

Impact: fix ring_buffer_read_page

After a page is swapped into the ring buffer, the write field must
also be reset.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index a8c275c01e83..9baad7ee4b36 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2492,6 +2492,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		rb_init_page(bpage);
 		bpage = cpu_buffer->reader_page->page;
 		cpu_buffer->reader_page->page = *data_page;
+		local_set(&cpu_buffer->reader_page->write, 0);
 		cpu_buffer->reader_page->read = 0;
 		*data_page = bpage;
 	}
-- 
cgit v1.2.3


From ef7a4a161472b952941bf78855a9cd95703c024e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 00:27:49 -0500
Subject: ring-buffer: fix ring_buffer_read_page

The ring_buffer_read_page was broken if it were to only copy part
of the page. This patch fixes that up as well as adds a parameter
to allow a length field, in order to only copy part of the buffer page.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++-
 kernel/trace/ring_buffer.c  | 92 +++++++++++++++++++++++++++++----------------
 2 files changed, 64 insertions(+), 35 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index f5e793d69bd3..79fcbc4b09d6 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -121,6 +121,9 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 u64 ring_buffer_time_stamp(int cpu);
 void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
 
+size_t ring_buffer_page_len(void *page);
+
+
 /*
  * The below functions are fine to use outside the tracing facility.
  */
@@ -138,8 +141,8 @@ static inline int tracing_is_on(void) { return 0; }
 
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer);
 void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data);
-int ring_buffer_read_page(struct ring_buffer *buffer,
-			  void **data_page, int cpu, int full);
+int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page,
+			  size_t len, int cpu, int full);
 
 enum ring_buffer_flags {
 	RB_FL_OVERWRITE		= 1 << 0,
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 9baad7ee4b36..2ad6bae95a3d 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -234,6 +234,11 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+size_t ring_buffer_page_len(void *page)
+{
+	return local_read(&((struct buffer_data_page *)page)->commit);
+}
+
 /*
  * Also stolen from mm/slob.c. Thanks to Mathieu Desnoyers for pointing
  * this issue out.
@@ -2378,8 +2383,8 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer,
  */
 void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 {
-	unsigned long addr;
 	struct buffer_data_page *bpage;
+	unsigned long addr;
 
 	addr = __get_free_page(GFP_KERNEL);
 	if (!addr)
@@ -2387,6 +2392,8 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer)
 
 	bpage = (void *)addr;
 
+	rb_init_page(bpage);
+
 	return bpage;
 }
 
@@ -2406,6 +2413,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  * ring_buffer_read_page - extract a page from the ring buffer
  * @buffer: buffer to extract from
  * @data_page: the page to use allocated from ring_buffer_alloc_read_page
+ * @len: amount to extract
  * @cpu: the cpu of the buffer to extract
  * @full: should the extraction only happen when the page is full.
  *
@@ -2418,7 +2426,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *	rpage = ring_buffer_alloc_read_page(buffer);
  *	if (!rpage)
  *		return error;
- *	ret = ring_buffer_read_page(buffer, &rpage, cpu, 0);
+ *	ret = ring_buffer_read_page(buffer, &rpage, len, cpu, 0);
  *	if (ret >= 0)
  *		process_page(rpage, ret);
  *
@@ -2435,71 +2443,89 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data)
  *  <0 if no data has been transferred.
  */
 int ring_buffer_read_page(struct ring_buffer *buffer,
-			    void **data_page, int cpu, int full)
+			  void **data_page, size_t len, int cpu, int full)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	struct ring_buffer_event *event;
 	struct buffer_data_page *bpage;
+	struct buffer_page *reader;
 	unsigned long flags;
+	unsigned int commit;
 	unsigned int read;
 	int ret = -1;
 
 	if (!data_page)
-		return 0;
+		return -1;
 
 	bpage = *data_page;
 	if (!bpage)
-		return 0;
+		return -1;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
-	/*
-	 * rb_buffer_peek will get the next ring buffer if
-	 * the current reader page is empty.
-	 */
-	event = rb_buffer_peek(buffer, cpu, NULL);
-	if (!event)
+	reader = rb_get_reader_page(cpu_buffer);
+	if (!reader)
 		goto out;
 
-	/* check for data */
-	if (!local_read(&cpu_buffer->reader_page->page->commit))
-		goto out;
+	event = rb_reader_event(cpu_buffer);
+
+	read = reader->read;
+	commit = rb_page_commit(reader);
 
-	read = cpu_buffer->reader_page->read;
 	/*
-	 * If the writer is already off of the read page, then simply
-	 * switch the read page with the given page. Otherwise
-	 * we need to copy the data from the reader to the writer.
+	 * If len > what's left on the page, and the writer is also off of
+	 * the read page, then simply switch the read page with the given
+	 * page. Otherwise we need to copy the data from the reader to the
+	 * writer.
 	 */
-	if (cpu_buffer->reader_page == cpu_buffer->commit_page) {
-		unsigned int commit = rb_page_commit(cpu_buffer->reader_page);
+	if ((len < (commit - read)) ||
+	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
+		unsigned int pos = read;
+		unsigned int size;
 
 		if (full)
 			goto out;
-		/* The writer is still on the reader page, we must copy */
-		memcpy(bpage->data + read, rpage->data + read, commit - read);
 
-		/* consume what was read */
-		cpu_buffer->reader_page->read = commit;
+		if (len > (commit - read))
+			len = (commit - read);
+
+		size = rb_event_length(event);
+
+		if (len < size)
+			goto out;
+
+		/* Need to copy one event at a time */
+		do {
+			memcpy(bpage->data + pos, rpage->data + pos, size);
+
+			len -= size;
+
+			rb_advance_reader(cpu_buffer);
+			pos = reader->read;
+
+			event = rb_reader_event(cpu_buffer);
+			size = rb_event_length(event);
+		} while (len > size);
 
 		/* update bpage */
-		local_set(&bpage->commit, commit);
-		if (!read)
-			bpage->time_stamp = rpage->time_stamp;
+		local_set(&bpage->commit, pos);
+		bpage->time_stamp = rpage->time_stamp;
+
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-		bpage = cpu_buffer->reader_page->page;
-		cpu_buffer->reader_page->page = *data_page;
-		local_set(&cpu_buffer->reader_page->write, 0);
-		cpu_buffer->reader_page->read = 0;
+		bpage = reader->page;
+		reader->page = *data_page;
+		local_set(&reader->write, 0);
+		reader->read = 0;
 		*data_page = bpage;
+
+		/* update the entry counter */
+		rb_remove_entries(cpu_buffer, bpage, read);
 	}
 	ret = read;
 
-	/* update the entry counter */
-	rb_remove_entries(cpu_buffer, bpage, read);
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
-- 
cgit v1.2.3


From e3d6bf0a0781a269f34250fd41e0d3dbfe540cf1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 13:53:07 -0500
Subject: ring-buffer: replace sizeof of event header with offsetof

Impact: fix to possible alignment problems on some archs.

Some arch compilers include an NULL char array in the sizeof field.
Since the ring_buffer_event type includes one of these, it is better
to use the "offsetof" instead, to avoid strange bugs on these archs.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2ad6bae95a3d..27cf834d8b4e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -132,7 +132,7 @@ void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
 
-#define RB_EVNT_HDR_SIZE (sizeof(struct ring_buffer_event))
+#define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
 
-- 
cgit v1.2.3


From 474d32b68d6d842f3e710e9ae9fe2568c53339f8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 19:51:40 -0500
Subject: ring-buffer: make ring_buffer_read_page read from start on partial
 page

Impact: dont leave holes in read buffer page

The ring_buffer_read_page swaps a given page with the reader page
of the ring buffer, if certain conditions are set:

 1) requested length is big enough to hold entire page data

 2) a writer is not currently on the page

 3) the page is not partially consumed.

Instead of swapping with the supplied page. It copies the data to
the supplied page instead. But currently the data is copied in the
same offset as the source page. This causes a hole at the start
of the reader page. This complicates the use of this function.
Instead, it should copy the data at the beginning of the function
and update the index fields accordingly.

Other small clean ups are also done in this patch.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 43 +++++++++++++++++++++++++++++++++----------
 1 file changed, 33 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 27cf834d8b4e..f2a163db52f9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -61,6 +61,8 @@ enum {
 
 static unsigned long ring_buffer_flags __read_mostly = RB_BUFFERS_ON;
 
+#define BUF_PAGE_HDR_SIZE offsetof(struct buffer_data_page, data)
+
 /**
  * tracing_on - enable all tracing buffers
  *
@@ -234,9 +236,16 @@ static void rb_init_page(struct buffer_data_page *bpage)
 	local_set(&bpage->commit, 0);
 }
 
+/**
+ * ring_buffer_page_len - the size of data on the page.
+ * @page: The page to read
+ *
+ * Returns the amount of data on the page, including buffer page header.
+ */
 size_t ring_buffer_page_len(void *page)
 {
-	return local_read(&((struct buffer_data_page *)page)->commit);
+	return local_read(&((struct buffer_data_page *)page)->commit)
+		+ BUF_PAGE_HDR_SIZE;
 }
 
 /*
@@ -259,7 +268,7 @@ static inline int test_time_stamp(u64 delta)
 	return 0;
 }
 
-#define BUF_PAGE_SIZE (PAGE_SIZE - offsetof(struct buffer_data_page, data))
+#define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE)
 
 /*
  * head_page == tail_page && head == tail then buffer is empty.
@@ -2454,6 +2463,15 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned int read;
 	int ret = -1;
 
+	/*
+	 * If len is not big enough to hold the page header, then
+	 * we can not copy anything.
+	 */
+	if (len <= BUF_PAGE_HDR_SIZE)
+		return -1;
+
+	len -= BUF_PAGE_HDR_SIZE;
+
 	if (!data_page)
 		return -1;
 
@@ -2473,15 +2491,17 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	commit = rb_page_commit(reader);
 
 	/*
-	 * If len > what's left on the page, and the writer is also off of
-	 * the read page, then simply switch the read page with the given
-	 * page. Otherwise we need to copy the data from the reader to the
-	 * writer.
+	 * If this page has been partially read or
+	 * if len is not big enough to read the rest of the page or
+	 * a writer is still on the page, then
+	 * we must copy the data from the page to the buffer.
+	 * Otherwise, we can simply swap the page with the one passed in.
 	 */
-	if ((len < (commit - read)) ||
+	if (read || (len < (commit - read)) ||
 	    cpu_buffer->reader_page == cpu_buffer->commit_page) {
 		struct buffer_data_page *rpage = cpu_buffer->reader_page->page;
-		unsigned int pos = read;
+		unsigned int rpos = read;
+		unsigned int pos = 0;
 		unsigned int size;
 
 		if (full)
@@ -2497,12 +2517,13 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* Need to copy one event at a time */
 		do {
-			memcpy(bpage->data + pos, rpage->data + pos, size);
+			memcpy(bpage->data + pos, rpage->data + rpos, size);
 
 			len -= size;
 
 			rb_advance_reader(cpu_buffer);
-			pos = reader->read;
+			rpos = reader->read;
+			pos += size;
 
 			event = rb_reader_event(cpu_buffer);
 			size = rb_event_length(event);
@@ -2512,6 +2533,8 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		local_set(&bpage->commit, pos);
 		bpage->time_stamp = rpage->time_stamp;
 
+		/* we copied everything to the beginning */
+		read = 0;
 	} else {
 		/* swap the pages */
 		rb_init_page(bpage);
-- 
cgit v1.2.3


From 2cadf9135eb3b6d84b6427314be827ddd443c308 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 1 Dec 2008 22:20:19 -0500
Subject: tracing: add binary buffer files for use with splice

Impact: new feature

This patch creates a directory of files that correspond to the
per CPU ring buffers. These are binary files and are made to
be used with splice. This is the fastest way to extract data from
the ftrace ring buffers.

Thanks to Jiaying Zhang for pushing me to get this code fixed,
 and to Eduard - Gabriel Munteanu for his splice code that helped
 me debug my code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 274 +++++++++++++++++++++++++++++++++++++++++++++++++--
 kernel/trace/trace.h |   1 +
 2 files changed, 268 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ea055aa21cd9..12539f72f4a5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -11,31 +11,30 @@
  *  Copyright (C) 2004-2006 Ingo Molnar
  *  Copyright (C) 2004 William Lee Irwin III
  */
+#include <linux/ring_buffer.h>
 #include <linux/utsrelease.h>
+#include <linux/stacktrace.h>
+#include <linux/writeback.h>
 #include <linux/kallsyms.h>
 #include <linux/seq_file.h>
 #include <linux/notifier.h>
+#include <linux/irqflags.h>
 #include <linux/debugfs.h>
 #include <linux/pagemap.h>
 #include <linux/hardirq.h>
 #include <linux/linkage.h>
 #include <linux/uaccess.h>
+#include <linux/kprobes.h>
 #include <linux/ftrace.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
+#include <linux/splice.h>
 #include <linux/kdebug.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
 #include <linux/gfp.h>
 #include <linux/fs.h>
-#include <linux/kprobes.h>
-#include <linux/writeback.h>
-#include <linux/splice.h>
-
-#include <linux/stacktrace.h>
-#include <linux/ring_buffer.h>
-#include <linux/irqflags.h>
 
 #include "trace.h"
 #include "trace_output.h"
@@ -3005,6 +3004,246 @@ static struct file_operations tracing_mark_fops = {
 	.write		= tracing_mark_write,
 };
 
+struct ftrace_buffer_info {
+	struct trace_array	*tr;
+	void			*spare;
+	int			cpu;
+	unsigned int		read;
+};
+
+static int tracing_buffers_open(struct inode *inode, struct file *filp)
+{
+	int cpu = (int)(long)inode->i_private;
+	struct ftrace_buffer_info *info;
+
+	if (tracing_disabled)
+		return -ENODEV;
+
+	info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->tr	= &global_trace;
+	info->cpu	= cpu;
+	info->spare	= ring_buffer_alloc_read_page(info->tr->buffer);
+	/* Force reading ring buffer for first read */
+	info->read	= (unsigned int)-1;
+	if (!info->spare)
+		goto out;
+
+	filp->private_data = info;
+
+	return 0;
+
+ out:
+	kfree(info);
+	return -ENOMEM;
+}
+
+static ssize_t
+tracing_buffers_read(struct file *filp, char __user *ubuf,
+		     size_t count, loff_t *ppos)
+{
+	struct ftrace_buffer_info *info = filp->private_data;
+	unsigned int pos;
+	ssize_t ret;
+	size_t size;
+
+	/* Do we have previous read data to read? */
+	if (info->read < PAGE_SIZE)
+		goto read;
+
+	info->read = 0;
+
+	ret = ring_buffer_read_page(info->tr->buffer,
+				    &info->spare,
+				    count,
+				    info->cpu, 0);
+	if (ret < 0)
+		return 0;
+
+	pos = ring_buffer_page_len(info->spare);
+
+	if (pos < PAGE_SIZE)
+		memset(info->spare + pos, 0, PAGE_SIZE - pos);
+
+read:
+	size = PAGE_SIZE - info->read;
+	if (size > count)
+		size = count;
+
+	ret = copy_to_user(ubuf, info->spare + info->read, size);
+	if (ret)
+		return -EFAULT;
+	*ppos += size;
+	info->read += size;
+
+	return size;
+}
+
+static int tracing_buffers_release(struct inode *inode, struct file *file)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+
+	ring_buffer_free_read_page(info->tr->buffer, info->spare);
+	kfree(info);
+
+	return 0;
+}
+
+struct buffer_ref {
+	struct ring_buffer	*buffer;
+	void			*page;
+	int			ref;
+};
+
+static void buffer_pipe_buf_release(struct pipe_inode_info *pipe,
+				    struct pipe_buffer *buf)
+{
+	struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+	if (--ref->ref)
+		return;
+
+	ring_buffer_free_read_page(ref->buffer, ref->page);
+	kfree(ref);
+	buf->private = 0;
+}
+
+static int buffer_pipe_buf_steal(struct pipe_inode_info *pipe,
+				 struct pipe_buffer *buf)
+{
+	return 1;
+}
+
+static void buffer_pipe_buf_get(struct pipe_inode_info *pipe,
+				struct pipe_buffer *buf)
+{
+	struct buffer_ref *ref = (struct buffer_ref *)buf->private;
+
+	ref->ref++;
+}
+
+/* Pipe buffer operations for a buffer. */
+static struct pipe_buf_operations buffer_pipe_buf_ops = {
+	.can_merge		= 0,
+	.map			= generic_pipe_buf_map,
+	.unmap			= generic_pipe_buf_unmap,
+	.confirm		= generic_pipe_buf_confirm,
+	.release		= buffer_pipe_buf_release,
+	.steal			= buffer_pipe_buf_steal,
+	.get			= buffer_pipe_buf_get,
+};
+
+/*
+ * Callback from splice_to_pipe(), if we need to release some pages
+ * at the end of the spd in case we error'ed out in filling the pipe.
+ */
+static void buffer_spd_release(struct splice_pipe_desc *spd, unsigned int i)
+{
+	struct buffer_ref *ref =
+		(struct buffer_ref *)spd->partial[i].private;
+
+	if (--ref->ref)
+		return;
+
+	ring_buffer_free_read_page(ref->buffer, ref->page);
+	kfree(ref);
+	spd->partial[i].private = 0;
+}
+
+static ssize_t
+tracing_buffers_splice_read(struct file *file, loff_t *ppos,
+			    struct pipe_inode_info *pipe, size_t len,
+			    unsigned int flags)
+{
+	struct ftrace_buffer_info *info = file->private_data;
+	struct partial_page partial[PIPE_BUFFERS];
+	struct page *pages[PIPE_BUFFERS];
+	struct splice_pipe_desc spd = {
+		.pages		= pages,
+		.partial	= partial,
+		.flags		= flags,
+		.ops		= &buffer_pipe_buf_ops,
+		.spd_release	= buffer_spd_release,
+	};
+	struct buffer_ref *ref;
+	int size, i;
+	size_t ret;
+
+	/*
+	 * We can't seek on a buffer input
+	 */
+	if (unlikely(*ppos))
+		return -ESPIPE;
+
+
+	for (i = 0; i < PIPE_BUFFERS && len; i++, len -= size) {
+		struct page *page;
+		int r;
+
+		ref = kzalloc(sizeof(*ref), GFP_KERNEL);
+		if (!ref)
+			break;
+
+		ref->buffer = info->tr->buffer;
+		ref->page = ring_buffer_alloc_read_page(ref->buffer);
+		if (!ref->page) {
+			kfree(ref);
+			break;
+		}
+
+		r = ring_buffer_read_page(ref->buffer, &ref->page,
+					  len, info->cpu, 0);
+		if (r < 0) {
+			ring_buffer_free_read_page(ref->buffer,
+						   ref->page);
+			kfree(ref);
+			break;
+		}
+
+		/*
+		 * zero out any left over data, this is going to
+		 * user land.
+		 */
+		size = ring_buffer_page_len(ref->page);
+		if (size < PAGE_SIZE)
+			memset(ref->page + size, 0, PAGE_SIZE - size);
+
+		page = virt_to_page(ref->page);
+
+		spd.pages[i] = page;
+		spd.partial[i].len = PAGE_SIZE;
+		spd.partial[i].offset = 0;
+		spd.partial[i].private = (unsigned long)ref;
+		spd.nr_pages++;
+	}
+
+	spd.nr_pages = i;
+
+	/* did we read anything? */
+	if (!spd.nr_pages) {
+		if (flags & SPLICE_F_NONBLOCK)
+			ret = -EAGAIN;
+		else
+			ret = 0;
+		/* TODO: block */
+		return ret;
+	}
+
+	ret = splice_to_pipe(pipe, &spd);
+
+	return ret;
+}
+
+static const struct file_operations tracing_buffers_fops = {
+	.open		= tracing_buffers_open,
+	.read		= tracing_buffers_read,
+	.release	= tracing_buffers_release,
+	.splice_read	= tracing_buffers_splice_read,
+	.llseek		= no_llseek,
+};
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 
 int __weak ftrace_arch_read_dyn_info(char *buf, int size)
@@ -3399,6 +3638,7 @@ static __init void create_trace_options_dir(void)
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
+	struct dentry *buffers;
 	struct dentry *entry;
 	int cpu;
 
@@ -3471,6 +3711,26 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_marker' entry\n");
 
+	buffers = debugfs_create_dir("binary_buffers", d_tracer);
+
+	if (!buffers)
+		pr_warning("Could not create buffers directory\n");
+	else {
+		int cpu;
+		char buf[64];
+
+		for_each_tracing_cpu(cpu) {
+			sprintf(buf, "%d", cpu);
+
+			entry = debugfs_create_file(buf, 0444, buffers,
+						    (void *)(long)cpu,
+						    &tracing_buffers_fops);
+			if (!entry)
+				pr_warning("Could not create debugfs buffers "
+					   "'%s' entry\n", buf);
+		}
+	}
+
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index e606633fb498..561bb5c5d988 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -217,6 +217,7 @@ enum trace_flag_type {
  */
 struct trace_array_cpu {
 	atomic_t		disabled;
+	void			*buffer_page;	/* ring buffer spare */
 
 	/* these fields get copied into max-trace: */
 	unsigned long		trace_idx;
-- 
cgit v1.2.3


From 1c21f14ec48a2256fb03682b24dddd23eacdc96f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 4 Mar 2009 13:51:13 +0100
Subject: lockdep: fix incorrect state name

In the recent mark_lock_irq() rework a bug snuck in that would report the
state of write locks causing irq inversion under a read lock as a read
lock.

Fix this by masking the read bit of the state when validating write
dependencies.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236172646.5330.7450.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 022d2ed7fd8b..ef6584fd9fe5 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -2015,7 +2015,8 @@ typedef int (*check_usage_f)(struct task_struct *, struct held_lock *,
 			     enum lock_usage_bit bit, const char *name);
 
 static int
-mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
+mark_lock_irq(struct task_struct *curr, struct held_lock *this,
+		enum lock_usage_bit new_bit)
 {
 	int excl_bit = exclusive_bit(new_bit);
 	int read = new_bit & 1;
@@ -2043,7 +2044,7 @@ mark_lock_irq(struct task_struct *curr, struct held_lock *this, int new_bit)
 	 * states.
 	 */
 	if ((!read || !dir || STRICT_READ_CHECKS) &&
-			!usage(curr, this, excl_bit, state_name(new_bit)))
+			!usage(curr, this, excl_bit, state_name(new_bit & ~1)))
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 26575e28df5eb2050c02369843faba38cecb4d8c Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 4 Mar 2009 14:53:24 +0100
Subject: lockdep: remove extra "irq" string

Impact: clarify lockdep printk text

print_irq_inversion_bug() gets handed state strings of the form

  "HARDIRQ", "SOFTIRQ", "RECLAIM_FS"

and appends "-irq-{un,}safe" to them, which is either redudant for *IRQ or
confusing in the RECLAIM_FS case.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236175192.5330.7585.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index ef6584fd9fe5..02014f7ccc86 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -1900,9 +1900,9 @@ print_irq_inversion_bug(struct task_struct *curr, struct lock_class *other,
 		curr->comm, task_pid_nr(curr));
 	print_lock(this);
 	if (forwards)
-		printk("but this lock took another, %s-irq-unsafe lock in the past:\n", irqclass);
+		printk("but this lock took another, %s-unsafe lock in the past:\n", irqclass);
 	else
-		printk("but this lock was taken by another, %s-irq-safe lock in the past:\n", irqclass);
+		printk("but this lock was taken by another, %s-safe lock in the past:\n", irqclass);
 	print_lock_name(other);
 	printk("\n\nand interrupts could create inverse lock ordering between them.\n\n");
 
-- 
cgit v1.2.3


From efed792d6738964f399a508ef9e831cd60fa4657 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Wed, 4 Mar 2009 12:32:55 +0100
Subject: tracing: add lockdep tracepoints for lock acquire/release

Augment the traces with lock names when lockdep is available:

 1)               |  down_read_trylock() {
 1)               |    _spin_lock_irqsave() {
 1)               |      /* lock_acquire: &sem->wait_lock */
 1)   4.201 us    |    }
 1)               |    _spin_unlock_irqrestore() {
 1)               |      /* lock_release: &sem->wait_lock */
 1)   3.523 us    |    }
 1)               |  /* lock_acquire: try read &mm->mmap_sem */
 1) + 13.386 us   |  }
 1)   1.635 us    |  find_vma();
 1)               |  handle_mm_fault() {
 1)               |    __do_fault() {
 1)               |      filemap_fault() {
 1)               |        find_lock_page() {
 1)               |          find_get_page() {
 1)               |            /* lock_acquire: read rcu_read_lock */
 1)               |            /* lock_release: rcu_read_lock */
 1)   5.697 us    |          }
 1)   8.158 us    |        }
 1) + 11.079 us   |      }
 1)               |      _spin_lock() {
 1)               |        /* lock_acquire: __pte_lockptr(page) */
 1)   3.949 us    |      }
 1)   1.460 us    |      page_add_file_rmap();
 1)               |      _spin_unlock() {
 1)               |        /* lock_release: __pte_lockptr(page) */
 1)   3.115 us    |      }
 1)               |      unlock_page() {
 1)   1.421 us    |        page_waitqueue();
 1)   1.220 us    |        __wake_up_bit();
 1)   6.519 us    |      }
 1) + 34.328 us   |    }
 1) + 37.452 us   |  }
 1)               |  up_read() {
 1)               |  /* lock_release: &mm->mmap_sem */
 1)               |    _spin_lock_irqsave() {
 1)               |      /* lock_acquire: &sem->wait_lock */
 1)   3.865 us    |    }
 1)               |    _spin_unlock_irqrestore() {
 1)               |      /* lock_release: &sem->wait_lock */
 1)   8.562 us    |    }
 1) + 17.370 us   |  }

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?T=F6r=F6k?= Edwin <edwintorok@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236166375.5330.7209.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/lockdep.h             |  9 ++++++++
 include/trace/lockdep_event_types.h | 44 +++++++++++++++++++++++++++++++++++++
 include/trace/trace_event_types.h   |  1 +
 include/trace/trace_events.h        |  1 +
 kernel/lockdep.c                    | 17 ++++++++++++++
 kernel/trace/trace.c                | 14 +++++++-----
 kernel/trace/trace_events_stage_3.h |  4 ++--
 7 files changed, 82 insertions(+), 8 deletions(-)
 create mode 100644 include/trace/lockdep.h
 create mode 100644 include/trace/lockdep_event_types.h

(limited to 'kernel')

diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h
new file mode 100644
index 000000000000..5ca67df87f2a
--- /dev/null
+++ b/include/trace/lockdep.h
@@ -0,0 +1,9 @@
+#ifndef _TRACE_LOCKDEP_H
+#define _TRACE_LOCKDEP_H
+
+#include <linux/lockdep.h>
+#include <linux/tracepoint.h>
+
+#include <trace/lockdep_event_types.h>
+
+#endif
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
new file mode 100644
index 000000000000..f713d74a82b4
--- /dev/null
+++ b/include/trace/lockdep_event_types.h
@@ -0,0 +1,44 @@
+
+#ifndef TRACE_EVENT_FORMAT
+# error Do not include this file directly.
+# error Unless you know what you are doing.
+#endif
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM lock
+
+#ifdef CONFIG_LOCKDEP
+
+TRACE_FORMAT(lock_acquire,
+	TPPROTO(struct lockdep_map *lock, unsigned int subclass,
+		int trylock, int read, int check,
+		struct lockdep_map *next_lock, unsigned long ip),
+	TPARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TPFMT("%s%s%s", trylock ? "try " : "",
+		read ? "read " : "", lock->name)
+	);
+
+TRACE_FORMAT(lock_release,
+	TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TPARGS(lock, nested, ip),
+	TPFMT("%s", lock->name)
+	);
+
+#ifdef CONFIG_LOCK_STAT
+
+TRACE_FORMAT(lock_contended,
+	TPPROTO(struct lockdep_map *lock, unsigned long ip),
+	TPARGS(lock, ip),
+	TPFMT("%s", lock->name)
+	);
+
+TRACE_FORMAT(lock_acquired,
+	TPPROTO(struct lockdep_map *lock, unsigned long ip),
+	TPARGS(lock, ip),
+	TPFMT("%s", lock->name)
+	);
+
+#endif
+#endif
+
+#undef TRACE_SYSTEM
diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h
index 33c8ed5ccb6c..df56f5694be6 100644
--- a/include/trace/trace_event_types.h
+++ b/include/trace/trace_event_types.h
@@ -2,3 +2,4 @@
 
 #include <trace/sched_event_types.h>
 #include <trace/irq_event_types.h>
+#include <trace/lockdep_event_types.h>
diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h
index ea2ef2051762..fd13750ca4ba 100644
--- a/include/trace/trace_events.h
+++ b/include/trace/trace_events.h
@@ -2,3 +2,4 @@
 
 #include <trace/sched.h>
 #include <trace/irq.h>
+#include <trace/lockdep.h>
diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02014f7ccc86..cb70c1db85d0 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -42,6 +42,7 @@
 #include <linux/hash.h>
 #include <linux/ftrace.h>
 #include <linux/stringify.h>
+#include <trace/lockdep.h>
 
 #include <asm/sections.h>
 
@@ -2913,6 +2914,8 @@ void lock_set_class(struct lockdep_map *lock, const char *name,
 }
 EXPORT_SYMBOL_GPL(lock_set_class);
 
+DEFINE_TRACE(lock_acquire);
+
 /*
  * We are not always called with irqs disabled - do that here,
  * and also avoid lockdep recursion:
@@ -2923,6 +2926,8 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 {
 	unsigned long flags;
 
+	trace_lock_acquire(lock, subclass, trylock, read, check, nest_lock, ip);
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -2937,11 +2942,15 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 }
 EXPORT_SYMBOL_GPL(lock_acquire);
 
+DEFINE_TRACE(lock_release);
+
 void lock_release(struct lockdep_map *lock, int nested,
 			  unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_release(lock, nested, ip);
+
 	if (unlikely(current->lockdep_recursion))
 		return;
 
@@ -3090,10 +3099,14 @@ found_it:
 	lock->ip = ip;
 }
 
+DEFINE_TRACE(lock_contended);
+
 void lock_contended(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_contended(lock, ip);
+
 	if (unlikely(!lock_stat))
 		return;
 
@@ -3109,10 +3122,14 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip)
 }
 EXPORT_SYMBOL_GPL(lock_contended);
 
+DEFINE_TRACE(lock_acquired);
+
 void lock_acquired(struct lockdep_map *lock, unsigned long ip)
 {
 	unsigned long flags;
 
+	trace_lock_acquired(lock, ip);
+
 	if (unlikely(!lock_stat))
 		return;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 12539f72f4a5..c8abbb0c8397 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -623,7 +623,7 @@ static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
 static int cmdline_idx;
-static DEFINE_SPINLOCK(trace_cmdline_lock);
+static raw_spinlock_t trace_cmdline_lock = __RAW_SPIN_LOCK_UNLOCKED;
 
 /* temporary disable recording */
 static atomic_t trace_record_cmdline_disabled __read_mostly;
@@ -735,7 +735,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	 * nor do we want to disable interrupts,
 	 * so if we miss here, then better luck next time.
 	 */
-	if (!spin_trylock(&trace_cmdline_lock))
+	if (!__raw_spin_trylock(&trace_cmdline_lock))
 		return;
 
 	idx = map_pid_to_cmdline[tsk->pid];
@@ -753,7 +753,7 @@ static void trace_save_cmdline(struct task_struct *tsk)
 
 	memcpy(&saved_cmdlines[idx], tsk->comm, TASK_COMM_LEN);
 
-	spin_unlock(&trace_cmdline_lock);
+	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
 char *trace_find_cmdline(int pid)
@@ -3751,7 +3751,7 @@ static __init int tracer_init_debugfs(void)
 
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -3773,7 +3773,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out;
 
 	pause_graph_tracing();
-	spin_lock_irqsave(&trace_buf_lock, irq_flags);
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	len = min(len, TRACE_BUF_SIZE-1);
@@ -3792,7 +3793,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event);
 
  out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, irq_flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
 	unpause_graph_tracing();
  out:
 	preempt_enable_notrace();
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 041789ffbac1..2c8d76c7dbed 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,7 +5,7 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "(<call>) " <fmt>);
+ * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
@@ -112,7 +112,7 @@
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
-	event_trace_printk(_RET_IP_, "(" #call ") " fmt);		\
+	event_trace_printk(_RET_IP_, #call ": " fmt);			\
 }									\
 									\
 static int ftrace_reg_event_##call(void)				\
-- 
cgit v1.2.3


From 64ca5ab913f1594ef316556e65f5eae63ff50cee Mon Sep 17 00:00:00 2001
From: Eric Dumazet <dada1@cosmosbay.com>
Date: Wed, 4 Mar 2009 12:11:56 -0800
Subject: rcu: increment quiescent state counter in ksoftirqd()

If a machine is flooded by network frames, a cpu can loop
100% of its time inside ksoftirqd() without calling schedule().
This can delay RCU grace period to insane values.

Adding rcu_qsctr_inc() call in ksoftirqd() solves this problem.

Paul: "This regression was a result of the recent change from
"schedule()" to "cond_resched()", which got rid of that quiescent
state in the common case where a reschedule is not needed".

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/softirq.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index bdbe9de9cd8d..9041ea7948fe 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -626,6 +626,7 @@ static int ksoftirqd(void * __bind_cpu)
 			preempt_enable_no_resched();
 			cond_resched();
 			preempt_disable();
+			rcu_qsctr_inc((long)__bind_cpu);
 		}
 		preempt_enable();
 		set_current_state(TASK_INTERRUPTIBLE);
-- 
cgit v1.2.3


From e543ad76914abec1acf6631604a4154cd7a2ca6b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 18:20:36 -0500
Subject: tracing: add cpu_file intialization for ftrace_dump

Impact: fix to ftrace_dump output corruption

The commit: b04cc6b1f6398b0e0b60d37e27ce51b4899672ec
  tracing/core: introduce per cpu tracing files

added a new field to the iterator called cpu_file. This was a handle
to differentiate between the per cpu trace output files and the
all cpu "trace" file. The all cpu "trace" file required setting this
to TRACE_PIPE_ALL_CPU.

The problem is that the ftrace_dump sets up its own iterator but was
not updated to handle this change. The result was only CPU 0 printing
out on crash and a lot of "<0>"'s also being printed.

Reported-by: Thomas Gleixner <tglx@linuxtronix.de>
Tested-by: Darren Hart <dvhtc@us.ibm.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c8abbb0c8397..ab5cbcae43a1 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3918,8 +3918,10 @@ void ftrace_dump(void)
 
 	printk(KERN_TRACE "Dumping ftrace buffer:\n");
 
+	/* Simulate the iterator */
 	iter.tr = &global_trace;
 	iter.trace = current_trace;
+	iter.cpu_file = TRACE_PIPE_ALL_CPU;
 
 	/*
 	 * We need to stop all tracing on all CPUS to read the
-- 
cgit v1.2.3


From 4f3640f8a358f2183a8c966f299eeb55ca523e06 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 3 Mar 2009 23:52:42 -0500
Subject: ring-buffer: fix timestamp in partial ring_buffer_page_read

If a partial ring_buffer_page_read happens, then some of the
incremental timestamps may be lost. This patch writes the
recent timestamp into the page that is passed back to the caller.

A partial ring_buffer_page_read is where the full page would not
be written back to the user, and instead, just part of the page
is copied to the user. A full page would be a page swap with the
ring buffer and the timestamps would be correct.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f2a163db52f9..f7473645b9c6 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2461,6 +2461,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	unsigned long flags;
 	unsigned int commit;
 	unsigned int read;
+	u64 save_timestamp;
 	int ret = -1;
 
 	/*
@@ -2515,6 +2516,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		if (len < size)
 			goto out;
 
+		/* save the current timestamp, since the user will need it */
+		save_timestamp = cpu_buffer->read_stamp;
+
 		/* Need to copy one event at a time */
 		do {
 			memcpy(bpage->data + pos, rpage->data + rpos, size);
@@ -2531,7 +2535,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 
 		/* update bpage */
 		local_set(&bpage->commit, pos);
-		bpage->time_stamp = rpage->time_stamp;
+		bpage->time_stamp = save_timestamp;
 
 		/* we copied everything to the beginning */
 		read = 0;
-- 
cgit v1.2.3


From 2dc5d12b1f43134e9bc5037f69f4739cfdfab93e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 19:10:05 -0500
Subject: tracing: do not return EFAULT if read copied anything

Impact: fix trace read to conform to standards

Andrew Morton, Theodore Tso and H. Peter Anvin brought to my attention
that a userspace read should not return -EFAULT if it succeeded in
copying anything. It should only return -EFAULT if it failed to copy
at all.

This patch modifies the check of copy_from_user and updates the return
code appropriately.

I also used H. Peter Anvin's short cut rule to just test ret == count.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ab5cbcae43a1..57155dc53530 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -346,6 +346,9 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	int len;
 	int ret;
 
+	if (!cnt)
+		return 0;
+
 	if (s->len <= s->readpos)
 		return -EBUSY;
 
@@ -353,9 +356,11 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	if (cnt > len)
 		cnt = len;
 	ret = copy_to_user(ubuf, s->buffer + s->readpos, cnt);
-	if (ret)
+	if (ret == cnt)
 		return -EFAULT;
 
+	cnt -= ret;
+
 	s->readpos += len;
 	return cnt;
 }
@@ -3049,6 +3054,9 @@ tracing_buffers_read(struct file *filp, char __user *ubuf,
 	ssize_t ret;
 	size_t size;
 
+	if (!count)
+		return 0;
+
 	/* Do we have previous read data to read? */
 	if (info->read < PAGE_SIZE)
 		goto read;
@@ -3073,8 +3081,10 @@ read:
 		size = count;
 
 	ret = copy_to_user(ubuf, info->spare + info->read, size);
-	if (ret)
+	if (ret == size)
 		return -EFAULT;
+	size -= ret;
+
 	*ppos += size;
 	info->read += size;
 
-- 
cgit v1.2.3


From e74da5235cec6cb71eb338c987f876ecc793138b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 20:31:11 -0500
Subject: tracing: fix seq read from trace files

The buffer used by trace_seq was updated incorrectly. Instead
of consuming what was actually read, it consumed the rest of the
buffer on reads.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 57155dc53530..2e53e6f09440 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -361,7 +361,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 
 	cnt -= ret;
 
-	s->readpos += len;
+	s->readpos += cnt;
 	return cnt;
 }
 
@@ -380,7 +380,7 @@ ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 	if (!ret)
 		return -EFAULT;
 
-	s->readpos += len;
+	s->readpos += cnt;
 	return cnt;
 }
 
-- 
cgit v1.2.3


From c032ef64d680717e4e8ce3da65da6419a35f8a2c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 20:34:24 -0500
Subject: tracing: add latency output format option

With the removal of the latency_trace file, we lost the ability
to see some of the finer details in a trace. Like the state of
interrupts enabled, the preempt count, need resched, and if we
are in an interrupt handler, softirq handler or not.

This patch simply creates an option to bring back the old format.
This also removes the warning about an unused variable that held
the latency_trace file operations.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 24 ++----------------------
 kernel/trace/trace.h |  3 ++-
 2 files changed, 4 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2e53e6f09440..55fcbb567950 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -299,6 +299,7 @@ static const char *trace_options[] = {
 	"sym-userobj",
 	"printk-msg-only",
 	"context-info",
+	"latency-format",
 	NULL
 };
 
@@ -1829,26 +1830,12 @@ static int tracing_open(struct inode *inode, struct file *file)
 	iter = __tracing_open(inode, file);
 	if (IS_ERR(iter))
 		ret = PTR_ERR(iter);
-
-	return ret;
-}
-
-static int tracing_lt_open(struct inode *inode, struct file *file)
-{
-	struct trace_iterator *iter;
-	int ret = 0;
-
-	iter = __tracing_open(inode, file);
-
-	if (IS_ERR(iter))
-		ret = PTR_ERR(iter);
-	else
+	else if (trace_flags & TRACE_ITER_LATENCY_FMT)
 		iter->iter_flags |= TRACE_FILE_LAT_FMT;
 
 	return ret;
 }
 
-
 static void *
 t_next(struct seq_file *m, void *v, loff_t *pos)
 {
@@ -1927,13 +1914,6 @@ static struct file_operations tracing_fops = {
 	.release	= tracing_release,
 };
 
-static struct file_operations tracing_lt_fops = {
-	.open		= tracing_lt_open,
-	.read		= seq_read,
-	.llseek		= seq_lseek,
-	.release	= tracing_release,
-};
-
 static struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 561bb5c5d988..12cd119cca32 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -651,7 +651,8 @@ enum trace_iterator_flags {
 	TRACE_ITER_USERSTACKTRACE       = 0x4000,
 	TRACE_ITER_SYM_USEROBJ          = 0x8000,
 	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
-	TRACE_ITER_CONTEXT_INFO		= 0x20000 /* Print pid/cpu/time */
+	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
+	TRACE_ITER_LATENCY_FMT		= 0x40000,
 };
 
 /*
-- 
cgit v1.2.3


From 5fd73f862468280d4cbb5ba4321502f911f9f89a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 21:42:04 -0500
Subject: tracing: remove extra latency_trace method from trace structure

Impact: clean up

The trace and latency_trace function pointers are identical for
every tracer but the function tracer. The differences in the function
tracer are trivial (latency output puts paranthesis around parent).

This patch removes the latency_trace pointer and all prints will
now just use the trace output function pointer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c     |  1 -
 kernel/trace/trace.c        |  2 +-
 kernel/trace/trace_branch.c |  1 -
 kernel/trace/trace_output.c | 32 --------------------------------
 kernel/trace/trace_output.h |  1 -
 5 files changed, 1 insertion(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e82cb9e930cc..e39679a72a3b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1231,7 +1231,6 @@ static struct tracer blk_tracer __read_mostly = {
 static struct trace_event trace_blk_event = {
 	.type	 	= TRACE_BLK,
 	.trace		= blk_trace_event_print,
-	.latency_trace	= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 55fcbb567950..21b89ecb8480 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1485,7 +1485,7 @@ static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
 	}
 
 	if (event)
-		return event->latency_trace(iter, sym_flags);
+		return event->trace(iter, sym_flags);
 
 	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
 		goto partial;
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index c2e68d440c4d..aaa0755268b9 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -159,7 +159,6 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 static struct trace_event trace_branch_event = {
 	.type	 	= TRACE_BRANCH,
 	.trace		= trace_branch_print,
-	.latency_trace	= trace_branch_print,
 };
 
 static struct tracer branch_trace __read_mostly =
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 9fc815031b09..306fef84c503 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -437,8 +437,6 @@ int register_ftrace_event(struct trace_event *event)
 
 	if (event->trace == NULL)
 		event->trace = trace_nop_print;
-	if (event->latency_trace == NULL)
-		event->latency_trace = trace_nop_print;
 	if (event->raw == NULL)
 		event->raw = trace_nop_print;
 	if (event->hex == NULL)
@@ -480,29 +478,6 @@ enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags)
 }
 
 /* TRACE_FN */
-static enum print_line_t trace_fn_latency(struct trace_iterator *iter,
-					  int flags)
-{
-	struct ftrace_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-	if (!trace_seq_puts(s, " ("))
-		goto partial;
-	if (!seq_print_ip_sym(s, field->parent_ip, flags))
-		goto partial;
-	if (!trace_seq_puts(s, ")\n"))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t trace_fn_trace(struct trace_iterator *iter, int flags)
 {
 	struct ftrace_entry *field;
@@ -573,7 +548,6 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 static struct trace_event trace_fn_event = {
 	.type	 	= TRACE_FN,
 	.trace		= trace_fn_trace,
-	.latency_trace	= trace_fn_latency,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
 	.binary		= trace_fn_bin,
@@ -705,7 +679,6 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 static struct trace_event trace_ctx_event = {
 	.type	 	= TRACE_CTX,
 	.trace		= trace_ctx_print,
-	.latency_trace	= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
 	.binary		= trace_ctxwake_bin,
@@ -714,7 +687,6 @@ static struct trace_event trace_ctx_event = {
 static struct trace_event trace_wake_event = {
 	.type	 	= TRACE_WAKE,
 	.trace		= trace_wake_print,
-	.latency_trace	= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
 	.binary		= trace_ctxwake_bin,
@@ -770,7 +742,6 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 static struct trace_event trace_special_event = {
 	.type	 	= TRACE_SPECIAL,
 	.trace		= trace_special_print,
-	.latency_trace	= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -808,7 +779,6 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 static struct trace_event trace_stack_event = {
 	.type	 	= TRACE_STACK,
 	.trace		= trace_stack_print,
-	.latency_trace	= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -838,7 +808,6 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 static struct trace_event trace_user_stack_event = {
 	.type	 	= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
-	.latency_trace	= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
 	.binary		= trace_special_bin,
@@ -883,7 +852,6 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 static struct trace_event trace_print_event = {
 	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
-	.latency_trace	= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 551a25a72217..8a34d688ed63 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -10,7 +10,6 @@ struct trace_event {
 	struct hlist_node	node;
 	int			type;
 	trace_print_func	trace;
-	trace_print_func	latency_trace;
 	trace_print_func	raw;
 	trace_print_func	hex;
 	trace_print_func	binary;
-- 
cgit v1.2.3


From 27d48be84477d2f0a2e2ac3738a3971dece631d5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 21:57:29 -0500
Subject: tracing: consolidate print_lat_fmt and print_trace_fmt

Impact: clean up

Both print_lat_fmt and print_trace_fmt do pretty much the same thing
except for one different function call. This patch consolidates the
two functions and adds an if statement to perform the difference.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 39 +++++++--------------------------------
 1 file changed, 7 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 21b89ecb8480..d1ef43999d9e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1468,33 +1468,6 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
 }
 
-static enum print_line_t print_lat_fmt(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
-	struct trace_event *event;
-	struct trace_entry *entry = iter->ent;
-
-	test_cpu_buff_start(iter);
-
-	event = ftrace_find_event(entry->type);
-
-	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		if (!trace_print_lat_context(iter))
-			goto partial;
-	}
-
-	if (event)
-		return event->trace(iter, sym_flags);
-
-	if (!trace_seq_printf(s, "Unknown type %d\n", entry->type))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1509,8 +1482,13 @@ static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
 	event = ftrace_find_event(entry->type);
 
 	if (trace_flags & TRACE_ITER_CONTEXT_INFO) {
-		if (!trace_print_context(iter))
-			goto partial;
+		if (iter->iter_flags & TRACE_FILE_LAT_FMT) {
+			if (!trace_print_lat_context(iter))
+				goto partial;
+		} else {
+			if (!trace_print_context(iter))
+				goto partial;
+		}
 	}
 
 	if (event)
@@ -1652,9 +1630,6 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	if (trace_flags & TRACE_ITER_RAW)
 		return print_raw_fmt(iter);
 
-	if (iter->iter_flags & TRACE_FILE_LAT_FMT)
-		return print_lat_fmt(iter);
-
 	return print_trace_fmt(iter);
 }
 
-- 
cgit v1.2.3


From e9d25fe6eaa2c720bb3ea661b660e58d54fa38bf Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 4 Mar 2009 22:15:30 -0500
Subject: tracing: have latency tracers set the latency format

The latency tracers (irqsoff, preemptoff, preemptirqsoff, and wakeup)
are pretty useless with the default output format. This patch makes them
automatically enable the latency format when they are selected. They
also record the state of the latency option, and if it was not enabled
when selected, they disable it on reset.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_irqsoff.c      | 8 ++++++++
 kernel/trace/trace_sched_wakeup.c | 8 ++++++++
 2 files changed, 16 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c
index 9e5ebd844158..b923d13e2fad 100644
--- a/kernel/trace/trace_irqsoff.c
+++ b/kernel/trace/trace_irqsoff.c
@@ -32,6 +32,8 @@ enum {
 
 static int trace_type __read_mostly;
 
+static int save_lat_flag;
+
 #ifdef CONFIG_PREEMPT_TRACER
 static inline int
 preempt_trace(void)
@@ -370,6 +372,9 @@ static void stop_irqsoff_tracer(struct trace_array *tr)
 
 static void __irqsoff_tracer_init(struct trace_array *tr)
 {
+	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+	trace_flags |= TRACE_ITER_LATENCY_FMT;
+
 	tracing_max_latency = 0;
 	irqsoff_trace = tr;
 	/* make sure that the tracer is visible */
@@ -380,6 +385,9 @@ static void __irqsoff_tracer_init(struct trace_array *tr)
 static void irqsoff_tracer_reset(struct trace_array *tr)
 {
 	stop_irqsoff_tracer(tr);
+
+	if (!save_lat_flag)
+		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
 }
 
 static void irqsoff_tracer_start(struct trace_array *tr)
diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index db55f7aaa640..3c5ad6b2ec84 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -32,6 +32,8 @@ static raw_spinlock_t wakeup_lock =
 
 static void __wakeup_reset(struct trace_array *tr);
 
+static int save_lat_flag;
+
 #ifdef CONFIG_FUNCTION_TRACER
 /*
  * irqsoff uses its own tracer function to keep the overhead down:
@@ -324,6 +326,9 @@ static void stop_wakeup_tracer(struct trace_array *tr)
 
 static int __wakeup_tracer_init(struct trace_array *tr)
 {
+	save_lat_flag = trace_flags & TRACE_ITER_LATENCY_FMT;
+	trace_flags |= TRACE_ITER_LATENCY_FMT;
+
 	tracing_max_latency = 0;
 	wakeup_trace = tr;
 	start_wakeup_tracer(tr);
@@ -347,6 +352,9 @@ static void wakeup_tracer_reset(struct trace_array *tr)
 	stop_wakeup_tracer(tr);
 	/* make sure we put back any tasks we are tracing */
 	wakeup_reset(tr);
+
+	if (!save_lat_flag)
+		trace_flags &= ~TRACE_ITER_LATENCY_FMT;
 }
 
 static void wakeup_tracer_start(struct trace_array *tr)
-- 
cgit v1.2.3


From 5e1607a00bd082972629d3d68c95c8bcf902b55a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 10:24:48 +0100
Subject: tracing: rename ftrace_printk() => trace_printk()

Impact: cleanup

Use a more generic name - this also allows the prototype to move
to kernel.h and be generally available to kernel developers who
want to do some quick tracing.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/ftrace.txt |  6 +++---
 include/linux/ftrace.h   | 18 +++++++++---------
 kernel/trace/trace.c     |  8 ++++----
 kernel/trace/trace.h     |  2 +-
 4 files changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ftrace.txt b/Documentation/ftrace.txt
index 2041ee951c1a..22614bef6359 100644
--- a/Documentation/ftrace.txt
+++ b/Documentation/ftrace.txt
@@ -1466,11 +1466,11 @@ want, depending on your needs.
 
 
 You can put some comments on specific functions by using
-ftrace_printk() For example, if you want to put a comment inside
+trace_printk() For example, if you want to put a comment inside
 the __might_sleep() function, you just have to include
-<linux/ftrace.h> and call ftrace_printk() inside __might_sleep()
+<linux/ftrace.h> and call trace_printk() inside __might_sleep()
 
-ftrace_printk("I'm a comment!\n")
+trace_printk("I'm a comment!\n")
 
 will produce:
 
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..fbb9c364e166 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -329,11 +329,11 @@ extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
 /**
- * ftrace_printk - printf formatting in the ftrace buffer
+ * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
  *
- * Note: __ftrace_printk is an internal function for ftrace_printk and
- *       the @ip is passed in via the ftrace_printk macro.
+ * Note: __trace_printk is an internal function for trace_printk and
+ *       the @ip is passed in via the trace_printk macro.
  *
  * This function allows a kernel developer to debug fast path sections
  * that printk is not appropriate for. By scattering in various
@@ -341,14 +341,14 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * where problems are occurring.
  *
  * This is intended as a debugging tool for the developer only.
- * Please refrain from leaving ftrace_printks scattered around in
+ * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define ftrace_printk(fmt...) __ftrace_printk(_THIS_IP_, fmt)
+# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
 extern int
-__ftrace_printk(unsigned long ip, const char *fmt, ...)
+__trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __ftrace_printk(_THIS_IP_, fmt, ap)
+# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 extern void ftrace_dump(void);
@@ -356,13 +356,13 @@ extern void ftrace_dump(void);
 static inline void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3) { }
 static inline int
-ftrace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
+trace_printk(const char *fmt, ...) __attribute__ ((format (printf, 1, 2)));
 
 static inline void tracing_start(void) { }
 static inline void tracing_stop(void) { }
 static inline void ftrace_off_permanent(void) { }
 static inline int
-ftrace_printk(const char *fmt, ...)
+trace_printk(const char *fmt, ...)
 {
 	return 0;
 }
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index d1ef43999d9e..c0e9c1263393 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -48,7 +48,7 @@ unsigned long __read_mostly	tracing_thresh;
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
  * entries inserted during the selftest although some concurrent
- * insertions into the ring-buffer such as ftrace_printk could occurred
+ * insertions into the ring-buffer such as trace_printk could occurred
  * at the same time, giving false positive or negative results.
  */
 static bool __read_mostly tracing_selftest_running;
@@ -291,7 +291,7 @@ static const char *trace_options[] = {
 	"block",
 	"stacktrace",
 	"sched-tree",
-	"ftrace_printk",
+	"trace_printk",
 	"ftrace_preempt",
 	"branch",
 	"annotate",
@@ -3768,7 +3768,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 }
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
-int __ftrace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_printk(unsigned long ip, const char *fmt, ...)
 {
 	int ret;
 	va_list ap;
@@ -3781,7 +3781,7 @@ int __ftrace_printk(unsigned long ip, const char *fmt, ...)
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__ftrace_printk);
+EXPORT_SYMBOL_GPL(__trace_printk);
 
 int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 12cd119cca32..8beff03fda68 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -115,7 +115,7 @@ struct userstack_entry {
 };
 
 /*
- * ftrace_printk entry:
+ * trace_printk entry:
  */
 struct print_entry {
 	struct trace_entry	ent;
-- 
cgit v1.2.3


From 03d78913f01e8f6599823f00357ed17b32747d3d Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 5 Mar 2009 02:29:05 -0800
Subject: lockdep: remove duplicate CONFIG_DEBUG_LOCKDEP definitions

Impact: cleanup

The atomic debug modifiers are already defined in
kernel/lockdep_internals.h.

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <alpine.DEB.2.00.0903050222160.30401@chino.kir.corp.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 7 -------
 1 file changed, 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 02014f7ccc86..9a1e2bcc4b8d 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -433,13 +433,6 @@ atomic_t nr_find_usage_forwards_checks;
 atomic_t nr_find_usage_forwards_recursions;
 atomic_t nr_find_usage_backwards_checks;
 atomic_t nr_find_usage_backwards_recursions;
-# define debug_atomic_inc(ptr)		atomic_inc(ptr)
-# define debug_atomic_dec(ptr)		atomic_dec(ptr)
-# define debug_atomic_read(ptr)		atomic_read(ptr)
-#else
-# define debug_atomic_inc(ptr)		do { } while (0)
-# define debug_atomic_dec(ptr)		do { } while (0)
-# define debug_atomic_read(ptr)		0
 #endif
 
 /*
-- 
cgit v1.2.3


From 0012693ad4f636c720fed3802027f9427962f540 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Mar 2009 01:49:22 +0100
Subject: tracing/function-graph-tracer: use the more lightweight local clock

Impact: decrease hangs risks with the graph tracer on slow systems

Since the function graph tracer can spend too much time on timer
interrupts, it's better now to use the more lightweight local
clock. Anyway, the function graph traces are more reliable on a
per cpu trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af243d.06e9300a.53ad.ffff840c@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/ftrace.c             |  2 +-
 include/linux/ftrace.h               | 13 +++++++------
 kernel/trace/trace_functions_graph.c |  2 +-
 3 files changed, 9 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 3925ec0184b1..a85da1764b1c 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -436,7 +436,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = cpu_clock(raw_smp_processor_id());
+	calltime = trace_clock_local();
 
 	if (ftrace_push_return_trace(old, calltime,
 				self_addr, &trace.depth) == -EBUSY) {
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1f69ac7c1587..6ea62acbe4b6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -1,15 +1,16 @@
 #ifndef _LINUX_FTRACE_H
 #define _LINUX_FTRACE_H
 
-#include <linux/linkage.h>
-#include <linux/fs.h>
-#include <linux/ktime.h>
-#include <linux/init.h>
-#include <linux/types.h>
-#include <linux/module.h>
+#include <linux/trace_clock.h>
 #include <linux/kallsyms.h>
+#include <linux/linkage.h>
 #include <linux/bitops.h>
+#include <linux/module.h>
+#include <linux/ktime.h>
 #include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/init.h>
+#include <linux/fs.h>
 
 #include <asm/ftrace.h>
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index c009553a8e81..e527f2f66c73 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -112,7 +112,7 @@ unsigned long ftrace_return_to_handler(void)
 	unsigned long ret;
 
 	ftrace_pop_return_trace(&trace, &ret);
-	trace.rettime = cpu_clock(raw_smp_processor_id());
+	trace.rettime = trace_clock_local();
 	ftrace_graph_return(&trace);
 
 	if (unlikely(!ret)) {
-- 
cgit v1.2.3


From 8a0be9ef8225638d26b455788f988c8f84ce9e75 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 5 Mar 2009 01:27:02 +0100
Subject: sched: don't rebalance if attached on NULL domain

Impact: fix function graph trace hang / drop pointless softirq on UP

While debugging a function graph trace hang on an old PII, I saw
that it consumed most of its time on the timer interrupt. And
the domain rebalancing softirq was the most concerned.

The timer interrupt calls trigger_load_balance() which will
decide if it is worth to schedule a rebalancing softirq.

In case of builtin UP kernel, no problem arises because there is
no domain question.

In case of builtin SMP kernel running on an SMP box, still no
problem, the softirq will be raised each time we reach the
next_balance time.

In case of builtin SMP kernel running on a UP box (most distros
provide default SMP kernels, whatever the box you have), then
the CPU is attached to the NULL sched domain. So a kind of
unexpected behaviour happen:

trigger_load_balance() -> raises the rebalancing softirq later
on softirq: run_rebalance_domains() -> rebalance_domains() where
the for_each_domain(cpu, sd) is not taken because of the NULL
domain we are attached at. Which means rq->next_balance is never
updated. So on the next timer tick, we will enter
trigger_load_balance() which will always reschedule() the
rebalacing softirq:

if (time_after_eq(jiffies, rq->next_balance))
	raise_softirq(SCHED_SOFTIRQ);

So for each tick, we process this pointless softirq.

This patch fixes it by checking if we are attached to the null
domain before raising the softirq, another possible fix would be
to set the maximal possible JIFFIES value to rq->next_balance if
we are attached to the NULL domain.

v2: build fix on UP

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49af242d.1c07d00a.32d5.ffffc019@mx.google.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index dfae1bf6d5b2..e509dbd7d77f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4148,6 +4148,11 @@ static void run_rebalance_domains(struct softirq_action *h)
 #endif
 }
 
+static inline int on_null_domain(int cpu)
+{
+	return !rcu_dereference(cpu_rq(cpu)->sd);
+}
+
 /*
  * Trigger the SCHED_SOFTIRQ if it is time to do periodic load balancing.
  *
@@ -4205,7 +4210,9 @@ static inline void trigger_load_balance(struct rq *rq, int cpu)
 	    cpumask_test_cpu(cpu, nohz.cpu_mask))
 		return;
 #endif
-	if (time_after_eq(jiffies, rq->next_balance))
+	/* Don't need to rebalance while attached to NULL domain */
+	if (time_after_eq(jiffies, rq->next_balance) &&
+	    likely(!on_null_domain(cpu)))
 		raise_softirq(SCHED_SOFTIRQ);
 }
 
-- 
cgit v1.2.3


From 40ada30f9621fbd831ac2437b9a2a399aad34b00 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 5 Mar 2009 21:19:55 +0100
Subject: tracing: clean up menu

Clean up menu structure, introduce TRACING_SUPPORT switch that signals
whether an architecture supports various instrumentation mechanisms.

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/Kconfig         |  1 +
 kernel/trace/Kconfig | 29 ++++++++++++++++-------------
 2 files changed, 17 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/Kconfig b/arch/Kconfig
index 550dab22daa1..a092dc77c24d 100644
--- a/arch/Kconfig
+++ b/arch/Kconfig
@@ -6,6 +6,7 @@ config OPROFILE
 	tristate "OProfile system profiling (EXPERIMENTAL)"
 	depends on PROFILING
 	depends on HAVE_OPROFILE
+	depends on TRACING_SUPPORT
 	select TRACING
 	select RING_BUFFER
 	help
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 999c6a2485df..5d733da5345a 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -53,12 +53,22 @@ config TRACING
 	select TRACEPOINTS
 	select NOP_TRACER
 
+#
+# Minimum requirements an architecture has to meet for us to
+# be able to offer generic tracing facilities:
+#
+config TRACING_SUPPORT
+	bool
+	depends on TRACE_IRQFLAGS_SUPPORT
+	depends on STACKTRACE_SUPPORT
+
+if TRACING_SUPPORT
+
 menu "Tracers"
 
 config FUNCTION_TRACER
 	bool "Kernel Function Tracer"
 	depends on HAVE_FUNCTION_TRACER
-	depends on DEBUG_KERNEL
 	select FRAME_POINTER
 	select KALLSYMS
 	select TRACING
@@ -91,7 +101,6 @@ config IRQSOFF_TRACER
 	default n
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on GENERIC_TIME
-	depends on DEBUG_KERNEL
 	select TRACE_IRQFLAGS
 	select TRACING
 	select TRACER_MAX_TRACE
@@ -114,7 +123,6 @@ config PREEMPT_TRACER
 	default n
 	depends on GENERIC_TIME
 	depends on PREEMPT
-	depends on DEBUG_KERNEL
 	select TRACING
 	select TRACER_MAX_TRACE
 	help
@@ -142,7 +150,6 @@ config SYSPROF_TRACER
 
 config SCHED_TRACER
 	bool "Scheduling Latency Tracer"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	select TRACER_MAX_TRACE
@@ -152,7 +159,6 @@ config SCHED_TRACER
 
 config CONTEXT_SWITCH_TRACER
 	bool "Trace process context switches"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select MARKERS
 	help
@@ -161,7 +167,6 @@ config CONTEXT_SWITCH_TRACER
 
 config EVENT_TRACER
 	bool "Trace various events in the kernel"
-	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer hooks to various trace points in the kernel
@@ -170,7 +175,6 @@ config EVENT_TRACER
 
 config BOOT_TRACER
 	bool "Trace boot initcalls"
-	depends on DEBUG_KERNEL
 	select TRACING
 	select CONTEXT_SWITCH_TRACER
 	help
@@ -188,7 +192,6 @@ config BOOT_TRACER
 
 config TRACE_BRANCH_PROFILING
 	bool "Trace likely/unlikely profiler"
-	depends on DEBUG_KERNEL
 	select TRACING
 	help
 	  This tracer profiles all the the likely and unlikely macros
@@ -241,7 +244,6 @@ config BRANCH_TRACER
 
 config POWER_TRACER
 	bool "Trace power consumption behavior"
-	depends on DEBUG_KERNEL
 	depends on X86
 	select TRACING
 	help
@@ -253,7 +255,6 @@ config POWER_TRACER
 config STACK_TRACER
 	bool "Trace max stack"
 	depends on HAVE_FUNCTION_TRACER
-	depends on DEBUG_KERNEL
 	select FUNCTION_TRACER
 	select STACKTRACE
 	select KALLSYMS
@@ -343,7 +344,6 @@ config DYNAMIC_FTRACE
 	bool "enable/disable ftrace tracepoints dynamically"
 	depends on FUNCTION_TRACER
 	depends on HAVE_DYNAMIC_FTRACE
-	depends on DEBUG_KERNEL
 	default y
 	help
          This option will modify all the calls to ftrace dynamically
@@ -369,7 +369,7 @@ config FTRACE_SELFTEST
 
 config FTRACE_STARTUP_TEST
 	bool "Perform a startup test on ftrace"
-	depends on TRACING && DEBUG_KERNEL
+	depends on TRACING
 	select FTRACE_SELFTEST
 	help
 	  This option performs a series of startup tests on ftrace. On bootup
@@ -379,7 +379,7 @@ config FTRACE_STARTUP_TEST
 
 config MMIOTRACE
 	bool "Memory mapped IO tracing"
-	depends on HAVE_MMIOTRACE_SUPPORT && DEBUG_KERNEL && PCI
+	depends on HAVE_MMIOTRACE_SUPPORT && PCI
 	select TRACING
 	help
 	  Mmiotrace traces Memory Mapped I/O access and is meant for
@@ -401,3 +401,6 @@ config MMIOTRACE_TEST
 	  Say N, unless you absolutely know what you are doing.
 
 endmenu
+
+endif # TRACING_SUPPORT
+
-- 
cgit v1.2.3


From 5e2336a0d47c9661a40cc5ef85135ce1406af6e8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 21:44:55 -0500
Subject: tracing: make all file_operations const

Impact: cleanup

All file_operations structures should be constant. No one is going to
change them.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c        | 10 +++++-----
 kernel/trace/ring_buffer.c   |  2 +-
 kernel/trace/trace.c         | 24 ++++++++++++------------
 kernel/trace/trace_sysprof.c |  2 +-
 4 files changed, 19 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 5a3a06b21eee..d7a06a0d9447 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1869,21 +1869,21 @@ ftrace_notrace_release(struct inode *inode, struct file *file)
 	return ftrace_regex_release(inode, file, 0);
 }
 
-static struct file_operations ftrace_avail_fops = {
+static const struct file_operations ftrace_avail_fops = {
 	.open = ftrace_avail_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = ftrace_avail_release,
 };
 
-static struct file_operations ftrace_failures_fops = {
+static const struct file_operations ftrace_failures_fops = {
 	.open = ftrace_failures_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = ftrace_avail_release,
 };
 
-static struct file_operations ftrace_filter_fops = {
+static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
 	.read = ftrace_regex_read,
 	.write = ftrace_filter_write,
@@ -1891,7 +1891,7 @@ static struct file_operations ftrace_filter_fops = {
 	.release = ftrace_filter_release,
 };
 
-static struct file_operations ftrace_notrace_fops = {
+static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
 	.read = ftrace_regex_read,
 	.write = ftrace_notrace_write,
@@ -2423,7 +2423,7 @@ ftrace_pid_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations ftrace_pid_fops = {
+static const struct file_operations ftrace_pid_fops = {
 	.read = ftrace_pid_read,
 	.write = ftrace_pid_write,
 };
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index f7473645b9c6..178858492a89 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -2606,7 +2606,7 @@ rb_simple_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations rb_simple_fops = {
+static const struct file_operations rb_simple_fops = {
 	.open		= tracing_open_generic,
 	.read		= rb_simple_read,
 	.write		= rb_simple_write,
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0e9c1263393..e6144acf2b75 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1882,14 +1882,14 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static struct file_operations tracing_fops = {
+static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
 	.llseek		= seq_lseek,
 	.release	= tracing_release,
 };
 
-static struct file_operations show_traces_fops = {
+static const struct file_operations show_traces_fops = {
 	.open		= show_traces_open,
 	.read		= seq_read,
 	.release	= seq_release,
@@ -1982,7 +1982,7 @@ err_unlock:
 	return err;
 }
 
-static struct file_operations tracing_cpumask_fops = {
+static const struct file_operations tracing_cpumask_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_cpumask_read,
 	.write		= tracing_cpumask_write,
@@ -2134,7 +2134,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations tracing_iter_fops = {
+static const struct file_operations tracing_iter_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_trace_options_read,
 	.write		= tracing_trace_options_write,
@@ -2167,7 +2167,7 @@ tracing_readme_read(struct file *filp, char __user *ubuf,
 					readme_msg, strlen(readme_msg));
 }
 
-static struct file_operations tracing_readme_fops = {
+static const struct file_operations tracing_readme_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_readme_read,
 };
@@ -2927,25 +2927,25 @@ tracing_mark_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations tracing_max_lat_fops = {
+static const struct file_operations tracing_max_lat_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_max_lat_read,
 	.write		= tracing_max_lat_write,
 };
 
-static struct file_operations tracing_ctrl_fops = {
+static const struct file_operations tracing_ctrl_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_ctrl_read,
 	.write		= tracing_ctrl_write,
 };
 
-static struct file_operations set_tracer_fops = {
+static const struct file_operations set_tracer_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_set_trace_read,
 	.write		= tracing_set_trace_write,
 };
 
-static struct file_operations tracing_pipe_fops = {
+static const struct file_operations tracing_pipe_fops = {
 	.open		= tracing_open_pipe,
 	.poll		= tracing_poll_pipe,
 	.read		= tracing_read_pipe,
@@ -2953,13 +2953,13 @@ static struct file_operations tracing_pipe_fops = {
 	.release	= tracing_release_pipe,
 };
 
-static struct file_operations tracing_entries_fops = {
+static const struct file_operations tracing_entries_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_entries_read,
 	.write		= tracing_entries_write,
 };
 
-static struct file_operations tracing_mark_fops = {
+static const struct file_operations tracing_mark_fops = {
 	.open		= tracing_open_generic,
 	.write		= tracing_mark_write,
 };
@@ -3240,7 +3240,7 @@ tracing_read_dyn_info(struct file *filp, char __user *ubuf,
 	return r;
 }
 
-static struct file_operations tracing_dyn_info_fops = {
+static const struct file_operations tracing_dyn_info_fops = {
 	.open		= tracing_open_generic,
 	.read		= tracing_read_dyn_info,
 };
diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c
index c771af4e8f1a..91fd19c2149f 100644
--- a/kernel/trace/trace_sysprof.c
+++ b/kernel/trace/trace_sysprof.c
@@ -314,7 +314,7 @@ sysprof_sample_write(struct file *filp, const char __user *ubuf,
 	return cnt;
 }
 
-static struct file_operations sysprof_sample_fops = {
+static const struct file_operations sysprof_sample_fops = {
 	.read		= sysprof_sample_read,
 	.write		= sysprof_sample_write,
 };
-- 
cgit v1.2.3


From 33b0c229e3abeae00493ed1d6f0b07191977a0a2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 11:45:43 -0500
Subject: tracing: move print of event format to separate file

Impact: clean up

Move the macro that creates the event format file to a separate header.
This will allow the default ftrace events to use this same macro
to create the formats to read those events.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 53 +----------------------------------
 kernel/trace/trace_format.h         | 55 +++++++++++++++++++++++++++++++++++++
 2 files changed, 56 insertions(+), 52 deletions(-)
 create mode 100644 kernel/trace/trace_format.h

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index b1cebba1d9b4..d24a97e74aea 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -75,56 +75,5 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 
 #include <trace/trace_event_types.h>
 
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- * 	struct ftrace_raw_##call field;
- * 	int ret;
- *
- * 	ret = trace_seq_printf(s, #type " " #item ";"
- * 			       " size:%d; offset:%d;\n",
- * 			       sizeof(field.type),
- * 			       offsetof(struct ftrace_raw_##call,
- * 					item));
- *
- * }
- */
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
+#include "trace_format.h"
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
new file mode 100644
index 000000000000..53a6b1357116
--- /dev/null
+++ b/kernel/trace/trace_format.h
@@ -0,0 +1,55 @@
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%lu;\tsize:%lu;\n",		\
+			       offsetof(typeof(field), item),		\
+			       sizeof(field.item));			\
+	if (!ret)							\
+		return 0;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+int									\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
-- 
cgit v1.2.3


From 770cb24345c0f6e0d47bd2b94aa6d67bea6f8b54 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 5 Mar 2009 21:35:29 -0500
Subject: tracing: add format files for ftrace default entries

Impact: allow user apps to read binary format of basic ftrace entries

Currently, only defined raw events export their formats so a binary
reader can parse them. There's no reason that the default ftrace entries
can't export their formats.

This patch adds a subsystem called "ftrace" in the events directory
that includes the ftrace entries for basic ftrace recorded items.

These only have three files in the events directory:

 type             : printf
 available_types  : printf
 format           : format for the event entry

For example:

 # cat /debug/tracing/events/ftrace/wakeup/format
name: wakeup
ID: 3
format:
        field:unsigned char type;       offset:0;       size:1;
        field:unsigned char flags;      offset:1;       size:1;
        field:unsigned char preempt_count;      offset:2;       size:1;
        field:int pid;  offset:4;       size:4;
        field:int tgid; offset:8;       size:4;

        field:unsigned int prev_pid;    offset:12;      size:4;
        field:unsigned char prev_prio;  offset:16;      size:1;
        field:unsigned char prev_state; offset:17;      size:1;
        field:unsigned int next_pid;    offset:20;      size:4;
        field:unsigned char next_prio;  offset:24;      size:1;
        field:unsigned char next_state; offset:25;      size:1;
        field:unsigned int next_cpu;    offset:28;      size:4;

print fmt: "%u:%u:%u  ==+ %u:%u:%u [%03u]"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/Makefile            |   1 +
 kernel/trace/trace_event_types.h | 165 +++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events.c      |  12 +--
 kernel/trace/trace_export.c      |  81 +++++++++++++++++++
 kernel/trace/trace_format.h      |   2 +-
 5 files changed, 255 insertions(+), 6 deletions(-)
 create mode 100644 kernel/trace/trace_event_types.h
 create mode 100644 kernel/trace/trace_export.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c931fe0560cb..f44736c7574a 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -41,5 +41,6 @@ obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
+obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
new file mode 100644
index 000000000000..fb4eba166433
--- /dev/null
+++ b/kernel/trace/trace_event_types.h
@@ -0,0 +1,165 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM	ftrace
+
+/*
+ * We cheat and use the proto type field as the ID
+ * and args as the entry type (minus 'struct')
+ */
+TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
+	),
+	TPRAWFMT(" %lx <-- %lx")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
+		   ftrace_graph_ent_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, graph_ent.func, func)
+		TRACE_FIELD(int, graph_ent.depth, depth)
+	),
+	TPRAWFMT("--> %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
+		   ftrace_graph_ret_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ret.func, func)
+		TRACE_FIELD(int, ret.depth, depth)
+	),
+	TPRAWFMT("<-- %lx (%d)")
+);
+
+TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+		TRACE_FIELD(unsigned char, prev_state, prev_state)
+		TRACE_FIELD(unsigned int, next_pid, next_pid)
+		TRACE_FIELD(unsigned char, next_prio, next_prio)
+		TRACE_FIELD(unsigned char, next_state, next_state)
+		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+	),
+	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, prev_pid, prev_pid)
+		TRACE_FIELD(unsigned char, prev_prio, prev_prio)
+		TRACE_FIELD(unsigned char, prev_state, prev_state)
+		TRACE_FIELD(unsigned int, next_pid, next_pid)
+		TRACE_FIELD(unsigned char, next_prio, next_prio)
+		TRACE_FIELD(unsigned char, next_state, next_state)
+		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
+	),
+	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+);
+
+TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, arg1, arg1)
+		TRACE_FIELD(unsigned long, arg2, arg2)
+		TRACE_FIELD(unsigned long, arg3, arg3)
+	),
+	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+);
+
+/*
+ * Stack-trace entry:
+ */
+
+/* #define FTRACE_STACK_ENTRIES   8 */
+
+TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, caller[0], stack0)
+		TRACE_FIELD(unsigned long, caller[1], stack1)
+		TRACE_FIELD(unsigned long, caller[2], stack2)
+		TRACE_FIELD(unsigned long, caller[3], stack3)
+		TRACE_FIELD(unsigned long, caller[4], stack4)
+		TRACE_FIELD(unsigned long, caller[5], stack5)
+		TRACE_FIELD(unsigned long, caller[6], stack6)
+		TRACE_FIELD(unsigned long, caller[7], stack7)
+	),
+	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, caller[0], stack0)
+		TRACE_FIELD(unsigned long, caller[1], stack1)
+		TRACE_FIELD(unsigned long, caller[2], stack2)
+		TRACE_FIELD(unsigned long, caller[3], stack3)
+		TRACE_FIELD(unsigned long, caller[4], stack4)
+		TRACE_FIELD(unsigned long, caller[5], stack5)
+		TRACE_FIELD(unsigned long, caller[6], stack6)
+		TRACE_FIELD(unsigned long, caller[7], stack7)
+	),
+	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
+);
+
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TPRAWFMT("%08lx (%d) %s")
+);
+
+TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned int, line, line)
+		TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func)
+		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
+		TRACE_FIELD(char, correct, correct)
+	),
+	TPRAWFMT("%u:%s:%s (%u)")
+);
+
+TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(u64, from, from)
+		TRACE_FIELD(u64, to, to)
+	),
+	TPRAWFMT("from: %llx to: %llx")
+);
+
+TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(ktime_t, state_data.stamp, stamp)
+		TRACE_FIELD(ktime_t, state_data.end, end)
+		TRACE_FIELD(int, state_data.type, type)
+		TRACE_FIELD(int, state_data.state, state)
+	),
+	TPRAWFMT("%llx->%llx type:%u state:%u")
+);
+
+TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+		TRACE_FIELD(unsigned long, call_site, call_site)
+		TRACE_FIELD(const void *, ptr, ptr)
+		TRACE_FIELD(size_t, bytes_req, bytes_req)
+		TRACE_FIELD(size_t, bytes_alloc, bytes_alloc)
+		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
+		TRACE_FIELD(int, node, node)
+	),
+	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+		 " flags:%x node:%d")
+);
+
+TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(enum kmemtrace_type_id, type_id, type_id)
+		TRACE_FIELD(unsigned long, call_site, call_site)
+		TRACE_FIELD(const void *, ptr, ptr)
+	),
+	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+);
+
+#undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 210e71ff82db..4488d90e75ef 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -656,11 +656,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		return -1;
 	}
 
-	entry = debugfs_create_file("enable", 0644, call->dir, call,
-				    &ftrace_enable_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/enable' entry\n", call->name);
+	if (call->regfunc) {
+		entry = debugfs_create_file("enable", 0644, call->dir, call,
+					    &ftrace_enable_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs "
+				   "'%s/enable' entry\n", call->name);
+	}
 
 	/* Only let type be writable, if we can change it */
 	entry = debugfs_create_file("type",
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
new file mode 100644
index 000000000000..0fb7be73e31c
--- /dev/null
+++ b/kernel/trace/trace_export.c
@@ -0,0 +1,81 @@
+/*
+ * trace_export.c - export basic ftrace utilities to user space
+ *
+ * Copyright (C) 2009 Steven Rostedt <srostedt@redhat.com>
+ */
+#include <linux/stringify.h>
+#include <linux/kallsyms.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/ftrace.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <linux/fs.h>
+
+#include "trace_output.h"
+
+#include "trace_format.h"
+
+#undef TRACE_FIELD_ZERO_CHAR
+#define TRACE_FIELD_ZERO_CHAR(item)				\
+	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"	\
+			       "offset:%lu;\tsize:0;\n",	\
+			       offsetof(typeof(field), item));	\
+	if (!ret)						\
+		return 0;
+
+
+#undef TPRAWFMT
+#define TPRAWFMT(args...) args
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct args field;						\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
+									\
+	return ret;							\
+}
+
+#include "trace_event_types.h"
+
+#undef TRACE_ZERO_CHAR
+#define TRACE_ZERO_CHAR(arg)
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)\
+	entry->item = assign;
+
+#undef TPCMD
+#define TPCMD(cmd...)	cmd
+
+#undef TRACE_ENTRY
+#define TRACE_ENTRY	entry
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
+	cmd;
+
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+									\
+static struct ftrace_event_call __used					\
+__attribute__((__aligned__(4)))						\
+__attribute__((section("_ftrace_events"))) event_##call = {		\
+	.name 			= #call,				\
+	.id			= proto,				\
+	.system			= __stringify(TRACE_SYSTEM),		\
+	.show_format		= ftrace_format_##call,			\
+}
+#include "trace_event_types.h"
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
index 53a6b1357116..03f9a4c165ca 100644
--- a/kernel/trace/trace_format.h
+++ b/kernel/trace/trace_format.h
@@ -40,7 +40,7 @@
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-int									\
+static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
 	struct ftrace_raw_##call field;					\
-- 
cgit v1.2.3


From edcb463997ed7b2ffa3bac76e3e75957318f2e01 Mon Sep 17 00:00:00 2001
From: Tejun Heo <tj@kernel.org>
Date: Fri, 6 Mar 2009 14:33:59 +0900
Subject: percpu, module: implement reserved allocation and use it for module
 percpu variables

Impact: add reserved allocation functionality and use it for module
	percpu variables

This patch implements reserved allocation from the first chunk.  When
setting up the first chunk, arch can ask to set aside certain number
of bytes right after the core static area which is available only
through a separate reserved allocator.  This will be used primarily
for module static percpu variables on architectures with limited
relocation range to ensure that the module perpcu symbols are inside
the relocatable range.

If reserved area is requested, the first chunk becomes reserved and
isn't available for regular allocation.  If the first chunk also
includes piggy-back dynamic allocation area, a separate chunk mapping
the same region is created to serve dynamic allocation.  The first one
is called static first chunk and the second dynamic first chunk.
Although they share the page map, their different area map
initializations guarantee they serve disjoint areas according to their
purposes.

If arch doesn't setup reserved area, reserved allocation is handled
like any other allocation.

Signed-off-by: Tejun Heo <tj@kernel.org>
---
 arch/x86/kernel/setup_percpu.c |   8 +--
 include/linux/percpu.h         |  10 +--
 kernel/module.c                |   2 +-
 mm/percpu.c                    | 153 +++++++++++++++++++++++++++++++++++------
 4 files changed, 144 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 38e2b2a470a5..dd4eabc747c8 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -217,7 +217,7 @@ proceed:
 	pr_info("PERCPU: Remapped at %p with large pages, static data "
 		"%zu bytes\n", vm.addr, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, PMD_SIZE,
+	ret = pcpu_setup_first_chunk(pcpur_get_page, static_size, 0, PMD_SIZE,
 				     pcpur_size - static_size, vm.addr, NULL);
 	goto out_free_ar;
 
@@ -297,7 +297,7 @@ static ssize_t __init setup_pcpu_embed(size_t static_size)
 	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
 		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
 
-	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
+	return pcpu_setup_first_chunk(pcpue_get_page, static_size, 0,
 				      pcpue_unit_size, dyn_size,
 				      pcpue_ptr, NULL);
 }
@@ -356,8 +356,8 @@ static ssize_t __init setup_pcpu_4k(size_t static_size)
 	pr_info("PERCPU: Allocated %d 4k pages, static data %zu bytes\n",
 		pcpu4k_nr_static_pages, static_size);
 
-	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, -1, -1, NULL,
-				     pcpu4k_populate_pte);
+	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size, 0, -1, -1,
+				     NULL, pcpu4k_populate_pte);
 	goto out_free_ar;
 
 enomem:
diff --git a/include/linux/percpu.h b/include/linux/percpu.h
index a96fc53bbd62..8ff15153ae20 100644
--- a/include/linux/percpu.h
+++ b/include/linux/percpu.h
@@ -117,10 +117,10 @@ typedef struct page * (*pcpu_get_page_fn_t)(unsigned int cpu, int pageno);
 typedef void (*pcpu_populate_pte_fn_t)(unsigned long addr);
 
 extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-					size_t static_size,
-					ssize_t unit_size, ssize_t dyn_size,
-					void *base_addr,
-					pcpu_populate_pte_fn_t populate_pte_fn);
+				size_t static_size, size_t reserved_size,
+				ssize_t unit_size, ssize_t dyn_size,
+				void *base_addr,
+				pcpu_populate_pte_fn_t populate_pte_fn);
 
 /*
  * Use this to get to a cpu's version of the per-cpu object
@@ -129,6 +129,8 @@ extern size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
  */
 #define per_cpu_ptr(ptr, cpu)	SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu)))
 
+extern void *__alloc_reserved_percpu(size_t size, size_t align);
+
 #else /* CONFIG_HAVE_DYNAMIC_PER_CPU_AREA */
 
 struct percpu_data {
diff --git a/kernel/module.c b/kernel/module.c
index 1f0657ae555b..f0e04d6b67d8 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -381,7 +381,7 @@ static void *percpu_modalloc(unsigned long size, unsigned long align,
 		align = PAGE_SIZE;
 	}
 
-	ptr = __alloc_percpu(size, align);
+	ptr = __alloc_reserved_percpu(size, align);
 	if (!ptr)
 		printk(KERN_WARNING
 		       "Could not allocate %lu bytes percpu data\n", size);
diff --git a/mm/percpu.c b/mm/percpu.c
index 5b47d9fe65f5..ef8e169b7731 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -94,6 +94,11 @@ static size_t pcpu_chunk_struct_size __read_mostly;
 void *pcpu_base_addr __read_mostly;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
+/* optional reserved chunk, only accessible for reserved allocations */
+static struct pcpu_chunk *pcpu_reserved_chunk;
+/* offset limit of the reserved chunk */
+static int pcpu_reserved_chunk_limit;
+
 /*
  * One mutex to rule them all.
  *
@@ -201,13 +206,14 @@ static void *pcpu_realloc(void *p, size_t size, size_t new_size)
  *
  * This function is called after an allocation or free changed @chunk.
  * New slot according to the changed state is determined and @chunk is
- * moved to the slot.
+ * moved to the slot.  Note that the reserved chunk is never put on
+ * chunk slots.
  */
 static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
 {
 	int nslot = pcpu_chunk_slot(chunk);
 
-	if (oslot != nslot) {
+	if (chunk != pcpu_reserved_chunk && oslot != nslot) {
 		if (oslot < nslot)
 			list_move(&chunk->list, &pcpu_slot[nslot]);
 		else
@@ -255,6 +261,15 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
 	struct rb_node *n, *parent;
 	struct pcpu_chunk *chunk;
 
+	/* is it in the reserved chunk? */
+	if (pcpu_reserved_chunk) {
+		void *start = pcpu_reserved_chunk->vm->addr;
+
+		if (addr >= start && addr < start + pcpu_reserved_chunk_limit)
+			return pcpu_reserved_chunk;
+	}
+
+	/* nah... search the regular ones */
 	n = *pcpu_chunk_rb_search(addr, &parent);
 	if (!n) {
 		/* no exactly matching chunk, the parent is the closest */
@@ -713,9 +728,10 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
 }
 
 /**
- * __alloc_percpu - allocate percpu area
+ * pcpu_alloc - the percpu allocator
  * @size: size of area to allocate in bytes
  * @align: alignment of area (max PAGE_SIZE)
+ * @reserved: allocate from the reserved chunk if available
  *
  * Allocate percpu area of @size bytes aligned at @align.  Might
  * sleep.  Might trigger writeouts.
@@ -723,7 +739,7 @@ static struct pcpu_chunk *alloc_pcpu_chunk(void)
  * RETURNS:
  * Percpu pointer to the allocated area on success, NULL on failure.
  */
-void *__alloc_percpu(size_t size, size_t align)
+static void *pcpu_alloc(size_t size, size_t align, bool reserved)
 {
 	void *ptr = NULL;
 	struct pcpu_chunk *chunk;
@@ -737,7 +753,18 @@ void *__alloc_percpu(size_t size, size_t align)
 
 	mutex_lock(&pcpu_mutex);
 
-	/* allocate area */
+	/* serve reserved allocations from the reserved chunk if available */
+	if (reserved && pcpu_reserved_chunk) {
+		chunk = pcpu_reserved_chunk;
+		if (size > chunk->contig_hint)
+			goto out_unlock;
+		off = pcpu_alloc_area(chunk, size, align);
+		if (off >= 0)
+			goto area_found;
+		goto out_unlock;
+	}
+
+	/* search through normal chunks */
 	for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
 		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
 			if (size > chunk->contig_hint)
@@ -773,8 +800,41 @@ out_unlock:
 	mutex_unlock(&pcpu_mutex);
 	return ptr;
 }
+
+/**
+ * __alloc_percpu - allocate dynamic percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align.  Might
+ * sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, false);
+}
 EXPORT_SYMBOL_GPL(__alloc_percpu);
 
+/**
+ * __alloc_reserved_percpu - allocate reserved percpu area
+ * @size: size of area to allocate in bytes
+ * @align: alignment of area (max PAGE_SIZE)
+ *
+ * Allocate percpu area of @size bytes aligned at @align from reserved
+ * percpu area if arch has set it up; otherwise, allocation is served
+ * from the same dynamic area.  Might sleep.  Might trigger writeouts.
+ *
+ * RETURNS:
+ * Percpu pointer to the allocated area on success, NULL on failure.
+ */
+void *__alloc_reserved_percpu(size_t size, size_t align)
+{
+	return pcpu_alloc(size, align, true);
+}
+
 static void pcpu_kill_chunk(struct pcpu_chunk *chunk)
 {
 	WARN_ON(chunk->immutable);
@@ -826,6 +886,7 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * pcpu_setup_first_chunk - initialize the first percpu chunk
  * @get_page_fn: callback to fetch page pointer
  * @static_size: the size of static percpu area in bytes
+ * @reserved_size: the size of reserved percpu area in bytes
  * @unit_size: unit size in bytes, must be multiple of PAGE_SIZE, -1 for auto
  * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
  * @base_addr: mapped address, NULL for auto
@@ -844,14 +905,22 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * indicates end of pages for the cpu.  Note that @get_page_fn() must
  * return the same number of pages for all cpus.
  *
+ * @reserved_size, if non-zero, specifies the amount of bytes to
+ * reserve after the static area in the first chunk.  This reserves
+ * the first chunk such that it's available only through reserved
+ * percpu allocation.  This is primarily used to serve module percpu
+ * static areas on architectures where the addressing model has
+ * limited offset range for symbol relocations to guarantee module
+ * percpu symbols fall inside the relocatable range.
+ *
  * @unit_size, if non-negative, specifies unit size and must be
  * aligned to PAGE_SIZE and equal to or larger than @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * @dyn_size, if non-negative, limits the number of bytes available
  * for dynamic allocation in the first chunk.  Specifying non-negative
  * value make percpu leave alone the area beyond @static_size +
- * @dyn_size.
+ * @reserved_size + @dyn_size.
  *
  * Non-null @base_addr means that the caller already allocated virtual
  * region for the first chunk and mapped it.  percpu must not mess
@@ -861,28 +930,36 @@ EXPORT_SYMBOL_GPL(free_percpu);
  * @populate_pte_fn is used to populate the pagetable.  NULL means the
  * caller already populated the pagetable.
  *
+ * If the first chunk ends up with both reserved and dynamic areas, it
+ * is served by two chunks - one to serve the core static and reserved
+ * areas and the other for the dynamic area.  They share the same vm
+ * and page map but uses different area allocation map to stay away
+ * from each other.  The latter chunk is circulated in the chunk slots
+ * and available for dynamic allocation like any other chunks.
+ *
  * RETURNS:
  * The determined pcpu_unit_size which can be used to initialize
  * percpu access.
  */
 size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
-				     size_t static_size,
+				     size_t static_size, size_t reserved_size,
 				     ssize_t unit_size, ssize_t dyn_size,
 				     void *base_addr,
 				     pcpu_populate_pte_fn_t populate_pte_fn)
 {
 	static struct vm_struct first_vm;
-	static int smap[2];
-	struct pcpu_chunk *schunk;
+	static int smap[2], dmap[2];
+	struct pcpu_chunk *schunk, *dchunk = NULL;
 	unsigned int cpu;
 	int nr_pages;
 	int err, i;
 
 	/* santiy checks */
-	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC);
+	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
+		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
 	BUG_ON(!static_size);
 	if (unit_size >= 0) {
-		BUG_ON(unit_size < static_size +
+		BUG_ON(unit_size < static_size + reserved_size +
 				   (dyn_size >= 0 ? dyn_size : 0));
 		BUG_ON(unit_size & ~PAGE_MASK);
 	} else {
@@ -895,7 +972,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
 	else
 		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
-					PFN_UP(static_size));
+					PFN_UP(static_size + reserved_size));
 
 	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
 	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
@@ -903,7 +980,7 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
 
 	if (dyn_size < 0)
-		dyn_size = pcpu_unit_size - static_size;
+		dyn_size = pcpu_unit_size - static_size - reserved_size;
 
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
@@ -914,20 +991,49 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	for (i = 0; i < pcpu_nr_slots; i++)
 		INIT_LIST_HEAD(&pcpu_slot[i]);
 
-	/* init static chunk */
+	/*
+	 * Initialize static chunk.  If reserved_size is zero, the
+	 * static chunk covers static area + dynamic allocation area
+	 * in the first chunk.  If reserved_size is not zero, it
+	 * covers static area + reserved area (mostly used for module
+	 * static percpu allocation).
+	 */
 	schunk = alloc_bootmem(pcpu_chunk_struct_size);
 	INIT_LIST_HEAD(&schunk->list);
 	schunk->vm = &first_vm;
 	schunk->map = smap;
 	schunk->map_alloc = ARRAY_SIZE(smap);
 	schunk->page = schunk->page_ar;
-	schunk->free_size = dyn_size;
+
+	if (reserved_size) {
+		schunk->free_size = reserved_size;
+		pcpu_reserved_chunk = schunk;	/* not for dynamic alloc */
+	} else {
+		schunk->free_size = dyn_size;
+		dyn_size = 0;			/* dynamic area covered */
+	}
 	schunk->contig_hint = schunk->free_size;
 
 	schunk->map[schunk->map_used++] = -static_size;
 	if (schunk->free_size)
 		schunk->map[schunk->map_used++] = schunk->free_size;
 
+	pcpu_reserved_chunk_limit = static_size + schunk->free_size;
+
+	/* init dynamic chunk if necessary */
+	if (dyn_size) {
+		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
+		INIT_LIST_HEAD(&dchunk->list);
+		dchunk->vm = &first_vm;
+		dchunk->map = dmap;
+		dchunk->map_alloc = ARRAY_SIZE(dmap);
+		dchunk->page = schunk->page_ar;	/* share page map with schunk */
+
+		dchunk->contig_hint = dchunk->free_size = dyn_size;
+		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
+		dchunk->map[dchunk->map_used++] = dchunk->free_size;
+	}
+
 	/* allocate vm address */
 	first_vm.flags = VM_ALLOC;
 	first_vm.size = pcpu_chunk_size;
@@ -937,12 +1043,14 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	else {
 		/*
 		 * Pages already mapped.  No need to remap into
-		 * vmalloc area.  In this case the static chunk can't
-		 * be mapped or unmapped by percpu and is marked
+		 * vmalloc area.  In this case the first chunks can't
+		 * be mapped or unmapped by percpu and are marked
 		 * immutable.
 		 */
 		first_vm.addr = base_addr;
 		schunk->immutable = true;
+		if (dchunk)
+			dchunk->immutable = true;
 	}
 
 	/* assign pages */
@@ -978,8 +1086,13 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
 	}
 
 	/* link the first chunk in */
-	pcpu_chunk_relocate(schunk, -1);
-	pcpu_chunk_addr_insert(schunk);
+	if (!dchunk) {
+		pcpu_chunk_relocate(schunk, -1);
+		pcpu_chunk_addr_insert(schunk);
+	} else {
+		pcpu_chunk_relocate(dchunk, -1);
+		pcpu_chunk_addr_insert(dchunk);
+	}
 
 	/* we're done */
 	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
-- 
cgit v1.2.3


From 422d3c7a577b15e1384c9d4e72a9540896b685fa Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 6 Mar 2009 10:40:53 +0900
Subject: tracing: current tip/master can't enable ftrace

After commit 40ada30f9621fbd831ac2437b9a2a399aad34b00,
"make menuconfig" doesn't display "Tracer" item.

Following modification restores it.
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 5d733da5345a..058d949a3214 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -61,6 +61,7 @@ config TRACING_SUPPORT
 	bool
 	depends on TRACE_IRQFLAGS_SUPPORT
 	depends on STACKTRACE_SUPPORT
+	default y
 
 if TRACING_SUPPORT
 
-- 
cgit v1.2.3


From 10dd3ebe213c31bff14b4dae3c5d32a76b1fad7c Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 6 Mar 2009 15:29:04 +0900
Subject: tracing: fix deadlock when setting set_ftrace_pid

Impact: fix deadlock while using set_ftrace_pid

Reproducer:

	# cd /sys/kernel/debug/tracing
	# echo $$ > set_ftrace_pid

	then, console becomes hung.

Details:

when writing set_ftracepid, kernel callstack is following

	ftrace_pid_write()
		mutex_lock(&ftrace_lock);
		ftrace_update_pid_func()
			mutex_lock(&ftrace_lock);
			mutex_unlock(&ftrace_lock);
		mutex_unlock(&ftrace_lock);

then, system always deadlocks when ftrace_pid_write() is called.

In past days, ftrace_pid_write() used ftrace_start_lock, but
commit e6ea44e9b4c12325337cd1c06103cd515a1c02b2 consolidated
ftrace_start_lock to ftrace_lock.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <20090306151155.0778.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d7a06a0d9447..d33d306bdcf4 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -218,10 +218,8 @@ static void ftrace_update_pid_func(void)
 {
 	ftrace_func_t func;
 
-	mutex_lock(&ftrace_lock);
-
 	if (ftrace_trace_function == ftrace_stub)
-		goto out;
+		return;
 
 	func = ftrace_trace_function;
 
@@ -238,9 +236,6 @@ static void ftrace_update_pid_func(void)
 #else
 	__ftrace_trace_function = func;
 #endif
-
- out:
-	mutex_unlock(&ftrace_lock);
 }
 
 /* set when tracing only a pid */
-- 
cgit v1.2.3


From 5ed0cec0ac5f1b3759bdbe4d9df32ee4ff8afb5a Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 19:40:20 +0800
Subject: sched: TIF_NEED_RESCHED -> need_reshed() cleanup

Impact: cleanup

Use test_tsk_need_resched(), set_tsk_need_resched(), need_resched()
instead of using TIF_NEED_RESCHED.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <49B10BA4.9070209@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c    | 10 +++++-----
 lib/kernel_lock.c |  2 +-
 2 files changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8b92f40c147d..e0fa739a441b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -1189,10 +1189,10 @@ static void resched_task(struct task_struct *p)
 
 	assert_spin_locked(&task_rq(p)->lock);
 
-	if (unlikely(test_tsk_thread_flag(p, TIF_NEED_RESCHED)))
+	if (test_tsk_need_resched(p))
 		return;
 
-	set_tsk_thread_flag(p, TIF_NEED_RESCHED);
+	set_tsk_need_resched(p);
 
 	cpu = task_cpu(p);
 	if (cpu == smp_processor_id())
@@ -1248,7 +1248,7 @@ void wake_up_idle_cpu(int cpu)
 	 * lockless. The worst case is that the other CPU runs the
 	 * idle task through an additional NOOP schedule()
 	 */
-	set_tsk_thread_flag(rq->idle, TIF_NEED_RESCHED);
+	set_tsk_need_resched(rq->idle);
 
 	/* NEED_RESCHED must be visible before we test polling */
 	smp_mb();
@@ -4740,7 +4740,7 @@ asmlinkage void __sched preempt_schedule(void)
 		 * between schedule and now.
 		 */
 		barrier();
-	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+	} while (need_resched());
 }
 EXPORT_SYMBOL(preempt_schedule);
 
@@ -4769,7 +4769,7 @@ asmlinkage void __sched preempt_schedule_irq(void)
 		 * between schedule and now.
 		 */
 		barrier();
-	} while (unlikely(test_thread_flag(TIF_NEED_RESCHED)));
+	} while (need_resched());
 }
 
 #endif /* CONFIG_PREEMPT */
diff --git a/lib/kernel_lock.c b/lib/kernel_lock.c
index 01a3c22c1b5a..39f1029e3525 100644
--- a/lib/kernel_lock.c
+++ b/lib/kernel_lock.c
@@ -39,7 +39,7 @@ static  __cacheline_aligned_in_smp DEFINE_SPINLOCK(kernel_flag);
 int __lockfunc __reacquire_kernel_lock(void)
 {
 	while (!_raw_spin_trylock(&kernel_flag)) {
-		if (test_thread_flag(TIF_NEED_RESCHED))
+		if (need_resched())
 			return -EAGAIN;
 		cpu_relax();
 	}
-- 
cgit v1.2.3


From 4460fdad85becd569f11501ad5b91814814335ff Mon Sep 17 00:00:00 2001
From: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Date: Fri, 6 Mar 2009 10:36:38 -0500
Subject: tracing, Text Edit Lock - kprobes architecture independent support

Use the mutual exclusion provided by the text edit lock in the kprobes code. It
allows coherent manipulation of the kernel code by other subsystems.

Changelog:

Move the kernel_text_lock/unlock out of the for loops.
Use text_mutex directly instead of a function.
Remove whitespace modifications.

(note : kprobes_mutex is always taken outside of text_mutex)

Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Acked-by: Masami Hiramatsu <mhiramat@redhat.com>
LKML-Reference: <49B14306.2080202@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 15 +++++++++++++--
 1 file changed, 13 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 7ba8cd9845cb..479d4d5672f9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -43,6 +43,7 @@
 #include <linux/seq_file.h>
 #include <linux/debugfs.h>
 #include <linux/kdebug.h>
+#include <linux/memory.h>
 
 #include <asm-generic/sections.h>
 #include <asm/cacheflush.h>
@@ -699,9 +700,10 @@ int __kprobes register_kprobe(struct kprobe *p)
 		goto out;
 	}
 
+	mutex_lock(&text_mutex);
 	ret = arch_prepare_kprobe(p);
 	if (ret)
-		goto out;
+		goto out_unlock_text;
 
 	INIT_HLIST_NODE(&p->hlist);
 	hlist_add_head_rcu(&p->hlist,
@@ -710,6 +712,8 @@ int __kprobes register_kprobe(struct kprobe *p)
 	if (kprobe_enabled)
 		arch_arm_kprobe(p);
 
+out_unlock_text:
+	mutex_unlock(&text_mutex);
 out:
 	mutex_unlock(&kprobe_mutex);
 
@@ -746,8 +750,11 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled && !kprobe_gone(old_p))
+		if (kprobe_enabled && !kprobe_gone(old_p)) {
+			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
+			mutex_unlock(&text_mutex);
+		}
 		hlist_del_rcu(&old_p->hlist);
 	} else {
 		if (p->break_handler && !kprobe_gone(p))
@@ -1280,12 +1287,14 @@ static void __kprobes enable_all_kprobes(void)
 	if (kprobe_enabled)
 		goto already_enabled;
 
+	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
 			if (!kprobe_gone(p))
 				arch_arm_kprobe(p);
 	}
+	mutex_unlock(&text_mutex);
 
 	kprobe_enabled = true;
 	printk(KERN_INFO "Kprobes globally enabled\n");
@@ -1310,6 +1319,7 @@ static void __kprobes disable_all_kprobes(void)
 
 	kprobe_enabled = false;
 	printk(KERN_INFO "Kprobes globally disabled\n");
+	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
@@ -1318,6 +1328,7 @@ static void __kprobes disable_all_kprobes(void)
 		}
 	}
 
+	mutex_unlock(&text_mutex);
 	mutex_unlock(&kprobe_mutex);
 	/* Allow all currently running kprobes to complete */
 	synchronize_sched();
-- 
cgit v1.2.3


From 1427cdf0592368bdec57276edaf714040ee8744f Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:47 +0100
Subject: tracing: infrastructure for supporting binary record

Impact: save on memory for tracing

Current tracers are typically using a struct(like struct ftrace_entry,
struct ctx_switch_entry, struct special_entr etc...)to record a binary
event. These structs can only record a their own kind of events.
A new kind of tracer need a new struct and a lot of code too handle it.

So we need a generic binary record for events. This infrastructure
is for this purpose.

[fweisbec@gmail.com: rebase against latest -tip, make it safe while sched
tracing as reported by Steven Rostedt]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h       |  3 ++
 kernel/trace/Kconfig         |  6 +++
 kernel/trace/Makefile        |  1 +
 kernel/trace/trace.c         | 56 ++++++++++++++++++++++++++++
 kernel/trace/trace.h         | 12 ++++++
 kernel/trace/trace_bprintk.c | 87 ++++++++++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.c  | 75 ++++++++++++++++++++++++++++++++++++++
 7 files changed, 240 insertions(+)
 create mode 100644 kernel/trace/trace_bprintk.c

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 498769425eb2..1c9cdca02580 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,6 +223,9 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+#endif
 
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 058d949a3214..ad8d3617d0a6 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -97,6 +97,12 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
+config TRACE_BPRINTK
+	bool "Binary printk for tracing"
+	default y
+	depends on TRACING
+	select BINARY_PRINTF
+
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index f44736c7574a..46557ef4c379 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,6 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
+obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6144acf2b75..ff53509e19f8 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3792,6 +3792,62 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+/**
+ * trace_vbprintk - write binary msg to tracing buffer
+ *
+ * Caller must insure @fmt are valid when msg is in tracing buffer.
+ */
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct bprintk_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (tracing_disabled || !trace_bprintk_enable)
+		return 0;
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 8beff03fda68..0f5077f8f957 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -124,6 +125,16 @@ struct print_entry {
 	char			buf[];
 };
 
+struct bprintk_entry {
+	struct trace_entry ent;
+	unsigned long ip;
+	const char *fmt;
+	u32 buf[];
+};
+#ifdef CONFIG_TRACE_BPRINTK
+extern int trace_bprintk_enable;
+#endif
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -285,6 +296,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
new file mode 100644
index 000000000000..1f8e532c3fb9
--- /dev/null
+++ b/kernel/trace/trace_bprintk.c
@@ -0,0 +1,87 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+/* binary printk basic */
+static DEFINE_MUTEX(btrace_mutex);
+static int btrace_metadata_count;
+
+static inline void lock_btrace(void)
+{
+	mutex_lock(&btrace_mutex);
+}
+
+static inline void unlock_btrace(void)
+{
+	mutex_unlock(&btrace_mutex);
+}
+
+static void get_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count++;
+	unlock_btrace();
+}
+
+static void put_btrace_metadata(void)
+{
+	lock_btrace();
+	btrace_metadata_count--;
+	unlock_btrace();
+}
+
+/* events tracer */
+int trace_bprintk_enable;
+
+static void start_bprintk_trace(struct trace_array *tr)
+{
+	get_btrace_metadata();
+	tracing_reset_online_cpus(tr);
+	trace_bprintk_enable = 1;
+}
+
+static void stop_bprintk_trace(struct trace_array *tr)
+{
+	trace_bprintk_enable = 0;
+	tracing_reset_online_cpus(tr);
+	put_btrace_metadata();
+}
+
+static int init_bprintk_trace(struct trace_array *tr)
+{
+	start_bprintk_trace(tr);
+	return 0;
+}
+
+static struct tracer bprintk_trace __read_mostly =
+{
+	.name	     = "events",
+	.init	     = init_bprintk_trace,
+	.reset	     = stop_bprintk_trace,
+	.start	     = start_bprintk_trace,
+	.stop	     = stop_bprintk_trace,
+};
+
+static __init int init_bprintk(void)
+{
+	return register_tracer(&bprintk_trace);
+}
+
+device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 306fef84c503..4ab71201862e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,6 +53,26 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
+static int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+{
+	int len = (PAGE_SIZE - 1) - s->len;
+	int ret;
+
+	if (!len)
+		return 0;
+
+	ret = bstr_printf(s->buffer + s->len, len, fmt, binary);
+
+	/* If we can't write it all, don't bother writing anything */
+	if (ret >= len)
+		return 0;
+
+	s->len += ret;
+
+	return len;
+}
+
 /**
  * trace_seq_puts - trace sequence printing of simple string
  * @s: trace sequence descriptor
@@ -855,6 +875,60 @@ static struct trace_event trace_print_event = {
 	.raw		= trace_print_raw,
 };
 
+/* TRACE_BPRINTK */
+static enum print_line_t
+trace_bprintk_print(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_puts(s, ": "))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t
+trace_bprintk_raw(struct trace_iterator *iter, int flags)
+{
+	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
+	struct bprintk_entry *field;
+
+	trace_assign_type(field, entry);
+
+	if (!trace_seq_printf(s, ": %lx : ", field->ip))
+		goto partial;
+
+	if (!trace_seq_bprintf(s, field->fmt, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static struct trace_event trace_bprintk_event = {
+	.type	 	= TRACE_BPRINTK,
+	.trace		= trace_bprintk_print,
+	.raw		= trace_bprintk_raw,
+	.hex		= trace_nop_print,
+	.binary		= trace_nop_print,
+};
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -863,6 +937,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
+	&trace_bprintk_event,
 	NULL
 };
 
-- 
cgit v1.2.3


From 1ba28e02a18cbdbea123836f6c98efb09cbf59ec Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 6 Mar 2009 17:21:48 +0100
Subject: tracing: add trace_bprintk()

Impact: add a generic printk() for tracing, like trace_printk()

trace_bprintk() uses the infrastructure to record events on ring_buffer.

[ fweisbec@gmail.com: ported to latest -tip, made it work if
  !CONFIG_MODULES, never free the format strings from modules
  because we can't keep track of them and conditionnaly create
  the ftrace format strings section (reported by Steven Rostedt) ]

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  9 ++++
 include/linux/ftrace.h            | 21 ++++++++++
 include/linux/module.h            |  5 +++
 kernel/module.c                   |  6 +++
 kernel/trace/trace.c              | 15 +++++++
 kernel/trace/trace_bprintk.c      | 87 ++++++++++++++++++++++++++++++++++-----
 6 files changed, 133 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0add6b28c366..48ade3168b13 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -69,6 +69,14 @@
 #define FTRACE_EVENTS()
 #endif
 
+#ifdef CONFIG_TRACING
+#define TRACE_PRINTKS() VMLINUX_SYMBOL(__start___trace_bprintk_fmt) = .;      \
+			 *(__trace_printk_fmt) /* Trace_printk fmt' pointer */ \
+			 VMLINUX_SYMBOL(__stop___trace_bprintk_fmt) = .;
+#else
+#define TRACE_PRINTKS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -100,6 +108,7 @@
 		*(__vermagic)		/* Kernel version magic */	\
 		*(__markers_strings)	/* Markers: strings */		\
 		*(__tracepoints_strings)/* Tracepoints: strings */	\
+		TRACE_PRINTKS()					\
 	}								\
 									\
 	.rodata1          : AT(ADDR(.rodata1) - LOAD_OFFSET) {		\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1c9cdca02580..1cc8ca453a9b 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -225,6 +225,27 @@ extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
+extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+		__attribute__ ((format (printf, 2, 3)));
+
+static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
+		__attribute__ ((format (printf, 1, 2)));
+static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
+#define __trace_bprintk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_bprintk_check_format(fmt, ##args);		\
+} while (0)
+
+#define trace_bprintk(fmt, args...)					\
+do {									\
+	static char *__attribute__((section("__trace_bprintk_fmt")))	\
+			trace_bprintk_fmt = fmt;			\
+	__trace_bprintk_check_format(fmt, ##args);			\
+	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
+} while (0)
+#else
+#define trace_bprintk trace_printk
 #endif
 
 /* May be defined in arch */
diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..8cbec972d8e7 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,6 +329,11 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	const char **trace_bprintk_fmt_start;
+	unsigned int num_trace_bprintk_fmt;
+#endif
+
 #ifdef CONFIG_MODULE_UNLOAD
 	/* What modules depend on me? */
 	struct list_head modules_which_use_me;
diff --git a/kernel/module.c b/kernel/module.c
index 22d7379709da..2dece104f9a1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,6 +2158,12 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
+#ifdef CONFIG_TRACE_BPRINTK
+	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
+			"__trace_bprintk_fmt", sizeof(char *),
+			&mod->num_trace_bprintk_fmt);
+#endif
+
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ff53509e19f8..46b3cd7a5752 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3848,6 +3848,21 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!fmt)
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vbprintk(ip, fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_bprintk);
+
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
index 1f8e532c3fb9..f4c245a5cd33 100644
--- a/kernel/trace/trace_bprintk.c
+++ b/kernel/trace/trace_bprintk.c
@@ -19,9 +19,21 @@
 
 #include "trace.h"
 
+#ifdef CONFIG_MODULES
+
 /* binary printk basic */
 static DEFINE_MUTEX(btrace_mutex);
-static int btrace_metadata_count;
+/*
+ * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
 
 static inline void lock_btrace(void)
 {
@@ -33,26 +45,75 @@ static inline void unlock_btrace(void)
 	mutex_unlock(&btrace_mutex);
 }
 
-static void get_btrace_metadata(void)
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
 {
-	lock_btrace();
-	btrace_metadata_count++;
-	unlock_btrace();
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
 }
 
-static void put_btrace_metadata(void)
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
 {
+	const char **iter;
 	lock_btrace();
-	btrace_metadata_count--;
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
 	unlock_btrace();
 }
 
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
 /* events tracer */
 int trace_bprintk_enable;
 
 static void start_bprintk_trace(struct trace_array *tr)
 {
-	get_btrace_metadata();
 	tracing_reset_online_cpus(tr);
 	trace_bprintk_enable = 1;
 }
@@ -61,7 +122,6 @@ static void stop_bprintk_trace(struct trace_array *tr)
 {
 	trace_bprintk_enable = 0;
 	tracing_reset_online_cpus(tr);
-	put_btrace_metadata();
 }
 
 static int init_bprintk_trace(struct trace_array *tr)
@@ -81,7 +141,14 @@ static struct tracer bprintk_trace __read_mostly =
 
 static __init int init_bprintk(void)
 {
-	return register_tracer(&bprintk_trace);
+	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
+	if (ret)
+		return ret;
+
+	ret = register_tracer(&bprintk_trace);
+	if (ret)
+		unregister_module_notifier(&module_trace_bprintk_format_nb);
+	return ret;
 }
 
 device_initcall(init_bprintk);
-- 
cgit v1.2.3


From 769b0441f438c4bb4872cb8560eb6fe51bcc09ee Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 6 Mar 2009 17:21:49 +0100
Subject: tracing/core: drop the old trace_printk() implementation in favour of
 trace_bprintk()

Impact: faster and lighter tracing

Now that we have trace_bprintk() which is faster and consume lesser
memory than trace_printk() and has the same purpose, we can now drop
the old implementation in favour of the binary one from trace_bprintk(),
which means we move all the implementation of trace_bprintk() to
trace_printk(), so the Api doesn't change except that we must now use
trace_seq_bprintk() to print the TRACE_PRINT entries.

Some changes result of this:

- Previously, trace_bprintk depended of a single tracer and couldn't
  work without. This tracer has been dropped and the whole implementation
  of trace_printk() (like the module formats management) is now integrated
  in the tracing core (comes with CONFIG_TRACING), though we keep the file
  trace_printk (previously trace_bprintk.c) where we can find the module
  management. Thus we don't overflow trace.c

- changes some parts to use trace_seq_bprintk() to print TRACE_PRINT entries.

- change a bit trace_printk/trace_vprintk macros to support non-builtin formats
  constants, and fix 'const' qualifiers warnings. But this is all transparent for
  developers.

- etc...

V2:

- Rebase against last changes
- Fix mispell on the changelog

V3:

- Rebase against last changes (moving trace_printk() to kernel.h)

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h               |  25 -----
 include/linux/kernel.h               |  34 +++++-
 include/linux/module.h               |   2 +-
 kernel/trace/Kconfig                 |   7 +-
 kernel/trace/Makefile                |   2 +-
 kernel/trace/trace.c                 | 212 ++++++++++-------------------------
 kernel/trace/trace.h                 |  14 +--
 kernel/trace/trace_bprintk.c         | 154 -------------------------
 kernel/trace/trace_functions_graph.c |   6 +-
 kernel/trace/trace_mmiotrace.c       |   9 +-
 kernel/trace/trace_output.c          |  70 ++----------
 kernel/trace/trace_output.h          |   2 +
 kernel/trace/trace_printk.c          | 138 +++++++++++++++++++++++
 13 files changed, 262 insertions(+), 413 deletions(-)
 delete mode 100644 kernel/trace/trace_bprintk.c
 create mode 100644 kernel/trace/trace_printk.c

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1cc8ca453a9b..e1583f2639b0 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -223,31 +223,6 @@ extern int ftrace_make_nop(struct module *mod,
  */
 extern int ftrace_make_call(struct dyn_ftrace *rec, unsigned long addr);
 
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
-extern int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-		__attribute__ ((format (printf, 2, 3)));
-
-static inline void  ____trace_bprintk_check_format(const char *fmt, ...)
-		__attribute__ ((format (printf, 1, 2)));
-static inline void ____trace_bprintk_check_format(const char *fmt, ...) {}
-#define __trace_bprintk_check_format(fmt, args...)			\
-do {									\
-	if (0)								\
-		____trace_bprintk_check_format(fmt, ##args);		\
-} while (0)
-
-#define trace_bprintk(fmt, args...)					\
-do {									\
-	static char *__attribute__((section("__trace_bprintk_fmt")))	\
-			trace_bprintk_fmt = fmt;			\
-	__trace_bprintk_check_format(fmt, ##args);			\
-	__trace_bprintk(_THIS_IP_, trace_bprintk_fmt, ##args);	\
-} while (0)
-#else
-#define trace_bprintk trace_printk
-#endif
-
 /* May be defined in arch */
 extern int ftrace_arch_read_dyn_info(char *buf, int size);
 
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7aef15c4645e..4e726b9a71ec 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -423,6 +423,16 @@ extern void ftrace_off_permanent(void);
 extern void
 ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
 
+static inline void __attribute__ ((format (printf, 1, 2)))
+____trace_printk_check_format(const char *fmt, ...)
+{
+}
+#define __trace_printk_check_format(fmt, args...)			\
+do {									\
+	if (0)								\
+		____trace_printk_check_format(fmt, ##args);		\
+} while (0)
+
 /**
  * trace_printk - printf formatting in the ftrace buffer
  * @fmt: the printf format for printing
@@ -439,13 +449,31 @@ ftrace_special(unsigned long arg1, unsigned long arg2, unsigned long arg3);
  * Please refrain from leaving trace_printks scattered around in
  * your code.
  */
-# define trace_printk(fmt...) __trace_printk(_THIS_IP_, fmt)
+
+#define trace_printk(fmt, args...)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__trace_printk_check_format(fmt, ##args);			\
+	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+} while (0)
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
-# define ftrace_vprintk(fmt, ap) __trace_printk(_THIS_IP_, fmt, ap)
+
+#define ftrace_vprintk(fmt, vargs)					\
+do {									\
+	static const char *trace_printk_fmt				\
+	__attribute__((section("__trace_printk_fmt")));			\
+	trace_printk_fmt = fmt;					\
+	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+} while (0)
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern void ftrace_dump(void);
 #else
 static inline void
@@ -467,7 +495,7 @@ ftrace_vprintk(const char *fmt, va_list ap)
 	return 0;
 }
 static inline void ftrace_dump(void) { }
-#endif
+#endif /* CONFIG_TRACING */
 
 /*
  *      Display an IP address in readable format.
diff --git a/include/linux/module.h b/include/linux/module.h
index 8cbec972d8e7..22d9878e868c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -329,7 +329,7 @@ struct module
 	unsigned int num_tracepoints;
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
+#ifdef CONFIG_TRACING
 	const char **trace_bprintk_fmt_start;
 	unsigned int num_trace_bprintk_fmt;
 #endif
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index ad8d3617d0a6..8e4a2a61cd75 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -52,6 +52,7 @@ config TRACING
 	select STACKTRACE if STACKTRACE_SUPPORT
 	select TRACEPOINTS
 	select NOP_TRACER
+	select BINARY_PRINTF
 
 #
 # Minimum requirements an architecture has to meet for us to
@@ -97,12 +98,6 @@ config FUNCTION_GRAPH_TRACER
 	  This is done by setting the current return address on the current
 	  task structure into a stack of calls.
 
-config TRACE_BPRINTK
-	bool "Binary printk for tracing"
-	default y
-	depends on TRACING
-	select BINARY_PRINTF
-
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
 	default n
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 46557ef4c379..c7a2943796eb 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -22,7 +22,7 @@ obj-$(CONFIG_TRACING) += trace.o
 obj-$(CONFIG_TRACING) += trace_clock.o
 obj-$(CONFIG_TRACING) += trace_output.o
 obj-$(CONFIG_TRACING) += trace_stat.o
-obj-$(CONFIG_TRACE_BPRINTK) += trace_bprintk.o
+obj-$(CONFIG_TRACING) += trace_printk.o
 obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o
 obj-$(CONFIG_SYSPROF_TRACER) += trace_sysprof.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 46b3cd7a5752..cc94f8642485 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1169,6 +1169,67 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 }
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
 
+
+/**
+ * trace_vprintk - write binary msg to tracing buffer
+ *
+ */
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static DEFINE_SPINLOCK(trace_buf_lock);
+	static u32 trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	struct print_entry *entry;
+	unsigned long flags;
+	int resched;
+	int cpu, len = 0, size, pc;
+
+	if (unlikely(tracing_selftest_running || tracing_disabled))
+		return 0;
+
+	/* Don't pollute graph traces with trace_vprintk internals */
+	pause_graph_tracing();
+
+	pc = preempt_count();
+	resched = ftrace_preempt_disable();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	spin_lock_irqsave(&trace_buf_lock, flags);
+	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	if (len > TRACE_BUF_SIZE || len < 0)
+		goto out_unlock;
+
+	size = sizeof(*entry) + sizeof(u32) * len;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+	entry->fmt			= fmt;
+
+	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+out_unlock:
+	spin_unlock_irqrestore(&trace_buf_lock, flags);
+
+out:
+	ftrace_preempt_enable(resched);
+	unpause_graph_tracing();
+
+	return len;
+}
+EXPORT_SYMBOL_GPL(trace_vprintk);
+
 enum trace_file_type {
 	TRACE_FILE_LAT_FMT	= 1,
 	TRACE_FILE_ANNOTATE	= 2,
@@ -1564,7 +1625,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_printf(s, "%s", field->buf);
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -3714,155 +3775,6 @@ static __init int tracer_init_debugfs(void)
 	return 0;
 }
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
-{
-	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
-	static char trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	int cpu, len = 0, size, pc;
-	struct print_entry *entry;
-	unsigned long irq_flags;
-
-	if (tracing_disabled || tracing_selftest_running)
-		return 0;
-
-	pc = preempt_count();
-	preempt_disable_notrace();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	pause_graph_tracing();
-	raw_local_irq_save(irq_flags);
-	__raw_spin_lock(&trace_buf_lock);
-	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	len = min(len, TRACE_BUF_SIZE-1);
-	trace_buf[len] = 0;
-
-	size = sizeof(*entry) + len + 1;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->depth			= depth;
-
-	memcpy(&entry->buf, trace_buf, len);
-	entry->buf[len] = 0;
-	ring_buffer_unlock_commit(tr->buffer, event);
-
- out_unlock:
-	__raw_spin_unlock(&trace_buf_lock);
-	raw_local_irq_restore(irq_flags);
-	unpause_graph_tracing();
- out:
-	preempt_enable_notrace();
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vprintk);
-
-int __trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_printk);
-
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
-{
-	if (!(trace_flags & TRACE_ITER_PRINTK))
-		return 0;
-
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-}
-EXPORT_SYMBOL_GPL(__ftrace_vprintk);
-
-/**
- * trace_vbprintk - write binary msg to tracing buffer
- *
- * Caller must insure @fmt are valid when msg is in tracing buffer.
- */
-int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
-{
-	static DEFINE_SPINLOCK(trace_buf_lock);
-	static u32 trace_buf[TRACE_BUF_SIZE];
-
-	struct ring_buffer_event *event;
-	struct trace_array *tr = &global_trace;
-	struct trace_array_cpu *data;
-	struct bprintk_entry *entry;
-	unsigned long flags;
-	int resched;
-	int cpu, len = 0, size, pc;
-
-	if (tracing_disabled || !trace_bprintk_enable)
-		return 0;
-
-	pc = preempt_count();
-	resched = ftrace_preempt_disable();
-	cpu = raw_smp_processor_id();
-	data = tr->data[cpu];
-
-	if (unlikely(atomic_read(&data->disabled)))
-		goto out;
-
-	spin_lock_irqsave(&trace_buf_lock, flags);
-	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
-
-	if (len > TRACE_BUF_SIZE || len < 0)
-		goto out_unlock;
-
-	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_BPRINTK, size, flags, pc);
-	if (!event)
-		goto out_unlock;
-	entry = ring_buffer_event_data(event);
-	entry->ip			= ip;
-	entry->fmt			= fmt;
-
-	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
-	ring_buffer_unlock_commit(tr->buffer, event);
-
-out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
-
-out:
-	ftrace_preempt_enable(resched);
-
-	return len;
-}
-EXPORT_SYMBOL_GPL(trace_vbprintk);
-
-int __trace_bprintk(unsigned long ip, const char *fmt, ...)
-{
-	int ret;
-	va_list ap;
-
-	if (!fmt)
-		return 0;
-
-	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, fmt, ap);
-	va_end(ap);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__trace_bprintk);
-
 static int trace_panic_handler(struct notifier_block *this,
 			       unsigned long event, void *unused)
 {
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 0f5077f8f957..6140922392c8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,7 +20,6 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
-	TRACE_BPRINTK,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -120,16 +119,10 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long		ip;
+	unsigned long 		ip;
 	int			depth;
-	char			buf[];
-};
-
-struct bprintk_entry {
-	struct trace_entry ent;
-	unsigned long ip;
-	const char *fmt;
-	u32 buf[];
+	const char		*fmt;
+	u32 			buf[];
 };
 #ifdef CONFIG_TRACE_BPRINTK
 extern int trace_bprintk_enable;
@@ -296,7 +289,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
-		IF_ASSIGN(var, ent, struct bprintk_entry, TRACE_BPRINTK);\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
diff --git a/kernel/trace/trace_bprintk.c b/kernel/trace/trace_bprintk.c
deleted file mode 100644
index f4c245a5cd33..000000000000
--- a/kernel/trace/trace_bprintk.c
+++ /dev/null
@@ -1,154 +0,0 @@
-/*
- * trace binary printk
- *
- * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
- *
- */
-#include <linux/kernel.h>
-#include <linux/ftrace.h>
-#include <linux/string.h>
-#include <linux/ctype.h>
-#include <linux/list.h>
-#include <linux/mutex.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
-#include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
-
-#include "trace.h"
-
-#ifdef CONFIG_MODULES
-
-/* binary printk basic */
-static DEFINE_MUTEX(btrace_mutex);
-/*
- * modules trace_bprintk()'s formats are autosaved in struct trace_bprintk_fmt
- * which are queued on trace_bprintk_fmt_list.
- */
-static LIST_HEAD(trace_bprintk_fmt_list);
-
-struct trace_bprintk_fmt {
-	struct list_head list;
-	char fmt[0];
-};
-
-
-static inline void lock_btrace(void)
-{
-	mutex_lock(&btrace_mutex);
-}
-
-static inline void unlock_btrace(void)
-{
-	mutex_unlock(&btrace_mutex);
-}
-
-
-static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
-{
-	struct trace_bprintk_fmt *pos;
-	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
-		if (!strcmp(pos->fmt, fmt))
-			return pos;
-	}
-	return NULL;
-}
-
-static
-void hold_module_trace_bprintk_format(const char **start, const char **end)
-{
-	const char **iter;
-	lock_btrace();
-	for (iter = start; iter < end; iter++) {
-		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
-		if (tb_fmt) {
-			*iter = tb_fmt->fmt;
-			continue;
-		}
-
-		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
-				+ strlen(*iter) + 1, GFP_KERNEL);
-		if (tb_fmt) {
-			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
-			strcpy(tb_fmt->fmt, *iter);
-			*iter = tb_fmt->fmt;
-		} else
-			*iter = NULL;
-	}
-	unlock_btrace();
-}
-
-static int module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	struct module *mod = data;
-	if (mod->num_trace_bprintk_fmt) {
-		const char **start = mod->trace_bprintk_fmt_start;
-		const char **end = start + mod->num_trace_bprintk_fmt;
-
-		if (val == MODULE_STATE_COMING)
-			hold_module_trace_bprintk_format(start, end);
-	}
-	return 0;
-}
-
-#else /* !CONFIG_MODULES */
-__init static int
-module_trace_bprintk_format_notify(struct notifier_block *self,
-		unsigned long val, void *data)
-{
-	return 0;
-}
-#endif /* CONFIG_MODULES */
-
-
-__initdata_or_module static
-struct notifier_block module_trace_bprintk_format_nb = {
-	.notifier_call = module_trace_bprintk_format_notify,
-};
-
-/* events tracer */
-int trace_bprintk_enable;
-
-static void start_bprintk_trace(struct trace_array *tr)
-{
-	tracing_reset_online_cpus(tr);
-	trace_bprintk_enable = 1;
-}
-
-static void stop_bprintk_trace(struct trace_array *tr)
-{
-	trace_bprintk_enable = 0;
-	tracing_reset_online_cpus(tr);
-}
-
-static int init_bprintk_trace(struct trace_array *tr)
-{
-	start_bprintk_trace(tr);
-	return 0;
-}
-
-static struct tracer bprintk_trace __read_mostly =
-{
-	.name	     = "events",
-	.init	     = init_bprintk_trace,
-	.reset	     = stop_bprintk_trace,
-	.start	     = start_bprintk_trace,
-	.stop	     = stop_bprintk_trace,
-};
-
-static __init int init_bprintk(void)
-{
-	int ret = register_module_notifier(&module_trace_bprintk_format_nb);
-	if (ret)
-		return ret;
-
-	ret = register_tracer(&bprintk_trace);
-	if (ret)
-		unregister_module_notifier(&module_trace_bprintk_format_nb);
-	return ret;
-}
-
-device_initcall(init_bprintk);
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e527f2f66c73..453ebd3b636e 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -742,7 +742,11 @@ print_graph_comment(struct print_entry *trace, struct trace_seq *s,
 		}
 
 	/* The comment */
-	ret = trace_seq_printf(s, "/* %s", trace->buf);
+	ret = trace_seq_printf(s, "/* ");
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index c401b908e805..23e346a734ca 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,15 +254,18 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
-	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
-	unsigned long usec_rem	= do_div(t, 1000000ULL);
+	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
 	unsigned secs		= (unsigned long)t;
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
+	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	ret = trace_seq_bprintf(s, print->fmt, print->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 4ab71201862e..ef8fd661b217 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -53,8 +53,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	return len;
 }
 
-static int
-trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
+int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary)
 {
 	int len = (PAGE_SIZE - 1) - s->len;
 	int ret;
@@ -834,54 +833,12 @@ static struct trace_event trace_user_stack_event = {
 };
 
 /* TRACE_PRINT */
-static enum print_line_t trace_print_print(struct trace_iterator *iter,
-					   int flags)
-{
-	struct print_entry *field;
-	struct trace_seq *s = &iter->seq;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!seq_print_ip_sym(s, field->ip, flags))
-		goto partial;
-
-	if (!trace_seq_printf(s, ": %s", field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
-{
-	struct print_entry *field;
-
-	trace_assign_type(field, iter->ent);
-
-	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
-		goto partial;
-
-	return TRACE_TYPE_HANDLED;
-
- partial:
-	return TRACE_TYPE_PARTIAL_LINE;
-}
-
-static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
-	.trace		= trace_print_print,
-	.raw		= trace_print_raw,
-};
-
-/* TRACE_BPRINTK */
 static enum print_line_t
-trace_bprintk_print(struct trace_iterator *iter, int flags)
+trace_print_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
+	struct print_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -900,14 +857,13 @@ trace_bprintk_print(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static enum print_line_t
-trace_bprintk_raw(struct trace_iterator *iter, int flags)
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 {
-	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
 	struct trace_seq *s = &iter->seq;
-	struct bprintk_entry *field;
 
-	trace_assign_type(field, entry);
+	trace_assign_type(field, iter->ent);
 
 	if (!trace_seq_printf(s, ": %lx : ", field->ip))
 		goto partial;
@@ -921,12 +877,11 @@ trace_bprintk_raw(struct trace_iterator *iter, int flags)
 	return TRACE_TYPE_PARTIAL_LINE;
 }
 
-static struct trace_event trace_bprintk_event = {
-	.type	 	= TRACE_BPRINTK,
-	.trace		= trace_bprintk_print,
-	.raw		= trace_bprintk_raw,
-	.hex		= trace_nop_print,
-	.binary		= trace_nop_print,
+
+static struct trace_event trace_print_event = {
+	.type	 	= TRACE_PRINT,
+	.trace		= trace_print_print,
+	.raw		= trace_print_raw,
 };
 
 static struct trace_event *events[] __initdata = {
@@ -937,7 +892,6 @@ static struct trace_event *events[] __initdata = {
 	&trace_stack_event,
 	&trace_user_stack_event,
 	&trace_print_event,
-	&trace_bprintk_event,
 	NULL
 };
 
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 8a34d688ed63..3b90e6ade1aa 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -18,6 +18,8 @@ struct trace_event {
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
+trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary);
+extern int
 seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
new file mode 100644
index 000000000000..a50aea22e929
--- /dev/null
+++ b/kernel/trace/trace_printk.c
@@ -0,0 +1,138 @@
+/*
+ * trace binary printk
+ *
+ * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
+ *
+ */
+#include <linux/kernel.h>
+#include <linux/ftrace.h>
+#include <linux/string.h>
+#include <linux/ctype.h>
+#include <linux/list.h>
+#include <linux/mutex.h>
+#include <linux/slab.h>
+#include <linux/module.h>
+#include <linux/seq_file.h>
+#include <linux/fs.h>
+#include <linux/marker.h>
+#include <linux/uaccess.h>
+
+#include "trace.h"
+
+#ifdef CONFIG_MODULES
+
+/*
+ * modules trace_printk()'s formats are autosaved in struct trace_bprintk_fmt
+ * which are queued on trace_bprintk_fmt_list.
+ */
+static LIST_HEAD(trace_bprintk_fmt_list);
+
+/* serialize accesses to trace_bprintk_fmt_list */
+static DEFINE_MUTEX(btrace_mutex);
+
+struct trace_bprintk_fmt {
+	struct list_head list;
+	char fmt[0];
+};
+
+static inline struct trace_bprintk_fmt *lookup_format(const char *fmt)
+{
+	struct trace_bprintk_fmt *pos;
+	list_for_each_entry(pos, &trace_bprintk_fmt_list, list) {
+		if (!strcmp(pos->fmt, fmt))
+			return pos;
+	}
+	return NULL;
+}
+
+static
+void hold_module_trace_bprintk_format(const char **start, const char **end)
+{
+	const char **iter;
+
+	mutex_lock(&btrace_mutex);
+	for (iter = start; iter < end; iter++) {
+		struct trace_bprintk_fmt *tb_fmt = lookup_format(*iter);
+		if (tb_fmt) {
+			*iter = tb_fmt->fmt;
+			continue;
+		}
+
+		tb_fmt = kmalloc(offsetof(struct trace_bprintk_fmt, fmt)
+				+ strlen(*iter) + 1, GFP_KERNEL);
+		if (tb_fmt) {
+			list_add_tail(&tb_fmt->list, &trace_bprintk_fmt_list);
+			strcpy(tb_fmt->fmt, *iter);
+			*iter = tb_fmt->fmt;
+		} else
+			*iter = NULL;
+	}
+	mutex_unlock(&btrace_mutex);
+}
+
+static int module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	struct module *mod = data;
+	if (mod->num_trace_bprintk_fmt) {
+		const char **start = mod->trace_bprintk_fmt_start;
+		const char **end = start + mod->num_trace_bprintk_fmt;
+
+		if (val == MODULE_STATE_COMING)
+			hold_module_trace_bprintk_format(start, end);
+	}
+	return 0;
+}
+
+#else /* !CONFIG_MODULES */
+__init static int
+module_trace_bprintk_format_notify(struct notifier_block *self,
+		unsigned long val, void *data)
+{
+	return 0;
+}
+#endif /* CONFIG_MODULES */
+
+
+__initdata_or_module static
+struct notifier_block module_trace_bprintk_format_nb = {
+	.notifier_call = module_trace_bprintk_format_notify,
+};
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+ {
+	int ret;
+	va_list ap;
+
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+ {
+	if (unlikely(!fmt))
+		return 0;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vprintk);
+
+
+static __init int init_trace_printk(void)
+{
+	return register_module_notifier(&module_trace_bprintk_format_nb);
+}
+
+early_initcall(init_trace_printk);
-- 
cgit v1.2.3


From 9de36825b321fe9fe9cf73260554251af579f4ca Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 6 Mar 2009 17:52:03 +0100
Subject: tracing: trace_bprintk() cleanups

Impact: cleanup

Remove a few leftovers and clean up the code a bit.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1236356510-8381-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/module.c      |  6 ------
 kernel/trace/trace.h | 19 ++++++++-----------
 2 files changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 2dece104f9a1..22d7379709da 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2158,12 +2158,6 @@ static noinline struct module *load_module(void __user *umod,
 					&mod->num_tracepoints);
 #endif
 
-#ifdef CONFIG_TRACE_BPRINTK
-	mod->trace_bprintk_fmt_start = section_objs(hdr, sechdrs, secstrings,
-			"__trace_bprintk_fmt", sizeof(char *),
-			&mod->num_trace_bprintk_fmt);
-#endif
-
 #ifdef CONFIG_MODVERSIONS
 	if ((mod->num_syms && !mod->crcs)
 	    || (mod->num_gpl_syms && !mod->gpl_crcs)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 6140922392c8..2bfb7d11fc17 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -119,14 +119,11 @@ struct userstack_entry {
  */
 struct print_entry {
 	struct trace_entry	ent;
-	unsigned long 		ip;
+	unsigned long		ip;
 	int			depth;
 	const char		*fmt;
-	u32 			buf[];
+	u32			buf[];
 };
-#ifdef CONFIG_TRACE_BPRINTK
-extern int trace_bprintk_enable;
-#endif
 
 #define TRACE_OLD_SIZE		88
 
@@ -199,7 +196,7 @@ struct kmemtrace_free_entry {
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
  *  IRQS_OFF		- interrupts were disabled
- *  IRQS_NOSUPPORT 	- arch does not support irqs_disabled_flags
+ *  IRQS_NOSUPPORT	- arch does not support irqs_disabled_flags
  *  NEED_RESCED		- reschedule is requested
  *  HARDIRQ		- inside an interrupt handler
  *  SOFTIRQ		- inside a softirq handler
@@ -302,7 +299,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
- 		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
+		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
@@ -325,8 +322,8 @@ enum print_line_t {
  * flags value in struct tracer_flags.
  */
 struct tracer_opt {
-	const char 	*name; /* Will appear on the trace_options file */
-	u32 		bit; /* Mask assigned in val field in tracer_flags */
+	const char	*name; /* Will appear on the trace_options file */
+	u32		bit; /* Mask assigned in val field in tracer_flags */
 };
 
 /*
@@ -335,7 +332,7 @@ struct tracer_opt {
  */
 struct tracer_flags {
 	u32			val;
-	struct tracer_opt 	*opts;
+	struct tracer_opt	*opts;
 };
 
 /* Makes more easy to define a tracer opt */
@@ -390,7 +387,7 @@ struct tracer {
 	int			(*set_flag)(u32 old_flags, u32 bit, int set);
 	struct tracer		*next;
 	int			print_max;
-	struct tracer_flags 	*flags;
+	struct tracer_flags	*flags;
 	struct tracer_stat	*stats;
 };
 
-- 
cgit v1.2.3


From 888b55dc314d26239d84c3b187dae555a81c1605 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Sun, 8 Mar 2009 13:12:43 +0900
Subject: ftrace: tracing header should put '#' at the beginning of a line

In a recent discussion, Andrew Morton pointed out that tracing header
should put '#' at the beginning of a line.

Then, we can easily filtered the header by following grep usage:

  cat trace | grep -v '^#'

Wakeup trace also has the same header problem.

Comparison of headers displayed:

before this patch:

 # tracer: wakeup
 #
 wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip
 --------------------------------------------------------------------
  latency: 19059 us, #21277/21277, CPU#1 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
     -----------------
     | task: kondemand/1-1644 (uid:0 nice:-5 policy:0 rt_prio:0)
     -----------------

 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /
 #                |||||     delay
 #  cmd     pid   ||||| time  |   caller
 #     \   /      |||||   \   |   /
 irqbalan-1887    1d.s.    0us :   1887:120:R   + [001]  1644:115:S kondemand/1
 irqbalan-1887    1d.s.    1us : default_wake_function <-autoremove_wake_function
 irqbalan-1887    1d.s.    2us : check_preempt_wakeup <-try_to_wake_up

after this patch:

 # tracer: wakeup
 #
 # wakeup latency trace v1.1.5 on 2.6.29-rc7-tip-tip
 # --------------------------------------------------------------------
 # latency: 529 us, #530/530, CPU#0 | (M:desktop VP:0, KP:0, SP:0 HP:0 #P:4)
 #    -----------------
 #    | task: kondemand/0-1641 (uid:0 nice:-5 policy:0 rt_prio:0)
 #    -----------------
 #
 #                  _------=> CPU#
 #                 / _-----=> irqs-off
 #                | / _----=> need-resched
 #                || / _---=> hardirq/softirq
 #                ||| / _--=> preempt-depth
 #                |||| /
 #                |||||     delay
 #  cmd     pid   ||||| time  |   caller
 #     \   /      |||||   \   |   /
     sshd-2496    0d.s.    0us :   2496:120:R   + [000]  1641:115:S kondemand/0
     sshd-2496    0d.s.    1us : default_wake_function <-autoremove_wake_function
     sshd-2496    0d.s.    1us : check_preempt_wakeup <-try_to_wake_up

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
LKML-Reference: <20090308124421.23C3.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f8642485..e5b56199e5e0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1466,11 +1466,11 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 	total = entries +
 		ring_buffer_overruns(iter->tr->buffer);
 
-	seq_printf(m, "%s latency trace v1.1.5 on %s\n",
+	seq_printf(m, "# %s latency trace v1.1.5 on %s\n",
 		   name, UTS_RELEASE);
-	seq_puts(m, "-----------------------------------"
+	seq_puts(m, "# -----------------------------------"
 		 "---------------------------------\n");
-	seq_printf(m, " latency: %lu us, #%lu/%lu, CPU#%d |"
+	seq_printf(m, "# latency: %lu us, #%lu/%lu, CPU#%d |"
 		   " (M:%s VP:%d, KP:%d, SP:%d HP:%d",
 		   nsecs_to_usecs(data->saved_latency),
 		   entries,
@@ -1492,24 +1492,24 @@ print_trace_header(struct seq_file *m, struct trace_iterator *iter)
 #else
 	seq_puts(m, ")\n");
 #endif
-	seq_puts(m, "    -----------------\n");
-	seq_printf(m, "    | task: %.16s-%d "
+	seq_puts(m, "#    -----------------\n");
+	seq_printf(m, "#    | task: %.16s-%d "
 		   "(uid:%d nice:%ld policy:%ld rt_prio:%ld)\n",
 		   data->comm, data->pid, data->uid, data->nice,
 		   data->policy, data->rt_priority);
-	seq_puts(m, "    -----------------\n");
+	seq_puts(m, "#    -----------------\n");
 
 	if (data->critical_start) {
-		seq_puts(m, " => started at: ");
+		seq_puts(m, "#  => started at: ");
 		seq_print_ip_sym(&iter->seq, data->critical_start, sym_flags);
 		trace_print_seq(m, &iter->seq);
-		seq_puts(m, "\n => ended at:   ");
+		seq_puts(m, "\n#  => ended at:   ");
 		seq_print_ip_sym(&iter->seq, data->critical_end, sym_flags);
 		trace_print_seq(m, &iter->seq);
-		seq_puts(m, "\n");
+		seq_puts(m, "#\n");
 	}
 
-	seq_puts(m, "\n");
+	seq_puts(m, "#\n");
 }
 
 static void test_cpu_buff_start(struct trace_iterator *iter)
-- 
cgit v1.2.3


From c3ffc7a40b7e94b094efe1c8ab4e24370a782b65 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Mon, 9 Mar 2009 18:15:34 +0900
Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer

Impact: improve workqueue tracer output

Currently, /sys/kernel/debug/tracing/trace_stat/workqueues can display
wrong and strange thread names.

Why?

Currently, ftrace has tracing_record_cmdline()/trace_find_cmdline()
convenience function that implements a task->comm string cache.

This can avoid unnecessary memcpy overhead and the workqueue tracer
uses it.

However, in general, any trace statistics feature shouldn't use
tracing_record_cmdline() because trace statistics can display
very old process. Then comm cache can return wrong string because
recent process overrides the cache.

Fortunately, workqueue trace guarantees that displayed processes
are live. Thus we can search comm string from PID at display time.

<before>

% cat workqueues
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   7 431913     431913       kondemand/7
   7      0          0       tail
   7     21         21       git
   7      0          0       ls
   7      9          9       cat
   7 832632     832632       unix_chkpwd
   7 236292     236292       ls

Note: tail, git, ls, cat unix_chkpwd are obiously not workqueue thread.

<after>

% cat workqueues
 # CPU  INSERTED  EXECUTED   NAME
 # |      |         |          |

   7    510        510       kondemand/7
   7      0          0       kmpathd/7
   7     15         15       ata/7
   7      0          0       aio/7
   7     11         11       kblockd/7
   7   1063       1063       work_on_cpu/7
   7    167        167       events/7

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990fe9c5..46c8dc896bd3 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -99,8 +99,6 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 		pr_warning("trace_workqueue: not enough memory\n");
 		return;
 	}
-	tracing_record_cmdline(wq_thread);
-
 	INIT_LIST_HEAD(&cws->list);
 	cws->cpu = cpu;
 
@@ -195,11 +193,12 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct cpu_workqueue_stats *cws = p;
 	unsigned long flags;
 	int cpu = cws->cpu;
+	struct task_struct *tsk = find_task_by_vpid(cws->pid);
 
 	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
 		   atomic_read(&cws->inserted),
 		   cws->executed,
-		   trace_find_cmdline(cws->pid));
+		   tsk ? tsk->comm : "<...>");
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-- 
cgit v1.2.3


From 6d5b5acca9e566515ef3f1ed617e7295c4f94345 Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 9 Mar 2009 13:31:59 +0100
Subject: Fix fixpoint divide exception in acct_update_integrals

Frans Pop reported the crash below when running an s390 kernel under Hercules:

  Kernel BUG at 000738b4  verbose debug info unavailable!
  fixpoint divide exception: 0009  #1! SMP
  Modules linked in: nfs lockd nfs_acl sunrpc ctcm fsm tape_34xx
     cu3088 tape ccwgroup tape_class ext3 jbd mbcache dm_mirror dm_log dm_snapshot
     dm_mod dasd_eckd_mod dasd_mod
  CPU: 0 Not tainted 2.6.27.19 #13
  Process awk (pid: 2069, task: 0f9ed9b8, ksp: 0f4f7d18)
  Krnl PSW : 070c1000 800738b4 (acct_update_integrals+0x4c/0x118)
             R:0 T:1 IO:1 EX:1 Key:0 M:1 W:0 P:0 AS:0 CC:1 PM:0
  Krnl GPRS: 00000000 000007d0 7fffffff fffff830
             00000000 ffffffff 00000002 0f9ed9b8
             00000000 00008ca0 00000000 0f9ed9b8
             0f9edda4 8007386e 0f4f7ec8 0f4f7e98
  Krnl Code: 800738aa: a71807d0         lhi     %r1,2000
             800738ae: 8c200001         srdl    %r2,1
             800738b2: 1d21             dr      %r2,%r1
            >800738b4: 5810d10e         l       %r1,270(%r13)
             800738b8: 1823             lr      %r2,%r3
             800738ba: 4130f060         la      %r3,96(%r15)
             800738be: 0de1             basr    %r14,%r1
             800738c0: 5800f060         l       %r0,96(%r15)
  Call Trace:
  ( <000000000004fdea>! blocking_notifier_call_chain+0x1e/0x2c)
    <0000000000038502>! do_exit+0x106/0x7c0
    <0000000000038c36>! do_group_exit+0x7a/0xb4
    <0000000000038c8e>! SyS_exit_group+0x1e/0x30
    <0000000000021c28>! sysc_do_restart+0x12/0x16
    <0000000077e7e924>! 0x77e7e924

Reason for this is that cpu time accounting usually only happens from
interrupt context, but acct_update_integrals gets also called from
process context with interrupts enabled.

So in acct_update_integrals we may end up with the following scenario:

Between reading tsk->stime/tsk->utime and tsk->acct_timexpd an interrupt
happens which updates accouting values.  This causes acct_timexpd to be
greater than the former stime + utime.  The subsequent calculation of

	dtime = cputime_sub(time, tsk->acct_timexpd);

will be negative and the division performed by

	cputime_to_jiffies(dtime)

will generate an exception since the result won't fit into a 32 bit
register.

In order to fix this just always disable interrupts while accessing any
of the accounting values.

Reported by: Frans Pop <elendil@planet.nl>
Tested by: Frans Pop <elendil@planet.nl>
Cc: stable@kernel.org
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/tsacct.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/tsacct.c b/kernel/tsacct.c
index 43f891b05a4b..00d59d048edf 100644
--- a/kernel/tsacct.c
+++ b/kernel/tsacct.c
@@ -122,8 +122,10 @@ void acct_update_integrals(struct task_struct *tsk)
 	if (likely(tsk->mm)) {
 		cputime_t time, dtime;
 		struct timeval value;
+		unsigned long flags;
 		u64 delta;
 
+		local_irq_save(flags);
 		time = tsk->stime + tsk->utime;
 		dtime = cputime_sub(time, tsk->acct_timexpd);
 		jiffies_to_timeval(cputime_to_jiffies(dtime), &value);
@@ -131,10 +133,12 @@ void acct_update_integrals(struct task_struct *tsk)
 		delta = delta * USEC_PER_SEC + value.tv_usec;
 
 		if (delta == 0)
-			return;
+			goto out;
 		tsk->acct_timexpd = time;
 		tsk->acct_rss_mem1 += delta * get_mm_rss(tsk->mm);
 		tsk->acct_vm_mem1 += delta * tsk->mm->total_vm;
+	out:
+		local_irq_restore(flags);
 	}
 }
 
-- 
cgit v1.2.3


From 2d5516cbb9daf7d0e342a2e3b0fc6f8c39a81205 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 2 Mar 2009 22:58:45 +0100
Subject: copy_process: fix CLONE_PARENT && parent_exec_id interaction

CLONE_PARENT can fool the ->self_exec_id/parent_exec_id logic. If we
re-use the old parent, we must also re-use ->parent_exec_id to make
sure exit_notify() sees the right ->xxx_exec_id's when the CLONE_PARENT'ed
task exits.

Also, move down the "p->parent_exec_id = p->self_exec_id" thing, to place
two different cases together.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Serge E. Hallyn <serge@hallyn.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index a66fbde20715..4854c2c4a82e 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1179,10 +1179,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 #endif
 	clear_all_latency_tracing(p);
 
-	/* Our parent execution domain becomes current domain
-	   These must match for thread signalling to apply */
-	p->parent_exec_id = p->self_exec_id;
-
 	/* ok, now we should be set up.. */
 	p->exit_signal = (clone_flags & CLONE_THREAD) ? -1 : (clone_flags & CSIGNAL);
 	p->pdeath_signal = 0;
@@ -1220,10 +1216,13 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		set_task_cpu(p, smp_processor_id());
 
 	/* CLONE_PARENT re-uses the old parent */
-	if (clone_flags & (CLONE_PARENT|CLONE_THREAD))
+	if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) {
 		p->real_parent = current->real_parent;
-	else
+		p->parent_exec_id = current->parent_exec_id;
+	} else {
 		p->real_parent = current;
+		p->parent_exec_id = current->self_exec_id;
+	}
 
 	spin_lock(&current->sighand->siglock);
 
-- 
cgit v1.2.3


From 156b5f172a64103bcb13b6d26288388b9019caa3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 6 Mar 2009 10:50:53 -0500
Subject: tracing: typecast sizeof and offsetof to unsigned int

Impact: fix compiler warnings

On x86_64 sizeof and offsetof are treated as long, where as on x86_32
they are int. This patch typecasts them to unsigned int to avoid
one arch giving warnings while the other does not.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 15 ++++++++-------
 kernel/trace/trace_export.c | 10 +++++-----
 kernel/trace/trace_format.h | 12 ++++++------
 3 files changed, 19 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 4488d90e75ef..fa32ca320767 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -448,8 +448,9 @@ event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
 }
 
 #undef FIELD
-#define FIELD(type, name) \
-	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+#define FIELD(type, name)						\
+	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
+		(unsigned int)sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -457,11 +458,11 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
-				"\tfield:%s %s;\toffset:%lu;\tsize:%lu;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
 				"\n",
 				FIELD(unsigned char, type),
 				FIELD(unsigned char, flags),
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 0fb7be73e31c..7162ab49d05d 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -18,11 +18,11 @@
 #include "trace_format.h"
 
 #undef TRACE_FIELD_ZERO_CHAR
-#define TRACE_FIELD_ZERO_CHAR(item)				\
-	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"	\
-			       "offset:%lu;\tsize:0;\n",	\
-			       offsetof(typeof(field), item));	\
-	if (!ret)						\
+#define TRACE_FIELD_ZERO_CHAR(item)					\
+	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"		\
+			       "offset:%u;\tsize:0;\n",			\
+			       (unsigned int)offsetof(typeof(field), item)); \
+	if (!ret)							\
 		return 0;
 
 
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
index 03f9a4c165ca..97e59a9c82ea 100644
--- a/kernel/trace/trace_format.h
+++ b/kernel/trace/trace_format.h
@@ -22,9 +22,9 @@
 #undef TRACE_FIELD
 #define TRACE_FIELD(type, item, assign)					\
 	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
 	if (!ret)							\
 		return 0;
 
@@ -32,9 +32,9 @@
 #undef TRACE_FIELD_SPECIAL
 #define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
 	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%lu;\tsize:%lu;\n",		\
-			       offsetof(typeof(field), item),		\
-			       sizeof(field.item));			\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
 	if (!ret)							\
 		return 0;
 
-- 
cgit v1.2.3


From 2939b0469d04ba9ac791aca9a81625d7eb50662b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 15:47:18 -0400
Subject: tracing: replace TP<var> with TP_<var>

Impact: clean up

The macros TPPROTO, TPARGS, TPFMT, TPRAWFMT, and TPCMD all look a bit
ugly. This patch adds an underscore to their names.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 Documentation/tracepoints.txt          |  8 +--
 include/linux/tracepoint.h             |  6 +--
 include/trace/block.h                  | 70 ++++++++++++------------
 include/trace/irq_event_types.h        | 16 +++---
 include/trace/lockdep_event_types.h    | 24 ++++-----
 include/trace/power.h                  | 12 ++---
 include/trace/sched_event_types.h      | 98 +++++++++++++++++-----------------
 include/trace/workqueue.h              | 16 +++---
 kernel/trace/trace_event_types.h       | 28 +++++-----
 kernel/trace/trace_events_stage_2.h    |  6 +--
 kernel/trace/trace_events_stage_3.h    |  8 +--
 kernel/trace/trace_export.c            |  8 +--
 kernel/trace/trace_format.h            | 55 -------------------
 samples/tracepoints/tp-samples-trace.h |  8 +--
 14 files changed, 154 insertions(+), 209 deletions(-)
 delete mode 100644 kernel/trace/trace_format.h

(limited to 'kernel')

diff --git a/Documentation/tracepoints.txt b/Documentation/tracepoints.txt
index 6f0a044f5b5e..4ff43c6de299 100644
--- a/Documentation/tracepoints.txt
+++ b/Documentation/tracepoints.txt
@@ -45,8 +45,8 @@ In include/trace/subsys.h :
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_eventname,
-	TPPROTO(int firstarg, struct task_struct *p),
-	TPARGS(firstarg, p));
+	TP_PROTO(int firstarg, struct task_struct *p),
+	TP_ARGS(firstarg, p));
 
 In subsys/file.c (where the tracing statement must be added) :
 
@@ -66,10 +66,10 @@ Where :
     - subsys is the name of your subsystem.
     - eventname is the name of the event to trace.
 
-- TPPROTO(int firstarg, struct task_struct *p) is the prototype of the
+- TP_PROTO(int firstarg, struct task_struct *p) is the prototype of the
   function called by this tracepoint.
 
-- TPARGS(firstarg, p) are the parameters names, same as found in the
+- TP_ARGS(firstarg, p) are the parameters names, same as found in the
   prototype.
 
 Connecting a function (probe) to a tracepoint is done by providing a
diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 152b2f03fb86..3bcc3e171443 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -31,8 +31,8 @@ struct tracepoint {
 					 * Keep in sync with vmlinux.lds.h.
 					 */
 
-#define TPPROTO(args...)	args
-#define TPARGS(args...)		args
+#define TP_PROTO(args...)	args
+#define TP_ARGS(args...)		args
 
 #ifdef CONFIG_TRACEPOINTS
 
@@ -65,7 +65,7 @@ struct tracepoint {
 	{								\
 		if (unlikely(__tracepoint_##name.state))		\
 			__DO_TRACE(&__tracepoint_##name,		\
-				TPPROTO(proto), TPARGS(args));		\
+				TP_PROTO(proto), TP_ARGS(args));	\
 	}								\
 	static inline int register_trace_##name(void (*probe)(proto))	\
 	{								\
diff --git a/include/trace/block.h b/include/trace/block.h
index 25c6a1fd5b77..25b7068b819e 100644
--- a/include/trace/block.h
+++ b/include/trace/block.h
@@ -5,72 +5,72 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(block_rq_abort,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_insert,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_issue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_requeue,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_rq_complete,
-	TPPROTO(struct request_queue *q, struct request *rq),
-		TPARGS(q, rq));
+	TP_PROTO(struct request_queue *q, struct request *rq),
+	      TP_ARGS(q, rq));
 
 DECLARE_TRACE(block_bio_bounce,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_complete,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_backmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_frontmerge,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_bio_queue,
-	TPPROTO(struct request_queue *q, struct bio *bio),
-		TPARGS(q, bio));
+	TP_PROTO(struct request_queue *q, struct bio *bio),
+	      TP_ARGS(q, bio));
 
 DECLARE_TRACE(block_getrq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_sleeprq,
-	TPPROTO(struct request_queue *q, struct bio *bio, int rw),
-		TPARGS(q, bio, rw));
+	TP_PROTO(struct request_queue *q, struct bio *bio, int rw),
+	      TP_ARGS(q, bio, rw));
 
 DECLARE_TRACE(block_plug,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_timer,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_unplug_io,
-	TPPROTO(struct request_queue *q),
-		TPARGS(q));
+	TP_PROTO(struct request_queue *q),
+	      TP_ARGS(q));
 
 DECLARE_TRACE(block_split,
-	TPPROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
-		TPARGS(q, bio, pdu));
+	TP_PROTO(struct request_queue *q, struct bio *bio, unsigned int pdu),
+	      TP_ARGS(q, bio, pdu));
 
 DECLARE_TRACE(block_remap,
-	TPPROTO(struct request_queue *q, struct bio *bio, dev_t dev,
-		sector_t from, sector_t to),
-		TPARGS(q, bio, dev, from, to));
+	TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev,
+		 sector_t from, sector_t to),
+	      TP_ARGS(q, bio, dev, from, to));
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 65850bc5ea06..0147d9eef5f4 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -9,25 +9,25 @@
 #define TRACE_SYSTEM irq
 
 TRACE_EVENT_FORMAT(irq_handler_entry,
-	TPPROTO(int irq, struct irqaction *action),
-	TPARGS(irq, action),
-	TPFMT("irq=%d handler=%s", irq, action->name),
+	TP_PROTO(int irq, struct irqaction *action),
+	TP_ARGS(irq, action),
+	TP_FMT("irq=%d handler=%s", irq, action->name),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 	),
-	TPRAWFMT("irq %d")
+	TP_RAW_FMT("irq %d")
 	);
 
 TRACE_EVENT_FORMAT(irq_handler_exit,
-	TPPROTO(int irq, struct irqaction *action, int ret),
-	TPARGS(irq, action, ret),
-	TPFMT("irq=%d handler=%s return=%s",
+	TP_PROTO(int irq, struct irqaction *action, int ret),
+	TP_ARGS(irq, action, ret),
+	TP_FMT("irq=%d handler=%s return=%s",
 		irq, action->name, ret ? "handled" : "unhandled"),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, irq, irq)
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("irq %d ret %d")
+	TP_RAW_FMT("irq %d ret %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index f713d74a82b4..1f00e8b3543e 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -10,32 +10,32 @@
 #ifdef CONFIG_LOCKDEP
 
 TRACE_FORMAT(lock_acquire,
-	TPPROTO(struct lockdep_map *lock, unsigned int subclass,
+	TP_PROTO(struct lockdep_map *lock, unsigned int subclass,
 		int trylock, int read, int check,
 		struct lockdep_map *next_lock, unsigned long ip),
-	TPARGS(lock, subclass, trylock, read, check, next_lock, ip),
-	TPFMT("%s%s%s", trylock ? "try " : "",
+	TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip),
+	TP_FMT("%s%s%s", trylock ? "try " : "",
 		read ? "read " : "", lock->name)
 	);
 
 TRACE_FORMAT(lock_release,
-	TPPROTO(struct lockdep_map *lock, int nested, unsigned long ip),
-	TPARGS(lock, nested, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip),
+	TP_ARGS(lock, nested, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #ifdef CONFIG_LOCK_STAT
 
 TRACE_FORMAT(lock_contended,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 TRACE_FORMAT(lock_acquired,
-	TPPROTO(struct lockdep_map *lock, unsigned long ip),
-	TPARGS(lock, ip),
-	TPFMT("%s", lock->name)
+	TP_PROTO(struct lockdep_map *lock, unsigned long ip),
+	TP_ARGS(lock, ip),
+	TP_FMT("%s", lock->name)
 	);
 
 #endif
diff --git a/include/trace/power.h b/include/trace/power.h
index 38aca537e497..ef204666e983 100644
--- a/include/trace/power.h
+++ b/include/trace/power.h
@@ -18,15 +18,15 @@ struct power_trace {
 };
 
 DECLARE_TRACE(power_start,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_mark,
-	TPPROTO(struct power_trace *it, unsigned int type, unsigned int state),
-		TPARGS(it, type, state));
+	TP_PROTO(struct power_trace *it, unsigned int type, unsigned int state),
+	      TP_ARGS(it, type, state));
 
 DECLARE_TRACE(power_end,
-	TPPROTO(struct power_trace *it),
-		TPARGS(it));
+	TP_PROTO(struct power_trace *it),
+	      TP_ARGS(it));
 
 #endif /* _TRACE_POWER_H */
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index a6de5c1601a0..71b14828a957 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -9,143 +9,143 @@
 #define TRACE_SYSTEM sched
 
 TRACE_EVENT_FORMAT(sched_kthread_stop,
-	TPPROTO(struct task_struct *t),
-	TPARGS(t),
-	TPFMT("task %s:%d", t->comm, t->pid),
+	TP_PROTO(struct task_struct *t),
+	TP_ARGS(t),
+	TP_FMT("task %s:%d", t->comm, t->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, t->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_kthread_stop_ret,
-	TPPROTO(int ret),
-	TPARGS(ret),
-	TPFMT("ret=%d", ret),
+	TP_PROTO(int ret),
+	TP_ARGS(ret),
+	TP_FMT("ret=%d", ret),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, ret, ret)
 	),
-	TPRAWFMT("ret=%d")
+	TP_RAW_FMT("ret=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wait_task,
-	TPPROTO(struct rq *rq, struct task_struct *p),
-	TPARGS(rq, p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct rq *rq, struct task_struct *p),
+	TP_ARGS(rq, p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d %s",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d %s",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_wakeup_new,
-	TPPROTO(struct rq *rq, struct task_struct *p, int success),
-	TPARGS(rq, p, success),
-	TPFMT("task %s:%d",
+	TP_PROTO(struct rq *rq, struct task_struct *p, int success),
+	TP_ARGS(rq, p, success),
+	TP_FMT("task %s:%d",
 	      p->comm, p->pid, success ? "succeeded" : "failed"),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, success, success)
 	),
-	TPRAWFMT("task %d success=%d")
+	TP_RAW_FMT("task %d success=%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_switch,
-	TPPROTO(struct rq *rq, struct task_struct *prev,
+	TP_PROTO(struct rq *rq, struct task_struct *prev,
 		struct task_struct *next),
-	TPARGS(rq, prev, next),
-	TPFMT("task %s:%d ==> %s:%d",
+	TP_ARGS(rq, prev, next),
+	TP_FMT("task %s:%d ==> %s:%d",
 	      prev->comm, prev->pid, next->comm, next->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, prev_pid, prev->pid)
 		TRACE_FIELD(int, prev_prio, prev->prio)
 		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
 				    next_comm,
-				    TPCMD(memcpy(TRACE_ENTRY->next_comm,
+				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
 						 next->comm,
 						 TASK_COMM_LEN)))
 		TRACE_FIELD(pid_t, next_pid, next->pid)
 		TRACE_FIELD(int, next_prio, next->prio)
 	),
-	TPRAWFMT("prev %d:%d ==> next %s:%d:%d")
+	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
 	);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
-	TPPROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
-	TPARGS(p, orig_cpu, dest_cpu),
-	TPFMT("task %s:%d from: %d  to: %d",
+	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
+	TP_ARGS(p, orig_cpu, dest_cpu),
+	TP_FMT("task %s:%d from: %d  to: %d",
 	      p->comm, p->pid, orig_cpu, dest_cpu),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 		TRACE_FIELD(int, orig_cpu, orig_cpu)
 		TRACE_FIELD(int, dest_cpu, dest_cpu)
 	),
-	TPRAWFMT("task %d  from: %d to: %d")
+	TP_RAW_FMT("task %d  from: %d to: %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_free,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_exit,
-	TPPROTO(struct task_struct *p),
-	TPARGS(p),
-	TPFMT("task %s:%d", p->comm, p->pid),
+	TP_PROTO(struct task_struct *p),
+	TP_ARGS(p),
+	TP_FMT("task %s:%d", p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_wait,
-	TPPROTO(struct pid *pid),
-	TPARGS(pid),
-	TPFMT("pid %d", pid_nr(pid)),
+	TP_PROTO(struct pid *pid),
+	TP_ARGS(pid),
+	TP_FMT("pid %d", pid_nr(pid)),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, pid, pid_nr(pid))
 	),
-	TPRAWFMT("task %d")
+	TP_RAW_FMT("task %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_process_fork,
-	TPPROTO(struct task_struct *parent, struct task_struct *child),
-	TPARGS(parent, child),
-	TPFMT("parent %s:%d  child %s:%d",
+	TP_PROTO(struct task_struct *parent, struct task_struct *child),
+	TP_ARGS(parent, child),
+	TP_FMT("parent %s:%d  child %s:%d",
 	      parent->comm, parent->pid, child->comm, child->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(pid_t, parent, parent->pid)
 		TRACE_FIELD(pid_t, child, child->pid)
 	),
-	TPRAWFMT("parent %d  child %d")
+	TP_RAW_FMT("parent %d  child %d")
 	);
 
 TRACE_EVENT_FORMAT(sched_signal_send,
-	TPPROTO(int sig, struct task_struct *p),
-	TPARGS(sig, p),
-	TPFMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
+	TP_PROTO(int sig, struct task_struct *p),
+	TP_ARGS(sig, p),
+	TP_FMT("sig: %d   task %s:%d", sig, p->comm, p->pid),
 	TRACE_STRUCT(
 		TRACE_FIELD(int, sig, sig)
 		TRACE_FIELD(pid_t, pid, p->pid)
 	),
-	TPRAWFMT("sig: %d  task %d")
+	TP_RAW_FMT("sig: %d  task %d")
 	);
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/workqueue.h b/include/trace/workqueue.h
index 867829df4571..7626523deeba 100644
--- a/include/trace/workqueue.h
+++ b/include/trace/workqueue.h
@@ -6,20 +6,20 @@
 #include <linux/sched.h>
 
 DECLARE_TRACE(workqueue_insertion,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 DECLARE_TRACE(workqueue_execution,
-	   TPPROTO(struct task_struct *wq_thread, struct work_struct *work),
-	   TPARGS(wq_thread, work));
+	   TP_PROTO(struct task_struct *wq_thread, struct work_struct *work),
+	   TP_ARGS(wq_thread, work));
 
 /* Trace the creation of one workqueue thread on a cpu */
 DECLARE_TRACE(workqueue_creation,
-	   TPPROTO(struct task_struct *wq_thread, int cpu),
-	   TPARGS(wq_thread, cpu));
+	   TP_PROTO(struct task_struct *wq_thread, int cpu),
+	   TP_ARGS(wq_thread, cpu));
 
 DECLARE_TRACE(workqueue_destruction,
-	   TPPROTO(struct task_struct *wq_thread),
-	   TPARGS(wq_thread));
+	   TP_PROTO(struct task_struct *wq_thread),
+	   TP_ARGS(wq_thread));
 
 #endif /* __TRACE_WORKQUEUE_H */
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index fb4eba166433..d94179aa1fc2 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -10,7 +10,7 @@ TRACE_EVENT_FORMAT(function, TRACE_FN, ftrace_entry, ignore,
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned long, parent_ip, parent_ip)
 	),
-	TPRAWFMT(" %lx <-- %lx")
+	TP_RAW_FMT(" %lx <-- %lx")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
@@ -19,7 +19,7 @@ TRACE_EVENT_FORMAT(funcgraph_entry, TRACE_GRAPH_ENT,
 		TRACE_FIELD(unsigned long, graph_ent.func, func)
 		TRACE_FIELD(int, graph_ent.depth, depth)
 	),
-	TPRAWFMT("--> %lx (%d)")
+	TP_RAW_FMT("--> %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
@@ -28,7 +28,7 @@ TRACE_EVENT_FORMAT(funcgraph_exit, TRACE_GRAPH_RET,
 		TRACE_FIELD(unsigned long, ret.func, func)
 		TRACE_FIELD(int, ret.depth, depth)
 	),
-	TPRAWFMT("<-- %lx (%d)")
+	TP_RAW_FMT("<-- %lx (%d)")
 );
 
 TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
@@ -41,7 +41,7 @@ TRACE_EVENT_FORMAT(wakeup, TRACE_WAKE, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
@@ -54,7 +54,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore,
 		TRACE_FIELD(unsigned char, next_state, next_state)
 		TRACE_FIELD(unsigned int, next_cpu, next_cpu)
 	),
-	TPRAWFMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
+	TP_RAW_FMT("%u:%u:%u  ==+ %u:%u:%u [%03u]")
 );
 
 TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
@@ -63,7 +63,7 @@ TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore,
 		TRACE_FIELD(unsigned long, arg2, arg2)
 		TRACE_FIELD(unsigned long, arg3, arg3)
 	),
-	TPRAWFMT("(%08lx) (%08lx) (%08lx)")
+	TP_RAW_FMT("(%08lx) (%08lx) (%08lx)")
 );
 
 /*
@@ -83,7 +83,7 @@ TRACE_EVENT_FORMAT(kernel_stack, TRACE_STACK, stack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -98,7 +98,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		TRACE_FIELD(unsigned long, caller[6], stack6)
 		TRACE_FIELD(unsigned long, caller[7], stack7)
 	),
-	TPRAWFMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
+	TP_RAW_FMT("\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n"
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
@@ -108,7 +108,7 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TPRAWFMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
@@ -118,7 +118,7 @@ TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 		TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file)
 		TRACE_FIELD(char, correct, correct)
 	),
-	TPRAWFMT("%u:%s:%s (%u)")
+	TP_RAW_FMT("%u:%s:%s (%u)")
 );
 
 TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
@@ -126,7 +126,7 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore,
 		TRACE_FIELD(u64, from, from)
 		TRACE_FIELD(u64, to, to)
 	),
-	TPRAWFMT("from: %llx to: %llx")
+	TP_RAW_FMT("from: %llx to: %llx")
 );
 
 TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
@@ -136,7 +136,7 @@ TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore,
 		TRACE_FIELD(int, state_data.type, type)
 		TRACE_FIELD(int, state_data.state, state)
 	),
-	TPRAWFMT("%llx->%llx type:%u state:%u")
+	TP_RAW_FMT("%llx->%llx type:%u state:%u")
 );
 
 TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
@@ -149,7 +149,7 @@ TRACE_EVENT_FORMAT(kmem_alloc, TRACE_KMEM_ALLOC, kmemtrace_alloc_entry, ignore,
 		TRACE_FIELD(gfp_t, gfp_flags, gfp_flags)
 		TRACE_FIELD(int, node, node)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p req:%lu alloc:%lu"
 		 " flags:%x node:%d")
 );
 
@@ -159,7 +159,7 @@ TRACE_EVENT_FORMAT(kmem_free, TRACE_KMEM_FREE, kmemtrace_free_entry, ignore,
 		TRACE_FIELD(unsigned long, call_site, call_site)
 		TRACE_FIELD(const void *, ptr, ptr)
 	),
-	TPRAWFMT("type:%u call_site:%lx ptr:%p")
+	TP_RAW_FMT("type:%u call_site:%lx ptr:%p")
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d24a97e74aea..8e2e0f56c2a8 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TPRAWFMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -44,8 +44,8 @@
 	field->item,
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...)	args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...)	args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 2c8d76c7dbed..557ca52bcdcd 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -106,8 +106,8 @@
  *
  */
 
-#undef TPFMT
-#define TPFMT(fmt, args...)	fmt "\n", ##args
+#undef TP_FMT
+#define TP_FMT(fmt, args...)	fmt "\n", ##args
 
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
@@ -152,8 +152,8 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 7162ab49d05d..e62bc10f8103 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -26,8 +26,8 @@
 		return 0;
 
 
-#undef TPRAWFMT
-#define TPRAWFMT(args...) args
+#undef TP_RAW_FMT
+#define TP_RAW_FMT(args...) args
 
 #undef TRACE_EVENT_FORMAT
 #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
@@ -57,8 +57,8 @@ ftrace_format_##call(struct trace_seq *s)				\
 #define TRACE_FIELD(type, item, assign)\
 	entry->item = assign;
 
-#undef TPCMD
-#define TPCMD(cmd...)	cmd
+#undef TP_CMD
+#define TP_CMD(cmd...)	cmd
 
 #undef TRACE_ENTRY
 #define TRACE_ENTRY	entry
diff --git a/kernel/trace/trace_format.h b/kernel/trace/trace_format.h
deleted file mode 100644
index 97e59a9c82ea..000000000000
--- a/kernel/trace/trace_format.h
+++ /dev/null
@@ -1,55 +0,0 @@
-/*
- * Setup the showing format of trace point.
- *
- * int
- * ftrace_format_##call(struct trace_seq *s)
- * {
- *	struct ftrace_raw_##call field;
- *	int ret;
- *
- *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
- *
- * }
- */
-
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)					\
-	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
-	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
-			       "offset:%u;\tsize:%u;\n",		\
-			       (unsigned int)offsetof(typeof(field), item), \
-			       (unsigned int)sizeof(field.item));	\
-	if (!ret)							\
-		return 0;
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-static int								\
-ftrace_format_##call(struct trace_seq *s)				\
-{									\
-	struct ftrace_raw_##call field;					\
-	int ret;							\
-									\
-	tstruct;							\
-									\
-	trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt);		\
-									\
-	return ret;							\
-}
-
diff --git a/samples/tracepoints/tp-samples-trace.h b/samples/tracepoints/tp-samples-trace.h
index 01724e04c556..dffdc49878af 100644
--- a/samples/tracepoints/tp-samples-trace.h
+++ b/samples/tracepoints/tp-samples-trace.h
@@ -5,9 +5,9 @@
 #include <linux/tracepoint.h>
 
 DECLARE_TRACE(subsys_event,
-	TPPROTO(struct inode *inode, struct file *file),
-	TPARGS(inode, file));
+	TP_PROTO(struct inode *inode, struct file *file),
+	TP_ARGS(inode, file));
 DECLARE_TRACE(subsys_eventb,
-	TPPROTO(void),
-	TPARGS());
+	TP_PROTO(void),
+	TP_ARGS());
 #endif
-- 
cgit v1.2.3


From 9cc26a261d43e5898287a1f5808132f8f05ceb1c Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 16:00:22 -0400
Subject: tracing: use generic __stringify

Impact: clean up

This removes the custom made STR(x) macros in the tracer and uses
the generic __stringify macro instead.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/events.c               | 4 +---
 kernel/trace/trace_events_stage_3.h | 4 ++--
 kernel/trace/trace_selftest.c       | 6 ++----
 3 files changed, 5 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index f2509cbaacea..9fc918da404f 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -2,9 +2,7 @@
  * This is the place to register all trace points as events.
  */
 
-/* someday this needs to go in a generic header */
-#define __STR(x) #x
-#define STR(x) __STR(x)
+#include <linux/stringify.h>
 
 #include <trace/trace_events.h>
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 557ca52bcdcd..41b82b93c9c7 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -139,7 +139,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
+	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
@@ -225,7 +225,7 @@ static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
-	.system			= STR(TRACE_SYSTEM),			\
+	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 7238646b8723..f907a2b29028 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -1,5 +1,6 @@
 /* Include in trace.c */
 
+#include <linux/stringify.h>
 #include <linux/kthread.h>
 #include <linux/delay.h>
 
@@ -100,9 +101,6 @@ static inline void warn_failed_init_tracer(struct tracer *trace, int init_ret)
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 
-#define __STR(x) #x
-#define STR(x) __STR(x)
-
 /* Test dynamic code modification and ftrace filters */
 int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 					   struct trace_array *tr,
@@ -130,7 +128,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace,
 	 * start of the function names. We simply put a '*' to
 	 * accommodate them.
 	 */
-	func_name = "*" STR(DYN_FTRACE_TEST_NAME);
+	func_name = "*" __stringify(DYN_FTRACE_TEST_NAME);
 
 	/* filter only on our function */
 	ftrace_set_filter(func_name, strlen(func_name), 1);
-- 
cgit v1.2.3


From da4d03020c2af32f73e8bfbab0a66620d85bb9bb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 9 Mar 2009 17:14:30 -0400
Subject: tracing: new format for specialized trace points

Impact: clean up and enhancement

The TRACE_EVENT_FORMAT macro looks quite ugly and is limited in its
ability to save data as well as to print the record out. Working with
Ingo Molnar, we came up with a new format that is much more pleasing to
the eye of C developers. This new macro is more C style than the old
macro, and is more obvious to what it does.

Here's the example. The only updated macro in this patch is the
sched_switch trace point.

The old method looked like this:

 TRACE_EVENT_FORMAT(sched_switch,
        TP_PROTO(struct rq *rq, struct task_struct *prev,
                struct task_struct *next),
        TP_ARGS(rq, prev, next),
        TP_FMT("task %s:%d ==> %s:%d",
              prev->comm, prev->pid, next->comm, next->pid),
        TRACE_STRUCT(
                TRACE_FIELD(pid_t, prev_pid, prev->pid)
                TRACE_FIELD(int, prev_prio, prev->prio)
                TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
                                    next_comm,
                                    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
                                                 next->comm,
                                                 TASK_COMM_LEN)))
                TRACE_FIELD(pid_t, next_pid, next->pid)
                TRACE_FIELD(int, next_prio, next->prio)
        ),
        TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
        );

The above method is hard to read and requires two format fields.

The new method:

 /*
  * Tracepoint for task switches, performed by the scheduler:
  *
  * (NOTE: the 'rq' argument is not used by generic trace events,
  *        but used by the latency tracer plugin. )
  */
 TRACE_EVENT(sched_switch,

	TP_PROTO(struct rq *rq, struct task_struct *prev,
		 struct task_struct *next),

	TP_ARGS(rq, prev, next),

	TP_STRUCT__entry(
		__array(	char,	prev_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	prev_pid			)
		__field(	int,	prev_prio			)
		__array(	char,	next_comm,	TASK_COMM_LEN	)
		__field(	pid_t,	next_pid			)
		__field(	int,	next_prio			)
	),

	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
		__entry->next_comm, __entry->next_pid, __entry->next_prio),

	TP_fast_assign(
		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
		__entry->prev_pid	= prev->pid;
		__entry->prev_prio	= prev->prio;
		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
		__entry->next_pid	= next->pid;
		__entry->next_prio	= next->prio;
	)
 );

This macro is called TRACE_EVENT, it is broken up into 5 parts:

 TP_PROTO:        the proto type of the trace point
 TP_ARGS:         the arguments of the trace point
 TP_STRUCT_entry: the structure layout of the entry in the ring buffer
 TP_printk:       the printk format
 TP_fast_assign:  the method used to write the entry into the ring buffer

The structure is the definition of how the event will be saved in the
ring buffer. The printk is used by the internal tracing in case of
an oops, and the kernel needs to print out the format of the record
to the console. This the TP_printk gives a means to show the records
in a human readable format. It is also used to print out the data
from the trace file.

The TP_fast_assign is executed directly. It is basically like a C function,
where the __entry is the handle to the record.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   3 +
 include/trace/sched_event_types.h   |  48 +++++++----
 kernel/trace/trace.h                |   5 --
 kernel/trace/trace_event_types.h    |   3 +-
 kernel/trace/trace_events.c         | 159 +-----------------------------------
 kernel/trace/trace_events_stage_1.h |  28 ++++---
 kernel/trace/trace_events_stage_2.h |  89 ++++++++++++++++----
 kernel/trace/trace_events_stage_3.h |  34 +++-----
 kernel/trace/trace_export.c         |  23 +++++-
 9 files changed, 159 insertions(+), 233 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 3bcc3e171443..6b4f1bb3701e 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -160,4 +160,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
 	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
+#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
+
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 71b14828a957..aa77fb754038 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -62,25 +62,41 @@ TRACE_EVENT_FORMAT(sched_wakeup_new,
 	TP_RAW_FMT("task %d success=%d")
 	);
 
-TRACE_EVENT_FORMAT(sched_switch,
+/*
+ * Tracepoint for task switches, performed by the scheduler:
+ *
+ * (NOTE: the 'rq' argument is not used by generic trace events,
+ *        but used by the latency tracer plugin. )
+ */
+TRACE_EVENT(sched_switch,
+
 	TP_PROTO(struct rq *rq, struct task_struct *prev,
-		struct task_struct *next),
+		 struct task_struct *next),
+
 	TP_ARGS(rq, prev, next),
-	TP_FMT("task %s:%d ==> %s:%d",
-	      prev->comm, prev->pid, next->comm, next->pid),
-	TRACE_STRUCT(
-		TRACE_FIELD(pid_t, prev_pid, prev->pid)
-		TRACE_FIELD(int, prev_prio, prev->prio)
-		TRACE_FIELD_SPECIAL(char next_comm[TASK_COMM_LEN],
-				    next_comm,
-				    TP_CMD(memcpy(TRACE_ENTRY->next_comm,
-						 next->comm,
-						 TASK_COMM_LEN)))
-		TRACE_FIELD(pid_t, next_pid, next->pid)
-		TRACE_FIELD(int, next_prio, next->prio)
+
+	TP_STRUCT__entry(
+		__array(	char,	prev_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	prev_pid			)
+		__field(	int,	prev_prio			)
+		__array(	char,	next_comm,	TASK_COMM_LEN	)
+		__field(	pid_t,	next_pid			)
+		__field(	int,	next_prio			)
 	),
-	TP_RAW_FMT("prev %d:%d ==> next %s:%d:%d")
-	);
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio),
+
+	TP_fast_assign(
+		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+		__entry->prev_pid	= prev->pid;
+		__entry->prev_prio	= prev->prio;
+		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
+		__entry->next_pid	= next->pid;
+		__entry->next_prio	= next->prio;
+	)
+);
 
 TRACE_EVENT_FORMAT(sched_migrate_task,
 	TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu),
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 2bfb7d11fc17..c5e1d8865fe4 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -751,12 +751,7 @@ struct ftrace_event_call {
 	int		(*regfunc)(void);
 	void		(*unregfunc)(void);
 	int		id;
-	struct dentry	*raw_dir;
-	int		raw_enabled;
-	int		type;
 	int		(*raw_init)(void);
-	int		(*raw_reg)(void);
-	void		(*raw_unreg)(void);
 	int		(*show_format)(struct trace_seq *s);
 };
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d94179aa1fc2..5cca4c978bde 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -106,9 +106,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
-	TP_RAW_FMT("%08lx (%d) %s")
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fa32ca320767..1880a6438097 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -59,22 +59,12 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call,
 			call->enabled = 0;
 			call->unregfunc();
 		}
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-		}
 		break;
 	case 1:
-		if (!call->enabled &&
-		    (call->type & TRACE_EVENT_TYPE_PRINTF)) {
+		if (!call->enabled) {
 			call->enabled = 1;
 			call->regfunc();
 		}
-		if (!call->raw_enabled &&
-		    (call->type & TRACE_EVENT_TYPE_RAW)) {
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
 		break;
 	}
 }
@@ -300,7 +290,7 @@ event_enable_read(struct file *filp, char __user *ubuf, size_t cnt,
 	struct ftrace_event_call *call = filp->private_data;
 	char *buf;
 
-	if (call->enabled || call->raw_enabled)
+	if (call->enabled)
 		buf = "1\n";
 	else
 		buf = "0\n";
@@ -346,107 +336,6 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
-static ssize_t
-event_type_read(struct file *filp, char __user *ubuf, size_t cnt,
-		loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	if (call->type & TRACE_EVENT_TYPE_PRINTF)
-		r += sprintf(buf, "printf\n");
-
-	if (call->type & TRACE_EVENT_TYPE_RAW)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
-static ssize_t
-event_type_write(struct file *filp, const char __user *ubuf, size_t cnt,
-		 loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[64];
-
-	/*
-	 * If there's only one type, we can't change it.
-	 * And currently we always have printf type, and we
-	 * may or may not have raw type.
-	 *
-	 * This is a redundant check, the file should be read
-	 * only if this is the case anyway.
-	 */
-
-	if (!call->raw_init)
-		return -EPERM;
-
-	if (cnt >= sizeof(buf))
-		return -EINVAL;
-
-	if (copy_from_user(&buf, ubuf, cnt))
-		return -EFAULT;
-
-	buf[cnt] = 0;
-
-	if (!strncmp(buf, "printf", 6) &&
-	    (!buf[6] || isspace(buf[6]))) {
-
-		call->type = TRACE_EVENT_TYPE_PRINTF;
-
-		/*
-		 * If raw enabled, the disable it and enable
-		 * printf type.
-		 */
-		if (call->raw_enabled) {
-			call->raw_enabled = 0;
-			call->raw_unreg();
-
-			call->enabled = 1;
-			call->regfunc();
-		}
-
-	} else if (!strncmp(buf, "raw", 3) &&
-	    (!buf[3] || isspace(buf[3]))) {
-
-		call->type = TRACE_EVENT_TYPE_RAW;
-
-		/*
-		 * If printf enabled, the disable it and enable
-		 * raw type.
-		 */
-		if (call->enabled) {
-			call->enabled = 0;
-			call->unregfunc();
-
-			call->raw_enabled = 1;
-			call->raw_reg();
-		}
-	} else
-		return -EINVAL;
-
-	*ppos += cnt;
-
-	return cnt;
-}
-
-static ssize_t
-event_available_types_read(struct file *filp, char __user *ubuf, size_t cnt,
-			   loff_t *ppos)
-{
-	struct ftrace_event_call *call = filp->private_data;
-	char buf[16];
-	int r = 0;
-
-	r += sprintf(buf, "printf\n");
-
-	if (call->raw_init)
-		r += sprintf(buf+r, "raw\n");
-
-	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
-}
-
 #undef FIELD
 #define FIELD(type, name)						\
 	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
@@ -470,6 +359,7 @@ static int trace_write_header(struct trace_seq *s)
 				FIELD(int, pid),
 				FIELD(int, tgid));
 }
+
 static ssize_t
 event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 		  loff_t *ppos)
@@ -527,13 +417,6 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
-static const struct file_operations ftrace_avail_fops = {
-	.open = ftrace_event_seq_open,
-	.read = seq_read,
-	.llseek = seq_lseek,
-	.release = seq_release,
-};
-
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -548,17 +431,6 @@ static const struct file_operations ftrace_enable_fops = {
 	.write = event_enable_write,
 };
 
-static const struct file_operations ftrace_type_fops = {
-	.open = tracing_open_generic,
-	.read = event_type_read,
-	.write = event_type_write,
-};
-
-static const struct file_operations ftrace_available_types_fops = {
-	.open = tracing_open_generic,
-	.read = event_available_types_read,
-};
-
 static const struct file_operations ftrace_event_format_fops = {
 	.open = tracing_open_generic,
 	.read = event_format_read,
@@ -647,9 +519,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	/* default the output to printf */
-	call->type = TRACE_EVENT_TYPE_PRINTF;
-
 	call->dir = debugfs_create_dir(call->name, d_events);
 	if (!call->dir) {
 		pr_warning("Could not create debugfs "
@@ -665,21 +534,6 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
-	/* Only let type be writable, if we can change it */
-	entry = debugfs_create_file("type",
-				    call->raw_init ? 0644 : 0444,
-				    call->dir, call,
-				    &ftrace_type_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/type' entry\n", call->name);
-
-	entry = debugfs_create_file("available_types", 0444, call->dir, call,
-				    &ftrace_available_types_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/available_types' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
@@ -704,13 +558,6 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
-	entry = debugfs_create_file("available_events", 0444, d_tracer,
-				    (void *)&show_event_seq_ops,
-				    &ftrace_avail_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'available_events' entry\n");
-
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 3830a731424c..edfcbd3a0d1b 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -18,19 +18,23 @@
 #define TRACE_FORMAT(call, proto, args, fmt)
 
 #undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)	\
-	struct ftrace_raw_##name {					\
-		struct trace_entry	ent;				\
-		tstruct							\
-	};								\
-	static struct ftrace_event_call event_##name
+#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
+
+#undef __array
+#define __array(type, item, len)	type	item[len];
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __field
+#define __field(type, item)		type	item;
 
-#define TRACE_FIELD(type, item, assign) \
-	type item;
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	type_item;
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+	struct ftrace_raw_##name {				\
+		struct trace_entry	ent;			\
+		tstruct						\
+	};							\
+	static struct ftrace_event_call event_##name
 
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 8e2e0f56c2a8..d91bf4c56661 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -32,23 +32,14 @@
  * in binary.
  */
 
-#undef TRACE_STRUCT
-#define TRACE_STRUCT(args...) args
+#undef __entry
+#define __entry field
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign) \
-	field->item,
+#undef TP_printk
+#define TP_printk(fmt, args...) fmt "\n", args
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	field->item,
-
-
-#undef TP_RAW_FMT
-#define TP_RAW_FMT(args...)	args
-
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -66,14 +57,76 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, tpfmt "%s", tstruct "\n");		\
+	ret = trace_seq_printf(s, print);				\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
 	return TRACE_TYPE_HANDLED;					\
 }
-
+	
 #include <trace/trace_event_types.h>
 
-#include "trace_format.h"
+/*
+ * Setup the showing format of trace point.
+ *
+ * int
+ * ftrace_format_##call(struct trace_seq *s)
+ * {
+ *	struct ftrace_raw_##call field;
+ *	int ret;
+ *
+ *	ret = trace_seq_printf(s, #type " " #item ";"
+ *			       " size:%d; offset:%d;\n",
+ *			       sizeof(field.type),
+ *			       offsetof(struct ftrace_raw_##call,
+ *					item));
+ *
+ * }
+ */
+
+#undef TP_STRUCT__entry
+#define TP_STRUCT__entry(args...) args
+
+#undef __field
+#define __field(type, item)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __array
+#define __array(type, item, len)						\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+#undef __entry
+#define __entry "REC"
+
+#undef TP_printk
+#define TP_printk(fmt, args...) "%s, %s\n", #fmt, #args
+
+#undef TP_fast_assign
+#define TP_fast_assign(args...) args
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+static int								\
+ftrace_format_##call(struct trace_seq *s)				\
+{									\
+	struct ftrace_raw_##call field;					\
+	int ret;							\
+									\
+	tstruct;							\
+									\
+	trace_seq_printf(s, "\nprint fmt: " print);			\
+									\
+	return ret;							\
+}
+
 #include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 41b82b93c9c7..8e398d864096 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -144,27 +144,15 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TRACE_FIELD
-#define TRACE_FIELD(type, item, assign)\
-	entry->item = assign;
-
-#undef TP_CMD
-#define TP_CMD(cmd...)	cmd
-
-#undef TRACE_ENTRY
-#define TRACE_ENTRY	entry
+#undef TRACE_EVENT_FORMAT
+#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
+	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
 
-#undef TRACE_FIELD_SPECIAL
-#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \
-	cmd;
+#undef __entry
+#define __entry entry
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt)	\
-_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -185,7 +173,7 @@ static void ftrace_raw_event_##call(proto)				\
 		return;							\
 	entry	= ring_buffer_event_data(event);			\
 									\
-	tstruct;							\
+	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
 }									\
@@ -226,10 +214,8 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name 			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
-	.regfunc		= ftrace_reg_event_##call,		\
-	.unregfunc		= ftrace_unreg_event_##call,		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
-	.raw_reg		= ftrace_raw_reg_event_##call,		\
-	.raw_unreg		= ftrace_raw_unreg_event_##call,	\
+	.regfunc		= ftrace_raw_reg_event_##call,		\
+	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
 }
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index e62bc10f8103..23ae78430d58 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -15,7 +15,28 @@
 
 #include "trace_output.h"
 
-#include "trace_format.h"
+
+#undef TRACE_STRUCT
+#define TRACE_STRUCT(args...) args
+
+#undef TRACE_FIELD
+#define TRACE_FIELD(type, item, assign)					\
+	ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
+
+
+#undef TRACE_FIELD_SPECIAL
+#define TRACE_FIELD_SPECIAL(type_item, item, cmd)			\
+	ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t"	\
+			       "offset:%u;\tsize:%u;\n",		\
+			       (unsigned int)offsetof(typeof(field), item), \
+			       (unsigned int)sizeof(field.item));	\
+	if (!ret)							\
+		return 0;
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-- 
cgit v1.2.3


From 157587d7ac555458da9f682e3250135e468470a6 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 00:15:34 -0400
Subject: tracing: remove obsolete TRACE_EVENT_FORMAT macro

Impact: clean up

The TRACE_EVENT_FORMAT macro is no longer used by trace points
and only the DECLARE_TRACE, TRACE_FORMAT or TRACE_EVENT macros should
be used by them. Although the TRACE_EVENT_FORMAT macro is still used
by the internal tracing utility, it should not be used in core
kernel code.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          | 3 ---
 include/trace/lockdep_event_types.h | 2 +-
 include/trace/sched_event_types.h   | 2 +-
 kernel/trace/trace_events_stage_1.h | 3 ---
 kernel/trace/trace_events_stage_3.h | 6 +-----
 5 files changed, 3 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 6b4f1bb3701e..69b56988813d 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,9 +157,6 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, struct, tpfmt)	\
-	TRACE_FORMAT(name, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #define TRACE_EVENT(name, proto, args, struct, print, assign)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h
index 1f00e8b3543e..adccfcd2ec8f 100644
--- a/include/trace/lockdep_event_types.h
+++ b/include/trace/lockdep_event_types.h
@@ -1,5 +1,5 @@
 
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_FORMAT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index 0bbbf410e01f..fb37af672c88 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -1,6 +1,6 @@
 
 /* use <trace/sched.h> instead */
-#ifndef TRACE_EVENT_FORMAT
+#ifndef TRACE_EVENT
 # error Do not include this file directly.
 # error Unless you know what you are doing.
 #endif
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index edfcbd3a0d1b..15e9bf965a18 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -17,9 +17,6 @@
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(name, proto, args, fmt, tstruct, tpfmt)
-
 #undef __array
 #define __array(type, item, len)	type	item[len];
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 8e398d864096..3ba55d4ab073 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -35,7 +35,7 @@
  * }
  *
  *
- * For those macros defined with TRACE_EVENT_FORMAT:
+ * For those macros defined with TRACE_EVENT:
  *
  * static struct ftrace_event_call event_<call>;
  *
@@ -144,10 +144,6 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
 
-#undef TRACE_EVENT_FORMAT
-#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, raw)	\
-	TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))
-
 #undef __entry
 #define __entry entry
 
-- 
cgit v1.2.3


From bbcd3063597a3824357cd83c501c2a2aa21ef37b Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 10 Mar 2009 10:49:53 +0900
Subject: tracing: Don't assume possible cpu list have continuous numbers

"for (++cpu ; cpu < num_possible_cpus(); cpu++)" statement assumes
possible cpus have continuous number - but that's a wrong assumption.

Insted, cpumask_next() should be used.

Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090310104437.A480.A69D9226@jp.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 46c8dc896bd3..739fdacf873b 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -91,7 +91,7 @@ static void probe_workqueue_creation(struct task_struct *wq_thread, int cpu)
 	struct cpu_workqueue_stats *cws;
 	unsigned long flags;
 
-	WARN_ON(cpu < 0 || cpu >= num_possible_cpus());
+	WARN_ON(cpu < 0);
 
 	/* Workqueues are sometimes created in atomic context */
 	cws = kzalloc(sizeof(struct cpu_workqueue_stats), GFP_ATOMIC);
@@ -175,12 +175,12 @@ static void *workqueue_stat_next(void *prev, int idx)
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (list_is_last(&prev_cws->list, &workqueue_cpu_stat(cpu)->list)) {
 		spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-		for (++cpu ; cpu < num_possible_cpus(); cpu++) {
-			ret = workqueue_stat_start_cpu(cpu);
-			if (ret)
-				return ret;
-		}
-		return NULL;
+		do {
+			cpu = cpumask_next(cpu, cpu_possible_mask);
+			if (cpu >= nr_cpu_ids)
+				return NULL;
+		} while (!(ret = workqueue_stat_start_cpu(cpu)));
+		return ret;
 	}
 	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
 
-- 
cgit v1.2.3


From bbb76b552a9cef86777181c8577acc907b122b41 Mon Sep 17 00:00:00 2001
From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Tue, 10 Mar 2009 17:34:47 +0800
Subject: ptrace: remove a useless goto

Impact: cleanup

Obviously, this goto is useless. Remove it.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Roland McGrath <roland@redhat.com>
LKML-Reference: <20090310093447.GC3179@hack>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/ptrace.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..137955913e7e 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -612,8 +612,6 @@ SYSCALL_DEFINE4(ptrace, long, request, long, pid, long, addr, long, data)
 		goto out_put_task_struct;
 
 	ret = arch_ptrace(child, request, addr, data);
-	if (ret < 0)
-		goto out_put_task_struct;
 
  out_put_task_struct:
 	put_task_struct(child);
-- 
cgit v1.2.3


From ce8eb2bf05042452107e489782105d2e235cbdd0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 10:14:35 -0400
Subject: tracing: fix printk format specifier

Impact: clean up

The offsetof and sizeof are of type size_t, and instead of typecasting
them to unsigned int for printk formatting, one could just use %zu.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 1880a6438097..a0b41cc26f26 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -338,8 +338,7 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 #undef FIELD
 #define FIELD(type, name)						\
-	#type, #name, (unsigned int)offsetof(typeof(field), name),	\
-		(unsigned int)sizeof(field.name)
+	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -347,11 +346,11 @@ static int trace_write_header(struct trace_seq *s)
 
 	/* struct trace_entry */
 	return trace_seq_printf(s,
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
-				"\tfield:%s %s;\toffset:%u;\tsize:%u;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
+				"\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n"
 				"\n",
 				FIELD(unsigned char, type),
 				FIELD(unsigned char, flags),
-- 
cgit v1.2.3


From 40e26815fafd3b8c4aced17b1f22e68ef33eb8db Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 11:32:40 -0400
Subject: tracing: do not allow modifying the ftrace events via the event files

Impact: fix to prevent crash on calling NULL function pointer

The ftrace internal records have their format exported via the event
system under the ftrace subsystem. These are only for exporting the
format to allow binary readers to be able to parse them in a binary
output.

The ftrace subsystem events can only be enabled via the ftrace tracers
and do not have a registering function. The event files expect the
event record to have registering function and will call it directly.
Passing in a ftrace subsystem event will cause the kernel to crash
because it will execute a NULL pointer.

This patch prevents the ftrace subsystem from being viewable to the
event enabling files.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 18 +++++++++++++++---
 1 file changed, 15 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a0b41cc26f26..85ec10fbb38d 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -102,7 +102,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	mutex_lock(&event_mutex);
 	events_for_each(call) {
 
-		if (!call->name)
+		if (!call->name || !call->regfunc)
 			continue;
 
 		if (match &&
@@ -207,8 +207,20 @@ t_next(struct seq_file *m, void *v, loff_t *pos)
 
 	(*pos)++;
 
-	if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
-		return NULL;
+	for (;;) {
+		if ((unsigned long)call >= (unsigned long)__stop_ftrace_events)
+			return NULL;
+
+		/*
+		 * The ftrace subsystem is for showing formats only.
+		 * They can not be enabled or disabled via the event files.
+		 */
+		if (call->regfunc)
+			break;
+
+		call++;
+		next = call;
+	}
 
 	m->private = ++next;
 
-- 
cgit v1.2.3


From 57310a98a354e84279d7c8af2f48805a62372e53 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 9 Mar 2009 13:56:21 +0100
Subject: sched: optimize ttwu vs group scheduling

Impact: micro-optimization

We can avoid the sched domain walk on try_to_wake_up() when we know
there are no groups.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236603381.8389.455.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index e0fa739a441b..af5cd1b2d03e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -331,6 +331,13 @@ static DEFINE_PER_CPU(struct rt_rq, init_rt_rq) ____cacheline_aligned_in_smp;
  */
 static DEFINE_SPINLOCK(task_group_lock);
 
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+	return list_empty(&root_task_group.children);
+}
+#endif
+
 #ifdef CONFIG_FAIR_GROUP_SCHED
 #ifdef CONFIG_USER_SCHED
 # define INIT_TASK_GROUP_LOAD	(2*NICE_0_LOAD)
@@ -391,6 +398,13 @@ static inline void set_task_rq(struct task_struct *p, unsigned int cpu)
 
 #else
 
+#ifdef CONFIG_SMP
+static int root_task_group_empty(void)
+{
+	return 1;
+}
+#endif
+
 static inline void set_task_rq(struct task_struct *p, unsigned int cpu) { }
 static inline struct task_group *task_group(struct task_struct *p)
 {
@@ -2318,7 +2332,7 @@ static int try_to_wake_up(struct task_struct *p, unsigned int state, int sync)
 		sync = 0;
 
 #ifdef CONFIG_SMP
-	if (sched_feat(LB_WAKEUP_UPDATE)) {
+	if (sched_feat(LB_WAKEUP_UPDATE) && !root_task_group_empty()) {
 		struct sched_domain *sd;
 
 		this_cpu = raw_smp_processor_id();
-- 
cgit v1.2.3


From 2314c4ae1461c9e8b26cf8b9a851f280bc5769e1 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:04:02 -0400
Subject: tracing: add back the available_events file

The event directory files type and available_types were no longer
needed with the new TRACE_EVENT_FORMAT macros, they were deleted.
But by accident the available_events file was also removed.
This patch brings it back.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 85ec10fbb38d..769dfd00fc85 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -428,6 +428,13 @@ static const struct seq_operations show_set_event_seq_ops = {
 	.stop = t_stop,
 };
 
+static const struct file_operations ftrace_avail_fops = {
+	.open = ftrace_event_seq_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
 static const struct file_operations ftrace_set_event_fops = {
 	.open = ftrace_event_seq_open,
 	.read = seq_read,
@@ -569,6 +576,13 @@ static __init int event_trace_init(void)
 	if (!d_tracer)
 		return 0;
 
+	entry = debugfs_create_file("available_events", 0444, d_tracer,
+				    (void *)&show_event_seq_ops,
+				    &ftrace_avail_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'available_events' entry\n");
+
 	entry = debugfs_create_file("set_event", 0644, d_tracer,
 				    (void *)&show_set_event_seq_ops,
 				    &ftrace_set_event_fops);
-- 
cgit v1.2.3


From 30a8fecc2d34f086df34fe2f2b926f080e002600 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 12:41:38 -0400
Subject: tracing: flip the TP_printk and TP_fast_assign in the TRACE_EVENT
 macro

Impact: clean up

In trying to stay consistant with the C style format in the TRACE_EVENT
macro, it makes more sense to do the printk after the assigning of
the variables.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h          |   2 +-
 include/trace/irq_event_types.h     |   8 +--
 include/trace/sched_event_types.h   | 102 ++++++++++++++++++------------------
 kernel/trace/trace_events_stage_1.h |   2 +-
 kernel/trace/trace_events_stage_2.h |   4 +-
 kernel/trace/trace_events_stage_3.h |   2 +-
 6 files changed, 60 insertions(+), 60 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 69b56988813d..c7b09452514b 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -157,7 +157,7 @@ static inline void tracepoint_synchronize_unregister(void)
 #define TRACE_FORMAT(name, proto, args, fmt)		\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
-#define TRACE_EVENT(name, proto, args, struct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, struct, assign, print)	\
 	DECLARE_TRACE(name, PARAMS(proto), PARAMS(args))
 
 #endif
diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 43bcb74dd49f..214bb928fe9e 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -31,13 +31,13 @@ TRACE_EVENT(irq_handler_exit,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("irq=%d return=%s",
-		  __entry->irq, __entry->ret ? "handled" : "unhandled"),
-
 	TP_fast_assign(
 		__entry->irq	= irq;
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("irq=%d return=%s",
+		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
 #undef TRACE_SYSTEM
diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h
index fb37af672c88..63547dc1125f 100644
--- a/include/trace/sched_event_types.h
+++ b/include/trace/sched_event_types.h
@@ -22,12 +22,12 @@ TRACE_EVENT(sched_kthread_stop,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("task %s:%d", __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, t->comm, TASK_COMM_LEN);
 		__entry->pid	= t->pid;
-	)
+	),
+
+	TP_printk("task %s:%d", __entry->comm, __entry->pid)
 );
 
 /*
@@ -43,11 +43,11 @@ TRACE_EVENT(sched_kthread_stop_ret,
 		__field(	int,	ret	)
 	),
 
-	TP_printk("ret %d", __entry->ret),
-
 	TP_fast_assign(
 		__entry->ret	= ret;
-	)
+	),
+
+	TP_printk("ret %d", __entry->ret)
 );
 
 /*
@@ -68,14 +68,14 @@ TRACE_EVENT(sched_wait_task,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->prio	= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -97,16 +97,16 @@ TRACE_EVENT(sched_wakeup,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -128,16 +128,16 @@ TRACE_EVENT(sched_wakeup_new,
 		__field(	int,	success			)
 	),
 
-	TP_printk("task %s:%d [%d] success=%d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->success),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->success	= success;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] success=%d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->success)
 );
 
 /*
@@ -162,10 +162,6 @@ TRACE_EVENT(sched_switch,
 		__field(	int,	next_prio			)
 	),
 
-	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
-		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
-		__entry->next_comm, __entry->next_pid, __entry->next_prio),
-
 	TP_fast_assign(
 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
 		__entry->prev_pid	= prev->pid;
@@ -173,7 +169,11 @@ TRACE_EVENT(sched_switch,
 		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
 		__entry->next_pid	= next->pid;
 		__entry->next_prio	= next->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] ==> %s:%d [%d]",
+		__entry->prev_comm, __entry->prev_pid, __entry->prev_prio,
+		__entry->next_comm, __entry->next_pid, __entry->next_prio)
 );
 
 /*
@@ -193,17 +193,17 @@ TRACE_EVENT(sched_migrate_task,
 		__field(	int,	dest_cpu		)
 	),
 
-	TP_printk("task %s:%d [%d] from: %d  to: %d",
-		  __entry->comm, __entry->pid, __entry->prio,
-		  __entry->orig_cpu, __entry->dest_cpu),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
 		__entry->orig_cpu	= orig_cpu;
 		__entry->dest_cpu	= dest_cpu;
-	)
+	),
+
+	TP_printk("task %s:%d [%d] from: %d  to: %d",
+		  __entry->comm, __entry->pid, __entry->prio,
+		  __entry->orig_cpu, __entry->dest_cpu)
 );
 
 /*
@@ -221,14 +221,14 @@ TRACE_EVENT(sched_process_free,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -246,14 +246,14 @@ TRACE_EVENT(sched_process_exit,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid		= p->pid;
 		__entry->prio		= p->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -271,14 +271,14 @@ TRACE_EVENT(sched_process_wait,
 		__field(	int,	prio			)
 	),
 
-	TP_printk("task %s:%d [%d]",
-		  __entry->comm, __entry->pid, __entry->prio),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, current->comm, TASK_COMM_LEN);
 		__entry->pid		= pid_nr(pid);
 		__entry->prio		= current->prio;
-	)
+	),
+
+	TP_printk("task %s:%d [%d]",
+		  __entry->comm, __entry->pid, __entry->prio)
 );
 
 /*
@@ -297,16 +297,16 @@ TRACE_EVENT(sched_process_fork,
 		__field(	pid_t,	child_pid			)
 	),
 
-	TP_printk("parent %s:%d  child %s:%d",
-		__entry->parent_comm, __entry->parent_pid,
-		__entry->child_comm, __entry->child_pid),
-
 	TP_fast_assign(
 		memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN);
 		__entry->parent_pid	= parent->pid;
 		memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN);
 		__entry->child_pid	= child->pid;
-	)
+	),
+
+	TP_printk("parent %s:%d  child %s:%d",
+		__entry->parent_comm, __entry->parent_pid,
+		__entry->child_comm, __entry->child_pid)
 );
 
 /*
@@ -324,14 +324,14 @@ TRACE_EVENT(sched_signal_send,
 		__field(	pid_t,	pid			)
 	),
 
-	TP_printk("sig: %d  task %s:%d",
-		  __entry->sig, __entry->comm, __entry->pid),
-
 	TP_fast_assign(
 		memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
 		__entry->pid	= p->pid;
 		__entry->sig	= sig;
-	)
+	),
+
+	TP_printk("sig: %d  task %s:%d",
+		  __entry->sig, __entry->comm, __entry->pid)
 );
 
 #undef TRACE_SYSTEM
diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 15e9bf965a18..82f68443c556 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -27,7 +27,7 @@
 #define TP_STRUCT__entry(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(name, proto, args, tstruct, print, assign)	\
+#define TRACE_EVENT(name, proto, args, tstruct, assign, print)	\
 	struct ftrace_raw_##name {				\
 		struct trace_entry	ent;			\
 		tstruct						\
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index d91bf4c56661..1ad9f8d2fe45 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -39,7 +39,7 @@
 #define TP_printk(fmt, args...) fmt "\n", args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 enum print_line_t							\
 ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 {									\
@@ -115,7 +115,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 #define TP_fast_assign(args...) args
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, func)		\
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
 static int								\
 ftrace_format_##call(struct trace_seq *s)				\
 {									\
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 3ba55d4ab073..d6de06b9201a 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -148,7 +148,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 #define __entry entry
 
 #undef TRACE_EVENT
-#define TRACE_EVENT(call, proto, args, tstruct, print, assign)		\
+#define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
-- 
cgit v1.2.3


From 0e3d0f0566f3fcf664782f597070bbc669d78454 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 13:12:58 -0400
Subject: tracing: update comments to match event code macros

Impact: clean up / comments

The comments that described the ftrace macros to manipulate the
TRACE_EVENT and TRACE_FORMAT macros no longer match the code.
This patch updates them.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_1.h | 6 ++++--
 kernel/trace/trace_events_stage_2.h | 9 ++++-----
 kernel/trace/trace_events_stage_3.h | 8 ++++----
 3 files changed, 12 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h
index 82f68443c556..38985f9b379c 100644
--- a/kernel/trace/trace_events_stage_1.h
+++ b/kernel/trace/trace_events_stage_1.h
@@ -6,11 +6,13 @@
  * struct ftrace_raw_<call> {
  *	struct trace_entry		ent;
  *	<type>				<item>;
+ *	<type2>				<item2>[<len>];
  *	[...]
  * };
  *
- * The <type> <item> is created by the TRACE_FIELD(type, item, assign)
- * macro. We simply do "type item;", and that will create the fields
+ * The <type> <item> is created by the __field(type, item) macro or
+ * the __array(type2, item2, len) macro.
+ * We simply do "type item;", and that will create the fields
  * in the structure.
  */
 
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 1ad9f8d2fe45..ca347afd6aa0 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -20,7 +20,7 @@
  *
  *	field = (typeof(field))entry;
  *
- *	ret = trace_seq_printf(s, <TP_RAW_FMT> "%s", <ARGS> "\n");
+ *	ret = trace_seq_printf(s, <TP_printk> "\n");
  *	if (!ret)
  *		return TRACE_TYPE_PARTIAL_LINE;
  *
@@ -76,10 +76,9 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
  *	int ret;
  *
  *	ret = trace_seq_printf(s, #type " " #item ";"
- *			       " size:%d; offset:%d;\n",
- *			       sizeof(field.type),
- *			       offsetof(struct ftrace_raw_##call,
- *					item));
+ *			       " offset:%u; size:%u;\n",
+ *			       offsetof(struct ftrace_raw_##call, item),
+ *			       sizeof(field.type));
  *
  * }
  */
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index d6de06b9201a..6ee1de59f19d 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -56,7 +56,8 @@
  * 		return;
  * 	entry	= ring_buffer_event_data(event);
  *
- * 	<tstruct>;  <-- Here we assign the entries by the TRACE_FIELD.
+ * 	<assign>;  <-- Here we assign the entries by the __field and
+ *			__array macros.
  *
  * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
@@ -96,11 +97,10 @@
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
  * 	.name 			= "<call>",
+ *	.system			= "<system>",
+ * 	.raw_init		= ftrace_raw_init_event_<call>,
  * 	.regfunc		= ftrace_reg_event_<call>,
  * 	.unregfunc		= ftrace_unreg_event_<call>,
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.raw_reg		= ftrace_raw_reg_event_<call>,
- * 	.raw_unreg		= ftrace_raw_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
-- 
cgit v1.2.3


From ef18012b248b47ec9a12c3a83ca5e99782d39c5d Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 14:10:56 -0400
Subject: tracing: remove funky whitespace in the trace code

Impact: clean up

There existed a lot of <space><tab>'s in the tracing code. This
patch removes them.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/tracepoint.h           | 16 +++---
 kernel/trace/blktrace.c              | 10 ++--
 kernel/trace/trace.c                 |  2 +-
 kernel/trace/trace_branch.c          |  2 +-
 kernel/trace/trace_events_stage_3.h  | 98 ++++++++++++++++++------------------
 kernel/trace/trace_export.c          |  2 +-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_output.c          | 14 +++---
 kernel/trace/trace_workqueue.c       |  6 +--
 9 files changed, 78 insertions(+), 78 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h
index 119ece224c21..d35a7ee7611f 100644
--- a/include/linux/tracepoint.h
+++ b/include/linux/tracepoint.h
@@ -178,8 +178,8 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* prototype, declare it via TP_PROTO():
  *	*
  *
- * 	TP_PROTO(struct rq *rq, struct task_struct *prev,
- * 		 struct task_struct *next),
+ *	TP_PROTO(struct rq *rq, struct task_struct *prev,
+ *		 struct task_struct *next),
  *
  *	*
  *	* Define the call signature of the 'function'.
@@ -187,7 +187,7 @@ static inline void tracepoint_synchronize_unregister(void)
  *	*  TP_PROTO1/TP_PROTO2/TP_PROTO3 ugliness.)
  *	*
  *
- * 	TP_ARGS(rq, prev, next),
+ *	TP_ARGS(rq, prev, next),
  *
  *	*
  *	* Fast binary tracing: define the trace record via
@@ -229,13 +229,13 @@ static inline void tracepoint_synchronize_unregister(void)
  *	* happens, on an active tracepoint.
  *	*
  *
- * 	TP_fast_assign(
- * 		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
- * 		__entry->prev_pid	= prev->pid;
- * 		__entry->prev_prio	= prev->prio;
+ *	TP_fast_assign(
+ *		memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN);
+ *		__entry->prev_pid	= prev->pid;
+ *		__entry->prev_prio	= prev->prio;
  *		memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN);
  *		__entry->next_pid	= next->pid;
- * 		__entry->next_prio	= next->prio;
+ *		__entry->next_prio	= next->prio;
  *	)
  *
  *	*
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index e39679a72a3b..bec69d3678c1 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -33,7 +33,7 @@ static struct trace_array *blk_tr;
 static int __read_mostly  blk_tracer_enabled;
 
 /* Select an alternative, minimalistic output than the original one */
-#define TRACE_BLK_OPT_CLASSIC 	0x1
+#define TRACE_BLK_OPT_CLASSIC	0x1
 
 static struct tracer_opt blk_tracer_opts[] = {
 	/* Default disable the minimalistic output */
@@ -564,7 +564,7 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop);
 /**
  * blk_trace_ioctl: - handle the ioctls associated with tracing
  * @bdev:	the block device
- * @cmd: 	the ioctl cmd
+ * @cmd:	the ioctl cmd
  * @arg:	the argument data, if any
  *
  **/
@@ -1128,9 +1128,9 @@ static void blk_tracer_reset(struct trace_array *tr)
 
 static struct {
 	const char *act[2];
-	int 	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
+	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
 } what2act[] __read_mostly = {
-	[__BLK_TA_QUEUE]	= {{  "Q", "queue" }, 	   blk_log_generic },
+	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
 	[__BLK_TA_GETRQ]	= {{  "G", "getrq" },	   blk_log_generic },
@@ -1229,7 +1229,7 @@ static struct tracer blk_tracer __read_mostly = {
 };
 
 static struct trace_event trace_blk_event = {
-	.type	 	= TRACE_BLK,
+	.type		= TRACE_BLK,
 	.trace		= blk_trace_event_print,
 	.binary		= blk_trace_event_print_binary,
 };
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index cc94f8642485..8c6a902db40a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -799,7 +799,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags,
 
 	entry->preempt_count		= pc & 0xff;
 	entry->pid			= (tsk) ? tsk->pid : 0;
-	entry->tgid               	= (tsk) ? tsk->tgid : 0;
+	entry->tgid			= (tsk) ? tsk->tgid : 0;
 	entry->flags =
 #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT
 		(irqs_disabled_flags(flags) ? TRACE_FLAG_IRQS_OFF : 0) |
diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c
index aaa0755268b9..ad8c22efff41 100644
--- a/kernel/trace/trace_branch.c
+++ b/kernel/trace/trace_branch.c
@@ -157,7 +157,7 @@ static enum print_line_t trace_branch_print(struct trace_iterator *iter,
 
 
 static struct trace_event trace_branch_event = {
-	.type	 	= TRACE_BRANCH,
+	.type		= TRACE_BRANCH,
 	.trace		= trace_branch_print,
 };
 
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6ee1de59f19d..ae2e323df0c7 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -5,23 +5,23 @@
  *
  * static void ftrace_event_<call>(proto)
  * {
- * 	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
+ *	event_trace_printk(_RET_IP_, "<call>: " <fmt>);
  * }
  *
  * static int ftrace_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to  <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to  <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_event_<call>);
+ *	unregister_trace_<call>(ftrace_event_<call>);
  * }
  *
  * For those macros defined with TRACE_FORMAT:
@@ -29,9 +29,9 @@
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.name			= "<call>",
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  * }
  *
  *
@@ -41,66 +41,66 @@
  *
  * static void ftrace_raw_event_<call>(proto)
  * {
- * 	struct ring_buffer_event *event;
- * 	struct ftrace_raw_<call> *entry; <-- defined in stage 1
- * 	unsigned long irq_flags;
- * 	int pc;
- *
- * 	local_save_flags(irq_flags);
- * 	pc = preempt_count();
- *
- * 	event = trace_current_buffer_lock_reserve(event_<call>.id,
- * 				  sizeof(struct ftrace_raw_<call>),
- * 				  irq_flags, pc);
- * 	if (!event)
- * 		return;
- * 	entry	= ring_buffer_event_data(event);
- *
- * 	<assign>;  <-- Here we assign the entries by the __field and
+ *	struct ring_buffer_event *event;
+ *	struct ftrace_raw_<call> *entry; <-- defined in stage 1
+ *	unsigned long irq_flags;
+ *	int pc;
+ *
+ *	local_save_flags(irq_flags);
+ *	pc = preempt_count();
+ *
+ *	event = trace_current_buffer_lock_reserve(event_<call>.id,
+ *				  sizeof(struct ftrace_raw_<call>),
+ *				  irq_flags, pc);
+ *	if (!event)
+ *		return;
+ *	entry	= ring_buffer_event_data(event);
+ *
+ *	<assign>;  <-- Here we assign the entries by the __field and
  *			__array macros.
  *
- * 	trace_current_buffer_unlock_commit(event, irq_flags, pc);
+ *	trace_current_buffer_unlock_commit(event, irq_flags, pc);
  * }
  *
  * static int ftrace_raw_reg_event_<call>(void)
  * {
- * 	int ret;
+ *	int ret;
  *
- * 	ret = register_trace_<call>(ftrace_raw_event_<call>);
- * 	if (!ret)
- * 		pr_info("event trace: Could not activate trace point "
- * 			"probe to <call>");
- * 	return ret;
+ *	ret = register_trace_<call>(ftrace_raw_event_<call>);
+ *	if (!ret)
+ *		pr_info("event trace: Could not activate trace point "
+ *			"probe to <call>");
+ *	return ret;
  * }
  *
  * static void ftrace_unreg_event_<call>(void)
  * {
- * 	unregister_trace_<call>(ftrace_raw_event_<call>);
+ *	unregister_trace_<call>(ftrace_raw_event_<call>);
  * }
  *
  * static struct trace_event ftrace_event_type_<call> = {
- * 	.trace			= ftrace_raw_output_<call>, <-- stage 2
+ *	.trace			= ftrace_raw_output_<call>, <-- stage 2
  * };
  *
  * static int ftrace_raw_init_event_<call>(void)
  * {
- * 	int id;
+ *	int id;
  *
- * 	id = register_ftrace_event(&ftrace_event_type_<call>);
- * 	if (!id)
- * 		return -ENODEV;
- * 	event_<call>.id = id;
- * 	return 0;
+ *	id = register_ftrace_event(&ftrace_event_type_<call>);
+ *	if (!id)
+ *		return -ENODEV;
+ *	event_<call>.id = id;
+ *	return 0;
  * }
  *
  * static struct ftrace_event_call __used
  * __attribute__((__aligned__(4)))
  * __attribute__((section("_ftrace_events"))) event_<call> = {
- * 	.name 			= "<call>",
+ *	.name			= "<call>",
  *	.system			= "<system>",
- * 	.raw_init		= ftrace_raw_init_event_<call>,
- * 	.regfunc		= ftrace_reg_event_<call>,
- * 	.unregfunc		= ftrace_unreg_event_<call>,
+ *	.raw_init		= ftrace_raw_init_event_<call>,
+ *	.regfunc		= ftrace_reg_event_<call>,
+ *	.unregfunc		= ftrace_unreg_event_<call>,
  *	.show_format		= ftrace_format_<call>,
  * }
  *
@@ -138,7 +138,7 @@ _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
@@ -163,7 +163,7 @@ static void ftrace_raw_event_##call(proto)				\
 	pc = preempt_count();						\
 									\
 	event = trace_current_buffer_lock_reserve(event_##call.id,	\
-				  sizeof(struct ftrace_raw_##call), 	\
+				  sizeof(struct ftrace_raw_##call),	\
 				  irq_flags, pc);			\
 	if (!event)							\
 		return;							\
@@ -208,7 +208,7 @@ static int ftrace_raw_init_event_##call(void)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.raw_init		= ftrace_raw_init_event_##call,		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 23ae78430d58..4d9952d3df50 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -94,7 +94,7 @@ ftrace_format_##call(struct trace_seq *s)				\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
-	.name 			= #call,				\
+	.name			= #call,				\
 	.id			= proto,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
 	.show_format		= ftrace_format_##call,			\
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 453ebd3b636e..d1493b853e41 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -841,12 +841,12 @@ static void graph_trace_close(struct trace_iterator *iter)
 }
 
 static struct tracer graph_trace __read_mostly = {
-	.name	     	= "function_graph",
+	.name		= "function_graph",
 	.open		= graph_trace_open,
 	.close		= graph_trace_close,
 	.wait_pipe	= poll_wait_pipe,
-	.init	     	= graph_trace_init,
-	.reset	     	= graph_trace_reset,
+	.init		= graph_trace_init,
+	.reset		= graph_trace_reset,
 	.print_line	= print_graph_function,
 	.print_header	= print_graph_headers,
 	.flags		= &tracer_flags,
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ef8fd661b217..491832af9ba1 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -565,7 +565,7 @@ static enum print_line_t trace_fn_bin(struct trace_iterator *iter, int flags)
 }
 
 static struct trace_event trace_fn_event = {
-	.type	 	= TRACE_FN,
+	.type		= TRACE_FN,
 	.trace		= trace_fn_trace,
 	.raw		= trace_fn_raw,
 	.hex		= trace_fn_hex,
@@ -696,7 +696,7 @@ static enum print_line_t trace_ctxwake_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_ctx_event = {
-	.type	 	= TRACE_CTX,
+	.type		= TRACE_CTX,
 	.trace		= trace_ctx_print,
 	.raw		= trace_ctx_raw,
 	.hex		= trace_ctx_hex,
@@ -704,7 +704,7 @@ static struct trace_event trace_ctx_event = {
 };
 
 static struct trace_event trace_wake_event = {
-	.type	 	= TRACE_WAKE,
+	.type		= TRACE_WAKE,
 	.trace		= trace_wake_print,
 	.raw		= trace_wake_raw,
 	.hex		= trace_wake_hex,
@@ -759,7 +759,7 @@ static enum print_line_t trace_special_bin(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_special_event = {
-	.type	 	= TRACE_SPECIAL,
+	.type		= TRACE_SPECIAL,
 	.trace		= trace_special_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -796,7 +796,7 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_stack_event = {
-	.type	 	= TRACE_STACK,
+	.type		= TRACE_STACK,
 	.trace		= trace_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -825,7 +825,7 @@ static enum print_line_t trace_user_stack_print(struct trace_iterator *iter,
 }
 
 static struct trace_event trace_user_stack_event = {
-	.type	 	= TRACE_USER_STACK,
+	.type		= TRACE_USER_STACK,
 	.trace		= trace_user_stack_print,
 	.raw		= trace_special_print,
 	.hex		= trace_special_hex,
@@ -879,7 +879,7 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 
 
 static struct trace_event trace_print_event = {
-	.type	 	= TRACE_PRINT,
+	.type		= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 4664990fe9c5..e542483df623 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -19,14 +19,14 @@ struct cpu_workqueue_stats {
 /* Useful to know if we print the cpu headers */
 	bool		            first_entry;
 	int		            cpu;
-	pid_t 			    pid;
+	pid_t			    pid;
 /* Can be inserted from interrupt or user context, need to be atomic */
-	atomic_t 	            inserted;
+	atomic_t	            inserted;
 /*
  *  Don't need to be atomic, works are serialized in a single workqueue thread
  *  on a single CPU.
  */
-	unsigned int 	 	    executed;
+	unsigned int		    executed;
 };
 
 /* List of workqueue threads on one cpu */
-- 
cgit v1.2.3


From 6cc3c6e12bb039047974ad2e7e2d46d15a1b762f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 10 Mar 2009 19:03:43 +0100
Subject: trace_clock: fix preemption bug

Using the function_graph tracer in recent kernels generates a spew of
preemption BUGs. Fix this by not requiring trace_clock_local() users
to disable preemption themselves.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_clock.c | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 2d4953f93560..05b176abfd30 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -27,12 +27,19 @@
  */
 u64 notrace trace_clock_local(void)
 {
+	unsigned long flags;
+	u64 clock;
+
 	/*
 	 * sched_clock() is an architecture implemented, fast, scalable,
 	 * lockless clock. It is not guaranteed to be coherent across
 	 * CPUs, nor across CPU idle events.
 	 */
-	return sched_clock();
+	raw_local_irq_save(flags);
+	clock = sched_clock();
+	raw_local_irq_restore(flags);
+
+	return clock;
 }
 
 /*
-- 
cgit v1.2.3


From 80370cb758e7ca2692cd9fb5e413d970b1f4b2b2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 10 Mar 2009 17:16:35 -0400
Subject: tracing: use raw spinlocks for trace_vprintk

Impact: prevent locking up by lockdep tracer

The lockdep tracer uses trace_vprintk and thus trace_vprintk can not
call back into lockdep without locking up.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8c6a902db40a..4c97947650ae 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1176,7 +1176,8 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
  */
 int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
-	static DEFINE_SPINLOCK(trace_buf_lock);
+	static raw_spinlock_t trace_buf_lock =
+		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 	static u32 trace_buf[TRACE_BUF_SIZE];
 
 	struct ring_buffer_event *event;
@@ -1201,7 +1202,9 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	if (unlikely(atomic_read(&data->disabled)))
 		goto out;
 
-	spin_lock_irqsave(&trace_buf_lock, flags);
+	/* Lockdep uses trace_printk for lock tracing */
+	local_irq_save(flags);
+	__raw_spin_lock(&trace_buf_lock);
 	len = vbin_printf(trace_buf, TRACE_BUF_SIZE, fmt, args);
 
 	if (len > TRACE_BUF_SIZE || len < 0)
@@ -1220,7 +1223,8 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 out_unlock:
-	spin_unlock_irqrestore(&trace_buf_lock, flags);
+	__raw_spin_unlock(&trace_buf_lock);
+	local_irq_restore(flags);
 
 out:
 	ftrace_preempt_enable(resched);
-- 
cgit v1.2.3


From be50b8342dead8cacf57d4839240106b225d31f5 Mon Sep 17 00:00:00 2001
From: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Date: Tue, 10 Mar 2009 12:55:56 -0700
Subject: kernel/user.c: fix a memory leak when freeing up non-init
 usernamespaces users

We were returning early in the sysfs directory cleanup function if the
user belonged to a non init usernamespace.  Due to this a lot of the
cleanup was not done and we were left with a leak.  Fix the leak.

Reported-by: Serge Hallyn <serue@linux.vnet.ibm.com>
Signed-off-by: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Tested-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/user.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/user.c b/kernel/user.c
index 6a9b696128c8..fbb300e6191f 100644
--- a/kernel/user.c
+++ b/kernel/user.c
@@ -286,14 +286,12 @@ int __init uids_sysfs_init(void)
 /* work function to remove sysfs directory for a user and free up
  * corresponding structures.
  */
-static void remove_user_sysfs_dir(struct work_struct *w)
+static void cleanup_user_struct(struct work_struct *w)
 {
 	struct user_struct *up = container_of(w, struct user_struct, work);
 	unsigned long flags;
 	int remove_user = 0;
 
-	if (up->user_ns != &init_user_ns)
-		return;
 	/* Make uid_hash_remove() + sysfs_remove_file() + kobject_del()
 	 * atomic.
 	 */
@@ -312,9 +310,11 @@ static void remove_user_sysfs_dir(struct work_struct *w)
 	if (!remove_user)
 		goto done;
 
-	kobject_uevent(&up->kobj, KOBJ_REMOVE);
-	kobject_del(&up->kobj);
-	kobject_put(&up->kobj);
+	if (up->user_ns == &init_user_ns) {
+		kobject_uevent(&up->kobj, KOBJ_REMOVE);
+		kobject_del(&up->kobj);
+		kobject_put(&up->kobj);
+	}
 
 	sched_destroy_user(up);
 	key_put(up->uid_keyring);
@@ -335,7 +335,7 @@ static void free_user(struct user_struct *up, unsigned long flags)
 	atomic_inc(&up->__count);
 	spin_unlock_irqrestore(&uidhash_lock, flags);
 
-	INIT_WORK(&up->work, remove_user_sysfs_dir);
+	INIT_WORK(&up->work, cleanup_user_struct);
 	schedule_work(&up->work);
 }
 
-- 
cgit v1.2.3


From df1c99d416500da8d26a4d78777467c53ee7689e Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 10 Mar 2009 19:08:11 +0100
Subject: sched: add avg_overlap decay

Impact: more precise avg_overlap metric - better load-balancing

avg_overlap is used to measure the runtime overlap of the waker and
wakee.

However, when a process changes behaviour, eg a pipe becomes
un-congested and we don't need to go to sleep after a wakeup
for a while, the avg_overlap value grows stale.

When running we use the avg runtime between preemption as a
measure for avg_overlap since the amount of runtime can be
correlated to cache footprint.

The longer we run, the less likely we'll be wanting to be
migrated to another CPU.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1236709131.25234.576.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 24 +++++++++++++++++++++++-
 1 file changed, 23 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index af5cd1b2d03e..2f28351892c9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4620,6 +4620,28 @@ static inline void schedule_debug(struct task_struct *prev)
 #endif
 }
 
+static void put_prev_task(struct rq *rq, struct task_struct *prev)
+{
+	if (prev->state == TASK_RUNNING) {
+		u64 runtime = prev->se.sum_exec_runtime;
+
+		runtime -= prev->se.prev_sum_exec_runtime;
+		runtime = min_t(u64, runtime, 2*sysctl_sched_migration_cost);
+
+		/*
+		 * In order to avoid avg_overlap growing stale when we are
+		 * indeed overlapping and hence not getting put to sleep, grow
+		 * the avg_overlap on preemption.
+		 *
+		 * We use the average preemption runtime because that
+		 * correlates to the amount of cache footprint a task can
+		 * build up.
+		 */
+		update_avg(&prev->se.avg_overlap, runtime);
+	}
+	prev->sched_class->put_prev_task(rq, prev);
+}
+
 /*
  * Pick up the highest-prio task:
  */
@@ -4698,7 +4720,7 @@ need_resched_nonpreemptible:
 	if (unlikely(!rq->nr_running))
 		idle_balance(cpu, rq);
 
-	prev->sched_class->put_prev_task(rq, prev);
+	put_prev_task(rq, prev);
 	next = pick_next_task(rq);
 
 	if (likely(prev != next)) {
-- 
cgit v1.2.3


From 73c5162aa362a543793f4a957c6c536dcbaa89ce Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 13:42:01 -0400
Subject: tracing: keep ring buffer to minimum size till used

Impact: less memory impact on systems not using tracer

When the kernel boots up that has tracing configured, it allocates
the default size of the ring buffer. This currently happens to be
1.4Megs per possible CPU. This is quite a bit of wasted memory if
the system is never using the tracer.

The current solution is to keep the ring buffers to a minimum size
until the user uses them. Once a tracer is piped into the current_tracer
the ring buffer will be expanded to the default size. If the user
changes the size of the ring buffer, it will take the size given
by the user immediately.

If the user adds a "ftrace=" to the kernel command line, then the ring
buffers will be set to the default size on initialization.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 79 +++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 59 insertions(+), 20 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 4c97947650ae..0c1dc1850858 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -44,6 +44,12 @@
 unsigned long __read_mostly	tracing_max_latency;
 unsigned long __read_mostly	tracing_thresh;
 
+/*
+ * On boot up, the ring buffer is set to the minimum size, so that
+ * we do not waste memory on systems that are not using tracing.
+ */
+static int ring_buffer_expanded;
+
 /*
  * We need to change this state when a selftest is running.
  * A selftest will lurk into the ring-buffer to count the
@@ -128,6 +134,8 @@ static int __init set_ftrace(char *str)
 {
 	strncpy(bootup_tracer_buf, str, BOOTUP_TRACER_SIZE);
 	default_bootup_tracer = bootup_tracer_buf;
+	/* We are using ftrace early, expand it */
+	ring_buffer_expanded = 1;
 	return 1;
 }
 __setup("ftrace=", set_ftrace);
@@ -2315,6 +2323,40 @@ int tracer_init(struct tracer *t, struct trace_array *tr)
 	return t->init(tr);
 }
 
+static int tracing_resize_ring_buffer(unsigned long size)
+{
+	int ret;
+
+	/*
+	 * If kernel or user changes the size of the ring buffer
+	 * it get completed.
+	 */
+	ring_buffer_expanded = 1;
+
+	ret = ring_buffer_resize(global_trace.buffer, size);
+	if (ret < 0)
+		return ret;
+
+	ret = ring_buffer_resize(max_tr.buffer, size);
+	if (ret < 0) {
+		int r;
+
+		r = ring_buffer_resize(global_trace.buffer,
+				       global_trace.entries);
+		if (r < 0) {
+			/* AARGH! We are left with different
+			 * size max buffer!!!! */
+			WARN_ON(1);
+			tracing_disabled = 1;
+		}
+		return ret;
+	}
+
+	global_trace.entries = size;
+
+	return ret;
+}
+
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
@@ -2330,6 +2372,13 @@ static int tracing_set_tracer(const char *buf)
 	struct tracer *t;
 	int ret = 0;
 
+	if (!ring_buffer_expanded) {
+		ret = tracing_resize_ring_buffer(trace_buf_size);
+		if (ret < 0)
+			return ret;
+		ret = 0;
+	}
+
 	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(t->name, buf) == 0)
@@ -2903,28 +2952,11 @@ tracing_entries_write(struct file *filp, const char __user *ubuf,
 	val <<= 10;
 
 	if (val != global_trace.entries) {
-		ret = ring_buffer_resize(global_trace.buffer, val);
+		ret = tracing_resize_ring_buffer(val);
 		if (ret < 0) {
 			cnt = ret;
 			goto out;
 		}
-
-		ret = ring_buffer_resize(max_tr.buffer, val);
-		if (ret < 0) {
-			int r;
-			cnt = ret;
-			r = ring_buffer_resize(global_trace.buffer,
-					       global_trace.entries);
-			if (r < 0) {
-				/* AARGH! We are left with different
-				 * size max buffer!!!! */
-				WARN_ON(1);
-				tracing_disabled = 1;
-			}
-			goto out;
-		}
-
-		global_trace.entries = val;
 	}
 
 	filp->f_pos += cnt;
@@ -3916,6 +3948,7 @@ void ftrace_dump(void)
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
+	int ring_buf_size;
 	int i;
 	int ret = -ENOMEM;
 
@@ -3928,12 +3961,18 @@ __init static int tracer_alloc_buffers(void)
 	if (!alloc_cpumask_var(&tracing_reader_cpumask, GFP_KERNEL))
 		goto out_free_tracing_cpumask;
 
+	/* To save memory, keep the ring buffer size to its minimum */
+	if (ring_buffer_expanded)
+		ring_buf_size = trace_buf_size;
+	else
+		ring_buf_size = 1;
+
 	cpumask_copy(tracing_buffer_mask, cpu_possible_mask);
 	cpumask_copy(tracing_cpumask, cpu_all_mask);
 	cpumask_clear(tracing_reader_cpumask);
 
 	/* TODO: make the number of buffers hot pluggable with CPUS */
-	global_trace.buffer = ring_buffer_alloc(trace_buf_size,
+	global_trace.buffer = ring_buffer_alloc(ring_buf_size,
 						   TRACE_BUFFER_FLAGS);
 	if (!global_trace.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate ring buffer!\n");
@@ -3944,7 +3983,7 @@ __init static int tracer_alloc_buffers(void)
 
 
 #ifdef CONFIG_TRACER_MAX_TRACE
-	max_tr.buffer = ring_buffer_alloc(trace_buf_size,
+	max_tr.buffer = ring_buffer_alloc(ring_buf_size,
 					     TRACE_BUFFER_FLAGS);
 	if (!max_tr.buffer) {
 		printk(KERN_ERR "tracer: failed to allocate max ring buffer!\n");
-- 
cgit v1.2.3


From 1852fcce181faa237c010a3dbedb473cf9d4555f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 14:33:00 -0400
Subject: tracing: expand the ring buffers when an event is activated

To save memory, the tracer ring buffers are set to a minimum.
The activating of a trace expands the ring buffer size. This patch
adds this expanding, when an event is activated.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c        | 20 ++++++++++++++++++++
 kernel/trace/trace.h        |  3 +++
 kernel/trace/trace_events.c |  8 ++++++++
 3 files changed, 31 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 0c1dc1850858..35ee63ae4122 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2357,6 +2357,26 @@ static int tracing_resize_ring_buffer(unsigned long size)
 	return ret;
 }
 
+/**
+ * tracing_update_buffers - used by tracing facility to expand ring buffers
+ *
+ * To save on memory when the tracing is never used on a system with it
+ * configured in. The ring buffers are set to a minimum size. But once
+ * a user starts to use the tracing facility, then they need to grow
+ * to their default size.
+ *
+ * This function is to be called when a tracer is about to be used.
+ */
+int tracing_update_buffers(void)
+{
+	int ret = 0;
+
+	if (!ring_buffer_expanded)
+		ret = tracing_resize_ring_buffer(trace_buf_size);
+
+	return ret;
+}
+
 struct trace_option_dentry;
 
 static struct trace_option_dentry *
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d8865fe4..336324d717f8 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -737,6 +737,9 @@ static inline void trace_branch_disable(void)
 }
 #endif /* CONFIG_BRANCH_TRACER */
 
+/* set ring buffers to default size if not already done so */
+int tracing_update_buffers(void);
+
 /* trace event type bit fields, not numeric */
 enum {
 	TRACE_EVENT_TYPE_PRINTF		= 1,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 769dfd00fc85..ca624df73591 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -141,6 +141,10 @@ ftrace_event_write(struct file *file, const char __user *ubuf,
 	if (!cnt || cnt < 0)
 		return 0;
 
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
 	ret = get_user(ch, ubuf++);
 	if (ret)
 		return ret;
@@ -331,6 +335,10 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	if (ret < 0)
 		return ret;
 
+	ret = tracing_update_buffers();
+	if (ret < 0)
+		return ret;
+
 	switch (val) {
 	case 0:
 	case 1:
-- 
cgit v1.2.3


From 9aba60fe6eb20453de53a572143bef22fa929fba Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 19:52:30 -0400
Subject: tracing: fix trace_wait to know to wait on all cpus or just one

Impact: fix to task live locking on reading trace_pipe on one CPU

The same code is used for both trace_pipe (all CPUS) and the per_cpu
trace_pipe file. When there is no data to read, it will check for
signals and wait on the trace wait queue.

The problem happens with the per_cpu wait. The trace_wait code checks
all CPUs. Thus, if there's data in another CPU buffer, then it will
exit the wait, without checking for signals or waiting on the wait queue.

It would then try to read the empty buffer, and since that will just
return nothing, then it will try to wait again. Unfortunately, that will
again fail due to there still being data in the other buffers. This
ends up with a live lock for the task.

This patch fixes the trace_wait to be aware that the iterator may only
be waiting on a single buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 13 +++++++++++++
 1 file changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 35ee63ae4122..e60f4be10d64 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1666,6 +1666,19 @@ static int trace_empty(struct trace_iterator *iter)
 {
 	int cpu;
 
+	/* If we are looking at one CPU buffer, only check that one */
+	if (iter->cpu_file != TRACE_PIPE_ALL_CPU) {
+		cpu = iter->cpu_file;
+		if (iter->buffer_iter[cpu]) {
+			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
+				return 0;
+		} else {
+			if (!ring_buffer_empty_cpu(iter->tr->buffer, cpu))
+				return 0;
+		}
+		return 1;
+	}
+
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu]) {
 			if (!ring_buffer_iter_empty(iter->buffer_iter[cpu]))
-- 
cgit v1.2.3


From 554f786e284a6ce859d51f62240d615603944c8e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 11 Mar 2009 22:00:13 -0400
Subject: ring-buffer: only allocate buffers for online cpus

Impact: save on memory

Currently, a ring buffer was allocated for each "possible_cpus". On
some systems, this is the same as NR_CPUS. Thus, if a system defined
NR_CPUS = 64 but it only had 1 CPU, we could have possibly 63 useless
ring buffers taking up space. With a default buffer of 3 megs, this
could be quite drastic.

This patch changes the ring buffer code to only allocate ring buffers
for online CPUs.  If a CPU goes off line, we do not free the buffer.
This is because the user may still have trace data in that buffer
that they would like to look at.

Perhaps in the future we could add code to delete a ring buffer if
the CPU is offline and the ring buffer becomes empty.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 266 ++++++++++++++++++++++++++++++++++++---------
 kernel/trace/trace.c       |   6 -
 2 files changed, 216 insertions(+), 56 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 178858492a89..d07c2888396f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -16,6 +16,7 @@
 #include <linux/init.h>
 #include <linux/hash.h>
 #include <linux/list.h>
+#include <linux/cpu.h>
 #include <linux/fs.h>
 
 #include "trace.h"
@@ -301,6 +302,10 @@ struct ring_buffer {
 	struct mutex			mutex;
 
 	struct ring_buffer_per_cpu	**buffers;
+
+#ifdef CONFIG_HOTPLUG
+	struct notifier_block		cpu_notify;
+#endif
 };
 
 struct ring_buffer_iter {
@@ -459,6 +464,11 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
  */
 extern int ring_buffer_page_too_big(void);
 
+#ifdef CONFIG_HOTPLUG
+static int __cpuinit rb_cpu_notify(struct notifier_block *self,
+				   unsigned long action, void *hcpu);
+#endif
+
 /**
  * ring_buffer_alloc - allocate a new ring_buffer
  * @size: the size in bytes per cpu that is needed.
@@ -496,7 +506,8 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
-	cpumask_copy(buffer->cpumask, cpu_possible_mask);
+	get_online_cpus();
+	cpumask_copy(buffer->cpumask, cpu_online_mask);
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
@@ -512,6 +523,13 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 			goto fail_free_buffers;
 	}
 
+#ifdef CONFIG_HOTPLUG
+	buffer->cpu_notify.notifier_call = rb_cpu_notify;
+	buffer->cpu_notify.priority = 0;
+	register_cpu_notifier(&buffer->cpu_notify);
+#endif
+
+	put_online_cpus();
 	mutex_init(&buffer->mutex);
 
 	return buffer;
@@ -525,6 +543,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
  fail_free_cpumask:
 	free_cpumask_var(buffer->cpumask);
+	put_online_cpus();
 
  fail_free_buffer:
 	kfree(buffer);
@@ -541,9 +560,17 @@ ring_buffer_free(struct ring_buffer *buffer)
 {
 	int cpu;
 
+	get_online_cpus();
+
+#ifdef CONFIG_HOTPLUG
+	unregister_cpu_notifier(&buffer->cpu_notify);
+#endif
+
 	for_each_buffer_cpu(buffer, cpu)
 		rb_free_cpu_buffer(buffer->buffers[cpu]);
 
+	put_online_cpus();
+
 	free_cpumask_var(buffer->cpumask);
 
 	kfree(buffer);
@@ -649,16 +676,15 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		return size;
 
 	mutex_lock(&buffer->mutex);
+	get_online_cpus();
 
 	nr_pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 
 	if (size < buffer_size) {
 
 		/* easy case, just free pages */
-		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages)) {
-			mutex_unlock(&buffer->mutex);
-			return -1;
-		}
+		if (RB_WARN_ON(buffer, nr_pages >= buffer->pages))
+			goto out_fail;
 
 		rm_pages = buffer->pages - nr_pages;
 
@@ -677,10 +703,8 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 	 * add these pages to the cpu_buffers. Otherwise we just free
 	 * them all and return -ENOMEM;
 	 */
-	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages)) {
-		mutex_unlock(&buffer->mutex);
-		return -1;
-	}
+	if (RB_WARN_ON(buffer, nr_pages <= buffer->pages))
+		goto out_fail;
 
 	new_pages = nr_pages - buffer->pages;
 
@@ -705,13 +729,12 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		rb_insert_pages(cpu_buffer, &pages, new_pages);
 	}
 
-	if (RB_WARN_ON(buffer, !list_empty(&pages))) {
-		mutex_unlock(&buffer->mutex);
-		return -1;
-	}
+	if (RB_WARN_ON(buffer, !list_empty(&pages)))
+		goto out_fail;
 
  out:
 	buffer->pages = nr_pages;
+	put_online_cpus();
 	mutex_unlock(&buffer->mutex);
 
 	return size;
@@ -721,8 +744,18 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 		list_del_init(&bpage->list);
 		free_buffer_page(bpage);
 	}
+	put_online_cpus();
 	mutex_unlock(&buffer->mutex);
 	return -ENOMEM;
+
+	/*
+	 * Something went totally wrong, and we are too paranoid
+	 * to even clean up the mess.
+	 */
+ out_fail:
+	put_online_cpus();
+	mutex_unlock(&buffer->mutex);
+	return -1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
@@ -1528,11 +1561,15 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
+ out:
+	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
@@ -1548,11 +1585,15 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
+ out:
+	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
@@ -1564,12 +1605,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret = 0;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 0;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return cpu_buffer->entries;
+	ret = cpu_buffer->entries;
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 
@@ -1581,12 +1629,19 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	unsigned long ret = 0;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 0;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return cpu_buffer->overrun;
+	ret = cpu_buffer->overrun;
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu);
 
@@ -1603,12 +1658,16 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	unsigned long entries = 0;
 	int cpu;
 
+	get_online_cpus();
+
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		entries += cpu_buffer->entries;
 	}
 
+	put_online_cpus();
+
 	return entries;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
@@ -1626,12 +1685,16 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	unsigned long overruns = 0;
 	int cpu;
 
+	get_online_cpus();
+
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		overruns += cpu_buffer->overrun;
 	}
 
+	put_online_cpus();
+
 	return overruns;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overruns);
@@ -1663,9 +1726,14 @@ static void rb_iter_reset(struct ring_buffer_iter *iter)
  */
 void ring_buffer_iter_reset(struct ring_buffer_iter *iter)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
+	struct ring_buffer_per_cpu *cpu_buffer;
 	unsigned long flags;
 
+	if (!iter)
+		return;
+
+	cpu_buffer = iter->cpu_buffer;
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	rb_iter_reset(iter);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
@@ -1900,9 +1968,6 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct buffer_page *reader;
 	int nr_loops = 0;
 
-	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
-
 	cpu_buffer = buffer->buffers[cpu];
 
  again:
@@ -2028,13 +2093,21 @@ struct ring_buffer_event *
 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event;
+	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+	get_online_cpus();
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		goto out;
+
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return event;
 }
 
@@ -2071,24 +2144,31 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 struct ring_buffer_event *
 ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
-	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event;
+	struct ring_buffer_per_cpu *cpu_buffer;
+	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+	/* might be called in atomic */
+	preempt_disable();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
+		goto out;
 
+	cpu_buffer = buffer->buffers[cpu];
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	event = rb_buffer_peek(buffer, cpu, ts);
 	if (!event)
-		goto out;
+		goto out_unlock;
 
 	rb_advance_reader(cpu_buffer);
 
- out:
+ out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	preempt_enable();
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2109,15 +2189,17 @@ struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_iter *iter;
+	struct ring_buffer_iter *iter = NULL;
 	unsigned long flags;
 
+	get_online_cpus();
+
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return NULL;
+		goto out;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
-		return NULL;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
 
@@ -2132,6 +2214,9 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2224,9 +2309,13 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
+	int resched;
+
+	/* Can't use get_online_cpus because this can be in atomic */
+	resched = ftrace_preempt_disable();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return;
+		goto out;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -2237,6 +2326,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
+ out:
+	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2246,10 +2337,16 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
+	int resched;
 	int cpu;
 
+	/* Can't use get_online_cpus because this can be in atomic */
+	resched = ftrace_preempt_disable();
+
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
+
+	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
@@ -2262,12 +2359,17 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
+	get_online_cpus();
+
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		if (!rb_per_cpu_empty(cpu_buffer))
 			return 0;
 	}
+
+	put_online_cpus();
+
 	return 1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2280,12 +2382,20 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
+	int ret = 1;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		return 1;
+		goto out;
 
 	cpu_buffer = buffer->buffers[cpu];
-	return rb_per_cpu_empty(cpu_buffer);
+	ret = rb_per_cpu_empty(cpu_buffer);
+
+ out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty_cpu);
 
@@ -2304,32 +2414,37 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 {
 	struct ring_buffer_per_cpu *cpu_buffer_a;
 	struct ring_buffer_per_cpu *cpu_buffer_b;
+	int ret = -EINVAL;
+
+	get_online_cpus();
 
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
-		return -EINVAL;
+		goto out;
 
 	/* At least make sure the two buffers are somewhat the same */
 	if (buffer_a->pages != buffer_b->pages)
-		return -EINVAL;
+		goto out;
+
+	ret = -EAGAIN;
 
 	if (ring_buffer_flags != RB_BUFFERS_ON)
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&buffer_a->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&buffer_b->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	cpu_buffer_a = buffer_a->buffers[cpu];
 	cpu_buffer_b = buffer_b->buffers[cpu];
 
 	if (atomic_read(&cpu_buffer_a->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	if (atomic_read(&cpu_buffer_b->record_disabled))
-		return -EAGAIN;
+		goto out;
 
 	/*
 	 * We can't do a synchronize_sched here because this
@@ -2349,7 +2464,11 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	atomic_dec(&cpu_buffer_a->record_disabled);
 	atomic_dec(&cpu_buffer_b->record_disabled);
 
-	return 0;
+	ret = 0;
+out:
+	put_online_cpus();
+
+	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
 
@@ -2464,27 +2583,32 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	u64 save_timestamp;
 	int ret = -1;
 
+	get_online_cpus();
+
+	if (!cpumask_test_cpu(cpu, buffer->cpumask))
+		goto out;
+
 	/*
 	 * If len is not big enough to hold the page header, then
 	 * we can not copy anything.
 	 */
 	if (len <= BUF_PAGE_HDR_SIZE)
-		return -1;
+		goto out;
 
 	len -= BUF_PAGE_HDR_SIZE;
 
 	if (!data_page)
-		return -1;
+		goto out;
 
 	bpage = *data_page;
 	if (!bpage)
-		return -1;
+		goto out;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
 	reader = rb_get_reader_page(cpu_buffer);
 	if (!reader)
-		goto out;
+		goto out_unlock;
 
 	event = rb_reader_event(cpu_buffer);
 
@@ -2506,7 +2630,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		unsigned int size;
 
 		if (full)
-			goto out;
+			goto out_unlock;
 
 		if (len > (commit - read))
 			len = (commit - read);
@@ -2514,7 +2638,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 		size = rb_event_length(event);
 
 		if (len < size)
-			goto out;
+			goto out_unlock;
 
 		/* save the current timestamp, since the user will need it */
 		save_timestamp = cpu_buffer->read_stamp;
@@ -2553,9 +2677,12 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	}
 	ret = read;
 
- out:
+ out_unlock:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+ out:
+	put_online_cpus();
+
 	return ret;
 }
 
@@ -2629,3 +2756,42 @@ static __init int rb_init_debugfs(void)
 }
 
 fs_initcall(rb_init_debugfs);
+
+#ifdef CONFIG_HOTPLUG
+static int __cpuinit rb_cpu_notify(struct notifier_block *self,
+				   unsigned long action, void *hcpu)
+{
+	struct ring_buffer *buffer =
+		container_of(self, struct ring_buffer, cpu_notify);
+	long cpu = (long)hcpu;
+
+	switch (action) {
+	case CPU_UP_PREPARE:
+	case CPU_UP_PREPARE_FROZEN:
+		if (cpu_isset(cpu, *buffer->cpumask))
+			return NOTIFY_OK;
+
+		buffer->buffers[cpu] =
+			rb_allocate_cpu_buffer(buffer, cpu);
+		if (!buffer->buffers[cpu]) {
+			WARN(1, "failed to allocate ring buffer on CPU %ld\n",
+			     cpu);
+			return NOTIFY_OK;
+		}
+		smp_wmb();
+		cpu_set(cpu, *buffer->cpumask);
+		break;
+	case CPU_DOWN_PREPARE:
+	case CPU_DOWN_PREPARE_FROZEN:
+		/*
+		 * Do nothing.
+		 *  If we were to free the buffer, then the user would
+		 *  lose any trace that was in the buffer.
+		 */
+		break;
+	default:
+		break;
+	}
+	return NOTIFY_OK;
+}
+#endif
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e60f4be10d64..14c98f6a47bc 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1805,17 +1805,11 @@ __tracing_open(struct inode *inode, struct file *file)
 
 			iter->buffer_iter[cpu] =
 				ring_buffer_read_start(iter->tr->buffer, cpu);
-
-			if (!iter->buffer_iter[cpu])
-				goto fail_buffer;
 		}
 	} else {
 		cpu = iter->cpu_file;
 		iter->buffer_iter[cpu] =
 				ring_buffer_read_start(iter->tr->buffer, cpu);
-
-		if (!iter->buffer_iter[cpu])
-			goto fail;
 	}
 
 	/* TODO stop tracer */
-- 
cgit v1.2.3


From b2d0994b1301fc3a6a89e1889578dac9227840e3 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 00:55:37 -0700
Subject: futex: update futex commentary

Impact: cleanup

The futex_hash_bucket can be a bit confusing when first looking
at the code as it is a shared queue (and futex_q isn't a queue
at all, but rather an element on the queue).

The mmap_sem is no longer held outside of the
futex_handle_fault() routine, yet numerous comments refer to it.
The fshared argument is no an integer.  I left some of these
comments along as they are simply removed in future patches.

Some of the commentary refering to futexes by virtual page
mappings was not very clear, and completely accurate (as for
shared futexes both the page and the offset are used to
determine the key).  For the purposes of the function
description, just referring to "the futex" seems sufficient.

With hashed futexes we now access the page after the hash-bucket
is locked, and not only after it is enqueued.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090312075537.9856.29954.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 33 ++++++++++++++-------------------
 1 file changed, 14 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 438701adce23..e6a4d72bca3d 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -114,7 +114,9 @@ struct futex_q {
 };
 
 /*
- * Split the global futex_lock into every hash list lock.
+ * Hash buckets are shared by all the futex_keys that hash to the same
+ * location.  Each key may have multiple futex_q structures, one for each task
+ * waiting on a futex.
  */
 struct futex_hash_bucket {
 	spinlock_t lock;
@@ -189,8 +191,7 @@ static void drop_futex_key_refs(union futex_key *key)
 /**
  * get_futex_key - Get parameters which are the keys for a futex.
  * @uaddr: virtual address of the futex
- * @shared: NULL for a PROCESS_PRIVATE futex,
- *	&current->mm->mmap_sem for a PROCESS_SHARED futex
+ * @fshared: 0 for a PROCESS_PRIVATE futex, 1 for PROCESS_SHARED
  * @key: address where result is stored.
  *
  * Returns a negative error code or 0
@@ -200,9 +201,7 @@ static void drop_futex_key_refs(union futex_key *key)
  * offset_within_page).  For private mappings, it's (uaddr, current->mm).
  * We can usually work out the index without swapping in the page.
  *
- * fshared is NULL for PROCESS_PRIVATE futexes
- * For other futexes, it points to &current->mm->mmap_sem and
- * caller must have taken the reader lock. but NOT any spinlocks.
+ * lock_page() might sleep, the caller should not hold a spinlock.
  */
 static int get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key)
 {
@@ -589,10 +588,9 @@ static void wake_futex(struct futex_q *q)
 	 * The waiting task can free the futex_q as soon as this is written,
 	 * without taking any locks.  This must come last.
 	 *
-	 * A memory barrier is required here to prevent the following store
-	 * to lock_ptr from getting ahead of the wakeup. Clearing the lock
-	 * at the end of wake_up_all() does not prevent this store from
-	 * moving.
+	 * A memory barrier is required here to prevent the following store to
+	 * lock_ptr from getting ahead of the wakeup. Clearing the lock at the
+	 * end of wake_up() does not prevent this store from moving.
 	 */
 	smp_wmb();
 	q->lock_ptr = NULL;
@@ -693,8 +691,7 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 }
 
 /*
- * Wake up all waiters hashed on the physical page that is mapped
- * to this virtual address:
+ * Wake up waiters matching bitset queued on this futex (uaddr).
  */
 static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset)
 {
@@ -1076,11 +1073,9 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	 * in the user space variable. This must be atomic as we have
 	 * to preserve the owner died bit here.
 	 *
-	 * Note: We write the user space value _before_ changing the
-	 * pi_state because we can fault here. Imagine swapped out
-	 * pages or a fork, which was running right before we acquired
-	 * mmap_sem, that marked all the anonymous memory readonly for
-	 * cow.
+	 * Note: We write the user space value _before_ changing the pi_state
+	 * because we can fault here. Imagine swapped out pages or a fork
+	 * that marked all the anonymous memory readonly for cow.
 	 *
 	 * Modifying pi_state _before_ the user space value would
 	 * leave the pi_state in an inconsistent state when we fault
@@ -1188,7 +1183,7 @@ retry:
 	hb = queue_lock(&q);
 
 	/*
-	 * Access the page AFTER the futex is queued.
+	 * Access the page AFTER the hash-bucket is locked.
 	 * Order is important:
 	 *
 	 *   Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val);
@@ -1204,7 +1199,7 @@ retry:
 	 * a wakeup when *uaddr != val on entry to the syscall.  This is
 	 * rare, but normal.
 	 *
-	 * for shared futexes, we hold the mmap semaphore, so the mapping
+	 * For shared futexes, we hold the mmap semaphore, so the mapping
 	 * cannot have changed since we looked it up in get_futex_key.
 	 */
 	ret = get_futex_value_locked(&uval, uaddr);
-- 
cgit v1.2.3


From de87fcc124a5d4a171aa32707b3265608ebda6e7 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 00:55:46 -0700
Subject: futex: additional (get|put)_futex_key() fixes

Impact: fix races

futex_requeue and futex_lock_pi still had some bad
(get|put)_futex_key() usage. This patch adds the missing
put_futex_keys() and corrects a goto in futex_lock_pi() to avoid
a double get.

Build and boot tested on a 4 way Intel x86_64 workstation.
Passes basic pthread_mutex and PI tests out of
ltp/testcases/realtime.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090312075545.9856.75152.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e6a4d72bca3d..4000454e4d83 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -802,8 +802,10 @@ retry:
 
 		ret = get_user(dummy, uaddr2);
 		if (ret)
-			return ret;
+			goto out_put_keys;
 
+		put_futex_key(fshared, &key2);
+		put_futex_key(fshared, &key1);
 		goto retryfull;
 	}
 
@@ -878,6 +880,9 @@ retry:
 			if (hb1 != hb2)
 				spin_unlock(&hb2->lock);
 
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+
 			ret = get_user(curval, uaddr1);
 
 			if (!ret)
@@ -1453,6 +1458,7 @@ retry_locked:
 			 * exit to complete.
 			 */
 			queue_unlock(&q, hb);
+			put_futex_key(fshared, &q.key);
 			cond_resched();
 			goto retry;
 
@@ -1595,13 +1601,12 @@ uaddr_faulted:
 
 	ret = get_user(uval, uaddr);
 	if (!ret)
-		goto retry;
+		goto retry_unlocked;
 
-	if (to)
-		destroy_hrtimer_on_stack(&to->timer);
-	return ret;
+	goto out_put_key;
 }
 
+
 /*
  * Userspace attempted a TID -> 0 atomic transition, and failed.
  * This is the in-kernel slowpath: we look up the PI state (if any),
@@ -1705,6 +1710,7 @@ pi_faulted:
 	}
 
 	ret = get_user(uval, uaddr);
+	put_futex_key(fshared, &key);
 	if (!ret)
 		goto retry;
 
-- 
cgit v1.2.3


From 5eb3dc62fc5986e85715041c23dcf3832812be4b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 00:55:52 -0700
Subject: futex: add double_unlock_hb()

Impact: cleanup

The futex code uses double_lock_hb() which locks the hb->lock's
in pointer value order.  There is no parallel unlock routine,
and the code unlocks them in name order, ignoring pointer value.

This patch adds double_unlock_hb() to refactor the duplicated
code segments.

Build and boot tested on a 4 way Intel x86_64 workstation.
Passes basic pthread_mutex and PI tests out of
ltp/testcases/realtime.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090312075552.9856.48021.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 29 +++++++++++++++++------------
 1 file changed, 17 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 4000454e4d83..e149545c5cea 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -690,6 +690,19 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 	}
 }
 
+static inline void
+double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
+{
+	if (hb1 <= hb2) {
+		spin_unlock(&hb2->lock);
+		if (hb1 < hb2)
+			spin_unlock(&hb1->lock);
+	} else { /* hb1 > hb2 */
+		spin_unlock(&hb1->lock);
+		spin_unlock(&hb2->lock);
+	}
+}
+
 /*
  * Wake up waiters matching bitset queued on this futex (uaddr).
  */
@@ -767,9 +780,7 @@ retry:
 	if (unlikely(op_ret < 0)) {
 		u32 dummy;
 
-		spin_unlock(&hb1->lock);
-		if (hb1 != hb2)
-			spin_unlock(&hb2->lock);
+		double_unlock_hb(hb1, hb2);
 
 #ifndef CONFIG_MMU
 		/*
@@ -833,9 +844,7 @@ retry:
 		ret += op_ret;
 	}
 
-	spin_unlock(&hb1->lock);
-	if (hb1 != hb2)
-		spin_unlock(&hb2->lock);
+	double_unlock_hb(hb1, hb2);
 out_put_keys:
 	put_futex_key(fshared, &key2);
 out_put_key1:
@@ -876,9 +885,7 @@ retry:
 		ret = get_futex_value_locked(&curval, uaddr1);
 
 		if (unlikely(ret)) {
-			spin_unlock(&hb1->lock);
-			if (hb1 != hb2)
-				spin_unlock(&hb2->lock);
+			double_unlock_hb(hb1, hb2);
 
 			put_futex_key(fshared, &key2);
 			put_futex_key(fshared, &key1);
@@ -925,9 +932,7 @@ retry:
 	}
 
 out_unlock:
-	spin_unlock(&hb1->lock);
-	if (hb1 != hb2)
-		spin_unlock(&hb2->lock);
+	double_unlock_hb(hb1, hb2);
 
 	/* drop_futex_key_refs() must be called outside the spinlocks. */
 	while (--drop_count >= 0)
-- 
cgit v1.2.3


From 16f4993f4e9860715918efd4eeac928f8de1218b Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 00:55:59 -0700
Subject: futex: use current->time_slack_ns for rt tasks too

RT tasks should set their timer slack to 0 on their own.  This
patch removes the 'if (rt_task()) slack = 0;' block in
futex_wait.

Build and boot tested on a 4 way Intel x86_64 workstation.
Passes basic pthread_mutex and PI tests out of
ltp/testcases/realtime.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Arjan van de Ven <arjan@linux.intel.com>
LKML-Reference: <20090312075559.9856.28822.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index e149545c5cea..6579912ee70c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1253,16 +1253,13 @@ retry:
 		if (!abs_time)
 			schedule();
 		else {
-			unsigned long slack;
-			slack = current->timer_slack_ns;
-			if (rt_task(current))
-				slack = 0;
 			hrtimer_init_on_stack(&t.timer,
 					      clockrt ? CLOCK_REALTIME :
 					      CLOCK_MONOTONIC,
 					      HRTIMER_MODE_ABS);
 			hrtimer_init_sleeper(&t, current);
-			hrtimer_set_expires_range_ns(&t.timer, *abs_time, slack);
+			hrtimer_set_expires_range_ns(&t.timer, *abs_time,
+						     current->timer_slack_ns);
 
 			hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS);
 			if (!hrtimer_active(&t.timer))
-- 
cgit v1.2.3


From e8f6386c01a5699c115bdad10271a24076364c97 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 00:56:06 -0700
Subject: futex: unlock before returning -EFAULT

Impact: rt-mutex failure case fix

futex_lock_pi can potentially return -EFAULT with the rt_mutex
held.  This seems like the wrong thing to do as userspace should
assume -EFAULT means the lock was not taken.  Even if it could
figure this out, we'd be leaving the pi_state->owner in an
inconsistent state.  This patch unlocks the rt_mutex prior to
returning -EFAULT to userspace.

Build and boot tested on a 4 way Intel x86_64 workstation.
Passes basic pthread_mutex and PI tests out of
ltp/testcases/realtime.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090312075606.9856.88729.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 7 +++++++
 1 file changed, 7 insertions(+)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 6579912ee70c..c980a556f82c 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -1567,6 +1567,13 @@ retry_locked:
 		}
 	}
 
+	/*
+	 * If fixup_pi_state_owner() faulted and was unable to handle the
+	 * fault, unlock it and return the fault to userspace.
+	 */
+	if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current))
+		rt_mutex_unlock(&q.pi_state->pi_mutex);
+
 	/* Unqueue and drop the lock */
 	unqueue_me_pi(&q);
 
-- 
cgit v1.2.3


From e4dc5b7a36a49eff97050894cf1b3a9a02523717 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 00:56:13 -0700
Subject: futex: clean up fault logic

Impact: cleanup

Older versions of the futex code held the mmap_sem which had to
be dropped in order to call get_user(), so a two-pronged fault
handling mechanism was employed to handle faults of the atomic
operations.  The mmap_sem is no longer held, so get_user()
should be adequate.  This patch greatly simplifies the logic and
improves legibility.

Build and boot tested on a 4 way Intel x86_64 workstation.
Passes basic pthread_mutex and PI tests out of
ltp/testcases/realtime.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090312075612.9856.48612.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 126 +++++++++++++++++----------------------------------------
 1 file changed, 36 insertions(+), 90 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index c980a556f82c..9c97f67d298e 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -298,41 +298,6 @@ static int get_futex_value_locked(u32 *dest, u32 __user *from)
 	return ret ? -EFAULT : 0;
 }
 
-/*
- * Fault handling.
- */
-static int futex_handle_fault(unsigned long address, int attempt)
-{
-	struct vm_area_struct * vma;
-	struct mm_struct *mm = current->mm;
-	int ret = -EFAULT;
-
-	if (attempt > 2)
-		return ret;
-
-	down_read(&mm->mmap_sem);
-	vma = find_vma(mm, address);
-	if (vma && address >= vma->vm_start &&
-	    (vma->vm_flags & VM_WRITE)) {
-		int fault;
-		fault = handle_mm_fault(mm, vma, address, 1);
-		if (unlikely((fault & VM_FAULT_ERROR))) {
-#if 0
-			/* XXX: let's do this when we verify it is OK */
-			if (ret & VM_FAULT_OOM)
-				ret = -ENOMEM;
-#endif
-		} else {
-			ret = 0;
-			if (fault & VM_FAULT_MAJOR)
-				current->maj_flt++;
-			else
-				current->min_flt++;
-		}
-	}
-	up_read(&mm->mmap_sem);
-	return ret;
-}
 
 /*
  * PI code:
@@ -760,9 +725,9 @@ futex_wake_op(u32 __user *uaddr1, int fshared, u32 __user *uaddr2,
 	struct futex_hash_bucket *hb1, *hb2;
 	struct plist_head *head;
 	struct futex_q *this, *next;
-	int ret, op_ret, attempt = 0;
+	int ret, op_ret;
 
-retryfull:
+retry:
 	ret = get_futex_key(uaddr1, fshared, &key1);
 	if (unlikely(ret != 0))
 		goto out;
@@ -773,9 +738,8 @@ retryfull:
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
 
-retry:
 	double_lock_hb(hb1, hb2);
-
+retry_private:
 	op_ret = futex_atomic_op_inuser(op, uaddr2);
 	if (unlikely(op_ret < 0)) {
 		u32 dummy;
@@ -796,28 +760,16 @@ retry:
 			goto out_put_keys;
 		}
 
-		/*
-		 * futex_atomic_op_inuser needs to both read and write
-		 * *(int __user *)uaddr2, but we can't modify it
-		 * non-atomically.  Therefore, if get_user below is not
-		 * enough, we need to handle the fault ourselves, while
-		 * still holding the mmap_sem.
-		 */
-		if (attempt++) {
-			ret = futex_handle_fault((unsigned long)uaddr2,
-						 attempt);
-			if (ret)
-				goto out_put_keys;
-			goto retry;
-		}
-
 		ret = get_user(dummy, uaddr2);
 		if (ret)
 			goto out_put_keys;
 
+		if (!fshared)
+			goto retry_private;
+
 		put_futex_key(fshared, &key2);
 		put_futex_key(fshared, &key1);
-		goto retryfull;
+		goto retry;
 	}
 
 	head = &hb1->chain;
@@ -877,6 +829,7 @@ retry:
 	hb1 = hash_futex(&key1);
 	hb2 = hash_futex(&key2);
 
+retry_private:
 	double_lock_hb(hb1, hb2);
 
 	if (likely(cmpval != NULL)) {
@@ -887,15 +840,16 @@ retry:
 		if (unlikely(ret)) {
 			double_unlock_hb(hb1, hb2);
 
-			put_futex_key(fshared, &key2);
-			put_futex_key(fshared, &key1);
-
 			ret = get_user(curval, uaddr1);
+			if (ret)
+				goto out_put_keys;
 
-			if (!ret)
-				goto retry;
+			if (!fshared)
+				goto retry_private;
 
-			goto out_put_keys;
+			put_futex_key(fshared, &key2);
+			put_futex_key(fshared, &key1);
+			goto retry;
 		}
 		if (curval != *cmpval) {
 			ret = -EAGAIN;
@@ -1070,7 +1024,7 @@ static int fixup_pi_state_owner(u32 __user *uaddr, struct futex_q *q,
 	struct futex_pi_state *pi_state = q->pi_state;
 	struct task_struct *oldowner = pi_state->owner;
 	u32 uval, curval, newval;
-	int ret, attempt = 0;
+	int ret;
 
 	/* Owner died? */
 	if (!pi_state->owner)
@@ -1141,7 +1095,7 @@ retry:
 handle_fault:
 	spin_unlock(q->lock_ptr);
 
-	ret = futex_handle_fault((unsigned long)uaddr, attempt++);
+	ret = get_user(uval, uaddr);
 
 	spin_lock(q->lock_ptr);
 
@@ -1190,6 +1144,7 @@ retry:
 	if (unlikely(ret != 0))
 		goto out;
 
+retry_private:
 	hb = queue_lock(&q);
 
 	/*
@@ -1216,13 +1171,16 @@ retry:
 
 	if (unlikely(ret)) {
 		queue_unlock(&q, hb);
-		put_futex_key(fshared, &q.key);
 
 		ret = get_user(uval, uaddr);
+		if (ret)
+			goto out_put_key;
 
-		if (!ret)
-			goto retry;
-		goto out;
+		if (!fshared)
+			goto retry_private;
+
+		put_futex_key(fshared, &q.key);
+		goto retry;
 	}
 	ret = -EWOULDBLOCK;
 	if (unlikely(uval != val)) {
@@ -1356,7 +1314,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared,
 	struct futex_hash_bucket *hb;
 	u32 uval, newval, curval;
 	struct futex_q q;
-	int ret, lock_taken, ownerdied = 0, attempt = 0;
+	int ret, lock_taken, ownerdied = 0;
 
 	if (refill_pi_state_cache())
 		return -ENOMEM;
@@ -1376,7 +1334,7 @@ retry:
 	if (unlikely(ret != 0))
 		goto out;
 
-retry_unlocked:
+retry_private:
 	hb = queue_lock(&q);
 
 retry_locked:
@@ -1601,18 +1559,15 @@ uaddr_faulted:
 	 */
 	queue_unlock(&q, hb);
 
-	if (attempt++) {
-		ret = futex_handle_fault((unsigned long)uaddr, attempt);
-		if (ret)
-			goto out_put_key;
-		goto retry_unlocked;
-	}
-
 	ret = get_user(uval, uaddr);
-	if (!ret)
-		goto retry_unlocked;
+	if (ret)
+		goto out_put_key;
 
-	goto out_put_key;
+	if (!fshared)
+		goto retry_private;
+
+	put_futex_key(fshared, &q.key);
+	goto retry;
 }
 
 
@@ -1628,7 +1583,7 @@ static int futex_unlock_pi(u32 __user *uaddr, int fshared)
 	u32 uval;
 	struct plist_head *head;
 	union futex_key key = FUTEX_KEY_INIT;
-	int ret, attempt = 0;
+	int ret;
 
 retry:
 	if (get_user(uval, uaddr))
@@ -1644,7 +1599,6 @@ retry:
 		goto out;
 
 	hb = hash_futex(&key);
-retry_unlocked:
 	spin_lock(&hb->lock);
 
 	/*
@@ -1709,17 +1663,9 @@ pi_faulted:
 	 * we have to drop the mmap_sem in order to call get_user().
 	 */
 	spin_unlock(&hb->lock);
-
-	if (attempt++) {
-		ret = futex_handle_fault((unsigned long)uaddr, attempt);
-		if (ret)
-			goto out;
-		uval = 0;
-		goto retry_unlocked;
-	}
+	put_futex_key(fshared, &key);
 
 	ret = get_user(uval, uaddr);
-	put_futex_key(fshared, &key);
 	if (!ret)
 		goto retry;
 
-- 
cgit v1.2.3


From f21cfb258df6dd3ea0b3e56d75c7e994edb81b35 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 12 Mar 2009 21:05:42 +0900
Subject: irq: add remove_irq() for freeing of setup_irq() irqs

Impact: add new API

This patch adds a remove_irq() function for releasing
interrupts requested with setup_irq().

Without this patch we have no way of releasing such
interrupts since free_irq() today tries to kfree()
the irqaction passed with setup_irq().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090312120542.2926.56609.sendpatchset@rx1.opensource.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h |  1 +
 kernel/irq/manage.c | 39 ++++++++++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index f899b502f186..56f9988362ec 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -236,6 +236,7 @@ typedef struct irq_desc		irq_desc_t;
 #include <asm/hw_irq.h>
 
 extern int setup_irq(unsigned int irq, struct irqaction *new);
+extern struct irqaction *remove_irq(unsigned int irq, void *dev_id);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 52ee17135092..8b069a7046e9 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -551,20 +551,14 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 }
 
 /**
- *	free_irq - free an interrupt
+ *	remove_irq - free an interrupt
  *	@irq: Interrupt line to free
  *	@dev_id: Device identity to free
  *
- *	Remove an interrupt handler. The handler is removed and if the
- *	interrupt line is no longer in use by any driver it is disabled.
- *	On a shared IRQ the caller must ensure the interrupt is disabled
- *	on the card it drives before calling this function. The function
- *	does not return until any executing interrupts for this IRQ
- *	have completed.
- *
- *	This function must not be called from interrupt context.
+ * Used to remove interrupts statically setup by the early boot process.
  */
-void free_irq(unsigned int irq, void *dev_id)
+
+struct irqaction *remove_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
@@ -573,7 +567,7 @@ void free_irq(unsigned int irq, void *dev_id)
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
 
 	if (!desc)
-		return;
+		return NULL;
 
 	spin_lock_irqsave(&desc->lock, flags);
 
@@ -589,7 +583,7 @@ void free_irq(unsigned int irq, void *dev_id)
 			WARN(1, "Trying to free already-free IRQ %d\n", irq);
 			spin_unlock_irqrestore(&desc->lock, flags);
 
-			return;
+			return NULL;
 		}
 
 		if (action->dev_id == dev_id)
@@ -636,7 +630,26 @@ void free_irq(unsigned int irq, void *dev_id)
 		local_irq_restore(flags);
 	}
 #endif
-	kfree(action);
+	return action;
+}
+
+/**
+ *	free_irq - free an interrupt allocated with request_irq
+ *	@irq: Interrupt line to free
+ *	@dev_id: Device identity to free
+ *
+ *	Remove an interrupt handler. The handler is removed and if the
+ *	interrupt line is no longer in use by any driver it is disabled.
+ *	On a shared IRQ the caller must ensure the interrupt is disabled
+ *	on the card it drives before calling this function. The function
+ *	does not return until any executing interrupts for this IRQ
+ *	have completed.
+ *
+ *	This function must not be called from interrupt context.
+ */
+void free_irq(unsigned int irq, void *dev_id)
+{
+	kfree(remove_irq(irq, dev_id));
 }
 EXPORT_SYMBOL(free_irq);
 
-- 
cgit v1.2.3


From cbf94f06824780183e4bba165c7c29d5c7bd9a51 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 12 Mar 2009 21:05:51 +0900
Subject: irq: match remove_irq() args with setup_irq()

Modify remove_irq() to match setup_irq().

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090312120551.2926.43942.sendpatchset@rx1.opensource.se>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irq.h |  2 +-
 kernel/irq/manage.c | 26 +++++++++++++++++---------
 2 files changed, 18 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irq.h b/include/linux/irq.h
index 56f9988362ec..737eafbc1f3d 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -236,7 +236,7 @@ typedef struct irq_desc		irq_desc_t;
 #include <asm/hw_irq.h>
 
 extern int setup_irq(unsigned int irq, struct irqaction *new);
-extern struct irqaction *remove_irq(unsigned int irq, void *dev_id);
+extern void remove_irq(unsigned int irq, struct irqaction *act);
 
 #ifdef CONFIG_GENERIC_HARDIRQS
 
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 8b069a7046e9..fc16570c9b46 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -550,15 +550,11 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 	return __setup_irq(irq, desc, act);
 }
 
-/**
- *	remove_irq - free an interrupt
- *	@irq: Interrupt line to free
- *	@dev_id: Device identity to free
- *
- * Used to remove interrupts statically setup by the early boot process.
+ /*
+ * Internal function to unregister an irqaction - used to free
+ * regular and special interrupts that are part of the architecture.
  */
-
-struct irqaction *remove_irq(unsigned int irq, void *dev_id)
+static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
@@ -633,6 +629,18 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id)
 	return action;
 }
 
+/**
+ *	remove_irq - free an interrupt
+ *	@irq: Interrupt line to free
+ *	@act: irqaction for the interrupt
+ *
+ * Used to remove interrupts statically setup by the early boot process.
+ */
+void remove_irq(unsigned int irq, struct irqaction *act)
+{
+	__free_irq(irq, act->dev_id);
+}
+
 /**
  *	free_irq - free an interrupt allocated with request_irq
  *	@irq: Interrupt line to free
@@ -649,7 +657,7 @@ struct irqaction *remove_irq(unsigned int irq, void *dev_id)
  */
 void free_irq(unsigned int irq, void *dev_id)
 {
-	kfree(remove_irq(irq, dev_id));
+	kfree(__free_irq(irq, dev_id));
 }
 EXPORT_SYMBOL(free_irq);
 
-- 
cgit v1.2.3


From eb53b4e8fef10ccccb49a6dbb5e19ca84ba5a305 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Thu, 12 Mar 2009 21:05:59 +0900
Subject: irq: export remove_irq() and setup_irq() symbols

Export the setup_irq() and remove_irq() symbols.

I'd like to export these functions since I have timer
code that needs to use setup_irq() early on (too early
for request_irq()), and the same code can also be
compiled as a module.

Signed-off-by: Magnus Damm <damm@igel.co.jp>
LKML-Reference: <20090312120559.2926.82371.sendpatchset@rx1.opensource.se>
[ changed to _GPL as these are special APIs deep inside the irq layer. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/irq/manage.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index fc16570c9b46..e28db0f656ac 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -549,6 +549,7 @@ int setup_irq(unsigned int irq, struct irqaction *act)
 
 	return __setup_irq(irq, desc, act);
 }
+EXPORT_SYMBOL_GPL(setup_irq);
 
  /*
  * Internal function to unregister an irqaction - used to free
@@ -640,6 +641,7 @@ void remove_irq(unsigned int irq, struct irqaction *act)
 {
 	__free_irq(irq, act->dev_id);
 }
+EXPORT_SYMBOL_GPL(remove_irq);
 
 /**
  *	free_irq - free an interrupt allocated with request_irq
-- 
cgit v1.2.3


From f061d35150003b7fd5b133d14d66a74500fdaa60 Mon Sep 17 00:00:00 2001
From: Darren Hart <dvhltc@us.ibm.com>
Date: Thu, 12 Mar 2009 15:11:18 -0700
Subject: futex: remove the pointer math from double_unlock_hb

Impact: simplify code

I mistakenly included the pointer value ordering in the
double_unlock_hb() in my previous patch. It's only necessary
in the double_lock_hb() function. This patch removes it.

Signed-off-by: Darren Hart <dvhltc@us.ibm.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <20090312221118.11146.68610.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 9c97f67d298e..2331b73f6932 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -658,14 +658,8 @@ double_lock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 static inline void
 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
-	if (hb1 <= hb2) {
-		spin_unlock(&hb2->lock);
-		if (hb1 < hb2)
-			spin_unlock(&hb1->lock);
-	} else { /* hb1 > hb2 */
-		spin_unlock(&hb1->lock);
-		spin_unlock(&hb2->lock);
-	}
+	spin_unlock(&hb1->lock);
+	spin_unlock(&hb2->lock);
 }
 
 /*
-- 
cgit v1.2.3


From d820ac4c2fa881079e6b689d2098adce337558ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 01:30:40 +0100
Subject: locking: rename trace_softirq_[enter|exit] =>
 lockdep_softirq_[enter|exit]

Impact: cleanup

The naming clashes with upcoming softirq tracepoints, so rename the
APIs to lockdep_*().

Requested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/irqflags.h | 8 ++++----
 kernel/softirq.c         | 4 ++--
 lib/locking-selftest.c   | 4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h
index 74bde13224c9..b02a3f1d46a0 100644
--- a/include/linux/irqflags.h
+++ b/include/linux/irqflags.h
@@ -24,8 +24,8 @@
 # define trace_softirqs_enabled(p)	((p)->softirqs_enabled)
 # define trace_hardirq_enter()	do { current->hardirq_context++; } while (0)
 # define trace_hardirq_exit()	do { current->hardirq_context--; } while (0)
-# define trace_softirq_enter()	do { current->softirq_context++; } while (0)
-# define trace_softirq_exit()	do { current->softirq_context--; } while (0)
+# define lockdep_softirq_enter()	do { current->softirq_context++; } while (0)
+# define lockdep_softirq_exit()	do { current->softirq_context--; } while (0)
 # define INIT_TRACE_IRQFLAGS	.softirqs_enabled = 1,
 #else
 # define trace_hardirqs_on()		do { } while (0)
@@ -38,8 +38,8 @@
 # define trace_softirqs_enabled(p)	0
 # define trace_hardirq_enter()		do { } while (0)
 # define trace_hardirq_exit()		do { } while (0)
-# define trace_softirq_enter()		do { } while (0)
-# define trace_softirq_exit()		do { } while (0)
+# define lockdep_softirq_enter()	do { } while (0)
+# define lockdep_softirq_exit()		do { } while (0)
 # define INIT_TRACE_IRQFLAGS
 #endif
 
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9041ea7948fe..08a030f85416 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -180,7 +180,7 @@ asmlinkage void __do_softirq(void)
 	account_system_vtime(current);
 
 	__local_bh_disable((unsigned long)__builtin_return_address(0));
-	trace_softirq_enter();
+	lockdep_softirq_enter();
 
 	cpu = smp_processor_id();
 restart:
@@ -220,7 +220,7 @@ restart:
 	if (pending)
 		wakeup_softirqd();
 
-	trace_softirq_exit();
+	lockdep_softirq_exit();
 
 	account_system_vtime(current);
 	_local_bh_enable();
diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c
index 280332c1827c..619313ed6c46 100644
--- a/lib/locking-selftest.c
+++ b/lib/locking-selftest.c
@@ -157,11 +157,11 @@ static void init_shared_classes(void)
 #define SOFTIRQ_ENTER()				\
 		local_bh_disable();		\
 		local_irq_disable();		\
-		trace_softirq_enter();		\
+		lockdep_softirq_enter();	\
 		WARN_ON(!in_softirq());
 
 #define SOFTIRQ_EXIT()				\
-		trace_softirq_exit();		\
+		lockdep_softirq_exit();		\
 		local_irq_enable();		\
 		local_bh_enable();
 
-- 
cgit v1.2.3


From a123c52b46a1f84bcec3dc963351896c6d6afaf7 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:21:08 -0400
Subject: tracing: fix comments about trace buffer resizing

Impact: cleanup

Some of the comments about the trace buffer resizing is gobbledygook.
And I wonder why people question if I'm a native English speaker.

This patch makes the comments make a bit more sense.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c3946a6df34e..c61ee85c50bb 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2336,7 +2336,8 @@ static int tracing_resize_ring_buffer(unsigned long size)
 
 	/*
 	 * If kernel or user changes the size of the ring buffer
-	 * it get completed.
+	 * we use the size that was given, and we can forget about
+	 * expanding it later.
 	 */
 	ring_buffer_expanded = 1;
 
@@ -2351,8 +2352,20 @@ static int tracing_resize_ring_buffer(unsigned long size)
 		r = ring_buffer_resize(global_trace.buffer,
 				       global_trace.entries);
 		if (r < 0) {
-			/* AARGH! We are left with different
-			 * size max buffer!!!! */
+			/*
+			 * AARGH! We are left with different
+			 * size max buffer!!!!
+			 * The max buffer is our "snapshot" buffer.
+			 * When a tracer needs a snapshot (one of the
+			 * latency tracers), it swaps the max buffer
+			 * with the saved snap shot. We succeeded to
+			 * update the size of the main buffer, but failed to
+			 * update the size of the max buffer. But when we tried
+			 * to reset the main buffer to the original size, we
+			 * failed there too. This is very unlikely to
+			 * happen, but if it does, warn and kill all
+			 * tracing.
+			 */
 			WARN_ON(1);
 			tracing_disabled = 1;
 		}
-- 
cgit v1.2.3


From 1027fcb206a0fb8348e63aff078c74bdee1c2698 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:33:20 -0400
Subject: tracing: protect ring_buffer_expanded with trace_types_lock

Impact: prevent races with ring_buffer_expanded

This patch places the expanding of the tracing buffer under the
protection of the trace_types_lock mutex. It is highly unlikely
that there would be any contention, but better safe than sorry.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c61ee85c50bb..04ab8243a13d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2391,8 +2391,10 @@ int tracing_update_buffers(void)
 {
 	int ret = 0;
 
+	mutex_lock(&trace_types_lock);
 	if (!ring_buffer_expanded)
 		ret = tracing_resize_ring_buffer(trace_buf_size);
+	mutex_unlock(&trace_types_lock);
 
 	return ret;
 }
@@ -2412,6 +2414,8 @@ static int tracing_set_tracer(const char *buf)
 	struct tracer *t;
 	int ret = 0;
 
+	mutex_lock(&trace_types_lock);
+
 	if (!ring_buffer_expanded) {
 		ret = tracing_resize_ring_buffer(trace_buf_size);
 		if (ret < 0)
@@ -2419,7 +2423,6 @@ static int tracing_set_tracer(const char *buf)
 		ret = 0;
 	}
 
-	mutex_lock(&trace_types_lock);
 	for (t = trace_types; t; t = t->next) {
 		if (strcmp(t->name, buf) == 0)
 			break;
-- 
cgit v1.2.3


From 59222efe2d184956464abe5b637bc842ff053b93 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 11:46:03 -0400
Subject: ring-buffer: use CONFIG_HOTPLUG_CPU not CONFIG_HOTPLUG

The hotplug code in the ring buffers is for use with CPU hotplug,
not generic hotplug.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index d07c2888396f..035b56c3a6c9 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -303,7 +303,7 @@ struct ring_buffer {
 
 	struct ring_buffer_per_cpu	**buffers;
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
 };
@@ -464,7 +464,7 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
  */
 extern int ring_buffer_page_too_big(void);
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit rb_cpu_notify(struct notifier_block *self,
 				   unsigned long action, void *hcpu);
 #endif
@@ -523,7 +523,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 			goto fail_free_buffers;
 	}
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	buffer->cpu_notify.notifier_call = rb_cpu_notify;
 	buffer->cpu_notify.priority = 0;
 	register_cpu_notifier(&buffer->cpu_notify);
@@ -562,7 +562,7 @@ ring_buffer_free(struct ring_buffer *buffer)
 
 	get_online_cpus();
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 	unregister_cpu_notifier(&buffer->cpu_notify);
 #endif
 
@@ -2757,7 +2757,7 @@ static __init int rb_init_debugfs(void)
 
 fs_initcall(rb_init_debugfs);
 
-#ifdef CONFIG_HOTPLUG
+#ifdef CONFIG_HOTPLUG_CPU
 static int __cpuinit rb_cpu_notify(struct notifier_block *self,
 				   unsigned long action, void *hcpu)
 {
-- 
cgit v1.2.3


From 8aabee573dff131a085c63de7667eacd94ba4ccb Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 13:13:49 -0400
Subject: ring-buffer: remove unneeded get_online_cpus

Impact: speed up and remove possible races

The get_online_cpus was added to the ring buffer because the original
design would free the ring buffer on a CPU that was being taken
off line. The final design kept the ring buffer around even when the
CPU was taken off line. This is to allow a user to still read the
information on that ring buffer.

Most of the get_online_cpus are no longer needed since the ring buffer will
not disappear from the use cases.

Reported-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 90 ++++++++--------------------------------------
 1 file changed, 14 insertions(+), 76 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 035b56c3a6c9..2c36be9fac2e 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -1561,15 +1561,11 @@ void ring_buffer_record_disable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_inc(&cpu_buffer->record_disabled);
- out:
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_disable_cpu);
 
@@ -1585,15 +1581,11 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	cpu_buffer = buffer->buffers[cpu];
 	atomic_dec(&cpu_buffer->record_disabled);
- out:
-	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 
@@ -1605,17 +1597,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_record_enable_cpu);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret = 0;
-
-	get_online_cpus();
+	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = cpu_buffer->entries;
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -1629,17 +1617,13 @@ EXPORT_SYMBOL_GPL(ring_buffer_entries_cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	unsigned long ret = 0;
-
-	get_online_cpus();
+	unsigned long ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 0;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = cpu_buffer->overrun;
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -1658,16 +1642,12 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer)
 	unsigned long entries = 0;
 	int cpu;
 
-	get_online_cpus();
-
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		entries += cpu_buffer->entries;
 	}
 
-	put_online_cpus();
-
 	return entries;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_entries);
@@ -1685,16 +1665,12 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer)
 	unsigned long overruns = 0;
 	int cpu;
 
-	get_online_cpus();
-
 	/* if you care about this being correct, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
 		overruns += cpu_buffer->overrun;
 	}
 
-	put_online_cpus();
-
 	return overruns;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_overruns);
@@ -2093,21 +2069,16 @@ struct ring_buffer_event *
 ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
-	struct ring_buffer_event *event = NULL;
+	struct ring_buffer_event *event;
 	unsigned long flags;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return NULL;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
- out:
-	put_online_cpus();
-
 	return event;
 }
 
@@ -2189,17 +2160,15 @@ struct ring_buffer_iter *
 ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	struct ring_buffer_iter *iter = NULL;
+	struct ring_buffer_iter *iter;
 	unsigned long flags;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return NULL;
 
 	iter = kmalloc(sizeof(*iter), GFP_KERNEL);
 	if (!iter)
-		goto out;
+		return NULL;
 
 	cpu_buffer = buffer->buffers[cpu];
 
@@ -2214,9 +2183,6 @@ ring_buffer_read_start(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
- out:
-	put_online_cpus();
-
 	return iter;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read_start);
@@ -2309,13 +2275,9 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer = buffer->buffers[cpu];
 	unsigned long flags;
-	int resched;
-
-	/* Can't use get_online_cpus because this can be in atomic */
-	resched = ftrace_preempt_disable();
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return;
 
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 
@@ -2326,8 +2288,6 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu)
 	__raw_spin_unlock(&cpu_buffer->lock);
 
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
- out:
-	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
 
@@ -2337,16 +2297,10 @@ EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu);
  */
 void ring_buffer_reset(struct ring_buffer *buffer)
 {
-	int resched;
 	int cpu;
 
-	/* Can't use get_online_cpus because this can be in atomic */
-	resched = ftrace_preempt_disable();
-
 	for_each_buffer_cpu(buffer, cpu)
 		ring_buffer_reset_cpu(buffer, cpu);
-
-	ftrace_preempt_enable(resched);
 }
 EXPORT_SYMBOL_GPL(ring_buffer_reset);
 
@@ -2359,8 +2313,6 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 	struct ring_buffer_per_cpu *cpu_buffer;
 	int cpu;
 
-	get_online_cpus();
-
 	/* yes this is racy, but if you don't like the race, lock the buffer */
 	for_each_buffer_cpu(buffer, cpu) {
 		cpu_buffer = buffer->buffers[cpu];
@@ -2368,8 +2320,6 @@ int ring_buffer_empty(struct ring_buffer *buffer)
 			return 0;
 	}
 
-	put_online_cpus();
-
 	return 1;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_empty);
@@ -2382,18 +2332,14 @@ EXPORT_SYMBOL_GPL(ring_buffer_empty);
 int ring_buffer_empty_cpu(struct ring_buffer *buffer, int cpu)
 {
 	struct ring_buffer_per_cpu *cpu_buffer;
-	int ret = 1;
-
-	get_online_cpus();
+	int ret;
 
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
-		goto out;
+		return 1;
 
 	cpu_buffer = buffer->buffers[cpu];
 	ret = rb_per_cpu_empty(cpu_buffer);
 
- out:
-	put_online_cpus();
 
 	return ret;
 }
@@ -2416,8 +2362,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 	struct ring_buffer_per_cpu *cpu_buffer_b;
 	int ret = -EINVAL;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer_a->cpumask) ||
 	    !cpumask_test_cpu(cpu, buffer_b->cpumask))
 		goto out;
@@ -2466,8 +2410,6 @@ int ring_buffer_swap_cpu(struct ring_buffer *buffer_a,
 
 	ret = 0;
 out:
-	put_online_cpus();
-
 	return ret;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu);
@@ -2583,8 +2525,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	u64 save_timestamp;
 	int ret = -1;
 
-	get_online_cpus();
-
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		goto out;
 
@@ -2681,8 +2621,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer,
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
  out:
-	put_online_cpus();
-
 	return ret;
 }
 
-- 
cgit v1.2.3


From db526ca329f855510e8ce672332eba3304aed590 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 13:53:25 -0400
Subject: tracing: show that buffer size is not expanded

Impact: do not confuse user on small trace buffer sizes

When the system boots up, the trace buffer is small to conserve memory.
It is only two pages per online CPU. When the tracer is used, it expands
to the default value.

This can confuse the user if they look at the buffer size and see only
7, but then later they see 1408.

 # cat /debug/tracing/buffer_size_kb
7

 # echo sched_switch > /debug/tracing/current_tracer

 # cat /debug/tracing/buffer_size_kb
1408

This patch tries to help remove this confustion by showing that the
buffer has not been expanded.

 # cat /debug/tracing/buffer_size_kb
7 (expanded: 1408)

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 04ab8243a13d..62a63b2b33dd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2948,10 +2948,18 @@ tracing_entries_read(struct file *filp, char __user *ubuf,
 		     size_t cnt, loff_t *ppos)
 {
 	struct trace_array *tr = filp->private_data;
-	char buf[64];
+	char buf[96];
 	int r;
 
-	r = sprintf(buf, "%lu\n", tr->entries >> 10);
+	mutex_lock(&trace_types_lock);
+	if (!ring_buffer_expanded)
+		r = sprintf(buf, "%lu (expanded: %lu)\n",
+			    tr->entries >> 10,
+			    trace_buf_size >> 10);
+	else
+		r = sprintf(buf, "%lu\n", tr->entries >> 10);
+	mutex_unlock(&trace_types_lock);
+
 	return simple_read_from_buffer(ubuf, cnt, ppos, buf, r);
 }
 
-- 
cgit v1.2.3


From 48ead02030f849d011259244bb4ea9b985479006 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 12 Mar 2009 18:24:49 +0100
Subject: tracing/core: bring back raw trace_printk for dynamic formats strings

Impact: fix callsites with dynamic format strings

Since its new binary implementation, trace_printk() internally uses static
containers for the format strings on each callsites. But the value is
assigned once at build time, which means that it can't take dynamic
formats.

So this patch unearthes the raw trace_printk implementation for the callers
that will need trace_printk to be able to carry these dynamic format
strings. The trace_printk() macro will use the appropriate implementation
for each callsite. Most of the time however, the binary implementation will
still be used.

The other impact of this patch is that mmiotrace_printk() will use the old
implementation because it calls the low level trace_vprintk and we can't
guess here whether the format passed in it is dynamic or not.

Some parts of this patch have been written by Steven Rostedt (most notably
the part that chooses the appropriate implementation for each callsites).

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/kernel.h               | 40 +++++++++++------
 kernel/trace/trace.c                 | 85 +++++++++++++++++++++++++++++++++---
 kernel/trace/trace.h                 | 13 +++++-
 kernel/trace/trace_event_types.h     | 11 ++++-
 kernel/trace/trace_functions_graph.c |  6 +--
 kernel/trace/trace_mmiotrace.c       |  7 +--
 kernel/trace/trace_output.c          | 57 +++++++++++++++++++++---
 kernel/trace/trace_printk.c          | 33 +++++++++++---
 8 files changed, 213 insertions(+), 39 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7742798c9208..1daca3b062bb 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -452,31 +452,45 @@ do {									\
 
 #define trace_printk(fmt, args...)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
-									\
 	__trace_printk_check_format(fmt, ##args);			\
-	__trace_printk(_THIS_IP_, trace_printk_fmt, ##args);		\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(_THIS_IP_, trace_printk_fmt, ##args);	\
+	} else								\
+		__trace_printk(_THIS_IP_, fmt, ##args);		\
 } while (0)
 
+extern int
+__trace_bprintk(unsigned long ip, const char *fmt, ...)
+	__attribute__ ((format (printf, 2, 3)));
+
 extern int
 __trace_printk(unsigned long ip, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement.
+ */
 #define ftrace_vprintk(fmt, vargs)					\
 do {									\
-	static const char *trace_printk_fmt				\
-	__attribute__((section("__trace_printk_fmt")));			\
-									\
-	if (!trace_printk_fmt)						\
-		trace_printk_fmt = fmt;					\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
 									\
-	__ftrace_vprintk(_THIS_IP_, trace_printk_fmt, vargs);		\
+		__ftrace_vbprintk(_THIS_IP_, trace_printk_fmt, vargs);	\
+	} else								\
+		__ftrace_vprintk(_THIS_IP_, fmt, vargs);		\
 } while (0)
 
+extern int
+__ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap);
+
 extern int
 __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap);
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 62a63b2b33dd..dbb077d8a172 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1179,10 +1179,10 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
 
 
 /**
- * trace_vprintk - write binary msg to tracing buffer
+ * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1191,7 +1191,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 	struct ring_buffer_event *event;
 	struct trace_array *tr = &global_trace;
 	struct trace_array_cpu *data;
-	struct print_entry *entry;
+	struct bprint_entry *entry;
 	unsigned long flags;
 	int resched;
 	int cpu, len = 0, size, pc;
@@ -1219,7 +1219,7 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 
 	size = sizeof(*entry) + sizeof(u32) * len;
-	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, flags, pc);
+	event = trace_buffer_lock_reserve(tr, TRACE_BPRINT, size, flags, pc);
 	if (!event)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
@@ -1240,6 +1240,60 @@ out:
 
 	return len;
 }
+EXPORT_SYMBOL_GPL(trace_vbprintk);
+
+int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+{
+	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
+	static char trace_buf[TRACE_BUF_SIZE];
+
+	struct ring_buffer_event *event;
+	struct trace_array *tr = &global_trace;
+	struct trace_array_cpu *data;
+	int cpu, len = 0, size, pc;
+	struct print_entry *entry;
+	unsigned long irq_flags;
+
+	if (tracing_disabled || tracing_selftest_running)
+		return 0;
+
+	pc = preempt_count();
+	preempt_disable_notrace();
+	cpu = raw_smp_processor_id();
+	data = tr->data[cpu];
+
+	if (unlikely(atomic_read(&data->disabled)))
+		goto out;
+
+	pause_graph_tracing();
+	raw_local_irq_save(irq_flags);
+	__raw_spin_lock(&trace_buf_lock);
+	len = vsnprintf(trace_buf, TRACE_BUF_SIZE, fmt, args);
+
+	len = min(len, TRACE_BUF_SIZE-1);
+	trace_buf[len] = 0;
+
+	size = sizeof(*entry) + len + 1;
+	event = trace_buffer_lock_reserve(tr, TRACE_PRINT, size, irq_flags, pc);
+	if (!event)
+		goto out_unlock;
+	entry = ring_buffer_event_data(event);
+	entry->ip			= ip;
+	entry->depth			= depth;
+
+	memcpy(&entry->buf, trace_buf, len);
+	entry->buf[len] = 0;
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+ out_unlock:
+	__raw_spin_unlock(&trace_buf_lock);
+	raw_local_irq_restore(irq_flags);
+	unpause_graph_tracing();
+ out:
+	preempt_enable_notrace();
+
+	return len;
+}
 EXPORT_SYMBOL_GPL(trace_vprintk);
 
 enum trace_file_type {
@@ -1628,6 +1682,22 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
+static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1637,7 +1707,7 @@ static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
 
 	trace_assign_type(field, entry);
 
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	ret = trace_seq_printf(s, "%s", field->buf);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
@@ -1702,6 +1772,11 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 			return ret;
 	}
 
+	if (iter->ent->type == TRACE_BPRINT &&
+			trace_flags & TRACE_ITER_PRINTK &&
+			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
+		return print_bprintk_msg_only(iter);
+
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 336324d717f8..cede1ab49d07 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -20,6 +20,7 @@ enum trace_type {
 	TRACE_WAKE,
 	TRACE_STACK,
 	TRACE_PRINT,
+	TRACE_BPRINT,
 	TRACE_SPECIAL,
 	TRACE_MMIO_RW,
 	TRACE_MMIO_MAP,
@@ -117,7 +118,7 @@ struct userstack_entry {
 /*
  * trace_printk entry:
  */
-struct print_entry {
+struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
 	int			depth;
@@ -125,6 +126,13 @@ struct print_entry {
 	u32			buf[];
 };
 
+struct print_entry {
+	struct trace_entry	ent;
+	unsigned long		ip;
+	int			depth;
+	char			buf[];
+};
+
 #define TRACE_OLD_SIZE		88
 
 struct trace_field_cont {
@@ -286,6 +294,7 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct stack_entry, TRACE_STACK);	\
 		IF_ASSIGN(var, ent, struct userstack_entry, TRACE_USER_STACK);\
 		IF_ASSIGN(var, ent, struct print_entry, TRACE_PRINT);	\
+		IF_ASSIGN(var, ent, struct bprint_entry, TRACE_BPRINT);	\
 		IF_ASSIGN(var, ent, struct special_entry, 0);		\
 		IF_ASSIGN(var, ent, struct trace_mmiotrace_rw,		\
 			  TRACE_MMIO_RW);				\
@@ -570,6 +579,8 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
+trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+extern int
 trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 5cca4c978bde..d0907d746425 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
@@ -112,6 +112,15 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
 );
 
+TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
+	TRACE_STRUCT(
+		TRACE_FIELD(unsigned long, ip, ip)
+		TRACE_FIELD(unsigned int, depth, depth)
+		TRACE_FIELD_ZERO_CHAR(buf)
+	),
+	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
+);
+
 TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned int, line, line)
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 8566c14b3e9a..4c388607ed67 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -684,7 +684,7 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct print_entry *trace, struct trace_seq *s,
+print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
 	int i;
@@ -781,8 +781,8 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_PRINT: {
-		struct print_entry *field;
+	case TRACE_BPRINT: {
+		struct bprint_entry *field;
 		trace_assign_type(field, entry);
 		return print_graph_comment(field, s, entry, iter);
 	}
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index 23e346a734ca..f095916e477f 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -254,6 +254,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 {
 	struct trace_entry *entry = iter->ent;
 	struct print_entry *print = (struct print_entry *)entry;
+	const char *msg		= print->buf;
 	struct trace_seq *s	= &iter->seq;
 	unsigned long long t	= ns2usecs(iter->ts);
 	unsigned long usec_rem	= do_div(t, USEC_PER_SEC);
@@ -261,11 +262,7 @@ static enum print_line_t mmio_print_mark(struct trace_iterator *iter)
 	int ret;
 
 	/* The trailing newline must be in the message. */
-	ret = trace_seq_printf(s, "MARK %u.%06lu ", secs, usec_rem);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	ret = trace_seq_bprintf(s, print->fmt, print->buf);
+	ret = trace_seq_printf(s, "MARK %u.%06lu %s", secs, usec_rem, msg);
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 491832af9ba1..ea9d3b410c7a 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -832,13 +832,13 @@ static struct trace_event trace_user_stack_event = {
 	.binary		= trace_special_bin,
 };
 
-/* TRACE_PRINT */
+/* TRACE_BPRINT */
 static enum print_line_t
-trace_print_print(struct trace_iterator *iter, int flags)
+trace_bprint_print(struct trace_iterator *iter, int flags)
 {
 	struct trace_entry *entry = iter->ent;
 	struct trace_seq *s = &iter->seq;
-	struct print_entry *field;
+	struct bprint_entry *field;
 
 	trace_assign_type(field, entry);
 
@@ -858,9 +858,10 @@ trace_print_print(struct trace_iterator *iter, int flags)
 }
 
 
-static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+static enum print_line_t
+trace_bprint_raw(struct trace_iterator *iter, int flags)
 {
-	struct print_entry *field;
+	struct bprint_entry *field;
 	struct trace_seq *s = &iter->seq;
 
 	trace_assign_type(field, iter->ent);
@@ -878,12 +879,55 @@ static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
 }
 
 
+static struct trace_event trace_bprint_event = {
+	.type		= TRACE_BPRINT,
+	.trace		= trace_bprint_print,
+	.raw		= trace_bprint_raw,
+};
+
+/* TRACE_PRINT */
+static enum print_line_t trace_print_print(struct trace_iterator *iter,
+					   int flags)
+{
+	struct print_entry *field;
+	struct trace_seq *s = &iter->seq;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!seq_print_ip_sym(s, field->ip, flags))
+		goto partial;
+
+	if (!trace_seq_printf(s, ": %s", field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
+static enum print_line_t trace_print_raw(struct trace_iterator *iter, int flags)
+{
+	struct print_entry *field;
+
+	trace_assign_type(field, iter->ent);
+
+	if (!trace_seq_printf(&iter->seq, "# %lx %s", field->ip, field->buf))
+		goto partial;
+
+	return TRACE_TYPE_HANDLED;
+
+ partial:
+	return TRACE_TYPE_PARTIAL_LINE;
+}
+
 static struct trace_event trace_print_event = {
-	.type		= TRACE_PRINT,
+	.type	 	= TRACE_PRINT,
 	.trace		= trace_print_print,
 	.raw		= trace_print_raw,
 };
 
+
 static struct trace_event *events[] __initdata = {
 	&trace_fn_event,
 	&trace_ctx_event,
@@ -891,6 +935,7 @@ static struct trace_event *events[] __initdata = {
 	&trace_special_event,
 	&trace_stack_event,
 	&trace_user_stack_event,
+	&trace_bprint_event,
 	&trace_print_event,
 	NULL
 };
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index a50aea22e929..f307a11e2332 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -99,7 +99,7 @@ struct notifier_block module_trace_bprintk_format_nb = {
 	.notifier_call = module_trace_bprintk_format_notify,
 };
 
-int __trace_printk(unsigned long ip, const char *fmt, ...)
+int __trace_bprintk(unsigned long ip, const char *fmt, ...)
  {
 	int ret;
 	va_list ap;
@@ -111,13 +111,13 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
 	va_end(ap);
 	return ret;
 }
-EXPORT_SYMBOL_GPL(__trace_printk);
+EXPORT_SYMBOL_GPL(__trace_bprintk);
 
-int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
  {
 	if (unlikely(!fmt))
 		return 0;
@@ -125,11 +125,34 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
+	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+}
+EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
+
+int __trace_printk(unsigned long ip, const char *fmt, ...)
+{
+	int ret;
+	va_list ap;
+
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
+	va_start(ap, fmt);
+	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	va_end(ap);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(__trace_printk);
+
+int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
+{
+	if (!(trace_flags & TRACE_ITER_PRINTK))
+		return 0;
+
 	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.2.3


From 828275574e0161bdddb5817d4bd76a0265ef0470 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:14:31 -0400
Subject: tracing: make bprint event use the proper event id

The bprint record is using TRACE_PRINT when it should be TRACE_BPRINT.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_event_types.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index d0907d746425..019915063fe6 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -102,7 +102,7 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 		 "\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n\t=> (%08lx)\n")
 );
 
-TRACE_EVENT_FORMAT(bprint, TRACE_PRINT, bprint_entry, ignore,
+TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
 		TRACE_FIELD(unsigned int, depth, depth)
-- 
cgit v1.2.3


From e9fb2b6d5845e24f104713591286b6f39761c027 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:19:25 -0400
Subject: tracing: have event_trace_printk use static tracer

Impact: speed up on event tracing

The event_trace_printk is currently a wrapper function that calls
trace_vprintk. Because it uses a variable for the fmt it misses out
on the optimization of using the binary printk.

This patch makes event_trace_printk into a macro wrapper to use the
fmt as the same as the trace_printks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h        | 17 +++++++++++++++++
 kernel/trace/trace_events.c | 10 ----------
 2 files changed, 17 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cede1ab49d07..35cfa7bbaf38 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -773,4 +773,21 @@ void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+extern const char *__start___trace_bprintk_fmt[];
+extern const char *__stop___trace_bprintk_fmt[];
+
+#define event_trace_printk(ip, fmt, args...)				\
+do {									\
+	__trace_printk_check_format(fmt, ##args);			\
+	tracing_record_cmdline(current);				\
+	if (__builtin_constant_p(fmt)) {				\
+		static const char *trace_printk_fmt			\
+		  __attribute__((section("__trace_printk_fmt"))) =	\
+			__builtin_constant_p(fmt) ? fmt : NULL;		\
+									\
+		__trace_bprintk(ip, trace_printk_fmt, ##args);		\
+	} else								\
+		__trace_printk(ip, fmt, ##args);			\
+} while (0)
+
 #endif /* _LINUX_KERNEL_TRACE_H */
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index ca624df73591..238ea95a4115 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,16 +24,6 @@ static DEFINE_MUTEX(event_mutex);
 	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
 	     event++)
 
-void event_trace_printk(unsigned long ip, const char *fmt, ...)
-{
-	va_list ap;
-
-	va_start(ap, fmt);
-	tracing_record_cmdline(current);
-	trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
-	va_end(ap);
-}
-
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
-- 
cgit v1.2.3


From 7975a2be16dd42df2cab80c80cb6ece382edb6ec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 14:23:17 -0400
Subject: tracing: export trace formats to user space

The binary printk saves a pointer to the format string in the ring buffer.
On output, the format is processed. But if the user is reading the
ring buffer through a binary interface, the pointer is meaningless.

This patch creates a file called printk_formats that maps the pointers
to the formats.

 # cat /debug/tracing/printk_formats
0xffffffff80713d40 : "irq_handler_entry: irq=%d handler=%s\n"
0xffffffff80713d48 : "lock_acquire: %s%s%s\n"
0xffffffff80713d50 : "lock_release: %s\n"

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_printk.c | 119 ++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 114 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index f307a11e2332..486785214e3e 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -4,18 +4,19 @@
  * Copyright (C) 2008 Lai Jiangshan <laijs@cn.fujitsu.com>
  *
  */
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
 #include <linux/kernel.h>
 #include <linux/ftrace.h>
 #include <linux/string.h>
+#include <linux/module.h>
+#include <linux/marker.h>
+#include <linux/mutex.h>
 #include <linux/ctype.h>
 #include <linux/list.h>
-#include <linux/mutex.h>
 #include <linux/slab.h>
-#include <linux/module.h>
-#include <linux/seq_file.h>
 #include <linux/fs.h>
-#include <linux/marker.h>
-#include <linux/uaccess.h>
 
 #include "trace.h"
 
@@ -153,6 +154,114 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
+static void *
+t_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	const char **fmt = m->private;
+	const char **next = fmt;
+
+	(*pos)++;
+
+	if ((unsigned long)fmt >= (unsigned long)__stop___trace_bprintk_fmt)
+		return NULL;
+
+	next = fmt;
+	m->private = ++next;
+
+	return fmt;
+}
+
+static void *t_start(struct seq_file *m, loff_t *pos)
+{
+	return t_next(m, NULL, pos);
+}
+
+static int t_show(struct seq_file *m, void *v)
+{
+	const char **fmt = v;
+	const char *str = *fmt;
+	int i;
+
+	seq_printf(m, "0x%lx : \"", (unsigned long)fmt);
+
+	/*
+	 * Tabs and new lines need to be converted.
+	 */
+	for (i = 0; str[i]; i++) {
+		switch (str[i]) {
+		case '\n':
+			seq_puts(m, "\\n");
+			break;
+		case '\t':
+			seq_puts(m, "\\t");
+			break;
+		case '\\':
+			seq_puts(m, "\\");
+			break;
+		case '"':
+			seq_puts(m, "\\\"");
+			break;
+		default:
+			seq_putc(m, str[i]);
+		}
+	}
+	seq_puts(m, "\"\n");
+
+	return 0;
+}
+
+static void t_stop(struct seq_file *m, void *p)
+{
+}
+
+static const struct seq_operations show_format_seq_ops = {
+	.start = t_start,
+	.next = t_next,
+	.show = t_show,
+	.stop = t_stop,
+};
+
+static int
+ftrace_formats_open(struct inode *inode, struct file *file)
+{
+	int ret;
+
+	ret = seq_open(file, &show_format_seq_ops);
+	if (!ret) {
+		struct seq_file *m = file->private_data;
+
+		m->private = __start___trace_bprintk_fmt;
+	}
+	return ret;
+}
+
+static const struct file_operations ftrace_formats_fops = {
+	.open = ftrace_formats_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release,
+};
+
+static __init int init_trace_printk_function_export(void)
+{
+	struct dentry *d_tracer;
+	struct dentry *entry;
+
+	d_tracer = tracing_init_dentry();
+	if (!d_tracer)
+		return 0;
+
+	entry = debugfs_create_file("printk_formats", 0444, d_tracer,
+				    NULL, &ftrace_formats_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'printk_formats' entry\n");
+
+	return 0;
+}
+
+fs_initcall(init_trace_printk_function_export);
+
 static __init int init_trace_printk(void)
 {
 	return register_module_notifier(&module_trace_bprintk_format_nb);
-- 
cgit v1.2.3


From 2da03ecee6308ea174e8a02b92a3c4ec92e886c8 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 18:57:51 -0400
Subject: tracing: fix stack tracer header

The stack tracer use to look like this:

 # cat /debug/tracing/stack_trace
         Depth  Size      Location    (57 entries)
         -----  ----      --------
  0)     5088      16   mempool_alloc_slab+0x16/0x18
  1)     5072     144   mempool_alloc+0x4d/0xfe
  2)     4928      16   scsi_sg_alloc+0x48/0x4a [scsi_mod]

Now it looks like this:

 # cat /debug/tracing/stack_trace

        Depth    Size      Location    (57 entries)
        -----    ----      --------
  0)     5088      16   mempool_alloc_slab+0x16/0x18
  1)     5072     144   mempool_alloc+0x4d/0xfe
  2)     4928      16   scsi_sg_alloc+0x48/0x4a [scsi_mod]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index d0871bc0aca5..4564fd94b0cd 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -251,9 +251,9 @@ static int t_show(struct seq_file *m, void *v)
 	int size;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(m, "        Depth   Size      Location"
+		seq_printf(m, "        Depth    Size      Location"
 			   "    (%d entries)\n"
-			   "        -----   ----      --------\n",
+			   "        -----    ----      --------\n",
 			   max_stack_trace.nr_entries);
 		return 0;
 	}
-- 
cgit v1.2.3


From e447e1df2e568cd43d1918963c9f09fae85aea57 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 19:42:29 -0400
Subject: tracing: explain why stack tracer is empty

If the stack tracing is disabled (by default) the stack_trace file
will only contain the header:

 # cat /debug/tracing/stack_trace
        Depth    Size      Location    (0 entries)
        -----    ----      --------

This can be frustrating to a developer that does not realize that the
stack tracer is disabled. This patch adds the following text:

  # cat /debug/tracing/stack_trace
        Depth    Size      Location    (0 entries)
        -----    ----      --------
 #
 #  Stack tracer disabled
 #
 # To enable the stack tracer, either add 'stacktrace' to the
 # kernel command line
 # or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'
 #

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 4564fd94b0cd..91ccbf396c9a 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -245,6 +245,17 @@ static int trace_lookup_stack(struct seq_file *m, long i)
 #endif
 }
 
+static void print_disabled(struct seq_file *m)
+{
+	seq_puts(m, "#\n"
+		 "#  Stack tracer disabled\n"
+		 "#\n"
+		 "# To enable the stack tracer, either add 'stacktrace' to the\n"
+		 "# kernel command line\n"
+		 "# or 'echo 1 > /proc/sys/kernel/stack_tracer_enabled'\n"
+		 "#\n");
+}
+
 static int t_show(struct seq_file *m, void *v)
 {
 	long i;
@@ -255,6 +266,10 @@ static int t_show(struct seq_file *m, void *v)
 			   "    (%d entries)\n"
 			   "        -----    ----      --------\n",
 			   max_stack_trace.nr_entries);
+
+		if (!stack_tracer_enabled && !max_stack_size)
+			print_disabled(m);
+
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 5d592b44b29a1d73e13d5c9e3426eed843bdc359 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:33:36 -0400
Subject: tracing: tracepoints for softirq entry/exit - add softirq-to-name
 array

Create a 'softirq_to_name' array, which is indexed by softirq #, so
that we can easily convert between the softirq index # and its name, in
order to get more meaningful output messages.

LKML-Reference: <20090312183336.GB3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/interrupt.h | 5 +++++
 kernel/softirq.c          | 9 ++++++++-
 2 files changed, 13 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 472f11765f60..9b7e9d743476 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -258,6 +258,11 @@ enum
 	NR_SOFTIRQS
 };
 
+/* map softirq index to softirq name. update 'softirq_to_name' in
+ * kernel/softirq.c when adding a new softirq.
+ */
+extern char *softirq_to_name[NR_SOFTIRQS];
+
 /* softirq mask and active fields moved to irq_cpustat_t in
  * asm/hardirq.h to get better cache usage.  KAO
  */
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 7571bcb71be4..9f90fdc039f4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -53,6 +53,12 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
+char *softirq_to_name[NR_SOFTIRQS] = {
+	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
+	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
+	"RCU_SOFTIRQ"
+};
+
 /*
  * we cannot loop indefinitely here to avoid userspace starvation,
  * but we also don't want to introduce a worst case 1/HZ latency
@@ -209,9 +215,10 @@ restart:
 			h->action(h);
 
 			if (unlikely(prev_count != preempt_count())) {
-				printk(KERN_ERR "huh, entered softirq %td %p"
+				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
 				       " exited with %08x?\n", h - softirq_vec,
+				       softirq_to_name[h - softirq_vec],
 				       h->action, prev_count, preempt_count());
 				preempt_count() = prev_count;
 			}
-- 
cgit v1.2.3


From 39842323ceb368d2ea36ab7696aedbe296e13b61 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 12 Mar 2009 14:36:03 -0400
Subject: tracing: tracepoints for softirq entry/exit - tracepoints

Introduce softirq entry/exit tracepoints. These are useful for
augmenting existing tracers, and to figure out softirq frequencies and
timings.

[
  s/irq_softirq_/softirq_/ for trace point names and
  Fixed printf format in TRACE_FORMAT macro
   - Steven Rostedt
]

LKML-Reference: <20090312183603.GC3352@redhat.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/trace/irq_event_types.h | 12 ++++++++++++
 kernel/softirq.c                |  7 ++++++-
 2 files changed, 18 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h
index 214bb928fe9e..85964ebd47ec 100644
--- a/include/trace/irq_event_types.h
+++ b/include/trace/irq_event_types.h
@@ -40,4 +40,16 @@ TRACE_EVENT(irq_handler_exit,
 		  __entry->irq, __entry->ret ? "handled" : "unhandled")
 );
 
+TRACE_FORMAT(softirq_entry,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
+TRACE_FORMAT(softirq_exit,
+	TP_PROTO(struct softirq_action *h, struct softirq_action *vec),
+	TP_ARGS(h, vec),
+	TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec])
+	);
+
 #undef TRACE_SYSTEM
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 9f90fdc039f4..a5e81231ca7a 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -24,6 +24,7 @@
 #include <linux/ftrace.h>
 #include <linux/smp.h>
 #include <linux/tick.h>
+#include <trace/irq.h>
 
 #include <asm/irq.h>
 /*
@@ -186,6 +187,9 @@ EXPORT_SYMBOL(local_bh_enable_ip);
  */
 #define MAX_SOFTIRQ_RESTART 10
 
+DEFINE_TRACE(softirq_entry);
+DEFINE_TRACE(softirq_exit);
+
 asmlinkage void __do_softirq(void)
 {
 	struct softirq_action *h;
@@ -212,8 +216,9 @@ restart:
 		if (pending & 1) {
 			int prev_count = preempt_count();
 
+			trace_softirq_entry(h, softirq_vec);
 			h->action(h);
-
+			trace_softirq_exit(h, softirq_vec);
 			if (unlikely(prev_count != preempt_count())) {
 				printk(KERN_ERR "huh, entered softirq %td %s %p"
 				       "with preempt_count %08x,"
-- 
cgit v1.2.3


From 889a6c367283709a80dad9413488472596a1a1d2 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Fri, 13 Mar 2009 09:03:04 +0900
Subject: tracing: Don't use tracing_record_cmdline() in workqueue tracer fix

commit c3ffc7a40b7e94b094efe1c8ab4e24370a782b65
"Don't use tracing_record_cmdline() in workqueue tracer"
has a race window.

find_task_by_vpid() requires task_list_lock().

LKML-Reference: <20090313090042.43CD.A69D9226@jp.fujitsu.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_workqueue.c | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index fb5ccac8bbc0..9ab035b58cf1 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -193,12 +193,20 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct cpu_workqueue_stats *cws = p;
 	unsigned long flags;
 	int cpu = cws->cpu;
-	struct task_struct *tsk = find_task_by_vpid(cws->pid);
-
-	seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
-		   atomic_read(&cws->inserted),
-		   cws->executed,
-		   tsk ? tsk->comm : "<...>");
+	struct pid *pid;
+	struct task_struct *tsk;
+
+	pid = find_get_pid(cws->pid);
+	if (pid) {
+		tsk = get_pid_task(pid, PIDTYPE_PID);
+		if (tsk) {
+			seq_printf(s, "%3d %6d     %6u       %s\n", cws->cpu,
+				   atomic_read(&cws->inserted), cws->executed,
+				   tsk->comm);
+			put_task_struct(tsk);
+		}
+		put_pid(pid);
+	}
 
 	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
 	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-- 
cgit v1.2.3


From f28e55765e40450c127e44d00ae65d0cd1a4efec Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 22:00:19 -0400
Subject: tracing: show event name in trace for TRACE_EVENT created events

Unlike TRACE_FORMAT() macros, the TRACE_EVENT() macros do not show
the event name in the trace file. Knowing the event type in the trace
output is very useful.

Instead of:

   task swapper:0 [140] ==> ntpd:3308 [120]

We now have:

   sched_switch: task swapper:0 [140] ==> ntpd:3308 [120]

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_events_stage_2.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index ca347afd6aa0..5117c43f5c67 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -57,7 +57,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags)	\
 									\
 	field = (typeof(field))entry;					\
 									\
-	ret = trace_seq_printf(s, print);				\
+	ret = trace_seq_printf(s, #call ": " print);			\
 	if (!ret)							\
 		return TRACE_TYPE_PARTIAL_LINE;				\
 									\
-- 
cgit v1.2.3


From 5cc985488845ec7227a2c5cfd2fd62cf57fb411a Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 12 Mar 2009 22:24:17 -0400
Subject: ring-buffer: document reader page design

In a private email conversation I explained how the ring buffer
page worked by using silly ASCII art. Ingo suggested that I add
that to the comments of the code.

Here it is.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ring_buffer.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 68 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 2c36be9fac2e..58128ad2fde0 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -21,6 +21,74 @@
 
 #include "trace.h"
 
+/*
+ * The ring buffer is made up of a list of pages. A separate list of pages is
+ * allocated for each CPU. A writer may only write to a buffer that is
+ * associated with the CPU it is currently executing on.  A reader may read
+ * from any per cpu buffer.
+ *
+ * The reader is special. For each per cpu buffer, the reader has its own
+ * reader page. When a reader has read the entire reader page, this reader
+ * page is swapped with another page in the ring buffer.
+ *
+ * Now, as long as the writer is off the reader page, the reader can do what
+ * ever it wants with that page. The writer will never write to that page
+ * again (as long as it is out of the ring buffer).
+ *
+ * Here's some silly ASCII art.
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |
+ *   +------+        +---+   +---+   +---+
+ *                   |   |-->|   |-->|   |
+ *                   +---+   +---+   +---+
+ *                     ^               |
+ *                     |               |
+ *                     +---------------+
+ *
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *                   |   |-->|   |-->|   |
+ *                   +---+   +---+   +---+
+ *                     ^               |
+ *                     |               |
+ *                     +---------------+
+ *
+ *
+ *   +------+
+ *   |reader|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *      ^            |   |-->|   |-->|   |
+ *      |            +---+   +---+   +---+
+ *      |                              |
+ *      |                              |
+ *      +------------------------------+
+ *
+ *
+ *   +------+
+ *   |buffer|          RING BUFFER
+ *   |page  |------------------v
+ *   +------+        +---+   +---+   +---+
+ *      ^            |   |   |   |-->|   |
+ *      |   New      +---+   +---+   +---+
+ *      |  Reader------^               |
+ *      |   page                       |
+ *      +------------------------------+
+ *
+ *
+ * After we make this swap, the reader can hand this page off to the splice
+ * code and be done with it. It can even allocate a new page if it needs to
+ * and swap that into the ring buffer.
+ *
+ * We will be using cmpxchg soon to make all this lockless.
+ *
+ */
+
 /*
  * A fast way to enable or disable all ring buffers is to
  * call tracing_on or tracing_off. Turning off the ring buffers
-- 
cgit v1.2.3


From eb1871f34358024acfa3523ef375ef14b7527173 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:00:58 -0400
Subject: tracing: left align location header in stack_trace

Ingo Molnar suggested, instead of:

        Depth    Size      Location    (27 entries)
        -----    ----      --------
  0)     2880      48   lock_timer_base+0x2b/0x4f
  1)     2832      80   __mod_timer+0x33/0xe0
  2)     2752      16   __ide_set_handler+0x63/0x65

To have it be:

        Depth    Size   Location    (27 entries)
        -----    ----   --------
  0)     2880      48   lock_timer_base+0x2b/0x4f
  1)     2832      80   __mod_timer+0x33/0xe0
  2)     2752      16   __ide_set_handler+0x63/0x65

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stack.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c
index 91ccbf396c9a..c750f65f9661 100644
--- a/kernel/trace/trace_stack.c
+++ b/kernel/trace/trace_stack.c
@@ -262,9 +262,9 @@ static int t_show(struct seq_file *m, void *v)
 	int size;
 
 	if (v == SEQ_START_TOKEN) {
-		seq_printf(m, "        Depth    Size      Location"
+		seq_printf(m, "        Depth    Size   Location"
 			   "    (%d entries)\n"
-			   "        -----    ----      --------\n",
+			   "        -----    ----   --------\n",
 			   max_stack_trace.nr_entries);
 
 		if (!stack_tracer_enabled && !max_stack_size)
-- 
cgit v1.2.3


From bdc067582b8b71c7771bab076bbc51569c594fb4 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:12:52 -0400
Subject: tracing: add comment for use of double __builtin_consant_p

Impact: documentation

The use of the double __builtin_contant_p checks in the event_trace_printk
can be confusing to developers and reviewers. This patch adds a comment
to explain why it is there.

Requested-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
LKML-Reference: <20090313122235.43EB.A69D9226@jp.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.h | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 35cfa7bbaf38..67595b8f0f15 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -776,6 +776,11 @@ extern struct ftrace_event_call __stop_ftrace_events[];
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
+/*
+ * The double __builtin_constant_p is because gcc will give us an error
+ * if we try to allocate the static variable to fmt if it is not a
+ * constant. Even with the outer if statement optimizing out.
+ */
 #define event_trace_printk(ip, fmt, args...)				\
 do {									\
 	__trace_printk_check_format(fmt, ##args);			\
-- 
cgit v1.2.3


From c69fc56de1df5769f2ec69c915c7ad5afe63804c Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Fri, 13 Mar 2009 14:49:46 +1030
Subject: cpumask: use topology_core_cpumask/topology_thread_cpumask instead of
 cpu_core_map/cpu_sibling_map

Impact: cleanup

This is presumably what those definitions are for, and while all archs
define cpu_core_map/cpu_sibling map, that's changing (eg. x86 wants to
change it to a pointer).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 block/blk.h    | 2 +-
 kernel/sched.c | 8 ++++----
 2 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/block/blk.h b/block/blk.h
index 0dce92c37496..3ee94358b43d 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -102,7 +102,7 @@ static inline int blk_cpu_to_group(int cpu)
 	const struct cpumask *mask = cpu_coregroup_mask(cpu);
 	return cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	return first_cpu(per_cpu(cpu_sibling_map, cpu));
+	return cpumask_first(topology_thread_cpumask(cpu));
 #else
 	return cpu;
 #endif
diff --git a/kernel/sched.c b/kernel/sched.c
index 0a76d0b6f215..5dabd80c3c15 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7249,7 +7249,7 @@ cpu_to_core_group(int cpu, const struct cpumask *cpu_map,
 {
 	int group;
 
-	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 	if (sg)
 		*sg = &per_cpu(sched_group_core, group).sg;
@@ -7278,7 +7278,7 @@ cpu_to_phys_group(int cpu, const struct cpumask *cpu_map,
 	cpumask_and(mask, cpu_coregroup_mask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #elif defined(CONFIG_SCHED_SMT)
-	cpumask_and(mask, &per_cpu(cpu_sibling_map, cpu), cpu_map);
+	cpumask_and(mask, topology_thread_cpumask(cpu), cpu_map);
 	group = cpumask_first(mask);
 #else
 	group = cpu;
@@ -7621,7 +7621,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 		SD_INIT(sd, SIBLING);
 		set_domain_attribute(sd, attr);
 		cpumask_and(sched_domain_span(sd),
-			    &per_cpu(cpu_sibling_map, i), cpu_map);
+			    topology_thread_cpumask(i), cpu_map);
 		sd->parent = p;
 		p->child = sd;
 		cpu_to_cpu_group(i, cpu_map, &sd->groups, tmpmask);
@@ -7632,7 +7632,7 @@ static int __build_sched_domains(const struct cpumask *cpu_map,
 	/* Set up CPU (sibling) groups */
 	for_each_cpu(i, cpu_map) {
 		cpumask_and(this_sibling_map,
-			    &per_cpu(cpu_sibling_map, i), cpu_map);
+			    topology_thread_cpumask(i), cpu_map);
 		if (i != cpumask_first(this_sibling_map))
 			continue;
 
-- 
cgit v1.2.3


From 7f96f93f02b7637491a1637dee12dcdcd40b9802 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:37:42 -0400
Subject: tracing: move binary buffers into per cpu directory

The binary_buffers directory in /debugfs/tracing held the files
to read the trace buffers in a binary format. This held one file
per CPU buffer. But we also have a per_cpu directory that holds
a way to read the pretty-print formats.

This patch moves the binary buffers into the per_cpu_directory:

 # ls /debug/tracing/per_cpu/cpu1/
trace  trace_pipe  trace_pipe_raw

The new name is called "trace_pipe_raw". The binary buffers always
acted similar to trace_pipe, except that they produce raw data.

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 26 +++++---------------------
 1 file changed, 5 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index dbb077d8a172..efe3202c0209 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3543,6 +3543,11 @@ static void tracing_init_debugfs_percpu(long cpu)
 				(void *) cpu, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
+
+	entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu,
+				    (void *) cpu, &tracing_buffers_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n");
 }
 
 #ifdef CONFIG_FTRACE_SELFTEST
@@ -3826,7 +3831,6 @@ static __init void create_trace_options_dir(void)
 static __init int tracer_init_debugfs(void)
 {
 	struct dentry *d_tracer;
-	struct dentry *buffers;
 	struct dentry *entry;
 	int cpu;
 
@@ -3899,26 +3903,6 @@ static __init int tracer_init_debugfs(void)
 		pr_warning("Could not create debugfs "
 			   "'trace_marker' entry\n");
 
-	buffers = debugfs_create_dir("binary_buffers", d_tracer);
-
-	if (!buffers)
-		pr_warning("Could not create buffers directory\n");
-	else {
-		int cpu;
-		char buf[64];
-
-		for_each_tracing_cpu(cpu) {
-			sprintf(buf, "%d", cpu);
-
-			entry = debugfs_create_file(buf, 0444, buffers,
-						    (void *)(long)cpu,
-						    &tracing_buffers_fops);
-			if (!entry)
-				pr_warning("Could not create debugfs buffers "
-					   "'%s' entry\n", buf);
-		}
-	}
-
 #ifdef CONFIG_DYNAMIC_FTRACE
 	entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer,
 				    &ftrace_update_tot_cnt,
-- 
cgit v1.2.3


From 899039e8746bb9a09b6487ddb8ab2275ce9d0256 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 13 Mar 2009 00:43:33 -0400
Subject: softirq: no need to have SOFTIRQ in softirq name

Impact: clean up

It is redundant to have 'SOFTIRQ' in the softirq names.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/softirq.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index a5e81231ca7a..65ff3e3961b4 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -55,9 +55,8 @@ static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp
 static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
 
 char *softirq_to_name[NR_SOFTIRQS] = {
-	"HI_SOFTIRQ", "TIMER_SOFTIRQ", "NET_TX_SOFTIRQ", "NET_RX_SOFTIRQ",
-	"BLOCK_SOFTIRQ", "TASKLET_SOFTIRQ", "SCHED_SOFTIRQ", "HRTIMER_SOFTIRQ",
-	"RCU_SOFTIRQ"
+	"HI", "TIMER", "NET_TX", "NET_RX", "BLOCK",
+	"TASKLET", "SCHED", "HRTIMER",	"RCU"
 };
 
 /*
-- 
cgit v1.2.3


From ee08c6eccb7d1295516f7cf420fddf7b14e9146f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 7 Mar 2009 05:52:59 +0100
Subject: tracing/ftrace: syscall tracing infrastructure, basics

Provide basic callbacks to do syscall tracing.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <1236401580-5758-2-git-send-email-fweisbec@gmail.com>
[ simplified it to a trace_printk() for now. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h        |  21 ++++++++
 kernel/trace/Kconfig          |  10 ++++
 kernel/trace/Makefile         |   1 +
 kernel/trace/trace.h          |   2 +
 kernel/trace/trace_syscalls.c | 113 ++++++++++++++++++++++++++++++++++++++++++
 5 files changed, 147 insertions(+)
 create mode 100644 kernel/trace/trace_syscalls.c

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index e1583f2639b0..c146c1021a29 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -503,4 +503,25 @@ static inline void trace_hw_branch_oops(void) {}
 
 #endif /* CONFIG_HW_BRANCH_TRACER */
 
+/*
+ * A syscall entry in the ftrace syscalls array.
+ *
+ * @syscall_nr: syscall number
+ */
+struct syscall_trace_entry {
+	int		syscall_nr;
+};
+
+#ifdef CONFIG_FTRACE_SYSCALLS
+extern void start_ftrace_syscalls(void);
+extern void stop_ftrace_syscalls(void);
+extern void ftrace_syscall_enter(struct pt_regs *regs);
+extern void ftrace_syscall_exit(struct pt_regs *regs);
+#else
+static inline void start_ftrace_syscalls(void) { }
+static inline void stop_ftrace_syscalls(void) { }
+static inline void ftrace_syscall_enter(struct pt_regs *regs) { }
+static inline void ftrace_syscall_exit(struct pt_regs *regs) { }
+#endif
+
 #endif /* _LINUX_FTRACE_H */
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 8e4a2a61cd75..95a0ad191f19 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -34,6 +34,9 @@ config HAVE_FTRACE_MCOUNT_RECORD
 config HAVE_HW_BRANCH_TRACER
 	bool
 
+config HAVE_FTRACE_SYSCALLS
+	bool
+
 config TRACER_MAX_TRACE
 	bool
 
@@ -175,6 +178,13 @@ config EVENT_TRACER
 	  allowing the user to pick and choose which trace point they
 	  want to trace.
 
+config FTRACE_SYSCALLS
+	bool "Trace syscalls"
+	depends on HAVE_FTRACE_SYSCALLS
+	select TRACING
+	help
+	  Basic tracer to catch the syscall entry and exit events.
+
 config BOOT_TRACER
 	bool "Trace boot initcalls"
 	select TRACING
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c7a2943796eb..c3feea01c3e0 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -43,5 +43,6 @@ obj-$(CONFIG_BLK_DEV_IO_TRACE)	+= blktrace.o
 obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
+obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index c5e1d8865fe4..3d49daae47dc 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -30,6 +30,8 @@ enum trace_type {
 	TRACE_GRAPH_ENT,
 	TRACE_USER_STACK,
 	TRACE_HW_BRANCHES,
+	TRACE_SYSCALL_ENTER,
+	TRACE_SYSCALL_EXIT,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
 	TRACE_POWER,
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
new file mode 100644
index 000000000000..66cf97449af3
--- /dev/null
+++ b/kernel/trace/trace_syscalls.c
@@ -0,0 +1,113 @@
+#include <linux/ftrace.h>
+#include <linux/kernel.h>
+
+#include <asm/syscall.h>
+
+#include "trace_output.h"
+#include "trace.h"
+
+static atomic_t refcount;
+
+void start_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_inc_return(&refcount) != 1)
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		set_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_dec(&refcount);
+}
+
+void stop_ftrace_syscalls(void)
+{
+	unsigned long flags;
+	struct task_struct *g, *t;
+
+	if (atomic_dec_return(&refcount))
+		goto out;
+
+	read_lock_irqsave(&tasklist_lock, flags);
+
+	do_each_thread(g, t) {
+		clear_tsk_thread_flag(t, TIF_SYSCALL_FTRACE);
+	} while_each_thread(g, t);
+
+	read_unlock_irqrestore(&tasklist_lock, flags);
+out:
+	atomic_inc(&refcount);
+}
+
+void ftrace_syscall_enter(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d enter\n", syscall_nr);
+}
+
+void ftrace_syscall_exit(struct pt_regs *regs)
+{
+	int syscall_nr;
+
+	syscall_nr = syscall_get_nr(current, regs);
+
+	trace_printk("syscall %d exit\n", syscall_nr);
+}
+
+static int init_syscall_tracer(struct trace_array *tr)
+{
+	start_ftrace_syscalls();
+
+	return 0;
+}
+
+static void reset_syscall_tracer(struct trace_array *tr)
+{
+	stop_ftrace_syscalls();
+}
+
+static struct trace_event syscall_enter_event = {
+	.type		= TRACE_SYSCALL_ENTER,
+};
+
+static struct trace_event syscall_exit_event = {
+	.type		= TRACE_SYSCALL_EXIT,
+};
+
+static struct tracer syscall_tracer __read_mostly = {
+	.name		= "syscall",
+	.init		= init_syscall_tracer,
+	.reset		= reset_syscall_tracer
+};
+
+__init int register_ftrace_syscalls(void)
+{
+	int ret;
+
+	ret = register_ftrace_event(&syscall_enter_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_enter_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	ret = register_ftrace_event(&syscall_exit_event);
+	if (!ret) {
+		printk(KERN_WARNING "event %d failed to register\n",
+		       syscall_exit_event.type);
+		WARN_ON_ONCE(1);
+	}
+
+	return register_tracer(&syscall_tracer);
+}
+device_initcall(register_ftrace_syscalls);
-- 
cgit v1.2.3


From b00f0b6dc1773b4c8f538503247da050b5ea631b Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:14:01 +0800
Subject: ftrace: avoid double-free of dyn_ftrace

If dyn_ftrace is freed before ftrace_release(), ftrace_release()
will free it again and make ftrace_free_records wrong.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
LKML-Reference: <49BA23D9.1050900@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index d33d306bdcf4..26c45aaf6805 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -356,7 +356,8 @@ void ftrace_release(void *start, unsigned long size)
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e))
+		if ((rec->ip >= s) && (rec->ip < e) &&
+		    !(rec->flags & FTRACE_FL_FREE))
 			ftrace_free_rec(rec);
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
-- 
cgit v1.2.3


From fa9d13cf135efbd454453a53b6299976bea245a9 Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:16:34 +0800
Subject: ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED
 rec

Do __ftrace_replace_code for !FTRACE_FL_CONVERTED rec will always
fail, we should ignore this rec.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Cc: "Steven Rostedt ;" <rostedt@goodmis.org>
LKML-Reference: <49BA2472.4060206@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 26c45aaf6805..08f4a624e31f 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -532,11 +532,12 @@ static void ftrace_replace_code(int enable)
 
 	do_for_each_ftrace_rec(pg, rec) {
 		/*
-		 * Skip over free records and records that have
-		 * failed.
+		 * Skip over free records, records that have
+		 * failed and not converted.
 		 */
 		if (rec->flags & FTRACE_FL_FREE ||
-		    rec->flags & FTRACE_FL_FAILED)
+		    rec->flags & FTRACE_FL_FAILED ||
+		    rec->flags & FTRACE_FL_CONVERTED)
 			continue;
 
 		/* ignore updates to this record's mcount site */
@@ -548,7 +549,7 @@ static void ftrace_replace_code(int enable)
 		}
 
 		failed = __ftrace_replace_code(rec, enable);
-		if (failed && (rec->flags & FTRACE_FL_CONVERTED)) {
+		if (failed) {
 			rec->flags |= FTRACE_FL_FAILED;
 			if ((system_state == SYSTEM_BOOTING) ||
 			    !core_kernel_text(rec->ip)) {
-- 
cgit v1.2.3


From 88f502fedba82eff252b6420e8b8328e4ae25c67 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:32:07 +0100
Subject: futex: remove the pointer math from double_unlock_hb, fix

Impact: fix double unlock crash

Thomas Gleixner noticed that the simplified double_unlock_hb()
became ... too unsophisticated: in the hb1 == hb2 case it will
do a double unlock.

Reported-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Darren Hart <dvhltc@us.ibm.com>
LKML-Reference: <20090312221118.11146.68610.stgit@Aeon>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/futex.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/futex.c b/kernel/futex.c
index 2331b73f6932..6b50a024bca2 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -659,7 +659,8 @@ static inline void
 double_unlock_hb(struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2)
 {
 	spin_unlock(&hb1->lock);
-	spin_unlock(&hb2->lock);
+	if (hb1 != hb2)
+		spin_unlock(&hb2->lock);
 }
 
 /*
-- 
cgit v1.2.3


From 641cd4cfcdc71ce01535b31cc4d57d59a1fae1fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:47:34 +0100
Subject: generic-ipi: eliminate WARN_ON()s during oops/panic

Do not output smp-call related warnings in the oops/panic codepath.

Reported-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/smp.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/smp.c b/kernel/smp.c
index 7ad2262d2eca..858baac568ee 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -5,6 +5,7 @@
  */
 #include <linux/rcupdate.h>
 #include <linux/rculist.h>
+#include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/percpu.h>
 #include <linux/init.h>
@@ -285,7 +286,7 @@ int smp_call_function_single(int cpu, void (*func) (void *info), void *info,
 	this_cpu = get_cpu();
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
 
 	if (cpu == this_cpu) {
 		local_irq_save(flags);
@@ -329,7 +330,7 @@ void __smp_call_function_single(int cpu, struct call_single_data *data,
 	csd_lock(data);
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(wait && irqs_disabled());
+	WARN_ON_ONCE(wait && irqs_disabled() && !oops_in_progress);
 
 	generic_exec_single(cpu, data, wait);
 }
@@ -365,7 +366,7 @@ void smp_call_function_many(const struct cpumask *mask,
 	int cpu, next_cpu, this_cpu = smp_processor_id();
 
 	/* Can deadlock when called with interrupts disabled */
-	WARN_ON(irqs_disabled());
+	WARN_ON_ONCE(irqs_disabled() && !oops_in_progress);
 
 	/* So, what's a CPU they want? Ignoring this one. */
 	cpu = cpumask_first_and(mask, cpu_online_mask);
-- 
cgit v1.2.3


From ffd71da4e3f323b7673b061e6f7e0d0c12dc2b49 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 10:54:24 +0100
Subject: panic: decrease oops_in_progress only after having done the panic

Impact: eliminate secondary warnings during panic()

We can panic() in a number of difficult, atomic contexts, hence
we use bust_spinlocks(1) in panic() to increase oops_in_progress,
which prevents various debug checks we have in place.

But in practice this protection only covers the first few printk's
done by panic() - it does not cover the later attempt to stop all
other CPUs and kexec(). If a secondary warning triggers in one of
those facilities that can make the panic message scroll off.

So do bust_spinlocks(0) only much later in panic(). (which code
is only reached if panic policy is relaxed that it can return
after a warning message)

Reported-by: Jan Beulich <jbeulich@novell.com>
LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index 32fe4eff1b89..57fb005de546 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -77,7 +77,6 @@ NORET_TYPE void panic(const char * fmt, ...)
 #ifdef CONFIG_DEBUG_BUGVERBOSE
 	dump_stack();
 #endif
-	bust_spinlocks(0);
 
 	/*
 	 * If we have crashed and we have a crash kernel loaded let it handle
@@ -136,6 +135,7 @@ NORET_TYPE void panic(const char * fmt, ...)
 		mdelay(1);
 		i++;
 	}
+	bust_spinlocks(0);
 }
 
 EXPORT_SYMBOL(panic);
-- 
cgit v1.2.3


From d1dedb52acd98bd5e13e1ff4c4d045d58bbd16fe Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic, smp: provide smp_send_stop() wrapper on UP too

Impact: cleanup, no code changed

Remove an ugly #ifdef CONFIG_SMP from panic(), by providing
an smp_send_stop() wrapper on UP too.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/smp.h | 4 +++-
 kernel/panic.c      | 2 --
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 2d3bcb6b37ff..a69db820eed6 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -38,7 +38,7 @@ int smp_call_function_single(int cpuid, void (*func) (void *info), void *info,
 /*
  * main cross-CPU interfaces, handles INIT, TLB flush, STOP, etc.
  * (defined in asm header):
- */ 
+ */
 
 /*
  * stops all CPUs but the current one:
@@ -122,6 +122,8 @@ extern unsigned int setup_max_cpus;
 
 #else /* !SMP */
 
+static inline void smp_send_stop(void) { }
+
 /*
  *	These macros fold the SMP functionality into a single CPU system
  */
diff --git a/kernel/panic.c b/kernel/panic.c
index 57fb005de546..ca75e819d0ea 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -85,14 +85,12 @@ NORET_TYPE void panic(const char * fmt, ...)
 	 */
 	crash_kexec(NULL);
 
-#ifdef CONFIG_SMP
 	/*
 	 * Note smp_send_stop is the usual smp shutdown function, which
 	 * unfortunately means it may not be hardened to work in a panic
 	 * situation.
 	 */
 	smp_send_stop();
-#endif
 
 	atomic_notifier_call_chain(&panic_notifier_list, 0, buf);
 
-- 
cgit v1.2.3


From c95dbf27e201587b2a6cf63f8501a0313d9b4801 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 11:14:06 +0100
Subject: panic: clean up kernel/panic.c

Impact: cleanup, no code changed

Clean up kernel/panic.c some more and make it more consistent.

LKML-Reference: <49B91A7E.76E4.0078.0@novell.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/panic.c | 111 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 59 insertions(+), 52 deletions(-)

(limited to 'kernel')

diff --git a/kernel/panic.c b/kernel/panic.c
index ca75e819d0ea..3fd8c5bf8b39 100644
--- a/kernel/panic.c
+++ b/kernel/panic.c
@@ -8,19 +8,19 @@
  * This function is used through-out the kernel (including mm and fs)
  * to indicate a major problem.
  */
+#include <linux/debug_locks.h>
+#include <linux/interrupt.h>
+#include <linux/kallsyms.h>
+#include <linux/notifier.h>
 #include <linux/module.h>
-#include <linux/sched.h>
-#include <linux/delay.h>
+#include <linux/random.h>
 #include <linux/reboot.h>
-#include <linux/notifier.h>
-#include <linux/init.h>
+#include <linux/delay.h>
+#include <linux/kexec.h>
+#include <linux/sched.h>
 #include <linux/sysrq.h>
-#include <linux/interrupt.h>
+#include <linux/init.h>
 #include <linux/nmi.h>
-#include <linux/kexec.h>
-#include <linux/debug_locks.h>
-#include <linux/random.h>
-#include <linux/kallsyms.h>
 #include <linux/dmi.h>
 
 int panic_on_oops;
@@ -52,19 +52,15 @@ EXPORT_SYMBOL(panic_blink);
  *
  *	This function never returns.
  */
-
 NORET_TYPE void panic(const char * fmt, ...)
 {
-	long i;
 	static char buf[1024];
 	va_list args;
-#if defined(CONFIG_S390)
-	unsigned long caller = (unsigned long) __builtin_return_address(0);
-#endif
+	long i;
 
 	/*
-	 * It's possible to come here directly from a panic-assertion and not
-	 * have preempt disabled. Some functions called from here want
+	 * It's possible to come here directly from a panic-assertion and
+	 * not have preempt disabled. Some functions called from here want
 	 * preempt to be disabled. No point enabling it later though...
 	 */
 	preempt_disable();
@@ -99,19 +95,21 @@ NORET_TYPE void panic(const char * fmt, ...)
 
 	if (panic_timeout > 0) {
 		/*
-	 	 * Delay timeout seconds before rebooting the machine. 
-		 * We can't use the "normal" timers since we just panicked..
-	 	 */
-		printk(KERN_EMERG "Rebooting in %d seconds..",panic_timeout);
+		 * Delay timeout seconds before rebooting the machine.
+		 * We can't use the "normal" timers since we just panicked.
+		 */
+		printk(KERN_EMERG "Rebooting in %d seconds..", panic_timeout);
+
 		for (i = 0; i < panic_timeout*1000; ) {
 			touch_nmi_watchdog();
 			i += panic_blink(i);
 			mdelay(1);
 			i++;
 		}
-		/*	This will not be a clean reboot, with everything
-		 *	shutting down.  But if there is a chance of
-		 *	rebooting the system it will be rebooted.
+		/*
+		 * This will not be a clean reboot, with everything
+		 * shutting down.  But if there is a chance of
+		 * rebooting the system it will be rebooted.
 		 */
 		emergency_restart();
 	}
@@ -124,10 +122,15 @@ NORET_TYPE void panic(const char * fmt, ...)
 	}
 #endif
 #if defined(CONFIG_S390)
-	disabled_wait(caller);
+	{
+		unsigned long caller;
+
+		caller = (unsigned long)__builtin_return_address(0);
+		disabled_wait(caller);
+	}
 #endif
 	local_irq_enable();
-	for (i = 0;;) {
+	for (i = 0; ; ) {
 		touch_softlockup_watchdog();
 		i += panic_blink(i);
 		mdelay(1);
@@ -140,23 +143,23 @@ EXPORT_SYMBOL(panic);
 
 
 struct tnt {
-	u8 bit;
-	char true;
-	char false;
+	u8	bit;
+	char	true;
+	char	false;
 };
 
 static const struct tnt tnts[] = {
-	{ TAINT_PROPRIETARY_MODULE, 'P', 'G' },
-	{ TAINT_FORCED_MODULE, 'F', ' ' },
-	{ TAINT_UNSAFE_SMP, 'S', ' ' },
-	{ TAINT_FORCED_RMMOD, 'R', ' ' },
-	{ TAINT_MACHINE_CHECK, 'M', ' ' },
-	{ TAINT_BAD_PAGE, 'B', ' ' },
-	{ TAINT_USER, 'U', ' ' },
-	{ TAINT_DIE, 'D', ' ' },
-	{ TAINT_OVERRIDDEN_ACPI_TABLE, 'A', ' ' },
-	{ TAINT_WARN, 'W', ' ' },
-	{ TAINT_CRAP, 'C', ' ' },
+	{ TAINT_PROPRIETARY_MODULE,	'P', 'G' },
+	{ TAINT_FORCED_MODULE,		'F', ' ' },
+	{ TAINT_UNSAFE_SMP,		'S', ' ' },
+	{ TAINT_FORCED_RMMOD,		'R', ' ' },
+	{ TAINT_MACHINE_CHECK,		'M', ' ' },
+	{ TAINT_BAD_PAGE,		'B', ' ' },
+	{ TAINT_USER,			'U', ' ' },
+	{ TAINT_DIE,			'D', ' ' },
+	{ TAINT_OVERRIDDEN_ACPI_TABLE,	'A', ' ' },
+	{ TAINT_WARN,			'W', ' ' },
+	{ TAINT_CRAP,			'C', ' ' },
 };
 
 /**
@@ -193,7 +196,8 @@ const char *print_tainted(void)
 		*s = 0;
 	} else
 		snprintf(buf, sizeof(buf), "Not tainted");
-	return(buf);
+
+	return buf;
 }
 
 int test_taint(unsigned flag)
@@ -209,7 +213,8 @@ unsigned long get_taint(void)
 
 void add_taint(unsigned flag)
 {
-	debug_locks = 0; /* can't trust the integrity of the kernel anymore */
+	/* can't trust the integrity of the kernel anymore: */
+	debug_locks = 0;
 	set_bit(flag, &tainted_mask);
 }
 EXPORT_SYMBOL(add_taint);
@@ -264,8 +269,8 @@ static void do_oops_enter_exit(void)
 }
 
 /*
- * Return true if the calling CPU is allowed to print oops-related info.  This
- * is a bit racy..
+ * Return true if the calling CPU is allowed to print oops-related info.
+ * This is a bit racy..
  */
 int oops_may_print(void)
 {
@@ -274,20 +279,22 @@ int oops_may_print(void)
 
 /*
  * Called when the architecture enters its oops handler, before it prints
- * anything.  If this is the first CPU to oops, and it's oopsing the first time
- * then let it proceed.
+ * anything.  If this is the first CPU to oops, and it's oopsing the first
+ * time then let it proceed.
  *
- * This is all enabled by the pause_on_oops kernel boot option.  We do all this
- * to ensure that oopses don't scroll off the screen.  It has the side-effect
- * of preventing later-oopsing CPUs from mucking up the display, too.
+ * This is all enabled by the pause_on_oops kernel boot option.  We do all
+ * this to ensure that oopses don't scroll off the screen.  It has the
+ * side-effect of preventing later-oopsing CPUs from mucking up the display,
+ * too.
  *
- * It turns out that the CPU which is allowed to print ends up pausing for the
- * right duration, whereas all the other CPUs pause for twice as long: once in
- * oops_enter(), once in oops_exit().
+ * It turns out that the CPU which is allowed to print ends up pausing for
+ * the right duration, whereas all the other CPUs pause for twice as long:
+ * once in oops_enter(), once in oops_exit().
  */
 void oops_enter(void)
 {
-	debug_locks_off(); /* can't trust the integrity of the kernel anymore */
+	/* can't trust the integrity of the kernel anymore: */
+	debug_locks_off();
 	do_oops_enter_exit();
 }
 
-- 
cgit v1.2.3


From 850a80cfaa5aec3e626eb3736eff890a80e4fa77 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:47:23 +0800
Subject: ftrace: use seq_read

Impact: cleanup

VFS layer has tested the file mode, we do not need test it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2BAB.6010608@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 26 +++-----------------------
 1 file changed, 3 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 08f4a624e31f..bf78a4c75c67 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1120,16 +1120,6 @@ ftrace_notrace_open(struct inode *inode, struct file *file)
 	return ftrace_regex_open(inode, file, 0);
 }
 
-static ssize_t
-ftrace_regex_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	if (file->f_mode & FMODE_READ)
-		return seq_read(file, ubuf, cnt, ppos);
-	else
-		return -EPERM;
-}
-
 static loff_t
 ftrace_regex_lseek(struct file *file, loff_t offset, int origin)
 {
@@ -1882,7 +1872,7 @@ static const struct file_operations ftrace_failures_fops = {
 
 static const struct file_operations ftrace_filter_fops = {
 	.open = ftrace_filter_open,
-	.read = ftrace_regex_read,
+	.read = seq_read,
 	.write = ftrace_filter_write,
 	.llseek = ftrace_regex_lseek,
 	.release = ftrace_filter_release,
@@ -1890,7 +1880,7 @@ static const struct file_operations ftrace_filter_fops = {
 
 static const struct file_operations ftrace_notrace_fops = {
 	.open = ftrace_notrace_open,
-	.read = ftrace_regex_read,
+	.read = seq_read,
 	.write = ftrace_notrace_write,
 	.llseek = ftrace_regex_lseek,
 	.release = ftrace_notrace_release,
@@ -1992,16 +1982,6 @@ ftrace_graph_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
-static ssize_t
-ftrace_graph_read(struct file *file, char __user *ubuf,
-		       size_t cnt, loff_t *ppos)
-{
-	if (file->f_mode & FMODE_READ)
-		return seq_read(file, ubuf, cnt, ppos);
-	else
-		return -EPERM;
-}
-
 static int
 ftrace_set_func(unsigned long *array, int *idx, char *buffer)
 {
@@ -2132,7 +2112,7 @@ ftrace_graph_write(struct file *file, const char __user *ubuf,
 
 static const struct file_operations ftrace_graph_fops = {
 	.open = ftrace_graph_open,
-	.read = ftrace_graph_read,
+	.read = seq_read,
 	.write = ftrace_graph_write,
 };
 #endif /* CONFIG_FUNCTION_GRAPH_TRACER */
-- 
cgit v1.2.3


From e94142a67f8bad494c593f0a07c9fc2fbec98c0e Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 13 Mar 2009 17:51:27 +0800
Subject: ftrace: remove struct list_head from struct dyn_ftrace

Impact: save memory

The struct dyn_ftrace table is very large, this patch will save
about 50%.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
LKML-Reference: <49BA2C9F.8020009@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h |  1 -
 kernel/trace/ftrace.c  | 14 ++++++++------
 2 files changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..9d598bbf28a6 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,7 +145,6 @@ enum {
 };
 
 struct dyn_ftrace {
-	struct list_head	list;
 	unsigned long		ip; /* address of mcount call-site */
 	unsigned long		flags;
 	struct dyn_arch_ftrace	arch;
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bf78a4c75c67..90d5729afeff 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -272,7 +272,7 @@ enum {
 
 static int ftrace_filtered;
 
-static LIST_HEAD(ftrace_new_addrs);
+static struct dyn_ftrace *ftrace_new_addrs;
 
 static DEFINE_MUTEX(ftrace_regex_lock);
 
@@ -409,8 +409,8 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-
-	list_add(&rec->list, &ftrace_new_addrs);
+	rec->flags = (unsigned long)ftrace_new_addrs;
+	ftrace_new_addrs = rec;
 
 	return rec;
 }
@@ -716,19 +716,21 @@ unsigned long		ftrace_update_tot_cnt;
 
 static int ftrace_update_code(struct module *mod)
 {
-	struct dyn_ftrace *p, *t;
+	struct dyn_ftrace *p;
 	cycle_t start, stop;
 
 	start = ftrace_now(raw_smp_processor_id());
 	ftrace_update_cnt = 0;
 
-	list_for_each_entry_safe(p, t, &ftrace_new_addrs, list) {
+	while (ftrace_new_addrs) {
 
 		/* If something went wrong, bail without enabling anything */
 		if (unlikely(ftrace_disabled))
 			return -1;
 
-		list_del_init(&p->list);
+		p = ftrace_new_addrs;
+		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
 		if (ftrace_code_disable(mod, p)) {
-- 
cgit v1.2.3


From c8e2aeef0b8ac9fb8821b8b3734c031579d0b77a Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 9 Mar 2009 20:26:23 +0100
Subject: genirq: remove redundant if condition

Impact: cleanup

The code is only compiled if CONFIG_GENERIC_HARDIRQS=y so another
check for this define in the code is redundant. Remove it.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/manage.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index e28db0f656ac..4600f877c292 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -15,7 +15,7 @@
 
 #include "internals.h"
 
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
+#ifdef CONFIG_SMP
 cpumask_var_t irq_default_affinity;
 
 /**
-- 
cgit v1.2.3


From 4553573277906901f62f73c0432b332c53de5e2c Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Sun, 22 Feb 2009 23:00:32 +0100
Subject: genirq: use kzalloc instead of explicit zero initialization

Impact: simplification

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Peter Zijlstra <peterz@infradead.org>
---
 kernel/irq/manage.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 4600f877c292..8a22039a90ba 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -737,15 +737,13 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 	if (!handler)
 		return -EINVAL;
 
-	action = kmalloc(sizeof(struct irqaction), GFP_KERNEL);
+	action = kzalloc(sizeof(struct irqaction), GFP_KERNEL);
 	if (!action)
 		return -ENOMEM;
 
 	action->handler = handler;
 	action->flags = irqflags;
-	cpus_clear(action->mask);
 	action->name = devname;
-	action->next = NULL;
 	action->dev_id = dev_id;
 
 	retval = __setup_irq(irq, desc, action);
-- 
cgit v1.2.3


From 0e57aa11abb15b70db53d1f95ae70b3c980ac885 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Fri, 13 Mar 2009 14:34:05 +0100
Subject: genirq: deprecate __do_IRQ

Two years migration time is enough. Remove the compability cruft.

Add the deprecated warning in kernel/irq/handle.c because marking
__do_IRQ itself is way too noisy.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 Documentation/feature-removal-schedule.txt | 8 ++++++++
 kernel/irq/handle.c                        | 5 +++++
 2 files changed, 13 insertions(+)

(limited to 'kernel')

diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index 20d3b94703a4..63b4550411be 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -344,3 +344,11 @@ Why:	See commits 129f8ae9b1b5be94517da76009ea956e89104ce8 and
 	Removal is subject to fixing any remaining bugs in ACPI which may
 	cause the thermal throttling not to happen at the right time.
 Who:	Dave Jones <davej@redhat.com>, Matthew Garrett <mjg@redhat.com>
+
+-----------------------------
+
+What:	__do_IRQ all in one fits nothing interrupt handler
+When:	2.6.32
+Why:	__do_IRQ was kept for easy migration to the type flow handlers.
+	More than two years of migration time is enough.
+Who:	Thomas Gleixner <tglx@linutronix.de>
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index a2ee682bca2e..6661704140c7 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -349,6 +349,11 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 }
 
 #ifndef CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ
+
+#ifdef CONFIG_ENABLE_WARN_DEPRECATED
+# warning __do_IRQ is deprecated. Please convert to proper flow handlers
+#endif
+
 /**
  * __do_IRQ - original all in one highlevel IRQ handler
  * @irq:	the interrupt number
-- 
cgit v1.2.3


From bed1ffca022cc876fb83161d26670e9b5d3cf36b Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 13 Mar 2009 15:42:11 +0100
Subject: tracing/syscalls: core infrastructure for syscalls tracing,
 enhancements

Impact: new feature

This adds the generic support for syscalls tracing. This is
currently exploited through a devoted tracer but other tracing
engines can use it. (They just have to play with
{start,stop}_ftrace_syscalls() and use the display callbacks
unless they want to override them.)

The syscalls prototypes definitions are abused here to steal
some metadata informations:

- syscall name, param types, param names, number of params

The syscall addr is not directly saved during this definition
because we don't know if its prototype is available in the
namespace. But we don't really need it. The arch has just to
build a function able to resolve the syscall number to its
metadata struct.

The current tracer prints the syscall names, parameters names
and values (and their types optionally). Currently the value is
a raw hex but higher level values diplaying is on my TODO list.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1236955332-10133-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/asm-generic/vmlinux.lds.h |  11 ++-
 include/linux/ftrace.h            |  14 +++-
 include/linux/syscalls.h          |  60 +++++++++++++++-
 kernel/trace/trace.h              |  17 +++++
 kernel/trace/trace_syscalls.c     | 146 +++++++++++++++++++++++++++++++++++---
 5 files changed, 234 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index 0e0f39be6c8b..d3bc3c86df6a 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -77,6 +77,14 @@
 #define TRACE_PRINTKS()
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define TRACE_SYSCALLS() VMLINUX_SYMBOL(__start_syscalls_metadata) = .;	\
+			 *(__syscalls_metadata)				\
+			 VMLINUX_SYMBOL(__stop_syscalls_metadata) = .;
+#else
+#define TRACE_SYSCALLS()
+#endif
+
 /* .data section */
 #define DATA_DATA							\
 	*(.data)							\
@@ -99,7 +107,8 @@
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()						\
 	TRACE_PRINTKS()							\
-	FTRACE_EVENTS()
+	FTRACE_EVENTS()							\
+	TRACE_SYSCALLS()
 
 #define RO_DATA(align)							\
 	. = ALIGN((align));						\
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index c146c1021a29..6dc1c652447e 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -506,13 +506,21 @@ static inline void trace_hw_branch_oops(void) {}
 /*
  * A syscall entry in the ftrace syscalls array.
  *
- * @syscall_nr: syscall number
+ * @name: name of the syscall
+ * @nb_args: number of parameters it takes
+ * @types: list of types as strings
+ * @args: list of args as strings (args[i] matches types[i])
  */
-struct syscall_trace_entry {
-	int		syscall_nr;
+struct syscall_metadata {
+	const char	*name;
+	int		nb_args;
+	const char	**types;
+	const char	**args;
 };
 
 #ifdef CONFIG_FTRACE_SYSCALLS
+extern void arch_init_ftrace_syscalls(void);
+extern struct syscall_metadata *syscall_nr_to_meta(int nr);
 extern void start_ftrace_syscalls(void);
 extern void stop_ftrace_syscalls(void);
 extern void ftrace_syscall_enter(struct pt_regs *regs);
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index f9f900cfd066..0cff9bb80b02 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -65,6 +65,7 @@ struct old_linux_dirent;
 #include <asm/signal.h>
 #include <linux/quota.h>
 #include <linux/key.h>
+#include <linux/ftrace.h>
 
 #define __SC_DECL1(t1, a1)	t1 a1
 #define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
@@ -95,7 +96,46 @@ struct old_linux_dirent;
 #define __SC_TEST5(t5, a5, ...)	__SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
 #define __SC_TEST6(t6, a6, ...)	__SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define __SC_STR_ADECL1(t, a)		#a
+#define __SC_STR_ADECL2(t, a, ...)	#a, __SC_STR_ADECL1(__VA_ARGS__)
+#define __SC_STR_ADECL3(t, a, ...)	#a, __SC_STR_ADECL2(__VA_ARGS__)
+#define __SC_STR_ADECL4(t, a, ...)	#a, __SC_STR_ADECL3(__VA_ARGS__)
+#define __SC_STR_ADECL5(t, a, ...)	#a, __SC_STR_ADECL4(__VA_ARGS__)
+#define __SC_STR_ADECL6(t, a, ...)	#a, __SC_STR_ADECL5(__VA_ARGS__)
+
+#define __SC_STR_TDECL1(t, a)		#t
+#define __SC_STR_TDECL2(t, a, ...)	#t, __SC_STR_TDECL1(__VA_ARGS__)
+#define __SC_STR_TDECL3(t, a, ...)	#t, __SC_STR_TDECL2(__VA_ARGS__)
+#define __SC_STR_TDECL4(t, a, ...)	#t, __SC_STR_TDECL3(__VA_ARGS__)
+#define __SC_STR_TDECL5(t, a, ...)	#t, __SC_STR_TDECL4(__VA_ARGS__)
+#define __SC_STR_TDECL6(t, a, ...)	#t, __SC_STR_TDECL5(__VA_ARGS__)
+
+#define SYSCALL_METADATA(sname, nb)				\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys"#sname,			\
+		.nb_args 	= nb,				\
+		.types		= types_##sname,		\
+		.args		= args_##sname,			\
+	}
+
+#define SYSCALL_DEFINE0(sname)					\
+	static const struct syscall_metadata __used		\
+	  __attribute__((__aligned__(4)))			\
+	  __attribute__((section("__syscalls_metadata")))	\
+	  __syscall_meta_##sname = {				\
+		.name 		= "sys_"#sname,			\
+		.nb_args 	= 0,				\
+	};							\
+	asmlinkage long sys_##sname(void)
+
+#else
 #define SYSCALL_DEFINE0(name)	   asmlinkage long sys_##name(void)
+#endif
+
 #define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
 #define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
@@ -117,10 +157,26 @@ struct old_linux_dirent;
 #endif
 #endif
 
+#ifdef CONFIG_FTRACE_SYSCALLS
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	static const char *types_##sname[] = {			\
+		__SC_STR_TDECL##x(__VA_ARGS__)			\
+	};							\
+	static const char *args_##sname[] = {			\
+		__SC_STR_ADECL##x(__VA_ARGS__)			\
+	};							\
+	SYSCALL_METADATA(sname, x);				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#else
+#define SYSCALL_DEFINEx(x, sname, ...)				\
+	__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
+#endif
+
 #ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
 
 #define SYSCALL_DEFINE(name) static inline long SYSC_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__));		\
 	static inline long SYSC##name(__SC_DECL##x(__VA_ARGS__));	\
 	asmlinkage long SyS##name(__SC_LONG##x(__VA_ARGS__))		\
@@ -134,7 +190,7 @@ struct old_linux_dirent;
 #else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
 
 #define SYSCALL_DEFINE(name) asmlinkage long sys_##name
-#define SYSCALL_DEFINEx(x, name, ...)					\
+#define __SYSCALL_DEFINEx(x, name, ...)					\
 	asmlinkage long sys##name(__SC_DECL##x(__VA_ARGS__))
 
 #endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 3d49daae47dc..d80ca0d464d9 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -194,6 +194,19 @@ struct kmemtrace_free_entry {
 	const void *ptr;
 };
 
+struct syscall_trace_enter {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		args[];
+};
+
+struct syscall_trace_exit {
+	struct trace_entry	ent;
+	int			nr;
+	unsigned long		ret;
+};
+
+
 /*
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
@@ -306,6 +319,10 @@ extern void __ftrace_bad_type(void);
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
 			  TRACE_KMEM_FREE);	\
+		IF_ASSIGN(var, ent, struct syscall_trace_enter,		\
+			  TRACE_SYSCALL_ENTER);				\
+		IF_ASSIGN(var, ent, struct syscall_trace_exit,		\
+			  TRACE_SYSCALL_EXIT);				\
 		__ftrace_bad_type();					\
 	} while (0)
 
diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 66cf97449af3..c72e599230ff 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -1,6 +1,5 @@
-#include <linux/ftrace.h>
 #include <linux/kernel.h>
-
+#include <linux/ftrace.h>
 #include <asm/syscall.h>
 
 #include "trace_output.h"
@@ -8,6 +7,90 @@
 
 static atomic_t refcount;
 
+/* Our two options */
+enum {
+	TRACE_SYSCALLS_OPT_TYPES = 0x1,
+};
+
+static struct tracer_opt syscalls_opts[] = {
+	{ TRACER_OPT(syscall_arg_type, TRACE_SYSCALLS_OPT_TYPES) },
+	{ }
+};
+
+static struct tracer_flags syscalls_flags = {
+	.val = 0, /* By default: no args types */
+	.opts = syscalls_opts
+};
+
+enum print_line_t
+print_syscall_enter(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_enter *trace;
+	struct syscall_metadata *entry;
+	int i, ret, syscall;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry)
+		goto end;
+
+	ret = trace_seq_printf(s, "%s(", entry->name);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	for (i = 0; i < entry->nb_args; i++) {
+		/* parameter types */
+		if (syscalls_flags.val & TRACE_SYSCALLS_OPT_TYPES) {
+			ret = trace_seq_printf(s, "%s ", entry->types[i]);
+			if (!ret)
+				return TRACE_TYPE_PARTIAL_LINE;
+		}
+		/* parameter values */
+		ret = trace_seq_printf(s, "%s: %lx%s ", entry->args[i],
+				       trace->args[i],
+				       i == entry->nb_args - 1 ? ")" : ",");
+		if (!ret)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
+
+end:
+	trace_seq_printf(s, "\n");
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t
+print_syscall_exit(struct trace_iterator *iter, int flags)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *ent = iter->ent;
+	struct syscall_trace_exit *trace;
+	int syscall;
+	struct syscall_metadata *entry;
+	int ret;
+
+	trace_assign_type(trace, ent);
+
+	syscall = trace->nr;
+
+	entry = syscall_nr_to_meta(syscall);
+	if (!entry) {
+		trace_seq_printf(s, "\n");
+		return TRACE_TYPE_HANDLED;
+	}
+
+	ret = trace_seq_printf(s, "%s -> 0x%lx\n", entry->name,
+				trace->ret);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 void start_ftrace_syscalls(void)
 {
 	unsigned long flags;
@@ -16,6 +99,7 @@ void start_ftrace_syscalls(void)
 	if (atomic_inc_return(&refcount) != 1)
 		goto out;
 
+	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
 
 	do_each_thread(g, t) {
@@ -48,20 +132,63 @@ out:
 
 void ftrace_syscall_enter(struct pt_regs *regs)
 {
+	struct syscall_trace_enter *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
+	int size;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d enter\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	size = sizeof(*entry) + sizeof(unsigned long) * sys_data->nb_args;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_ENTER, size,
+							0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	syscall_get_arguments(current, regs, 0, sys_data->nb_args, entry->args);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 void ftrace_syscall_exit(struct pt_regs *regs)
 {
+	struct syscall_trace_exit *entry;
+	struct syscall_metadata *sys_data;
+	struct ring_buffer_event *event;
 	int syscall_nr;
+	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	trace_printk("syscall %d exit\n", syscall_nr);
+	cpu = raw_smp_processor_id();
+
+	sys_data = syscall_nr_to_meta(syscall_nr);
+	if (!sys_data)
+		return;
+
+	event = trace_current_buffer_lock_reserve(TRACE_SYSCALL_EXIT,
+				sizeof(*entry), 0, 0);
+	if (!event)
+		return;
+
+	entry = ring_buffer_event_data(event);
+	entry->nr = syscall_nr;
+	entry->ret = syscall_get_return_value(current, regs);
+
+	trace_current_buffer_unlock_commit(event, 0, 0);
+	trace_wake_up();
 }
 
 static int init_syscall_tracer(struct trace_array *tr)
@@ -77,17 +204,20 @@ static void reset_syscall_tracer(struct trace_array *tr)
 }
 
 static struct trace_event syscall_enter_event = {
-	.type		= TRACE_SYSCALL_ENTER,
+	.type	 	= TRACE_SYSCALL_ENTER,
+	.trace		= print_syscall_enter,
 };
 
 static struct trace_event syscall_exit_event = {
-	.type		= TRACE_SYSCALL_EXIT,
+	.type	 	= TRACE_SYSCALL_EXIT,
+	.trace		= print_syscall_exit,
 };
 
 static struct tracer syscall_tracer __read_mostly = {
-	.name		= "syscall",
+	.name	     	= "syscall",
 	.init		= init_syscall_tracer,
-	.reset		= reset_syscall_tracer
+	.reset		= reset_syscall_tracer,
+	.flags		= &syscalls_flags,
 };
 
 __init int register_ftrace_syscalls(void)
-- 
cgit v1.2.3


From ac99c58c9e56967037382e31f865b72b10127965 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:35 +0100
Subject: tracing/syscalls: fix missing release of tracing

Impact: fix 'stuck' syscall tracer

The syscall tracer uses a refcounter to enable several users
simultaneously.

But the refcounter did not behave correctly and always restored
its value to 0 after calling start_syscall_tracing(). Therefore,
stop_syscall_tracing() couldn't release correctly the tasks from
tracing.

Also the tracer forgot to reset the buffer when it is released.

Drop the pointless refcount decrement on start_syscall_tracing()
and reset the buffer when we release the tracer.

This fixes two reported issue:

- when we switch from syscall tracer to another tracer, syscall
  tracing continued.

- incorrect use of the refcount.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c72e599230ff..c5fc1d8880f6 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -96,8 +96,9 @@ void start_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	/* Don't enable the flag on the tasks twice */
 	if (atomic_inc_return(&refcount) != 1)
-		goto out;
+		return;
 
 	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
@@ -107,8 +108,6 @@ void start_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
-out:
-	atomic_dec(&refcount);
 }
 
 void stop_ftrace_syscalls(void)
@@ -116,8 +115,9 @@ void stop_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	/* There are perhaps still some users */
 	if (atomic_dec_return(&refcount))
-		goto out;
+		return;
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -126,8 +126,6 @@ void stop_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
-out:
-	atomic_inc(&refcount);
 }
 
 void ftrace_syscall_enter(struct pt_regs *regs)
@@ -201,6 +199,7 @@ static int init_syscall_tracer(struct trace_array *tr)
 static void reset_syscall_tracer(struct trace_array *tr)
 {
 	stop_ftrace_syscalls();
+	tracing_reset_online_cpus(tr);
 }
 
 static struct trace_event syscall_enter_event = {
-- 
cgit v1.2.3


From 6404434525bb9f8f2239998f30fd7c93f2efa5b3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:36 +0100
Subject: tracing/syscalls: various cleanups

Impact: cleanup

- Drop unused cpu variable
- Fix some errors on comments

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-2-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index c5fc1d8880f6..26f9a8679d3d 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -7,7 +7,7 @@
 
 static atomic_t refcount;
 
-/* Our two options */
+/* Option to display the parameters types */
 enum {
 	TRACE_SYSCALLS_OPT_TYPES = 0x1,
 };
@@ -18,7 +18,7 @@ static struct tracer_opt syscalls_opts[] = {
 };
 
 static struct tracer_flags syscalls_flags = {
-	.val = 0, /* By default: no args types */
+	.val = 0, /* By default: no parameters types */
 	.opts = syscalls_opts
 };
 
@@ -135,12 +135,9 @@ void ftrace_syscall_enter(struct pt_regs *regs)
 	struct ring_buffer_event *event;
 	int size;
 	int syscall_nr;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	cpu = raw_smp_processor_id();
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
@@ -166,12 +163,9 @@ void ftrace_syscall_exit(struct pt_regs *regs)
 	struct syscall_metadata *sys_data;
 	struct ring_buffer_event *event;
 	int syscall_nr;
-	int cpu;
 
 	syscall_nr = syscall_get_nr(current, regs);
 
-	cpu = raw_smp_processor_id();
-
 	sys_data = syscall_nr_to_meta(syscall_nr);
 	if (!sys_data)
 		return;
-- 
cgit v1.2.3


From 5be71b61f17b0e3bc8ad0b1a1b7b53ab7d574ebb Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:37 +0100
Subject: tracing/syscalls: protect thread flag toggling from races

Impact: fix syscall tracer enable/disable race

The current thread flag toggling is racy as shown in the following
scenario:

- task A is the last user of syscall tracing, it releases the
  TIF_SYSCALL_FTRACE on each tasks

- at the same time task B start syscall tracing. refcount == 0 so
  it sets up TIF_SYSCALL_FTRACE on each tasks.

The effect of the mixup is unpredictable.
So this fix adds a mutex on {start,stop}_syscall_tracing().

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1237151439-6755-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_syscalls.c | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_syscalls.c b/kernel/trace/trace_syscalls.c
index 26f9a8679d3d..a2a3af29c943 100644
--- a/kernel/trace/trace_syscalls.c
+++ b/kernel/trace/trace_syscalls.c
@@ -5,7 +5,11 @@
 #include "trace_output.h"
 #include "trace.h"
 
-static atomic_t refcount;
+/* Keep a counter of the syscall tracing users */
+static int refcount;
+
+/* Prevent from races on thread flags toggling */
+static DEFINE_MUTEX(syscall_trace_lock);
 
 /* Option to display the parameters types */
 enum {
@@ -96,9 +100,11 @@ void start_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	mutex_lock(&syscall_trace_lock);
+
 	/* Don't enable the flag on the tasks twice */
-	if (atomic_inc_return(&refcount) != 1)
-		return;
+	if (++refcount != 1)
+		goto unlock;
 
 	arch_init_ftrace_syscalls();
 	read_lock_irqsave(&tasklist_lock, flags);
@@ -108,6 +114,9 @@ void start_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
+
+unlock:
+	mutex_unlock(&syscall_trace_lock);
 }
 
 void stop_ftrace_syscalls(void)
@@ -115,9 +124,11 @@ void stop_ftrace_syscalls(void)
 	unsigned long flags;
 	struct task_struct *g, *t;
 
+	mutex_lock(&syscall_trace_lock);
+
 	/* There are perhaps still some users */
-	if (atomic_dec_return(&refcount))
-		return;
+	if (--refcount)
+		goto unlock;
 
 	read_lock_irqsave(&tasklist_lock, flags);
 
@@ -126,6 +137,9 @@ void stop_ftrace_syscalls(void)
 	} while_each_thread(g, t);
 
 	read_unlock_irqrestore(&tasklist_lock, flags);
+
+unlock:
+	mutex_unlock(&syscall_trace_lock);
 }
 
 void ftrace_syscall_enter(struct pt_regs *regs)
-- 
cgit v1.2.3


From 0ea1c4156bf9e2eb370cc5c6fa6eb112bd844dec Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:38 +0100
Subject: tracing/syscalls: select kallsysms

Syscall tracing must select kallsysms.

The arch code builds a table to find the syscall metadata by syscall
number. It needs the syscalls names resolution from the symbol table
to know which name found on the syscalls metadatas match a function
pointer from the arch sys_call_table.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 95a0ad191f19..b0a46f889659 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -182,6 +182,7 @@ config FTRACE_SYSCALLS
 	bool "Trace syscalls"
 	depends on HAVE_FTRACE_SYSCALLS
 	select TRACING
+	select KALLSYMS
 	help
 	  Basic tracer to catch the syscall entry and exit events.
 
-- 
cgit v1.2.3


From 59f586db98919d7d9c43527b26c8de1cdf9ed912 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 15 Mar 2009 22:10:39 +0100
Subject: tracing/core: fix missing mutex unlock on tracing_set_tracer()

Impact: fix possible locking imbalance

In case of ring buffer resize failure, tracing_set_tracer forgot to
release trace_types_lock. Fix it.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237151439-6755-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efe3202c0209..c0cf946d42f5 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2494,7 +2494,7 @@ static int tracing_set_tracer(const char *buf)
 	if (!ring_buffer_expanded) {
 		ret = tracing_resize_ring_buffer(trace_buf_size);
 		if (ret < 0)
-			return ret;
+			goto out;
 		ret = 0;
 	}
 
-- 
cgit v1.2.3


From ac1d52d0b85854958c7e78c8006e39aadb6ce4b8 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 00:32:41 +0100
Subject: tracing/ftrace: fix double calls to tracing_start()

Impact: fix a warning during preemptirqsoff selftests

When the preemptirqsoff selftest fails, we see the following
warning:

[    6.050000] Testing tracer preemptirqsoff: .. no entries found ..
------------[ cut here ]------------
[    6.060000] WARNING: at kernel/trace/trace.c:688 tracing_start+0x67/0xd3()
[    6.060000] Modules linked in:
[    6.060000] Pid: 1, comm: swapper Tainted: G
[    6.060000] Call Trace:
[    6.060000]  [<ffffffff802460ff>] warn_slowpath+0xb1/0x100
[    6.060000]  [<ffffffff802a8f5b>] ? trace_preempt_on+0x35/0x4b
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff80271e0b>] ? __lock_acquired+0xe6/0x1f2
[    6.060000]  [<ffffffff802a37fb>] ? tracing_start+0x31/0xd3
[    6.060000]  [<ffffffff802a3831>] tracing_start+0x67/0xd3
[    6.060000]  [<ffffffff802a8ace>] ? irqsoff_tracer_reset+0x2d/0x57
[    6.060000]  [<ffffffff802a4d1c>] trace_selftest_startup_preemptirqsoff+0x1c8/0x1f1
[    6.060000]  [<ffffffff802a4798>] register_tracer+0x12f/0x241
[    6.060000]  [<ffffffff810250d0>] ? init_irqsoff_tracer+0x0/0x53
[    6.060000]  [<ffffffff8102510b>] init_irqsoff_tracer+0x3b/0x53

This is because in fail case, the preemptirqsoff tracer selftest calls twice
the tracing_start() function:

int
trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *tr)
{
        if (!ret && !count) {
                printk(KERN_CONT ".. no entries found ..");
                ret = -1;
                tracing_start(); <-----
                goto out;
        }
        [...]
out:
        trace->reset(tr);
        tracing_start(); <------
        tracing_max_latency = save_max;

        return ret;
}

Since it is well handled in the out path, we don't need the conditional one.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237159961-7447-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 16 ++++++----------
 1 file changed, 6 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index f907a2b29028..a2ca6f0fef9b 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -414,7 +414,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	ret = tracer_init(trace, tr);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
-		goto out;
+		goto out_no_start;
 	}
 
 	/* reset the max latency */
@@ -432,21 +432,16 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	tracing_stop();
 	/* check both trace buffers */
 	ret = trace_test_buffer(tr, NULL);
-	if (ret) {
-		tracing_start();
+	if (ret)
 		goto out;
-	}
 
 	ret = trace_test_buffer(&max_tr, &count);
-	if (ret) {
-		tracing_start();
+	if (ret)
 		goto out;
-	}
 
 	if (!ret && !count) {
 		printk(KERN_CONT ".. no entries found ..");
 		ret = -1;
-		tracing_start();
 		goto out;
 	}
 
@@ -475,9 +470,10 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 		goto out;
 	}
 
- out:
-	trace->reset(tr);
+out:
 	tracing_start();
+out_no_start:
+	trace->reset(tr);
 	tracing_max_latency = save_max;
 
 	return ret;
-- 
cgit v1.2.3


From 2fc1dfbe17e7705c55b7a99da995fa565e26f151 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 01:45:03 +0100
Subject: tracing/core: fix early free of cpumasks

Impact: fix crashes when tracing cpumasks

While ring-buffer allocation, the cpumasks are allocated too,
including the tracing cpumask and the per-cpu file mask handler.
But these cpumasks are freed accidentally just after.
Fix it.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237164303-11476-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c0cf946d42f5..ae32d3b99b4b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4125,7 +4125,8 @@ __init static int tracer_alloc_buffers(void)
 				       &trace_panic_notifier);
 
 	register_die_notifier(&trace_die_notifier);
-	ret = 0;
+
+	return 0;
 
 out_free_cpumask:
 	free_cpumask_var(tracing_reader_cpumask);
-- 
cgit v1.2.3


From 03303549b1695dc024d4a653cc16bd79f78f9750 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 16 Mar 2009 22:41:00 +0100
Subject: tracing/ftrace: fix the check on nopped sites

Impact: fix a dynamic tracing failure

Recently, the function and function graph tracers failed to use dynamic
tracing after the following commit:

fa9d13cf135efbd454453a53b6299976bea245a9
(ftrace: don't try to __ftrace_replace_code on !FTRACE_FL_CONVERTED rec)

The patch is right except a mistake on the check for the FTRACE_FL_CONVERTED
flag. The code patching is aborted in case of successfully nopped sites.
What we want is the opposite: ignore the callsites that haven't been nopped.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 90d5729afeff..7847806eefef 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -537,7 +537,7 @@ static void ftrace_replace_code(int enable)
 		 */
 		if (rec->flags & FTRACE_FL_FREE ||
 		    rec->flags & FTRACE_FL_FAILED ||
-		    rec->flags & FTRACE_FL_CONVERTED)
+		    !(rec->flags & FTRACE_FL_CONVERTED))
 			continue;
 
 		/* ignore updates to this record's mcount site */
-- 
cgit v1.2.3


From 4ca530852346be239b7c19e7bec5d2b78855bebe Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Mar 2009 19:20:15 -0400
Subject: tracing: protect reader of cmdline output

Impact: fix to one cause of incorrect comm outputs in trace

The spinlock only protected the creation of a comm <=> pid pair.
But it was possible that a reader could look up a pid, and get the
wrong comm because it had no locking.

This also required changing trace_find_cmdline to copy the comm cache
and not just send back a pointer to it.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/blktrace.c              | 23 ++++++++++++++++++-----
 kernel/trace/trace.c                 | 20 ++++++++++++--------
 kernel/trace/trace.h                 |  2 +-
 kernel/trace/trace_functions_graph.c | 12 ++++++------
 kernel/trace/trace_output.c          | 18 ++++++++++++------
 5 files changed, 49 insertions(+), 26 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1f32e4edf490..b171778e3863 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1027,7 +1027,9 @@ static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
 
 static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent)
 {
-	const char *cmd = trace_find_cmdline(ent->pid);
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
 
 	if (t_sec(ent))
 		return trace_seq_printf(s, "%llu + %u [%s]\n",
@@ -1057,19 +1059,30 @@ static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent)
 
 static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent)
 {
-	return trace_seq_printf(s, "[%s]\n", trace_find_cmdline(ent->pid));
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
+	return trace_seq_printf(s, "[%s]\n", cmd);
 }
 
 static int blk_log_unplug(struct trace_seq *s, const struct trace_entry *ent)
 {
-	return trace_seq_printf(s, "[%s] %llu\n", trace_find_cmdline(ent->pid),
-				get_pdu_int(ent));
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
+	return trace_seq_printf(s, "[%s] %llu\n", cmd, get_pdu_int(ent));
 }
 
 static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 {
+	char cmd[TASK_COMM_LEN];
+
+	trace_find_cmdline(ent->pid, cmd);
+
 	return trace_seq_printf(s, "%llu / %llu [%s]\n", t_sector(ent),
-				get_pdu_int(ent), trace_find_cmdline(ent->pid));
+				get_pdu_int(ent), cmd);
 }
 
 /*
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index efe3202c0209..2796bd2b17e4 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -770,25 +770,29 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
-char *trace_find_cmdline(int pid)
+void trace_find_cmdline(int pid, char comm[])
 {
-	char *cmdline = "<...>";
 	unsigned map;
 
-	if (!pid)
-		return "<idle>";
+	if (!pid) {
+		strcpy(comm, "<idle>");
+		return;
+	}
 
-	if (pid > PID_MAX_DEFAULT)
-		goto out;
+	if (pid > PID_MAX_DEFAULT) {
+		strcpy(comm, "<...>");
+		return;
+	}
 
+	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
 	if (map >= SAVED_CMDLINES)
 		goto out;
 
-	cmdline = saved_cmdlines[map];
+	strcpy(comm, saved_cmdlines[map]);
 
  out:
-	return cmdline;
+	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
 void tracing_record_cmdline(struct task_struct *tsk)
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 56ce34d90b03..b0ecad8ecc34 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -547,7 +547,7 @@ struct tracer_switch_ops {
 };
 #endif /* CONFIG_CONTEXT_SWITCH_TRACER */
 
-extern char *trace_find_cmdline(int pid);
+extern void trace_find_cmdline(int pid, char comm[]);
 
 #ifdef CONFIG_DYNAMIC_FTRACE
 extern unsigned long ftrace_update_tot_cnt;
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 4c388607ed67..6004ccac2dd7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -190,15 +190,15 @@ print_graph_cpu(struct trace_seq *s, int cpu)
 static enum print_line_t
 print_graph_proc(struct trace_seq *s, pid_t pid)
 {
-	int i;
-	int ret;
-	int len;
-	char comm[8];
-	int spaces = 0;
+	char comm[TASK_COMM_LEN];
 	/* sign + log10(MAX_INT) + '\0' */
 	char pid_str[11];
+	int spaces = 0;
+	int ret;
+	int len;
+	int i;
 
-	strncpy(comm, trace_find_cmdline(pid), 7);
+	trace_find_cmdline(pid, comm);
 	comm[7] = '\0';
 	sprintf(pid_str, "%d", pid);
 
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index ea9d3b410c7a..6a4c9dea191e 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -309,9 +309,9 @@ static int
 lat_print_generic(struct trace_seq *s, struct trace_entry *entry, int cpu)
 {
 	int hardirq, softirq;
-	char *comm;
+	char comm[TASK_COMM_LEN];
 
-	comm = trace_find_cmdline(entry->pid);
+	trace_find_cmdline(entry->pid, comm);
 	hardirq = entry->flags & TRACE_FLAG_HARDIRQ;
 	softirq = entry->flags & TRACE_FLAG_SOFTIRQ;
 
@@ -346,10 +346,12 @@ int trace_print_context(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
-	char *comm = trace_find_cmdline(entry->pid);
 	unsigned long long t = ns2usecs(iter->ts);
 	unsigned long usec_rem = do_div(t, USEC_PER_SEC);
 	unsigned long secs = (unsigned long)t;
+	char comm[TASK_COMM_LEN];
+
+	trace_find_cmdline(entry->pid, comm);
 
 	return trace_seq_printf(s, "%16s-%-5d [%03d] %5lu.%06lu: ",
 				comm, entry->pid, iter->cpu, secs, usec_rem);
@@ -372,7 +374,10 @@ int trace_print_lat_context(struct trace_iterator *iter)
 	rel_usecs = ns2usecs(next_ts - iter->ts);
 
 	if (verbose) {
-		char *comm = trace_find_cmdline(entry->pid);
+		char comm[TASK_COMM_LEN];
+
+		trace_find_cmdline(entry->pid, comm);
+
 		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
 				       entry->pid, iter->cpu, entry->flags,
@@ -577,14 +582,15 @@ static enum print_line_t trace_ctxwake_print(struct trace_iterator *iter,
 					     char *delim)
 {
 	struct ctx_switch_entry *field;
-	char *comm;
+	char comm[TASK_COMM_LEN];
 	int S, T;
 
+
 	trace_assign_type(field, iter->ent);
 
 	T = task_state_char(field->next_state);
 	S = task_state_char(field->prev_state);
-	comm = trace_find_cmdline(field->next_pid);
+	trace_find_cmdline(field->next_pid, comm);
 	if (!trace_seq_printf(&iter->seq,
 			      " %5d:%3d:%c %s [%03d] %5d:%3d:%c %s\n",
 			      field->prev_pid,
-- 
cgit v1.2.3


From 6adaad14d7d4d3ef31b4e2dc992b18b5da7c4eb3 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 16 Mar 2009 21:57:17 -0400
Subject: tracing: stop comm recording on tracing off

Impact: fix for losing comms in trace

The command lines of tasks are cached at sched switch to not need
to record them at every trace point.  Disabling the tracing on stops
the recording of traces, but does not stop the caching of command lines.
When the tracing is off the cache may overflow and cause the tracing
to show incorrect tasks matching the PIDs.

This patch disables prevents updates to the comm cache when the ring buffer
is off.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2796bd2b17e4..8f89690230e6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -797,7 +797,7 @@ void trace_find_cmdline(int pid, char comm[])
 
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled))
+	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
 		return;
 
 	trace_save_cmdline(tsk);
-- 
cgit v1.2.3


From c269fc8c537d761f36cb98e637ae934d9331a9d5 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 17 Mar 2009 01:20:59 -0500
Subject: tracing: fix leak in event_format_read()

Impact: fix memory leak

If event_format_read() exits early due to nonzero ppos, the
previous kmalloc doesn't get freed - might as well do the
check before the kmalloc and avoid the problem.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237270859.8033.141.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 238ea95a4115..c88227b3b9db 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -378,15 +378,15 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	char *buf;
 	int r;
 
+	if (*ppos)
+		return 0;
+
 	s = kmalloc(sizeof(*s), GFP_KERNEL);
 	if (!s)
 		return -ENOMEM;
 
 	trace_seq_init(s);
 
-	if (*ppos)
-		return 0;
-
 	/* If any of the first writes fail, so will the show_format. */
 
 	trace_seq_printf(s, "name: %s\n", call->name);
-- 
cgit v1.2.3


From 80dd99b368cf6501be88ab517bbbb5bf352b75b8 Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Mon, 16 Mar 2009 19:58:09 +0000
Subject: sched: fix typos in documentation

Fixed typos in function documentation.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
LKML-Reference: <20090316195809.GA6073@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 2f28351892c9..489e7d926408 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -2082,7 +2082,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state)
 		 * it must be off the runqueue _entirely_, and not
 		 * preempted!
 		 *
-		 * So if it wa still runnable (but just not actively
+		 * So if it was still runnable (but just not actively
 		 * running right now), it's preempted, and we should
 		 * yield - it could be a while.
 		 */
@@ -2574,7 +2574,7 @@ void wake_up_new_task(struct task_struct *p, unsigned long clone_flags)
 #ifdef CONFIG_PREEMPT_NOTIFIERS
 
 /**
- * preempt_notifier_register - tell me when current is being being preempted & rescheduled
+ * preempt_notifier_register - tell me when current is being preempted & rescheduled
  * @notifier: notifier struct to register
  */
 void preempt_notifier_register(struct preempt_notifier *notifier)
-- 
cgit v1.2.3


From 708dc5125309cd33c5daaad3026cc4ae6ef39c8b Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Mon, 16 Mar 2009 19:59:02 +0000
Subject: sched: small optimisation of can_migrate_task()

There were 3 invocations of task_hot() in can_migrate_task().

Replace these 3 invocations by only one invocation, cached in
a local variable.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
LKML-Reference: <20090316195902.GA6197@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 489e7d926408..d2dfe4c1a225 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3002,6 +3002,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		     struct sched_domain *sd, enum cpu_idle_type idle,
 		     int *all_pinned)
 {
+	int tsk_cache_hot = 0;
 	/*
 	 * We do not migrate tasks that are:
 	 * 1) running (obviously), or
@@ -3025,10 +3026,11 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 	 * 2) too many balance attempts have failed.
 	 */
 
-	if (!task_hot(p, rq->clock, sd) ||
-			sd->nr_balance_failed > sd->cache_nice_tries) {
+	tsk_cache_hot = task_hot(p, rq->clock, sd);
+	if (!tsk_cache_hot ||
+		sd->nr_balance_failed > sd->cache_nice_tries) {
 #ifdef CONFIG_SCHEDSTATS
-		if (task_hot(p, rq->clock, sd)) {
+		if (tsk_cache_hot) {
 			schedstat_inc(sd, lb_hot_gained[idle]);
 			schedstat_inc(p, se.nr_forced_migrations);
 		}
@@ -3036,7 +3038,7 @@ int can_migrate_task(struct task_struct *p, struct rq *rq, int this_cpu,
 		return 1;
 	}
 
-	if (task_hot(p, rq->clock, sd)) {
+	if (tsk_cache_hot) {
 		schedstat_inc(p, se.nr_failed_migrations_hot);
 		return 0;
 	}
-- 
cgit v1.2.3


From f2d28a2ebcb525a6ec7e2152106ddb385ef52b73 Mon Sep 17 00:00:00 2001
From: Guillaume Knispel <gknispel@proformatique.com>
Date: Tue, 17 Mar 2009 16:18:42 +0100
Subject: printk: correct the behavior of printk_timed_ratelimit()

Impact: fix jiffies-comparison sign-wrap behavior

The behavior provided by printk_timed_ratelimit() is, in some
situations, probably not what a caller would reasonably expect:

bool printk_timed_ratelimit(unsigned long *caller_jiffies,
			unsigned int interval_msecs)
{
	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
		return true;
	}
	return false;
}

On a 32 bit computer, if printk_timed_ratelimit() is initially called at
time jiffies == Ja, *caller_jiffies is set to
Ja + msecs_to_jiffies(interval_msecs): let's say Ja + 42 for this
example.

If this caller then doesn't call printk_timed_ratelimit() until
jiffies == Ja + (1 << 31) + 42 (which can happen as soon as ~ 25 days
later on a 1000 HZ system), printk_timed_ratelimit() will then always
return false to this caller until jiffies loops completely (1 << 31 more
ticks).

Ths change makes it only return false if jiffies is in the small
time window starting at the previous call when true was returned and
ending interval_msecs later.  Note that if jiffies loops completely
between two calls to printk_timed_ratelimit(), it will obviously still
wrongly return false, but this is something with a low probability.

If something completely reliable is needed I guess jiffies_64 must be
used (which this change does not do).

Signed-off-by: Guillaume Knispel <gknispel@proformatique.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Cc: Andrew Morton <akpm@osdl.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
LKML-Reference: <20090317161842.0059096b@xilun.lan.proformatique.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/printk.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..2be719908d1f 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -1292,8 +1292,11 @@ EXPORT_SYMBOL(printk_ratelimit);
 bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 			unsigned int interval_msecs)
 {
-	if (*caller_jiffies == 0 || time_after(jiffies, *caller_jiffies)) {
-		*caller_jiffies = jiffies + msecs_to_jiffies(interval_msecs);
+	if (*caller_jiffies == 0
+			|| !time_in_range(jiffies, *caller_jiffies,
+					*caller_jiffies
+					+ msecs_to_jiffies(interval_msecs))) {
+		*caller_jiffies = jiffies;
 		return true;
 	}
 	return false;
-- 
cgit v1.2.3


From 6e2b75740bed35df98b8113300579e13ed2ce848 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 16 Mar 2009 18:13:36 -0400
Subject: module: fix refptr allocation and release order

Impact: fix ref-after-free crash on failed module load

Fix refptr bug: Change refptr allocation and release order not to access a module
data structure pointed by 'mod' after freeing mod->module_core.
This bug will cause kernel panic(e.g. failed to find undefined symbols).

This bug was reported on systemtap bugzilla.
http://sources.redhat.com/bugzilla/show_bug.cgi?id=9927

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 26 ++++++++++++++------------
 1 file changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index ba22484a987e..1196f5d11700 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2015,14 +2015,6 @@ static noinline struct module *load_module(void __user *umod,
 	if (err < 0)
 		goto free_mod;
 
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
-				      mod->name);
-	if (!mod->refptr) {
-		err = -ENOMEM;
-		goto free_mod;
-	}
-#endif
 	if (pcpuindex) {
 		/* We have a special allocation for this section. */
 		percpu = percpu_modalloc(sechdrs[pcpuindex].sh_size,
@@ -2030,7 +2022,7 @@ static noinline struct module *load_module(void __user *umod,
 					 mod->name);
 		if (!percpu) {
 			err = -ENOMEM;
-			goto free_percpu;
+			goto free_mod;
 		}
 		sechdrs[pcpuindex].sh_flags &= ~(unsigned long)SHF_ALLOC;
 		mod->percpu = percpu;
@@ -2082,6 +2074,14 @@ static noinline struct module *load_module(void __user *umod,
 	/* Module has been moved. */
 	mod = (void *)sechdrs[modindex].sh_addr;
 
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	mod->refptr = percpu_modalloc(sizeof(local_t), __alignof__(local_t),
+				      mod->name);
+	if (!mod->refptr) {
+		err = -ENOMEM;
+		goto free_init;
+	}
+#endif
 	/* Now we've moved module, initialize linked lists, etc. */
 	module_unload_init(mod);
 
@@ -2288,15 +2288,17 @@ static noinline struct module *load_module(void __user *umod,
 	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
+ free_init:
+#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+	percpu_modfree(mod->refptr);
+#endif
 	module_free(mod, mod->module_init);
  free_core:
 	module_free(mod, mod->module_core);
+	/* mod will be freed with core. Don't access it beyond this line! */
  free_percpu:
 	if (percpu)
 		percpu_modfree(percpu);
-#if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
-	percpu_modfree(mod->refptr);
-#endif
  free_mod:
 	kfree(args);
  free_hdr:
-- 
cgit v1.2.3


From 37886f6a9f62d22530ffee8d3f9215c8345b6969 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 17:22:06 -0400
Subject: ring-buffer: add api to allow a tracer to change clock source

This patch adds a new function called ring_buffer_set_clock that
allows a tracer to assign its own clock source to the buffer.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/ring_buffer.h |  7 +++--
 kernel/trace/ring_buffer.c  | 65 ++++++++++++++++++++++++++-------------------
 kernel/trace/trace.c        | 21 ++++++++++-----
 3 files changed, 57 insertions(+), 36 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index b1a0068a5557..9e6052bd1a1c 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -118,8 +118,11 @@ unsigned long ring_buffer_overruns(struct ring_buffer *buffer);
 unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu);
 unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu);
 
-u64 ring_buffer_time_stamp(int cpu);
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts);
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu);
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts);
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void));
 
 size_t ring_buffer_page_len(void *page);
 
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 58128ad2fde0..bbf51922a8ca 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -180,29 +180,6 @@ EXPORT_SYMBOL_GPL(tracing_is_on);
 
 #include "trace.h"
 
-/* Up this if you want to test the TIME_EXTENTS and normalization */
-#define DEBUG_SHIFT 0
-
-u64 ring_buffer_time_stamp(int cpu)
-{
-	u64 time;
-
-	preempt_disable_notrace();
-	/* shift to debug/test normalization and TIME_EXTENTS */
-	time = trace_clock_local() << DEBUG_SHIFT;
-	preempt_enable_no_resched_notrace();
-
-	return time;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
-
-void ring_buffer_normalize_time_stamp(int cpu, u64 *ts)
-{
-	/* Just stupid testing the normalize function and deltas */
-	*ts >>= DEBUG_SHIFT;
-}
-EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
-
 #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array))
 #define RB_ALIGNMENT		4U
 #define RB_MAX_SMALL_DATA	28
@@ -374,6 +351,7 @@ struct ring_buffer {
 #ifdef CONFIG_HOTPLUG_CPU
 	struct notifier_block		cpu_notify;
 #endif
+	u64				(*clock)(void);
 };
 
 struct ring_buffer_iter {
@@ -394,6 +372,30 @@ struct ring_buffer_iter {
 		_____ret;					\
 	})
 
+/* Up this if you want to test the TIME_EXTENTS and normalization */
+#define DEBUG_SHIFT 0
+
+u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu)
+{
+	u64 time;
+
+	preempt_disable_notrace();
+	/* shift to debug/test normalization and TIME_EXTENTS */
+	time = buffer->clock() << DEBUG_SHIFT;
+	preempt_enable_no_resched_notrace();
+
+	return time;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_time_stamp);
+
+void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer,
+				      int cpu, u64 *ts)
+{
+	/* Just stupid testing the normalize function and deltas */
+	*ts >>= DEBUG_SHIFT;
+}
+EXPORT_SYMBOL_GPL(ring_buffer_normalize_time_stamp);
+
 /**
  * check_pages - integrity check of buffer pages
  * @cpu_buffer: CPU buffer with pages to test
@@ -569,6 +571,7 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 
 	buffer->pages = DIV_ROUND_UP(size, BUF_PAGE_SIZE);
 	buffer->flags = flags;
+	buffer->clock = trace_clock_local;
 
 	/* need at least two pages */
 	if (buffer->pages == 1)
@@ -645,6 +648,12 @@ ring_buffer_free(struct ring_buffer *buffer)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_free);
 
+void ring_buffer_set_clock(struct ring_buffer *buffer,
+			   u64 (*clock)(void))
+{
+	buffer->clock = clock;
+}
+
 static void rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer);
 
 static void
@@ -1191,7 +1200,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 			cpu_buffer->tail_page = next_page;
 
 			/* reread the time stamp */
-			*ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+			*ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu);
 			cpu_buffer->tail_page->page->time_stamp = *ts;
 		}
 
@@ -1334,7 +1343,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer,
 	if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000))
 		return NULL;
 
-	ts = ring_buffer_time_stamp(cpu_buffer->cpu);
+	ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu);
 
 	/*
 	 * Only the first commit can update the timestamp.
@@ -2051,7 +2060,8 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = cpu_buffer->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
@@ -2112,7 +2122,8 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	case RINGBUF_TYPE_DATA:
 		if (ts) {
 			*ts = iter->read_stamp + event->time_delta;
-			ring_buffer_normalize_time_stamp(cpu_buffer->cpu, ts);
+			ring_buffer_normalize_time_stamp(buffer,
+							 cpu_buffer->cpu, ts);
 		}
 		return event;
 
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8f89690230e6..3be2f788e10d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -155,13 +155,6 @@ ns2usecs(cycle_t nsec)
 	return nsec;
 }
 
-cycle_t ftrace_now(int cpu)
-{
-	u64 ts = ring_buffer_time_stamp(cpu);
-	ring_buffer_normalize_time_stamp(cpu, &ts);
-	return ts;
-}
-
 /*
  * The global_trace is the descriptor that holds the tracing
  * buffers for the live tracing. For each CPU, it contains
@@ -178,6 +171,20 @@ static struct trace_array	global_trace;
 
 static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu);
 
+cycle_t ftrace_now(int cpu)
+{
+	u64 ts;
+
+	/* Early boot up does not have a buffer yet */
+	if (!global_trace.buffer)
+		return trace_clock_local();
+
+	ts = ring_buffer_time_stamp(global_trace.buffer, cpu);
+	ring_buffer_normalize_time_stamp(global_trace.buffer, cpu, &ts);
+
+	return ts;
+}
+
 /*
  * The max_tr is used to snapshot the global_trace when a maximum
  * latency is reached. Some tracers will use this to store a maximum
-- 
cgit v1.2.3


From af4617bdba34aa556272b34c3986b0a4d588f568 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 18:09:55 -0400
Subject: tracing: add global-clock option to provide cross CPU clock to traces

Impact: feature to allow better serialized clock

This patch adds an option called "global-clock" that will allow
the tracer to switch to a slower but more accurate (across CPUs)
clock.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 34 ++++++++++++++++++++++++++++++----
 kernel/trace/trace.h |  1 +
 2 files changed, 31 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 3be2f788e10d..2f994caab0b7 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -315,6 +315,7 @@ static const char *trace_options[] = {
 	"printk-msg-only",
 	"context-info",
 	"latency-format",
+	"global-clock",
 	NULL
 };
 
@@ -2251,6 +2252,34 @@ static int set_tracer_option(struct tracer *trace, char *cmp, int neg)
 	return 0;
 }
 
+static void set_tracer_flags(unsigned int mask, int enabled)
+{
+	/* do nothing if flag is already set */
+	if (!!(trace_flags & mask) == !!enabled)
+		return;
+
+	if (enabled)
+		trace_flags |= mask;
+	else
+		trace_flags &= ~mask;
+
+	if (mask == TRACE_ITER_GLOBAL_CLK) {
+		u64 (*func)(void);
+
+		if (enabled)
+			func = trace_clock_global;
+		else
+			func = trace_clock_local;
+
+		mutex_lock(&trace_types_lock);
+		ring_buffer_set_clock(global_trace.buffer, func);
+
+		if (max_tr.buffer)
+			ring_buffer_set_clock(max_tr.buffer, func);
+		mutex_unlock(&trace_types_lock);
+	}
+}
+
 static ssize_t
 tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 			size_t cnt, loff_t *ppos)
@@ -2278,10 +2307,7 @@ tracing_trace_options_write(struct file *filp, const char __user *ubuf,
 		int len = strlen(trace_options[i]);
 
 		if (strncmp(cmp, trace_options[i], len) == 0) {
-			if (neg)
-				trace_flags &= ~(1 << i);
-			else
-				trace_flags |= (1 << i);
+			set_tracer_flags(1 << i, !neg);
 			break;
 		}
 	}
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index b0ecad8ecc34..26a7a28ca110 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -667,6 +667,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_PRINTK_MSGONLY	= 0x10000,
 	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
+	TRACE_ITER_GLOBAL_CLK		= 0x80000,
 };
 
 /*
-- 
cgit v1.2.3


From 5fec6ddcb43a91aa9a254c8ecf174c803de6f07e Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 19:59:53 -0400
Subject: tracing: make sched_switch stop/start light weight

The stopping and starting of a tracer should be light weight and
be able to be called in all contexts. The sched_switch grabbed
mutexes in the start/stop functions. This patch changes it to a
simple variable, on/off.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_sched_switch.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index 77132c2cf3d9..de35f200abd3 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -18,6 +18,7 @@ static struct trace_array	*ctx_trace;
 static int __read_mostly	tracer_enabled;
 static int			sched_ref;
 static DEFINE_MUTEX(sched_register_mutex);
+static int			sched_stopped;
 
 static void
 probe_sched_switch(struct rq *__rq, struct task_struct *prev,
@@ -28,7 +29,7 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	int cpu;
 	int pc;
 
-	if (!sched_ref)
+	if (!sched_ref || sched_stopped)
 		return;
 
 	tracing_record_cmdline(prev);
@@ -193,6 +194,7 @@ static void stop_sched_trace(struct trace_array *tr)
 static int sched_switch_trace_init(struct trace_array *tr)
 {
 	ctx_trace = tr;
+	tracing_reset_online_cpus(tr);
 	tracing_start_sched_switch_record();
 	return 0;
 }
@@ -205,13 +207,12 @@ static void sched_switch_trace_reset(struct trace_array *tr)
 
 static void sched_switch_trace_start(struct trace_array *tr)
 {
-	tracing_reset_online_cpus(tr);
-	tracing_start_sched_switch();
+	sched_stopped = 0;
 }
 
 static void sched_switch_trace_stop(struct trace_array *tr)
 {
-	tracing_stop_sched_switch();
+	sched_stopped = 1;
 }
 
 static struct tracer sched_switch_trace __read_mostly =
-- 
cgit v1.2.3


From 62524d55e5b9ffe36e3bf3dd7a594114f150b449 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 17 Mar 2009 20:58:00 -0400
Subject: tracing: make power tracer start/stop methods lighter weight

The start/stop methods of a tracer should be able to be executed
in all contexts. This patch converts the power tracer to do so.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_power.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
index 91ce672fb037..bae791ebcc51 100644
--- a/kernel/trace/trace_power.c
+++ b/kernel/trace/trace_power.c
@@ -122,10 +122,14 @@ fail_start:
 static void start_power_trace(struct trace_array *tr)
 {
 	trace_power_enabled = 1;
-	tracing_power_register();
 }
 
 static void stop_power_trace(struct trace_array *tr)
+{
+	trace_power_enabled = 0;
+}
+
+static void power_trace_reset(struct trace_array *tr)
 {
 	trace_power_enabled = 0;
 	unregister_trace_power_start(probe_power_start);
@@ -188,7 +192,7 @@ static struct tracer power_tracer __read_mostly =
 	.init		= power_trace_init,
 	.start		= start_power_trace,
 	.stop		= stop_power_trace,
-	.reset		= stop_power_trace,
+	.reset		= power_trace_reset,
 	.print_line	= power_print_line,
 };
 
-- 
cgit v1.2.3


From af66df5ecf9c9e2d2ff86e8203510c1c4519d64c Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Wed, 18 Mar 2009 00:04:25 +0000
Subject: sched: jiffies not printed per CPU

The jiffies value was being printed for each CPU, which does not seem to make
sense.  Moved jiffies to system section.

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090318000425.GA2228@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_debug.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 2b1260f0e800..4daebffa0565 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -272,7 +272,6 @@ static void print_cpu(struct seq_file *m, int cpu)
 	P(nr_switches);
 	P(nr_load_updates);
 	P(nr_uninterruptible);
-	SEQ_printf(m, "  .%-30s: %lu\n", "jiffies", jiffies);
 	PN(next_balance);
 	P(curr->pid);
 	PN(clock);
@@ -325,6 +324,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	SEQ_printf(m, "  .%-40s: %Ld\n", #x, (long long)(x))
 #define PN(x) \
 	SEQ_printf(m, "  .%-40s: %Ld.%06ld\n", #x, SPLIT_NS(x))
+	P(jiffies);
 	PN(sysctl_sched_latency);
 	PN(sysctl_sched_min_granularity);
 	PN(sysctl_sched_wakeup_granularity);
-- 
cgit v1.2.3


From 18aecd362a1c991fbf5f7919ae051a77532ba2f8 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 08:56:58 +0100
Subject: tracing: stop command line recording when tracing is disabled

Impact: prevent overwrite of command line entries

When the tracer is stopped the command line recording continues to
record. The check for tracing_is_on() is not sufficient here as the
ringbuffer status is not affected by setting
debug/tracing/tracing_enabled to 0. On a non idle system this can
result in the loss of the command line information for the stopped
trace, which makes the trace harder to read and analyse.

Check tracer_enabled to allow further recording.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 1ce6208fd727..7b6043ea256e 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -797,7 +797,8 @@ void trace_find_cmdline(int pid, char comm[])
 
 void tracing_record_cmdline(struct task_struct *tsk)
 {
-	if (atomic_read(&trace_record_cmdline_disabled) || !tracing_is_on())
+	if (atomic_read(&trace_record_cmdline_disabled) || !tracer_enabled ||
+	    !tracing_is_on())
 		return;
 
 	trace_save_cmdline(tsk);
-- 
cgit v1.2.3


From 2c7eea4c62ba090b7f4583c3d7337ea0019be900 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 09:03:19 +0100
Subject: tracing: replace the crude (unsigned) -1 hackery

Impact: cleanup

The command line recorder uses (unsigned) -1 to mark non mapped
entries in the pid to command line maps. The validity check is
completely unintuitive: idx >= SAVED_CMDLINES

There is no need for such casting games. Use a constant to mark
unmapped entries and check for that constant to make the code readable
and understandable.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 7b6043ea256e..ca673c475687 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -633,6 +633,7 @@ void tracing_reset_online_cpus(struct trace_array *tr)
 }
 
 #define SAVED_CMDLINES 128
+#define NO_CMDLINE_MAP UINT_MAX
 static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1];
 static unsigned map_cmdline_to_pid[SAVED_CMDLINES];
 static char saved_cmdlines[SAVED_CMDLINES][TASK_COMM_LEN];
@@ -644,8 +645,8 @@ static atomic_t trace_record_cmdline_disabled __read_mostly;
 
 static void trace_init_cmdlines(void)
 {
-	memset(&map_pid_to_cmdline, -1, sizeof(map_pid_to_cmdline));
-	memset(&map_cmdline_to_pid, -1, sizeof(map_cmdline_to_pid));
+	memset(&map_pid_to_cmdline, NO_CMDLINE_MAP, sizeof(map_pid_to_cmdline));
+	memset(&map_cmdline_to_pid, NO_CMDLINE_MAP, sizeof(map_cmdline_to_pid));
 	cmdline_idx = 0;
 }
 
@@ -753,12 +754,12 @@ static void trace_save_cmdline(struct task_struct *tsk)
 		return;
 
 	idx = map_pid_to_cmdline[tsk->pid];
-	if (idx >= SAVED_CMDLINES) {
+	if (idx == NO_CMDLINE_MAP) {
 		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
 
 		map = map_cmdline_to_pid[idx];
-		if (map <= PID_MAX_DEFAULT)
-			map_pid_to_cmdline[map] = (unsigned)-1;
+		if (map != NO_CMDLINE_MAP)
+			map_pid_to_cmdline[map] = NO_CMDLINE_MAP;
 
 		map_pid_to_cmdline[tsk->pid] = idx;
 
@@ -786,7 +787,7 @@ void trace_find_cmdline(int pid, char comm[])
 
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
-	if (map >= SAVED_CMDLINES)
+	if (map == NO_CMDLINE_MAP)
 		goto out;
 
 	strcpy(comm, saved_cmdlines[map]);
-- 
cgit v1.2.3


From 50d88758a3f9787cbdbdbc030560b815721eab4b Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Wed, 18 Mar 2009 08:58:44 +0100
Subject: tracing: fix trace_find_cmdline()

Impact: prevent stale command line output

In case there is no valid command line mapping for a pid
trace_find_cmdline() returns without updating the comm buffer. The
trace dump keeps the previous entry which results in confusing trace
output:

     <idle>-0     [000]   280.702056 ....
     <idle>-23456 [000]   280.702080 ....

Update the comm buffer with "<...>" when no mapping is found.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ca673c475687..06c69a260328 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -787,12 +787,11 @@ void trace_find_cmdline(int pid, char comm[])
 
 	__raw_spin_lock(&trace_cmdline_lock);
 	map = map_pid_to_cmdline[pid];
-	if (map == NO_CMDLINE_MAP)
-		goto out;
-
-	strcpy(comm, saved_cmdlines[map]);
+	if (map != NO_CMDLINE_MAP)
+		strcpy(comm, saved_cmdlines[map]);
+	else
+		strcpy(comm, "<...>");
 
- out:
 	__raw_spin_unlock(&trace_cmdline_lock);
 }
 
-- 
cgit v1.2.3


From a635cf0497342978d417cae19d4a4823932977ff Mon Sep 17 00:00:00 2001
From: Carsten Emde <Carsten.Emde@osadl.org>
Date: Wed, 18 Mar 2009 09:00:41 +0100
Subject: tracing: fix command line to pid reverse map

Impact: fix command line to pid mapping

map_cmdline_to_pid[] is checked in trace_save_cmdline(), but never
updated. This results in stale pid to command line mappings and the
tracer output will associate the wrong comm string.

Signed-off-by: Carsten Emde <Carsten.Emde@osadl.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 06c69a260328..305c562dae2a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -738,8 +738,7 @@ void trace_stop_cmdline_recording(void);
 
 static void trace_save_cmdline(struct task_struct *tsk)
 {
-	unsigned map;
-	unsigned idx;
+	unsigned pid, idx;
 
 	if (!tsk->pid || unlikely(tsk->pid > PID_MAX_DEFAULT))
 		return;
@@ -757,10 +756,17 @@ static void trace_save_cmdline(struct task_struct *tsk)
 	if (idx == NO_CMDLINE_MAP) {
 		idx = (cmdline_idx + 1) % SAVED_CMDLINES;
 
-		map = map_cmdline_to_pid[idx];
-		if (map != NO_CMDLINE_MAP)
-			map_pid_to_cmdline[map] = NO_CMDLINE_MAP;
+		/*
+		 * Check whether the cmdline buffer at idx has a pid
+		 * mapped. We are going to overwrite that entry so we
+		 * need to clear the map_pid_to_cmdline. Otherwise we
+		 * would read the new comm for the old pid.
+		 */
+		pid = map_cmdline_to_pid[idx];
+		if (pid != NO_CMDLINE_MAP)
+			map_pid_to_cmdline[pid] = NO_CMDLINE_MAP;
 
+		map_cmdline_to_pid[idx] = tsk->pid;
 		map_pid_to_cmdline[tsk->pid] = idx;
 
 		cmdline_idx = idx;
-- 
cgit v1.2.3


From 490362003457f8d387f6f6e73e3a7efbf56c3314 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Tue, 17 Mar 2009 22:38:58 +0100
Subject: tracing/ftrace: stop {irqs, preempt}soff tracers when tracing is
 stopped

Impact: fix a selftest warning

In some cases, it's possible to see the following warning on irqsoff
tracer selftest:

[    4.640003] Testing tracer irqsoff: <4>------------[ cut here ]------------
[    4.653562] WARNING: at kernel/trace/trace.c:458 update_max_tr_single+0x9a/0xc4()
[    4.660000] Hardware name: System Product Name
[    4.660000] Modules linked in:
[    4.660000] Pid: 301, comm: kstop/1 Not tainted 2.6.29-rc8-tip #35837
[    4.660000] Call Trace:
[    4.660000]  [<4014b588>] warn_slowpath+0x79/0x8f
[    4.660000]  [<402d6949>] ? put_dec+0x64/0x6b
[    4.660000]  [<40162b56>] ? getnstimeofday+0x58/0xdd
[    4.660000]  [<40162210>] ? clocksource_read+0x3/0xf
[    4.660000]  [<4015eb44>] ? ktime_set+0x8/0x34
[    4.660000]  [<4014101a>] ? balance_runtime+0x8/0x56
[    4.660000]  [<405f6f11>] ? _spin_lock+0x3/0x10
[    4.660000]  [<4011f643>] ? ftrace_call+0x5/0x8
[    4.660000]  [<4015d0f1>] ? task_cputime_zero+0x3/0x27
[    4.660000]  [<40190ee7>] ? cpupri_set+0x90/0xcb
[    4.660000]  [<405f7208>] ? _spin_lock_irqsave+0x22/0x34
[    4.660000]  [<40190f12>] ? cpupri_set+0xbb/0xcb
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<4018493f>] ? ring_buffer_reset_cpu+0x27/0x51
[    4.660000]  [<405f7208>] ? _spin_lock_irqsave+0x22/0x34
[    4.660000]  [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<4018cc29>] ? trace_hardirqs_off+0x1a/0x1c
[    4.660000]  [<405f7151>] ? _spin_unlock_irqrestore+0x23/0x35
[    4.660000]  [<40184962>] ? ring_buffer_reset_cpu+0x4a/0x51
[    4.660000]  [<401850f3>] ? cpumask_next+0x15/0x18
[    4.660000]  [<4018a41f>] update_max_tr_single+0x9a/0xc4
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4018cd13>] check_critical_timing+0xcc/0x11e
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4014e5fe>] ? exit_notify+0x16/0xf2
[    4.660000]  [<4018cdf1>] stop_critical_timing+0x8c/0x9f
[    4.660000]  [<4014e5c4>] ? forget_original_parent+0xac/0xd0
[    4.660000]  [<4018ce3a>] trace_hardirqs_on+0x1a/0x1c
[    4.660000]  [<4014e5c4>] forget_original_parent+0xac/0xd0
[    4.660000]  [<4014e5fe>] exit_notify+0x16/0xf2
[    4.660000]  [<4014e8a5>] do_exit+0x1cb/0x225
[    4.660000]  [<4015c72b>] ? kthread+0x0/0x69
[    4.660000]  [<4011f61d>] kernel_thread_helper+0xd/0x10
[    4.660000] ---[ end trace a7919e7f17c0a725 ]---
[    4.660164] .. no entries found ..FAILED!

During the selftest of irqsoff tracer, we do that:

	/* disable interrupts for a bit */
	local_irq_disable();
	udelay(100);
	local_irq_enable();
	/* stop the tracing. */
	tracing_stop();
	/* check both trace buffers */
	ret = trace_test_buffer(tr, NULL);

If a callsite performs a new max delay with irqs off just after
tracing_stop, update_max_tr_single() -> ring_buffer_swap_cpu()
will be called with the buffers disabled by tracing_stop(), hence
the warning, then ring_buffer_swap_cpu() return -EAGAIN and
update_max_tr_single() complains.

Fix it by also stopping the tracer before stopping the tracing globally.
A similar situation can happen with preemptoff and preemptirqsoff tracers
where we apply the same fix.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237325938-5240-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 26 ++++++++++++++++++++++++++
 1 file changed, 26 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index a2ca6f0fef9b..38856ba78a92 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -315,6 +315,14 @@ trace_selftest_startup_irqsoff(struct tracer *trace, struct trace_array *tr)
 	local_irq_disable();
 	udelay(100);
 	local_irq_enable();
+
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max irqs off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -369,6 +377,14 @@ trace_selftest_startup_preemptoff(struct tracer *trace, struct trace_array *tr)
 	preempt_disable();
 	udelay(100);
 	preempt_enable();
+
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max preempt off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -428,6 +444,13 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* reverse the order of preempt vs irqs */
 	local_irq_enable();
 
+	/*
+	 * Stop the tracer to avoid a warning subsequent
+	 * to buffer flipping failure because tracing_stop()
+	 * disables the tr and max buffers, making flipping impossible
+	 * in case of parallels max irqs/preempt off latencies.
+	 */
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
@@ -448,6 +471,8 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* do the test by disabling interrupts first this time */
 	tracing_max_latency = 0;
 	tracing_start();
+	trace->start(tr);
+
 	preempt_disable();
 	local_irq_disable();
 	udelay(100);
@@ -455,6 +480,7 @@ trace_selftest_startup_preemptirqsoff(struct tracer *trace, struct trace_array *
 	/* reverse the order of preempt vs irqs */
 	local_irq_enable();
 
+	trace->stop(tr);
 	/* stop the tracing. */
 	tracing_stop();
 	/* check both trace buffers */
-- 
cgit v1.2.3


From f02b8624fedca39886b0eef770dca70c2f0749b3 Mon Sep 17 00:00:00 2001
From: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Date: Wed, 18 Mar 2009 17:06:21 +0530
Subject: kprobes: Fix locking imbalance in kretprobes

Fix locking imbalance in kretprobes:

=====================================
[ BUG: bad unlock balance detected! ]
-------------------------------------
kthreadd/2 is trying to release lock (&rp->lock) at:
[<c06b3080>] pre_handler_kretprobe+0xea/0xf4
but there are no more locks to release!

other info that might help us debug this:
1 lock held by kthreadd/2:
 #0:  (rcu_read_lock){..--}, at: [<c06b2b24>] __atomic_notifier_call_chain+0x0/0x5a

stack backtrace:
Pid: 2, comm: kthreadd Not tainted 2.6.29-rc8 #1
Call Trace:
 [<c06ae498>] ? printk+0xf/0x17
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044ce6c>] print_unlock_inbalance_bug+0xc3/0xce
 [<c0444d4b>] ? clocksource_read+0x7/0xa
 [<c04450a4>] ? getnstimeofday+0x5f/0xf6
 [<c044a9ca>] ? register_lock_class+0x17/0x293
 [<c044b72c>] ? mark_lock+0x1e/0x30b
 [<c0448956>] ? tick_dev_program_event+0x4a/0xbc
 [<c0498100>] ? __slab_alloc+0xa5/0x415
 [<c06b2fbe>] ? pre_handler_kretprobe+0x28/0xf4
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044cf1b>] lock_release_non_nested+0xa4/0x1a5
 [<c06b3080>] ? pre_handler_kretprobe+0xea/0xf4
 [<c044d15d>] lock_release+0x141/0x166
 [<c06b07dd>] _spin_unlock_irqrestore+0x19/0x50
 [<c06b3080>] pre_handler_kretprobe+0xea/0xf4
 [<c06b20b5>] kprobe_exceptions_notify+0x1c9/0x43e
 [<c06b2b02>] notifier_call_chain+0x26/0x48
 [<c06b2b5b>] __atomic_notifier_call_chain+0x37/0x5a
 [<c06b2b24>] ? __atomic_notifier_call_chain+0x0/0x5a
 [<c06b2b8a>] atomic_notifier_call_chain+0xc/0xe
 [<c0442d0d>] notify_die+0x2d/0x2f
 [<c06b0f9c>] do_int3+0x1f/0x71
 [<c06b0e84>] int3+0x2c/0x34
 [<c042d476>] ? do_fork+0x1/0x288
 [<c040221b>] ? kernel_thread+0x71/0x79
 [<c043ed1b>] ? kthread+0x0/0x60
 [<c043ed1b>] ? kthread+0x0/0x60
 [<c04040b8>] ? kernel_thread_helper+0x0/0x10
 [<c043ec7f>] kthreadd+0xac/0x148
 [<c043ebd3>] ? kthreadd+0x0/0x148
 [<c04040bf>] kernel_thread_helper+0x7/0x10

Signed-off-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Tested-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: Masami Hiramatsu <mhiramat@redhat.com>
Cc: Jim Keniston <jkenisto@us.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: <stable@kernel.org> [2.6.29.x, 2.6.28.x, 2.6.27.x]
LKML-Reference: <20090318113621.GB4129@in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kprobes.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 479d4d5672f9..5016bfb682b9 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -919,10 +919,8 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 		ri->rp = rp;
 		ri->task = current;
 
-		if (rp->entry_handler && rp->entry_handler(ri, regs)) {
-			spin_unlock_irqrestore(&rp->lock, flags);
+		if (rp->entry_handler && rp->entry_handler(ri, regs))
 			return 0;
-		}
 
 		arch_prepare_kretprobe(ri, regs);
 
-- 
cgit v1.2.3


From 4acd4d00f716873e27e7b60ae292cbdbfae674dd Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Wed, 18 Mar 2009 10:40:24 -0400
Subject: tracing: give easy way to clear trace buffer

There is currently no easy way to clear the trace buffer. Currently
the only way is to change the current tracer.

This patch lets the user clear the trace buffer by simply writing
into the trace files.

 echo > /debug/tracing/trace

or to clear a single cpu (i.e. for CPU 1):

 echo > /debug/tracing/per_cpu/cpu1/trace

Requested-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c | 41 +++++++++++++++++++++++++++++++++--------
 1 file changed, 33 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a2d13e8c8fd8..8d981ababc45 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1941,9 +1941,14 @@ int tracing_open_generic(struct inode *inode, struct file *filp)
 static int tracing_release(struct inode *inode, struct file *file)
 {
 	struct seq_file *m = (struct seq_file *)file->private_data;
-	struct trace_iterator *iter = m->private;
+	struct trace_iterator *iter;
 	int cpu;
 
+	if (!(file->f_mode & FMODE_READ))
+		return 0;
+
+	iter = m->private;
+
 	mutex_lock(&trace_types_lock);
 	for_each_tracing_cpu(cpu) {
 		if (iter->buffer_iter[cpu])
@@ -1969,12 +1974,24 @@ static int tracing_open(struct inode *inode, struct file *file)
 	struct trace_iterator *iter;
 	int ret = 0;
 
-	iter = __tracing_open(inode, file);
-	if (IS_ERR(iter))
-		ret = PTR_ERR(iter);
-	else if (trace_flags & TRACE_ITER_LATENCY_FMT)
-		iter->iter_flags |= TRACE_FILE_LAT_FMT;
+	/* If this file was open for write, then erase contents */
+	if ((file->f_mode & FMODE_WRITE) &&
+	    !(file->f_flags & O_APPEND)) {
+		long cpu = (long) inode->i_private;
+
+		if (cpu == TRACE_PIPE_ALL_CPU)
+			tracing_reset_online_cpus(&global_trace);
+		else
+			tracing_reset(&global_trace, cpu);
+	}
 
+	if (file->f_mode & FMODE_READ) {
+		iter = __tracing_open(inode, file);
+		if (IS_ERR(iter))
+			ret = PTR_ERR(iter);
+		else if (trace_flags & TRACE_ITER_LATENCY_FMT)
+			iter->iter_flags |= TRACE_FILE_LAT_FMT;
+	}
 	return ret;
 }
 
@@ -2049,9 +2066,17 @@ static int show_traces_open(struct inode *inode, struct file *file)
 	return ret;
 }
 
+static ssize_t
+tracing_write_stub(struct file *filp, const char __user *ubuf,
+		   size_t count, loff_t *ppos)
+{
+	return count;
+}
+
 static const struct file_operations tracing_fops = {
 	.open		= tracing_open,
 	.read		= seq_read,
+	.write		= tracing_write_stub,
 	.llseek		= seq_lseek,
 	.release	= tracing_release,
 };
@@ -3576,7 +3601,7 @@ static void tracing_init_debugfs_percpu(long cpu)
 		pr_warning("Could not create debugfs 'trace_pipe' entry\n");
 
 	/* per cpu trace */
-	entry = debugfs_create_file("trace", 0444, d_cpu,
+	entry = debugfs_create_file("trace", 0644, d_cpu,
 				(void *) cpu, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
@@ -3890,7 +3915,7 @@ static __init int tracer_init_debugfs(void)
 	if (!entry)
 		pr_warning("Could not create debugfs 'tracing_cpumask' entry\n");
 
-	entry = debugfs_create_file("trace", 0444, d_tracer,
+	entry = debugfs_create_file("trace", 0644, d_tracer,
 				 (void *) TRACE_PIPE_ALL_CPU, &tracing_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs 'trace' entry\n");
-- 
cgit v1.2.3


From 09933a108e6730a464a1ab676c9decc11aee0edc Mon Sep 17 00:00:00 2001
From: Jaswinder Singh Rajput <jaswinder@kernel.org>
Date: Wed, 18 Mar 2009 22:18:56 +0530
Subject: tracing: fix oops in tracepoint_update_probe_range()

Change this crash:

 BUG: unable to handle kernel NULL pointer dereference at (null)
 IP: [<ffffffff8107d4de>] tracepoint_update_probe_range+0x1f/0x9b
 PGD 13d5fb067 PUD 13d688067 PMD 0
 Oops: 0000 [#1] SMP

To a more debuggable WARN_ONCE().

Signed-off-by: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain>
[ moved the check outside the lock and added a WARN_ON(). ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index 79602740bbb5..adf28734fe2d 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -272,12 +272,17 @@ static void disable_tracepoint(struct tracepoint *elem)
  *
  * Updates the probe callback corresponding to a range of tracepoints.
  */
-void tracepoint_update_probe_range(struct tracepoint *begin,
-	struct tracepoint *end)
+void
+tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 {
 	struct tracepoint *iter;
 	struct tracepoint_entry *mark_entry;
 
+	if (!begin) {
+		WARN_ON_ONCE(1);
+		return;
+	}
+
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
 		mark_entry = get_tracepoint(iter->name);
-- 
cgit v1.2.3


From ec625cb29e66824f7ce41082617aeb93fa4e42e2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 18 Mar 2009 19:54:04 +0100
Subject: tracepoints: dont update zero-sized tracepoint sections

Zero-sized tracepoint sections can occur if tracing is enabled but
no tracepoint is defined. Do not emit a warning in that case.

Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Jaswinder Singh Rajput <jaswinderrajput@gmail.com>
LKML-Reference: <1237394936.3132.1.camel@localhost.localdomain>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/tracepoint.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/tracepoint.c b/kernel/tracepoint.c
index adf28734fe2d..1ef5d3a601c7 100644
--- a/kernel/tracepoint.c
+++ b/kernel/tracepoint.c
@@ -278,10 +278,8 @@ tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end)
 	struct tracepoint *iter;
 	struct tracepoint_entry *mark_entry;
 
-	if (!begin) {
-		WARN_ON_ONCE(1);
+	if (!begin)
 		return;
-	}
 
 	mutex_lock(&tracepoints_mutex);
 	for (iter = begin; iter < end; iter++) {
-- 
cgit v1.2.3


From df7c8e845e8e2030e8ae947e0ace56d184d0e9a0 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Mar 2009 15:22:20 +1030
Subject: cpumask: remove cpumask allocation from idle_balance

Impact: fix circular locking

Steven reports a circular locking from alloc_cpumask_var doing
a wakeup. We get rid of this using the tried-and-true technique
of using a per-cpu cpumask_var_t rather than doing an alloc
every time.

Simpler and more robust than a rare, implicit allocation within
an atomic codepath.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0903181729360.31583@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 35 ++++++++++++++++++-----------------
 1 file changed, 18 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5dabd80c3c15..48862d418bed 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3448,19 +3448,23 @@ find_busiest_queue(struct sched_group *group, enum cpu_idle_type idle,
  */
 #define MAX_PINNED_INTERVAL	512
 
+/* Working cpumask for load_balance and load_balance_newidle. */
+static DEFINE_PER_CPU(cpumask_var_t, load_balance_tmpmask);
+
 /*
  * Check this_cpu to ensure it is balanced within domain. Attempt to move
  * tasks if there is an imbalance.
  */
 static int load_balance(int this_cpu, struct rq *this_rq,
 			struct sched_domain *sd, enum cpu_idle_type idle,
-			int *balance, struct cpumask *cpus)
+			int *balance)
 {
 	int ld_moved, all_pinned = 0, active_balance = 0, sd_idle = 0;
 	struct sched_group *group;
 	unsigned long imbalance;
 	struct rq *busiest;
 	unsigned long flags;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
 	cpumask_setall(cpus);
 
@@ -3615,8 +3619,7 @@ out:
  * this_rq is locked.
  */
 static int
-load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
-			struct cpumask *cpus)
+load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd)
 {
 	struct sched_group *group;
 	struct rq *busiest = NULL;
@@ -3624,6 +3627,7 @@ load_balance_newidle(int this_cpu, struct rq *this_rq, struct sched_domain *sd,
 	int ld_moved = 0;
 	int sd_idle = 0;
 	int all_pinned = 0;
+	struct cpumask *cpus = __get_cpu_var(load_balance_tmpmask);
 
 	cpumask_setall(cpus);
 
@@ -3764,10 +3768,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 	struct sched_domain *sd;
 	int pulled_task = 0;
 	unsigned long next_balance = jiffies + HZ;
-	cpumask_var_t tmpmask;
-
-	if (!alloc_cpumask_var(&tmpmask, GFP_ATOMIC))
-		return;
 
 	for_each_domain(this_cpu, sd) {
 		unsigned long interval;
@@ -3778,7 +3778,7 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		if (sd->flags & SD_BALANCE_NEWIDLE)
 			/* If we've pulled tasks over stop searching: */
 			pulled_task = load_balance_newidle(this_cpu, this_rq,
-							   sd, tmpmask);
+							   sd);
 
 		interval = msecs_to_jiffies(sd->balance_interval);
 		if (time_after(next_balance, sd->last_balance + interval))
@@ -3793,7 +3793,6 @@ static void idle_balance(int this_cpu, struct rq *this_rq)
 		 */
 		this_rq->next_balance = next_balance;
 	}
-	free_cpumask_var(tmpmask);
 }
 
 /*
@@ -3943,11 +3942,6 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 	unsigned long next_balance = jiffies + 60*HZ;
 	int update_next_balance = 0;
 	int need_serialize;
-	cpumask_var_t tmp;
-
-	/* Fails alloc?  Rebalancing probably not a priority right now. */
-	if (!alloc_cpumask_var(&tmp, GFP_ATOMIC))
-		return;
 
 	for_each_domain(cpu, sd) {
 		if (!(sd->flags & SD_LOAD_BALANCE))
@@ -3972,7 +3966,7 @@ static void rebalance_domains(int cpu, enum cpu_idle_type idle)
 		}
 
 		if (time_after_eq(jiffies, sd->last_balance + interval)) {
-			if (load_balance(cpu, rq, sd, idle, &balance, tmp)) {
+			if (load_balance(cpu, rq, sd, idle, &balance)) {
 				/*
 				 * We've pulled tasks over so either we're no
 				 * longer idle, or one of our SMT siblings is
@@ -4006,8 +4000,6 @@ out:
 	 */
 	if (likely(update_next_balance))
 		rq->next_balance = next_balance;
-
-	free_cpumask_var(tmp);
 }
 
 /*
@@ -8303,6 +8295,9 @@ void __init sched_init(void)
 #endif
 #ifdef CONFIG_USER_SCHED
 	alloc_size *= 2;
+#endif
+#ifdef CONFIG_CPUMASK_OFFSTACK
+	alloc_size *= num_possible_cpus() * cpumask_size();
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
@@ -8341,6 +8336,12 @@ void __init sched_init(void)
 		ptr += nr_cpu_ids * sizeof(void **);
 #endif /* CONFIG_USER_SCHED */
 #endif /* CONFIG_RT_GROUP_SCHED */
+#ifdef CONFIG_CPUMASK_OFFSTACK
+		for_each_possible_cpu(i) {
+			per_cpu(load_balance_tmpmask, i) = (void *)ptr;
+			ptr += cpumask_size();
+		}
+#endif /* CONFIG_CPUMASK_OFFSTACK */
 	}
 
 #ifdef CONFIG_SMP
-- 
cgit v1.2.3


From 4a44bac1f98223ed77e47bf3b42fcfd10cddd85f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 19 Mar 2009 13:21:44 +0100
Subject: symbols, stacktrace: look up init symbols after module symbols

Impact: fix incomplete stacktraces

I noticed such weird stacktrace entries in lockdep dumps:

[    0.285956] {HARDIRQ-ON-W} state was registered at:
[    0.285956]   [<ffffffff802bce90>] mark_irqflags+0xbe/0x125
[    0.285956]   [<ffffffff802bf2fd>] __lock_acquire+0x674/0x82d
[    0.285956]   [<ffffffff802bf5b2>] lock_acquire+0xfc/0x128
[    0.285956]   [<ffffffff8135b636>] rt_spin_lock+0xc8/0xd0
[    0.285956]   [<ffffffffffffffff>] 0xffffffffffffffff

The stacktrace entry is cut off after rt_spin_lock.

After much debugging i found out that stacktrace entries that
belong to init symbols dont get printed out, due to commit:

  a2da405: module: Don't report discarded init pages as kernel text.

The reason is this check added to core_kernel_text():

-       if (addr >= (unsigned long)_sinittext &&
+       if (system_state == SYSTEM_BOOTING &&
+           addr >= (unsigned long)_sinittext &&
            addr <= (unsigned long)_einittext)
                return 1;

This will discard inittext symbols even though their symbol table
is still present and even though stacktraces done while the system
was booting up might still be relevant.

To not reintroduce the (not well-specified) bug addressed in that
commit, first do a module symbols lookup, then a final init-symbols
lookup.

This will work fine on architectures that have separate address
spaces for modules (such as x86) - and should not crash any other
architectures either.

Acked-by: Peter Zijlstra <peterz@infradead.org>
Cc: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <new-discussion>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..c46da6a47036 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -41,6 +41,14 @@ const struct exception_table_entry *search_exception_tables(unsigned long addr)
 	return e;
 }
 
+static inline int init_kernel_text(unsigned long addr)
+{
+	if (addr >= (unsigned long)_sinittext &&
+	    addr <= (unsigned long)_einittext)
+		return 1;
+	return 0;
+}
+
 __notrace_funcgraph int core_kernel_text(unsigned long addr)
 {
 	if (addr >= (unsigned long)_stext &&
@@ -48,8 +56,7 @@ __notrace_funcgraph int core_kernel_text(unsigned long addr)
 		return 1;
 
 	if (system_state == SYSTEM_BOOTING &&
-	    addr >= (unsigned long)_sinittext &&
-	    addr <= (unsigned long)_einittext)
+	    init_kernel_text(addr))
 		return 1;
 	return 0;
 }
@@ -58,7 +65,19 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	if (__module_text_address(addr))
+		return 1;
+	/*
+	 * There might be init symbols in saved stacktraces.
+	 * Give those symbols a chance to be printed in
+	 * backtraces (such as lockdep traces).
+	 *
+	 * Since we are after the module-symbols check, there's
+	 * no danger of address overlap:
+	 */
+	if (init_kernel_text(addr))
+		return 1;
+	return 0;
 }
 
 int kernel_text_address(unsigned long addr)
-- 
cgit v1.2.3


From 8c083f081d0014057901c68a0a3e0f8ca7ac8d23 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Thu, 19 Mar 2009 15:22:20 +1030
Subject: cpumask: remove cpumask allocation from idle_balance, fix

Impact: fix boot crash

Fix typo in the size calculation.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
LKML-Reference: <alpine.DEB.2.00.0903181729360.31583@gandalf.stny.rr.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 48862d418bed..11dd52780adb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -8297,7 +8297,7 @@ void __init sched_init(void)
 	alloc_size *= 2;
 #endif
 #ifdef CONFIG_CPUMASK_OFFSTACK
-	alloc_size *= num_possible_cpus() * cpumask_size();
+	alloc_size += num_possible_cpus() * cpumask_size();
 #endif
 	/*
 	 * As sched_init() is called before page_alloc is setup,
-- 
cgit v1.2.3


From ac5f6c96859e9a664ac05b04bc96ed1caad5fe29 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 11:29:23 -0400
Subject: function-graph: consolidate prologues for output

Impact: clean up

The prologue of the function graph entry, return and comments all
start out pretty much the same. Each of these duplicate code and
do so slightly differently.

This patch consolidates the printing of the pid, absolute time,
cpu and proc (and for entry, the interrupt).

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 98 +++++++++++-------------------------
 1 file changed, 29 insertions(+), 69 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 6004ccac2dd7..2d4d185cd0d8 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -554,24 +554,24 @@ print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
 }
 
 static enum print_line_t
-print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
-			struct trace_iterator *iter)
+print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
+		     int type, unsigned long addr)
 {
-	int ret;
-	int cpu = iter->cpu;
-	pid_t *last_entry = iter->private;
 	struct trace_entry *ent = iter->ent;
-	struct ftrace_graph_ent *call = &field->graph_ent;
-	struct ftrace_graph_ret_entry *leaf_ret;
+	pid_t *last_pid = iter->private;
+	int cpu = iter->cpu;
+	int ret;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_entry) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Interrupt */
-	ret = print_graph_irq(iter, call->func, TRACE_GRAPH_ENT, cpu, ent->pid);
-	if (ret == TRACE_TYPE_PARTIAL_LINE)
-		return TRACE_TYPE_PARTIAL_LINE;
+	if (type) {
+		/* Interrupt */
+		ret = print_graph_irq(iter, addr, type, cpu, ent->pid);
+		if (ret == TRACE_TYPE_PARTIAL_LINE)
+			return TRACE_TYPE_PARTIAL_LINE;
+	}
 
 	/* Absolute time */
 	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
@@ -598,6 +598,20 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 			return TRACE_TYPE_PARTIAL_LINE;
 	}
 
+	return 0;
+}
+
+static enum print_line_t
+print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
+			struct trace_iterator *iter)
+{
+	int cpu = iter->cpu;
+	struct ftrace_graph_ent *call = &field->graph_ent;
+	struct ftrace_graph_ret_entry *leaf_ret;
+
+	if (print_graph_prologue(iter, s, TRACE_GRAPH_ENT, call->func))
+		return TRACE_TYPE_PARTIAL_LINE;
+
 	leaf_ret = get_return_for_leaf(iter, field);
 	if (leaf_ret)
 		return print_graph_entry_leaf(iter, field, leaf_ret, s);
@@ -613,38 +627,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 	int i;
 	int ret;
 	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private, pid = ent->pid;
+	pid_t pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
 
-	/* Pid */
-	if (verif_pid(s, pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
-		ret = print_graph_abs_time(iter->ts, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, cpu);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " | ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
 	/* Overhead */
 	ret = print_graph_overhead(duration, s);
 	if (!ret)
@@ -689,38 +677,10 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 {
 	int i;
 	int ret;
-	int cpu = iter->cpu;
-	pid_t *last_pid = iter->private;
 
-	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	/* Absolute time */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_ABS_TIME) {
-		ret = print_graph_abs_time(iter->ts, s);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Cpu */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_CPU) {
-		ret = print_graph_cpu(s, cpu);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
-	/* Proc */
-	if (tracer_flags.val & TRACE_GRAPH_PRINT_PROC) {
-		ret = print_graph_proc(s, ent->pid);
-		if (ret == TRACE_TYPE_PARTIAL_LINE)
-			return TRACE_TYPE_PARTIAL_LINE;
-
-		ret = trace_seq_printf(s, " | ");
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-	}
-
 	/* No overhead */
 	ret = print_graph_overhead(-1, s);
 	if (!ret)
-- 
cgit v1.2.3


From 3bf832ce1fe6988148d392599f34ca0c6a34427d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 19 Mar 2009 14:47:33 +0100
Subject: tracing/ring-buffer: fix non cpu hotplug case

Impact: fix warning with irqsoff tracer

The ring buffer allocates its buffers on pre-smp time (early_initcall).
It means that, at first, only the boot cpu buffer is allocated and
the ring-buffer cpumask only has the boot cpu set (cpu_online_mask).

Later, the secondary cpu will show up and the ring-buffer will be notified
about this event: the appropriate buffer will be allocated and the cpumask
will be updated.

Unfortunately, if !CONFIG_CPU_HOTPLUG, the ring-buffer will not be
notified about the secondary cpus, meaning that the cpumask will have
only the cpu boot set, and only one cpu buffer allocated.

We fix that by using cpu_possible_mask if !CONFIG_CPU_HOTPLUG.

This patch fixes the following warning with irqsoff tracer running:

[  169.317794] WARNING: at kernel/trace/trace.c:466 update_max_tr_single+0xcc/0xf3()
[  169.318002] Hardware name: AMILO Li 2727
[  169.318002] Modules linked in:
[  169.318002] Pid: 5624, comm: bash Not tainted 2.6.29-rc8-tip-02636-g6aafa6c #11
[  169.318002] Call Trace:
[  169.318002]  [<ffffffff81036182>] warn_slowpath+0xea/0x13d
[  169.318002]  [<ffffffff8100b9d6>] ? ftrace_call+0x5/0x2b
[  169.318002]  [<ffffffff8100b9d6>] ? ftrace_call+0x5/0x2b
[  169.318002]  [<ffffffff8100b9d1>] ? ftrace_call+0x0/0x2b
[  169.318002]  [<ffffffff8101ef10>] ? ftrace_modify_code+0xa9/0x108
[  169.318002]  [<ffffffff8106e27f>] ? trace_hardirqs_off+0x25/0x27
[  169.318002]  [<ffffffff8149afe7>] ? _spin_unlock_irqrestore+0x1f/0x2d
[  169.318002]  [<ffffffff81064f52>] ? ring_buffer_reset_cpu+0xf6/0xfb
[  169.318002]  [<ffffffff8106637c>] ? ring_buffer_reset+0x36/0x48
[  169.318002]  [<ffffffff8106aeda>] update_max_tr_single+0xcc/0xf3
[  169.318002]  [<ffffffff8100bc17>] ? sysret_check+0x22/0x5d
[  169.318002]  [<ffffffff8106e3ea>] stop_critical_timing+0x142/0x204
[  169.318002]  [<ffffffff8106e4cf>] trace_hardirqs_on_caller+0x23/0x25
[  169.318002]  [<ffffffff8149ac28>] trace_hardirqs_on_thunk+0x3a/0x3c
[  169.318002]  [<ffffffff8100bc17>] ? sysret_check+0x22/0x5d
[  169.318002] ---[ end trace db76cbf775a750cf ]---

Because this tracer may try to swap two cpu ring buffers for an
unregistered cpu on the ring buffer.

This patch might also fix a fair loss of traces due to unallocated buffers
for secondary cpus.

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-b: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237470453-5427-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 9 +++++++++
 1 file changed, 9 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index bbf51922a8ca..384ca5d9d729 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -577,8 +577,17 @@ struct ring_buffer *ring_buffer_alloc(unsigned long size, unsigned flags)
 	if (buffer->pages == 1)
 		buffer->pages++;
 
+	/*
+	 * In case of non-hotplug cpu, if the ring-buffer is allocated
+	 * in early initcall, it will not be notified of secondary cpus.
+	 * In that off case, we need to allocate for all possible cpus.
+	 */
+#ifdef CONFIG_HOTPLUG_CPU
 	get_online_cpus();
 	cpumask_copy(buffer->cpumask, cpu_online_mask);
+#else
+	cpumask_copy(buffer->cpumask, cpu_possible_mask);
+#endif
 	buffer->cpus = nr_cpu_ids;
 
 	bsize = sizeof(void *) * nr_cpu_ids;
-- 
cgit v1.2.3


From 5ef841f6f32dce0b752a4fa0622781ee67a0e874 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 12:20:38 -0400
Subject: tracing: make print_(b)printk_msg_only global

This patch makes print_printk_msg_only and print_bprintk_msg_only
global for other functions to use. It also renames them by adding
a "trace_" to the beginning to avoid namespace collisions.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c        | 36 ++----------------------------------
 kernel/trace/trace_output.c | 32 ++++++++++++++++++++++++++++++++
 kernel/trace/trace_output.h |  5 +++++
 3 files changed, 39 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 8d981ababc45..c637cb687cf2 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1694,38 +1694,6 @@ static enum print_line_t print_hex_fmt(struct trace_iterator *iter)
 	return TRACE_TYPE_HANDLED;
 }
 
-static enum print_line_t print_bprintk_msg_only(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct bprint_entry *field;
-	int ret;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_bprintf(s, field->fmt, field->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
-static enum print_line_t print_printk_msg_only(struct trace_iterator *iter)
-{
-	struct trace_seq *s = &iter->seq;
-	struct trace_entry *entry = iter->ent;
-	struct print_entry *field;
-	int ret;
-
-	trace_assign_type(field, entry);
-
-	ret = trace_seq_printf(s, "%s", field->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
-
-	return TRACE_TYPE_HANDLED;
-}
-
 static enum print_line_t print_bin_fmt(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1787,12 +1755,12 @@ static enum print_line_t print_trace_line(struct trace_iterator *iter)
 	if (iter->ent->type == TRACE_BPRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
-		return print_bprintk_msg_only(iter);
+		return trace_print_bprintk_msg_only(iter);
 
 	if (iter->ent->type == TRACE_PRINT &&
 			trace_flags & TRACE_ITER_PRINTK &&
 			trace_flags & TRACE_ITER_PRINTK_MSGONLY)
-		return print_printk_msg_only(iter);
+		return trace_print_printk_msg_only(iter);
 
 	if (trace_flags & TRACE_ITER_BIN)
 		return print_bin_fmt(iter);
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6a4c9dea191e..b45141748af5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -19,6 +19,38 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly;
 
 static int next_event_type = __TRACE_LAST_TYPE + 1;
 
+enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct bprint_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_bprintf(s, field->fmt, field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
+enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter)
+{
+	struct trace_seq *s = &iter->seq;
+	struct trace_entry *entry = iter->ent;
+	struct print_entry *field;
+	int ret;
+
+	trace_assign_type(field, entry);
+
+	ret = trace_seq_printf(s, "%s", field->buf);
+	if (!ret)
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return TRACE_TYPE_HANDLED;
+}
+
 /**
  * trace_seq_printf - sequence printing of trace information
  * @s: trace sequence descriptor
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 3b90e6ade1aa..35c422fb51a9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -15,6 +15,11 @@ struct trace_event {
 	trace_print_func	binary;
 };
 
+extern enum print_line_t
+trace_print_bprintk_msg_only(struct trace_iterator *iter);
+extern enum print_line_t
+trace_print_printk_msg_only(struct trace_iterator *iter);
+
 extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...)
 	__attribute__ ((format (printf, 2, 3)));
 extern int
-- 
cgit v1.2.3


From 2fbcdb35aca614f9529a0e7d340146cf0b71684f Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 13:24:42 -0400
Subject: function-graph: calculate function depth within function graph tracer

Currently, the function graph tracer depends on the trace_printk
to record the depth. All the information is already there in the trace
to calculate function depth, with the exception of having the printk
be the first item. But as soon as a entry or exit is reached, then
we know the depth.

This patch changes the iter->private data from recording a per cpu
last_pid, to a structure that holds both the last_pid and the current
depth. This data is used to determine the function depth for the
printks.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 91 +++++++++++++++++++++++++++---------
 1 file changed, 69 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 2d4d185cd0d8..66ea23b64fe6 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -14,6 +14,11 @@
 #include "trace.h"
 #include "trace_output.h"
 
+struct fgraph_data {
+	pid_t		last_pid;
+	int		depth;
+};
+
 #define TRACE_GRAPH_INDENT	2
 
 /* Flag options */
@@ -231,16 +236,16 @@ print_graph_proc(struct trace_seq *s, pid_t pid)
 
 /* If the pid changed since the last trace, output this event */
 static enum print_line_t
-verif_pid(struct trace_seq *s, pid_t pid, int cpu, pid_t *last_pids_cpu)
+verif_pid(struct trace_seq *s, pid_t pid, int cpu, struct fgraph_data *data)
 {
 	pid_t prev_pid;
 	pid_t *last_pid;
 	int ret;
 
-	if (!last_pids_cpu)
+	if (!data)
 		return TRACE_TYPE_HANDLED;
 
-	last_pid = per_cpu_ptr(last_pids_cpu, cpu);
+	last_pid = &(per_cpu_ptr(data, cpu)->last_pid);
 
 	if (*last_pid == pid)
 		return TRACE_TYPE_HANDLED;
@@ -471,6 +476,7 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 		struct ftrace_graph_ent_entry *entry,
 		struct ftrace_graph_ret_entry *ret_entry, struct trace_seq *s)
 {
+	struct fgraph_data *data = iter->private;
 	struct ftrace_graph_ret *graph_ret;
 	struct ftrace_graph_ent *call;
 	unsigned long long duration;
@@ -481,6 +487,18 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 	call = &entry->graph_ent;
 	duration = graph_ret->rettime - graph_ret->calltime;
 
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		/*
+		 * Comments display at + 1 to depth. Since
+		 * this is a leaf function, keep the comments
+		 * equal to this depth.
+		 */
+		*depth = call->depth - 1;
+	}
+
 	/* Overhead */
 	ret = print_graph_overhead(duration, s);
 	if (!ret)
@@ -512,12 +530,21 @@ print_graph_entry_leaf(struct trace_iterator *iter,
 }
 
 static enum print_line_t
-print_graph_entry_nested(struct ftrace_graph_ent_entry *entry,
-			struct trace_seq *s, pid_t pid, int cpu)
+print_graph_entry_nested(struct trace_iterator *iter,
+			 struct ftrace_graph_ent_entry *entry,
+			 struct trace_seq *s, int cpu)
 {
-	int i;
-	int ret;
 	struct ftrace_graph_ent *call = &entry->graph_ent;
+	struct fgraph_data *data = iter->private;
+	int ret;
+	int i;
+
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		*depth = call->depth;
+	}
 
 	/* No overhead */
 	ret = print_graph_overhead(-1, s);
@@ -557,13 +584,13 @@ static enum print_line_t
 print_graph_prologue(struct trace_iterator *iter, struct trace_seq *s,
 		     int type, unsigned long addr)
 {
+	struct fgraph_data *data = iter->private;
 	struct trace_entry *ent = iter->ent;
-	pid_t *last_pid = iter->private;
 	int cpu = iter->cpu;
 	int ret;
 
 	/* Pid */
-	if (verif_pid(s, ent->pid, cpu, last_pid) == TRACE_TYPE_PARTIAL_LINE)
+	if (verif_pid(s, ent->pid, cpu, data) == TRACE_TYPE_PARTIAL_LINE)
 		return TRACE_TYPE_PARTIAL_LINE;
 
 	if (type) {
@@ -616,7 +643,7 @@ print_graph_entry(struct ftrace_graph_ent_entry *field, struct trace_seq *s,
 	if (leaf_ret)
 		return print_graph_entry_leaf(iter, field, leaf_ret, s);
 	else
-		return print_graph_entry_nested(field, s, iter->ent->pid, cpu);
+		return print_graph_entry_nested(iter, field, s, cpu);
 
 }
 
@@ -624,11 +651,24 @@ static enum print_line_t
 print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
-	int i;
-	int ret;
-	int cpu = iter->cpu;
-	pid_t pid = ent->pid;
 	unsigned long long duration = trace->rettime - trace->calltime;
+	struct fgraph_data *data = iter->private;
+	pid_t pid = ent->pid;
+	int cpu = iter->cpu;
+	int ret;
+	int i;
+
+	if (data) {
+		int cpu = iter->cpu;
+		int *depth = &(per_cpu_ptr(data, cpu)->depth);
+
+		/*
+		 * Comments display at + 1 to depth. This is the
+		 * return from a function, we now want the comments
+		 * to display at the same level of the bracket.
+		 */
+		*depth = trace->depth - 1;
+	}
 
 	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -675,8 +715,13 @@ static enum print_line_t
 print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 		   struct trace_entry *ent, struct trace_iterator *iter)
 {
-	int i;
+	struct fgraph_data *data = iter->private;
+	int depth = 0;
 	int ret;
+	int i;
+
+	if (data)
+		depth = per_cpu_ptr(data, iter->cpu)->depth;
 
 	if (print_graph_prologue(iter, s, 0, 0))
 		return TRACE_TYPE_PARTIAL_LINE;
@@ -694,8 +739,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 	}
 
 	/* Indentation */
-	if (trace->depth > 0)
-		for (i = 0; i < (trace->depth + 1) * TRACE_GRAPH_INDENT; i++) {
+	if (depth > 0)
+		for (i = 0; i < (depth + 1) * TRACE_GRAPH_INDENT; i++) {
 			ret = trace_seq_printf(s, " ");
 			if (!ret)
 				return TRACE_TYPE_PARTIAL_LINE;
@@ -780,19 +825,21 @@ static void print_graph_headers(struct seq_file *s)
 
 static void graph_trace_open(struct trace_iterator *iter)
 {
-	/* pid on the last trace processed */
-	pid_t *last_pid = alloc_percpu(pid_t);
+	/* pid and depth on the last trace processed */
+	struct fgraph_data *data = alloc_percpu(struct fgraph_data);
 	int cpu;
 
-	if (!last_pid)
+	if (!data)
 		pr_warning("function graph tracer: not enough memory\n");
 	else
 		for_each_possible_cpu(cpu) {
-			pid_t *pid = per_cpu_ptr(last_pid, cpu);
+			pid_t *pid = &(per_cpu_ptr(data, cpu)->last_pid);
+			int *depth = &(per_cpu_ptr(data, cpu)->depth);
 			*pid = -1;
+			*depth = 0;
 		}
 
-	iter->private = last_pid;
+	iter->private = data;
 }
 
 static void graph_trace_close(struct trace_iterator *iter)
-- 
cgit v1.2.3


From 40ce74f19c28077550646c76d96a075bf312e461 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 14:03:53 -0400
Subject: tracing: remove recording function depth from trace_printk

The function depth in trace_printk was to facilitate the function
graph output. Now that the function graph calculates the depth within
the trace output, we no longer need to record the depth when the
trace_printk is called.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace.c             | 8 +++-----
 kernel/trace/trace.h             | 6 ++----
 kernel/trace/trace_event_types.h | 2 --
 kernel/trace/trace_mmiotrace.c   | 2 +-
 kernel/trace/trace_printk.c      | 8 ++++----
 5 files changed, 10 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index c637cb687cf2..f7f359d45823 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1194,7 +1194,7 @@ void trace_graph_return(struct ftrace_graph_ret *trace)
  * trace_vbprintk - write binary msg to tracing buffer
  *
  */
-int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vbprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock =
 		(raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
@@ -1236,7 +1236,6 @@ int trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
-	entry->depth			= depth;
 	entry->fmt			= fmt;
 
 	memcpy(entry->buf, trace_buf, sizeof(u32) * len);
@@ -1254,7 +1253,7 @@ out:
 }
 EXPORT_SYMBOL_GPL(trace_vbprintk);
 
-int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
+int trace_vprintk(unsigned long ip, const char *fmt, va_list args)
 {
 	static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED;
 	static char trace_buf[TRACE_BUF_SIZE];
@@ -1291,7 +1290,6 @@ int trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args)
 		goto out_unlock;
 	entry = ring_buffer_event_data(event);
 	entry->ip			= ip;
-	entry->depth			= depth;
 
 	memcpy(&entry->buf, trace_buf, len);
 	entry->buf[len] = 0;
@@ -3140,7 +3138,7 @@ static int mark_printk(const char *fmt, ...)
 	int ret;
 	va_list args;
 	va_start(args, fmt);
-	ret = trace_vprintk(0, -1, fmt, args);
+	ret = trace_vprintk(0, fmt, args);
 	va_end(args);
 	return ret;
 }
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 38276d1638e3..7c9a0cbf5dca 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -123,7 +123,6 @@ struct userstack_entry {
 struct bprint_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			depth;
 	const char		*fmt;
 	u32			buf[];
 };
@@ -131,7 +130,6 @@ struct bprint_entry {
 struct print_entry {
 	struct trace_entry	ent;
 	unsigned long		ip;
-	int			depth;
 	char			buf[];
 };
 
@@ -598,9 +596,9 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 extern void *head_page(struct trace_array_cpu *data);
 extern long ns2usecs(cycle_t nsec);
 extern int
-trace_vbprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
-trace_vprintk(unsigned long ip, int depth, const char *fmt, va_list args);
+trace_vprintk(unsigned long ip, const char *fmt, va_list args);
 
 extern unsigned long trace_flags;
 
diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h
index 019915063fe6..fd78bee71dd7 100644
--- a/kernel/trace/trace_event_types.h
+++ b/kernel/trace/trace_event_types.h
@@ -105,7 +105,6 @@ TRACE_EVENT_FORMAT(user_stack, TRACE_USER_STACK, userstack_entry, ignore,
 TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD(char *, fmt, fmt)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
@@ -115,7 +114,6 @@ TRACE_EVENT_FORMAT(bprint, TRACE_BPRINT, bprint_entry, ignore,
 TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore,
 	TRACE_STRUCT(
 		TRACE_FIELD(unsigned long, ip, ip)
-		TRACE_FIELD(unsigned int, depth, depth)
 		TRACE_FIELD_ZERO_CHAR(buf)
 	),
 	TP_RAW_FMT("%08lx (%d) fmt:%p %s")
diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c
index f095916e477f..8e37fcddd8b4 100644
--- a/kernel/trace/trace_mmiotrace.c
+++ b/kernel/trace/trace_mmiotrace.c
@@ -359,5 +359,5 @@ void mmio_trace_mapping(struct mmiotrace_map *map)
 
 int mmio_trace_printk(const char *fmt, va_list args)
 {
-	return trace_vprintk(0, -1, fmt, args);
+	return trace_vprintk(0, fmt, args);
 }
diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c
index 486785214e3e..eb81556107fe 100644
--- a/kernel/trace/trace_printk.c
+++ b/kernel/trace/trace_printk.c
@@ -112,7 +112,7 @@ int __trace_bprintk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vbprintk(ip, fmt, ap);
 	va_end(ap);
 	return ret;
 }
@@ -126,7 +126,7 @@ int __ftrace_vbprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	return trace_vbprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	return trace_vbprintk(ip, fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vbprintk);
 
@@ -139,7 +139,7 @@ int __trace_printk(unsigned long ip, const char *fmt, ...)
 		return 0;
 
 	va_start(ap, fmt);
-	ret = trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	ret = trace_vprintk(ip, fmt, ap);
 	va_end(ap);
 	return ret;
 }
@@ -150,7 +150,7 @@ int __ftrace_vprintk(unsigned long ip, const char *fmt, va_list ap)
 	if (!(trace_flags & TRACE_ITER_PRINTK))
 		return 0;
 
-	return trace_vprintk(ip, task_curr_ret_stack(current), fmt, ap);
+	return trace_vprintk(ip, fmt, ap);
 }
 EXPORT_SYMBOL_GPL(__ftrace_vprintk);
 
-- 
cgit v1.2.3


From 5087f8d2a2f2daff5a913d72d8ea3ad601948e10 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Thu, 19 Mar 2009 15:14:46 -0400
Subject: function-graph: show binary events as comments

With the added TRACE_EVENT macro, the events no longer appear in
the function graph tracer. This was because the function graph
did not know how to display the entries. The graph tracer was
only aware of its own entries and the printk entries.

By using the event call back feature, the graph tracer can now display
the events.

 # echo irq > /debug/tracing/set_event

Which can show:

 0)               |          handle_IRQ_event() {
 0)               |            /* irq_handler_entry: irq=48 handler=eth0 */
 0)               |            e1000_intr() {
 0)   0.926 us    |              __napi_schedule();
 0)   3.888 us    |            }
 0)               |            /* irq_handler_exit: irq=48 return=handled */
 0)   0.655 us    |            runqueue_is_locked();
 0)               |            __wake_up() {
 0)   0.831 us    |              _spin_lock_irqsave();

The irq entry and exit events show up as comments.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_functions_graph.c | 40 +++++++++++++++++++++++++-----------
 1 file changed, 28 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index 66ea23b64fe6..e876816fa8e7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -712,10 +712,12 @@ print_graph_return(struct ftrace_graph_ret *trace, struct trace_seq *s,
 }
 
 static enum print_line_t
-print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
-		   struct trace_entry *ent, struct trace_iterator *iter)
+print_graph_comment(struct trace_seq *s,  struct trace_entry *ent,
+		    struct trace_iterator *iter)
 {
+	unsigned long sym_flags = (trace_flags & TRACE_ITER_SYM_MASK);
 	struct fgraph_data *data = iter->private;
+	struct trace_event *event;
 	int depth = 0;
 	int ret;
 	int i;
@@ -751,9 +753,26 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 	if (!ret)
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	ret = trace_seq_bprintf(s, trace->fmt, trace->buf);
-	if (!ret)
-		return TRACE_TYPE_PARTIAL_LINE;
+	switch (iter->ent->type) {
+	case TRACE_BPRINT:
+		ret = trace_print_bprintk_msg_only(iter);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+		break;
+	case TRACE_PRINT:
+		ret = trace_print_printk_msg_only(iter);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+		break;
+	default:
+		event = ftrace_find_event(ent->type);
+		if (!event)
+			return TRACE_TYPE_UNHANDLED;
+
+		ret = event->trace(iter, sym_flags);
+		if (ret != TRACE_TYPE_HANDLED)
+			return ret;
+	}
 
 	/* Strip ending newline */
 	if (s->buffer[s->len - 1] == '\n') {
@@ -772,8 +791,8 @@ print_graph_comment(struct bprint_entry *trace, struct trace_seq *s,
 enum print_line_t
 print_graph_function(struct trace_iterator *iter)
 {
-	struct trace_seq *s = &iter->seq;
 	struct trace_entry *entry = iter->ent;
+	struct trace_seq *s = &iter->seq;
 
 	switch (entry->type) {
 	case TRACE_GRAPH_ENT: {
@@ -786,14 +805,11 @@ print_graph_function(struct trace_iterator *iter)
 		trace_assign_type(field, entry);
 		return print_graph_return(&field->ret, s, entry, iter);
 	}
-	case TRACE_BPRINT: {
-		struct bprint_entry *field;
-		trace_assign_type(field, entry);
-		return print_graph_comment(field, s, entry, iter);
-	}
 	default:
-		return TRACE_TYPE_UNHANDLED;
+		return print_graph_comment(s, entry, iter);
 	}
+
+	return TRACE_TYPE_HANDLED;
 }
 
 static void print_graph_headers(struct seq_file *s)
-- 
cgit v1.2.3


From 23725aeeab10ba02bcf10ec49ad73146b54cb52f Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:13 +0100
Subject: ftrace: provide an id file for each event

Since not every event has a format file to read the id from,
expose it explicitly in a separate file.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.372534033@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c88227b3b9db..7763db8fd0b3 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -412,6 +412,29 @@ event_format_read(struct file *filp, char __user *ubuf, size_t cnt,
 	return r;
 }
 
+static ssize_t
+event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+	trace_seq_printf(s, "%d\n", call->id);
+
+	r = simple_read_from_buffer(ubuf, cnt, ppos,
+				    s->buffer, s->len);
+	kfree(s);
+	return r;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -452,6 +475,11 @@ static const struct file_operations ftrace_event_format_fops = {
 	.read = event_format_read,
 };
 
+static const struct file_operations ftrace_event_id_fops = {
+	.open = tracing_open_generic,
+	.read = event_id_read,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -550,6 +578,14 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   "'%s/enable' entry\n", call->name);
 	}
 
+	if (call->id) {
+		entry = debugfs_create_file("id", 0444, call->dir, call,
+				&ftrace_event_id_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs '%s/id' entry\n",
+					call->name);
+	}
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
-- 
cgit v1.2.3


From 28bea271e58e429eccfad3d7ee2ad12d6ee015bf Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:14 +0100
Subject: ftrace: ensure every event gets an id

Impact: widen user-space visibe event IDs to all events

Previously only TRACE_EVENT events got ids, because only they
generated raw output which needs to be demuxed from the trace.

In order to provide a unique ID for each event, register everybody,
regardless.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.464914218@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_3.h | 15 ++++++++++++++-
 kernel/trace/trace_output.c         |  5 +++++
 2 files changed, 19 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index ae2e323df0c7..4c26d97b4508 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -130,7 +130,19 @@ static void ftrace_unreg_event_##call(void)				\
 {									\
 	unregister_trace_##call(ftrace_event_##call);			\
 }									\
-
+									\
+static struct ftrace_event_call event_##call;				\
+									\
+static int ftrace_init_event_##call(void)				\
+{									\
+	int id;								\
+									\
+	id = register_ftrace_event(NULL);				\
+	if (!id)							\
+		return -ENODEV;						\
+	event_##call.id = id;						\
+	return 0;							\
+}
 
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)				\
@@ -140,6 +152,7 @@ __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.name			= #call,				\
 	.system			= __stringify(TRACE_SYSTEM),		\
+	.raw_init		= ftrace_init_event_##call,		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
 }
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index b45141748af5..19261fdd2455 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -481,6 +481,11 @@ int register_ftrace_event(struct trace_event *event)
 
 	mutex_lock(&trace_event_mutex);
 
+	if (!event) {
+		ret = next_event_type++;
+		goto out;
+	}
+
 	if (!event->type)
 		event->type = next_event_type++;
 	else if (event->type > __TRACE_LAST_TYPE) {
-- 
cgit v1.2.3


From ac199db0189c091f2863312061c0575937f68810 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 19 Mar 2009 20:26:15 +0100
Subject: ftrace: event profile hooks

Impact: new tracing infrastructure feature

Provide infrastructure to generate software perf counter events
from tracepoints.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090319194233.557364871@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile               |  1 +
 kernel/trace/events.c               |  1 -
 kernel/trace/trace.h                | 11 ++++++++++
 kernel/trace/trace_event_profile.c  | 31 ++++++++++++++++++++++++++
 kernel/trace/trace_events.c         |  9 ++------
 kernel/trace/trace_events_stage_3.h | 44 +++++++++++++++++++++++++++++++++++++
 6 files changed, 89 insertions(+), 8 deletions(-)
 create mode 100644 kernel/trace/trace_event_profile.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index c3feea01c3e0..0e45c206c2f9 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -44,5 +44,6 @@ obj-$(CONFIG_EVENT_TRACER) += trace_events.o
 obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
+obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/events.c b/kernel/trace/events.c
index 9fc918da404f..246f2aa6dc46 100644
--- a/kernel/trace/events.c
+++ b/kernel/trace/events.c
@@ -12,4 +12,3 @@
 #include "trace_events_stage_2.h"
 #include "trace_events_stage_3.h"
 
-#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7c9a0cbf5dca..7cfb741be200 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -785,12 +785,23 @@ struct ftrace_event_call {
 	int		id;
 	int		(*raw_init)(void);
 	int		(*show_format)(struct trace_seq *s);
+
+#ifdef CONFIG_EVENT_PROFILE
+	atomic_t	profile_count;
+	int		(*profile_enable)(struct ftrace_event_call *);
+	void		(*profile_disable)(struct ftrace_event_call *);
+#endif
 };
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
 
+#define for_each_event(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 extern const char *__start___trace_bprintk_fmt[];
 extern const char *__stop___trace_bprintk_fmt[];
 
diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c
new file mode 100644
index 000000000000..22cba9970776
--- /dev/null
+++ b/kernel/trace/trace_event_profile.c
@@ -0,0 +1,31 @@
+/*
+ * trace event based perf counter profiling
+ *
+ * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra <pzijlstr@redhat.com>
+ *
+ */
+
+#include "trace.h"
+
+int ftrace_profile_enable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	for_each_event(event) {
+		if (event->id == event_id)
+			return event->profile_enable(event);
+	}
+
+	return -EINVAL;
+}
+
+void ftrace_profile_disable(int event_id)
+{
+	struct ftrace_event_call *event;
+
+	for_each_event(event) {
+		if (event->id == event_id)
+			return event->profile_disable(event);
+	}
+}
+
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 7763db8fd0b3..3047b56f6637 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,11 +19,6 @@
 
 static DEFINE_MUTEX(event_mutex);
 
-#define events_for_each(event)						\
-	for (event = __start_ftrace_events;				\
-	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
-	     event++)
-
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -90,7 +85,7 @@ static int ftrace_set_clr_event(char *buf, int set)
 	}
 
 	mutex_lock(&event_mutex);
-	events_for_each(call) {
+	for_each_event(call) {
 
 		if (!call->name || !call->regfunc)
 			continue;
@@ -628,7 +623,7 @@ static __init int event_trace_init(void)
 	if (!d_events)
 		return 0;
 
-	events_for_each(call) {
+	for_each_event(call) {
 		/* The linker may leave blanks */
 		if (!call->name)
 			continue;
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 4c26d97b4508..6b3261ca988c 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -109,6 +109,40 @@
 #undef TP_FMT
 #define TP_FMT(fmt, args...)	fmt "\n", ##args
 
+#ifdef CONFIG_EVENT_PROFILE
+#define _TRACE_PROFILE(call, proto, args)				\
+static void ftrace_profile_##call(proto)				\
+{									\
+	extern void perf_tpcounter_event(int);				\
+	perf_tpcounter_event(event_##call.id);				\
+}									\
+									\
+static int ftrace_profile_enable_##call(struct ftrace_event_call *call) \
+{									\
+	int ret = 0;							\
+									\
+	if (!atomic_inc_return(&call->profile_count))			\
+		ret = register_trace_##call(ftrace_profile_##call);	\
+									\
+	return ret;							\
+}									\
+									\
+static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \
+{									\
+	if (atomic_add_negative(-1, &call->profile_count))		\
+		unregister_trace_##call(ftrace_profile_##call);		\
+}
+
+#define _TRACE_PROFILE_INIT(call)					\
+	.profile_count = ATOMIC_INIT(-1),				\
+	.profile_enable = ftrace_profile_enable_##call,			\
+	.profile_disable = ftrace_profile_disable_##call,
+
+#else
+#define _TRACE_PROFILE(call, proto, args)
+#define _TRACE_PROFILE_INIT(call)
+#endif
+
 #define _TRACE_FORMAT(call, proto, args, fmt)				\
 static void ftrace_event_##call(proto)					\
 {									\
@@ -147,6 +181,7 @@ static int ftrace_init_event_##call(void)				\
 #undef TRACE_FORMAT
 #define TRACE_FORMAT(call, proto, args, fmt)				\
 _TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt))		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 static struct ftrace_event_call __used					\
 __attribute__((__aligned__(4)))						\
 __attribute__((section("_ftrace_events"))) event_##call = {		\
@@ -155,6 +190,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.raw_init		= ftrace_init_event_##call,		\
 	.regfunc		= ftrace_reg_event_##call,		\
 	.unregfunc		= ftrace_unreg_event_##call,		\
+	_TRACE_PROFILE_INIT(call)					\
 }
 
 #undef __entry
@@ -162,6 +198,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 
 #undef TRACE_EVENT
 #define TRACE_EVENT(call, proto, args, tstruct, assign, print)		\
+_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args))			\
 									\
 static struct ftrace_event_call event_##call;				\
 									\
@@ -227,4 +264,11 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
+	_TRACE_PROFILE_INIT(call)					\
 }
+
+#include <trace/trace_event_types.h>
+
+#undef _TRACE_PROFILE
+#undef _TRACE_PROFILE_INIT
+
-- 
cgit v1.2.3


From 505f2b970b2269ce4cb669b3ff4f6479d379cec2 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 20 Mar 2009 11:05:04 +0100
Subject: tracing, Text Edit Lock - kprobes architecture independent support,
 nommu fix

Impact: build fix on SH !CONFIG_MMU

Stephen Rothwell reported this linux-next build failure on the SH
architecture:

  kernel/built-in.o: In function `disable_all_kprobes':
  kernel/kprobes.c:1382: undefined reference to `text_mutex'
  [...]

And observed:

| Introduced by commit 4460fdad85becd569f11501ad5b91814814335ff ("tracing,
| Text Edit Lock - kprobes architecture independent support") from the
| tracing tree.  text_mutex is defined in mm/memory.c which is only built
| if CONFIG_MMU is defined, which is not true for sh allmodconfig.

Move this lock to kernel/extable.c (which is already home to various
kernel text related routines), which file is always built-in.

Reported-by: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Paul Mundt <lethal@linux-sh.org>
Cc: Mathieu Desnoyers <mathieu.desnoyers@polymtl.ca>
LKML-Reference: <20090320110602.86351a91.sfr@canb.auug.org.au>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 14 ++++++++++++--
 mm/memory.c      |  8 --------
 2 files changed, 12 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 0df6253730be..25d39b0c3a1b 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -15,11 +15,21 @@
     along with this program; if not, write to the Free Software
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
+#include <linux/ftrace.h>
 #include <linux/module.h>
+#include <linux/mutex.h>
 #include <linux/init.h>
-#include <linux/ftrace.h>
-#include <asm/uaccess.h>
+
 #include <asm/sections.h>
+#include <asm/uaccess.h>
+
+/*
+ * mutex protecting text section modification (dynamic code patching).
+ * some users need to sleep (allocating memory...) while they hold this lock.
+ *
+ * NOT exported to modules - patching kernel text is a really delicate matter.
+ */
+DEFINE_MUTEX(text_mutex);
 
 extern struct exception_table_entry __start___ex_table[];
 extern struct exception_table_entry __stop___ex_table[];
diff --git a/mm/memory.c b/mm/memory.c
index 05fab3bc5b4b..dfc9e4ea4e8b 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -101,14 +101,6 @@ int randomize_va_space __read_mostly =
 					2;
 #endif
 
-/*
- * mutex protecting text section modification (dynamic code patching).
- * some users need to sleep (allocating memory...) while they hold this lock.
- *
- * NOT exported to modules - patching kernel text is a really delicate matter.
- */
-DEFINE_MUTEX(text_mutex);
-
 static int __init disable_randmaps(char *s)
 {
 	randomize_va_space = 0;
-- 
cgit v1.2.3


From 09c9e84d474d917d9de5b9011ed2064b03a19677 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 21 Mar 2009 04:33:36 +0100
Subject: tracing/ring-buffer: don't annotate rb_cpu_notify with __cpuinit

Impact: remove a section warning

CONFIG_DEBUG_SECTION_MISMATCH raises the following warning on -tip:

  WARNING: kernel/trace/built-in.o(.text+0x5bc5): Section mismatch in
  reference from the function ring_buffer_alloc() to the function
  .cpuinit.text:rb_cpu_notify()
  The function ring_buffer_alloc() references
  the function __cpuinit rb_cpu_notify().

This is actually harmless. The code in the ring buffer don't build
rb_cpu_notify and other cpu hotplug stuffs when !CONFIG_HOTPLUG_CPU
so we have no risk to reference freed memory here (it would even
be harmless if we unconditionally build it because register_cpu_notifier
would do nothing when !CONFIG_HOTPLUG_CPU.

But since ring_buffer_alloc() can be called everytime, we don't want it
to be annotated with __cpuinit so we drop the __cpuinit from
rb_cpu_notify.

This is not a waste of memory because it is only defined and used on
CONFIG_HOTPLUG_CPU.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237606416-22268-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d9d729..808b14bbf076 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -535,8 +535,8 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 extern int ring_buffer_page_too_big(void);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __cpuinit rb_cpu_notify(struct notifier_block *self,
-				   unsigned long action, void *hcpu);
+static int rb_cpu_notify(struct notifier_block *self,
+			 unsigned long action, void *hcpu);
 #endif
 
 /**
@@ -2784,8 +2784,8 @@ static __init int rb_init_debugfs(void)
 fs_initcall(rb_init_debugfs);
 
 #ifdef CONFIG_HOTPLUG_CPU
-static int __cpuinit rb_cpu_notify(struct notifier_block *self,
-				   unsigned long action, void *hcpu)
+static int rb_cpu_notify(struct notifier_block *self,
+			 unsigned long action, void *hcpu)
 {
 	struct ring_buffer *buffer =
 		container_of(self, struct ring_buffer, cpu_notify);
-- 
cgit v1.2.3


From 1a17662ea033674a58bad3603531b0b5d42572f6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:47:30 +0800
Subject: blktrace: fix possible memory leak

When we failed to create "block" debugfs dir, we should do some
cleanups.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5B2.8000800@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index b171778e3863..fb3bc53835dd 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -432,7 +432,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
 		if (!blk_tree_root)
-			return -ENOMEM;
+			goto err;
 	}
 
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
-- 
cgit v1.2.3


From 5006ea73f38caef6065d1136808413813271633f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:03 +0800
Subject: blktrace: make blk_tracer_enabled a bool flag

It doesn't have to be a counter, and it can be a bool flag instead.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C2F5D3.8090104@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index fb3bc53835dd..73845b7968bb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -30,7 +30,7 @@
 static unsigned int blktrace_seq __read_mostly = 1;
 
 static struct trace_array *blk_tr;
-static int __read_mostly  blk_tracer_enabled;
+static bool blk_tracer_enabled __read_mostly;
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_BLK_OPT_CLASSIC	0x1
@@ -1111,9 +1111,7 @@ static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled++;
-	mutex_unlock(&blk_probe_mutex);
+	blk_tracer_enabled = true;
 	return 0;
 }
 
@@ -1131,11 +1129,7 @@ static void blk_tracer_reset(struct trace_array *tr)
 	if (!atomic_read(&blk_probes_ref))
 		return;
 
-	mutex_lock(&blk_probe_mutex);
-	blk_tracer_enabled--;
-	WARN_ON(blk_tracer_enabled < 0);
-	mutex_unlock(&blk_probe_mutex);
-
+	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
 
-- 
cgit v1.2.3


From 3c289ba7c320560ee74979a8895141c829046a2d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:26 +0800
Subject: blktrace: remove blk_probe_mutex

blk_register_tracepoints() always returns 0, so make it return void,
thus we don't need to use blk_probe_mutex to protect blk_probes_ref.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5EA.8060606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 27 +++++----------------------
 1 file changed, 5 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 73845b7968bb..223b92e77b3f 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -47,10 +47,9 @@ static struct tracer_flags blk_tracer_flags = {
 };
 
 /* Global reference count of probes */
-static DEFINE_MUTEX(blk_probe_mutex);
 static atomic_t blk_probes_ref = ATOMIC_INIT(0);
 
-static int blk_register_tracepoints(void);
+static void blk_register_tracepoints(void);
 static void blk_unregister_tracepoints(void);
 
 /*
@@ -256,10 +255,8 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
 }
 
 int blk_trace_remove(struct request_queue *q)
@@ -471,13 +468,8 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	mutex_lock(&blk_probe_mutex);
-	if (atomic_add_return(1, &blk_probes_ref) == 1) {
-		ret = blk_register_tracepoints();
-		if (ret)
-			goto probe_err;
-	}
-	mutex_unlock(&blk_probe_mutex);
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		blk_register_tracepoints();
 
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
@@ -487,9 +479,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	}
 
 	return 0;
-probe_err:
-	atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
 err:
 	if (bt) {
 		if (bt->msg_file)
@@ -863,7 +852,7 @@ void blk_add_driver_data(struct request_queue *q,
 }
 EXPORT_SYMBOL_GPL(blk_add_driver_data);
 
-static int blk_register_tracepoints(void)
+static void blk_register_tracepoints(void)
 {
 	int ret;
 
@@ -901,7 +890,6 @@ static int blk_register_tracepoints(void)
 	WARN_ON(ret);
 	ret = register_trace_block_remap(blk_add_trace_remap);
 	WARN_ON(ret);
-	return 0;
 }
 
 static void blk_unregister_tracepoints(void)
@@ -1099,11 +1087,8 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		if (blk_register_tracepoints())
-			atomic_dec(&blk_probes_ref);
-	mutex_unlock(&blk_probe_mutex);
+		blk_register_tracepoints();
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1118,10 +1103,8 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	mutex_lock(&blk_probe_mutex);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
-	mutex_unlock(&blk_probe_mutex);
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
-- 
cgit v1.2.3


From cbe28296eb1ac441b35cf45804d0ae808add7dd1 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:48:47 +0800
Subject: blktrace: don't increase blk_probes_ref if failed to setup blk trace

do_blk_trace_setup() may return EBUSY, but the current code
doesn't decrease blk_probes_ref in this case.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F5FF.80002@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 223b92e77b3f..11e7c8d9d222 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -468,9 +468,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 	bt->pid = buts->pid;
 	bt->trace_state = Blktrace_setup;
 
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		blk_register_tracepoints();
-
 	ret = -EBUSY;
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt) {
@@ -478,6 +475,9 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 	}
 
+	if (atomic_add_return(1, &blk_probes_ref) == 1)
+		blk_register_tracepoints();
+
 	return 0;
 err:
 	if (bt) {
-- 
cgit v1.2.3


From 15152e448b693fa41de40f1e40ffbe717a3aab88 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 09:49:08 +0800
Subject: blktrace: report EBUSY correctly

blk_trace_remove_queue() returns EINVAL if q->blk_trace == NULL,
but blk_trace_setup_queue() doesn't return EBUSY if
q->blk_trace != NULL.

 # echo 0 > sdaX/trace/enable
 # echo 0 > sdaX/trace/enable
 bash: echo: write error: Invalid argument
 # echo 1 > sdaX/trace/enable
 # echo 1 > sdaX/trace/enable
 (should return EBUSY)

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C2F614.2010101@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 11e7c8d9d222..14986afdbc1c 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1260,12 +1260,10 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
-	int ret;
 
-	ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
-		goto err;
+		return -ENOMEM;
 
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
@@ -1276,11 +1274,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
 		kfree(bt);
-		ret = -EBUSY;
+		return -EBUSY;
 	}
+
 	return 0;
-err:
-	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From cd649b8bb830d65c57c3c8b98d57b5402256d8bd Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 11:33:55 +0800
Subject: blktrace: remove sysfs_blk_trace_enable_show/store()

sysfs_blk_trace_enable_show()/store() share most of code with
sysfs_blk_trace_attr_show()/store().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C30EA3.1060004@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 92 ++++++++++++-------------------------------------
 1 file changed, 22 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 14986afdbc1c..dfee6f915179 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1284,72 +1284,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
  * sysfs interface to enable and configure tracing
  */
 
-static ssize_t sysfs_blk_trace_enable_show(struct device *dev,
-					   struct device_attribute *attr,
-					   char *buf)
-{
-	struct hd_struct *p = dev_to_part(dev);
-	struct block_device *bdev;
-	ssize_t ret = -ENXIO;
-
-	lock_kernel();
-	bdev = bdget(part_devt(p));
-	if (bdev != NULL) {
-		struct request_queue *q = bdev_get_queue(bdev);
-
-		if (q != NULL) {
-			mutex_lock(&bdev->bd_mutex);
-			ret = sprintf(buf, "%u\n", !!q->blk_trace);
-			mutex_unlock(&bdev->bd_mutex);
-		}
-
-		bdput(bdev);
-	}
-
-	unlock_kernel();
-	return ret;
-}
-
-static ssize_t sysfs_blk_trace_enable_store(struct device *dev,
-					    struct device_attribute *attr,
-					    const char *buf, size_t count)
-{
-	struct block_device *bdev;
-	struct request_queue *q;
-	struct hd_struct *p;
-	int value;
-	ssize_t ret = -ENXIO;
-
-	if (count == 0 || sscanf(buf, "%d", &value) != 1)
-		goto out;
-
-	lock_kernel();
-	p = dev_to_part(dev);
-	bdev = bdget(part_devt(p));
-	if (bdev == NULL)
-		goto out_unlock_kernel;
-
-	q = bdev_get_queue(bdev);
-	if (q == NULL)
-		goto out_bdput;
-
-	mutex_lock(&bdev->bd_mutex);
-	if (value)
-		ret = blk_trace_setup_queue(q, bdev->bd_dev);
-	else
-		ret = blk_trace_remove_queue(q);
-	mutex_unlock(&bdev->bd_mutex);
-
-	if (ret == 0)
-		ret = count;
-out_bdput:
-	bdput(bdev);
-out_unlock_kernel:
-	unlock_kernel();
-out:
-	return ret;
-}
-
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf);
@@ -1361,8 +1295,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 		    sysfs_blk_trace_attr_show, \
 		    sysfs_blk_trace_attr_store)
 
-static DEVICE_ATTR(enable, S_IRUGO | S_IWUSR,
-		   sysfs_blk_trace_enable_show, sysfs_blk_trace_enable_store);
+static BLK_TRACE_DEVICE_ATTR(enable);
 static BLK_TRACE_DEVICE_ATTR(act_mask);
 static BLK_TRACE_DEVICE_ATTR(pid);
 static BLK_TRACE_DEVICE_ATTR(start_lba);
@@ -1447,6 +1380,12 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q == NULL)
 		goto out_bdput;
 	mutex_lock(&bdev->bd_mutex);
+
+	if (attr == &dev_attr_enable) {
+		ret = sprintf(buf, "%u\n", !!q->blk_trace);
+		goto out_unlock_bdev;
+	}
+
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
@@ -1457,6 +1396,8 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 		ret = sprintf(buf, "%llu\n", q->blk_trace->start_lba);
 	else if (attr == &dev_attr_end_lba)
 		ret = sprintf(buf, "%llu\n", q->blk_trace->end_lba);
+
+out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
@@ -1499,6 +1440,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 		goto out_bdput;
 
 	mutex_lock(&bdev->bd_mutex);
+
+	if (attr == &dev_attr_enable) {
+		if (value)
+			ret = blk_trace_setup_queue(q, bdev->bd_dev);
+		else
+			ret = blk_trace_remove_queue(q);
+		goto out_unlock_bdev;
+	}
+
 	ret = 0;
 	if (q->blk_trace == NULL)
 		ret = blk_trace_setup_queue(q, bdev->bd_dev);
@@ -1512,13 +1462,15 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 			q->blk_trace->start_lba = value;
 		else if (attr == &dev_attr_end_lba)
 			q->blk_trace->end_lba = value;
-		ret = count;
 	}
+
+out_unlock_bdev:
 	mutex_unlock(&bdev->bd_mutex);
 out_bdput:
 	bdput(bdev);
 out_unlock_kernel:
 	unlock_kernel();
 out:
-	return ret;
+	return ret ? ret : count;
 }
+
-- 
cgit v1.2.3


From b125130b22d67f249beba10b71a254558b5279d0 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 10:34:00 +0800
Subject: blktrace: avoid accessing NULL bdev->bd_disk

bdev->bd_disk can be NULL, if the block device is not opened.

Try this against an unmounted partition, and you'll see NULL dereference:

  # echo 1 > /sys/block/sda/sda5/enable

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <49C30098.6080107@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index dfee6f915179..108f4f7715a5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1362,6 +1362,14 @@ static int blk_str2act_mask(const char *str)
 	return mask;
 }
 
+static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
+{
+	if (bdev->bd_disk == NULL)
+		return NULL;
+
+	return bdev_get_queue(bdev);
+}
+
 static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 					 struct device_attribute *attr,
 					 char *buf)
@@ -1376,9 +1384,10 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (bdev == NULL)
 		goto out_unlock_kernel;
 
-	q = bdev_get_queue(bdev);
+	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
+
 	mutex_lock(&bdev->bd_mutex);
 
 	if (attr == &dev_attr_enable) {
@@ -1435,7 +1444,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (bdev == NULL)
 		goto out_unlock_kernel;
 
-	q = bdev_get_queue(bdev);
+	q = blk_trace_get_queue(bdev);
 	if (q == NULL)
 		goto out_bdput;
 
-- 
cgit v1.2.3


From cf586b61f80229491127d3c57c06ed93c9f530d3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 05:04:35 +0100
Subject: tracing/function-graph-tracer: prevent hangs during self-tests

Impact: detect tracing related hangs

Sometimes, with some configs, the function graph tracer can make
the timer interrupt too much slow, hanging the kernel in an endless
loop of timer interrupts servicing.

As suggested by Ingo, this patch brings a watchdog which stops the
selftest after a defined number of functions traced, definitely
disabling this tracer.

For those who want to debug the cause of the function graph trace
hang, you can pass the ftrace_dump_on_oops kernel parameter to dump
the traces after this hang detection.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c          | 26 +++++++++++++++++++++++---
 kernel/trace/trace_selftest.c | 38 +++++++++++++++++++++++++++++++++++++-
 2 files changed, 60 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e3dfefe69348..e6fac0ffe6f0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -4018,11 +4018,12 @@ trace_printk_seq(struct trace_seq *s)
 	trace_seq_init(s);
 }
 
-void ftrace_dump(void)
+static void __ftrace_dump(bool disable_tracing)
 {
 	static DEFINE_SPINLOCK(ftrace_dump_lock);
 	/* use static because iter can be a bit big for the stack */
 	static struct trace_iterator iter;
+	unsigned int old_userobj;
 	static int dump_ran;
 	unsigned long flags;
 	int cnt = 0, cpu;
@@ -4034,14 +4035,17 @@ void ftrace_dump(void)
 
 	dump_ran = 1;
 
-	/* No turning back! */
 	tracing_off();
-	ftrace_kill();
+
+	if (disable_tracing)
+		ftrace_kill();
 
 	for_each_tracing_cpu(cpu) {
 		atomic_inc(&global_trace.data[cpu]->disabled);
 	}
 
+	old_userobj = trace_flags & TRACE_ITER_SYM_USEROBJ;
+
 	/* don't look at user memory in panic mode */
 	trace_flags &= ~TRACE_ITER_SYM_USEROBJ;
 
@@ -4086,10 +4090,26 @@ void ftrace_dump(void)
 	else
 		printk(KERN_TRACE "---------------------------------\n");
 
+	/* Re-enable tracing if requested */
+	if (!disable_tracing) {
+		trace_flags |= old_userobj;
+
+		for_each_tracing_cpu(cpu) {
+			atomic_dec(&global_trace.data[cpu]->disabled);
+		}
+		tracing_on();
+	}
+
  out:
 	spin_unlock_irqrestore(&ftrace_dump_lock, flags);
 }
 
+/* By default: disable tracing after the dump */
+void ftrace_dump(void)
+{
+	__ftrace_dump(true);
+}
+
 __init static int tracer_alloc_buffers(void)
 {
 	struct trace_array_cpu *data;
diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index 38856ba78a92..b56dcf7d3566 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -248,6 +248,28 @@ trace_selftest_startup_function(struct tracer *trace, struct trace_array *tr)
 
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
+
+/* Maximum number of functions to trace before diagnosing a hang */
+#define GRAPH_MAX_FUNC_TEST	100000000
+
+static void __ftrace_dump(bool disable_tracing);
+static unsigned int graph_hang_thresh;
+
+/* Wrap the real function entry probe to avoid possible hanging */
+static int trace_graph_entry_watchdog(struct ftrace_graph_ent *trace)
+{
+	/* This is harmlessly racy, we want to approximately detect a hang */
+	if (unlikely(++graph_hang_thresh > GRAPH_MAX_FUNC_TEST)) {
+		ftrace_graph_stop();
+		printk(KERN_WARNING "BUG: Function graph tracer hang!\n");
+		if (ftrace_dump_on_oops)
+			__ftrace_dump(false);
+		return 0;
+	}
+
+	return trace_graph_entry(trace);
+}
+
 /*
  * Pretty much the same than for the function tracer from which the selftest
  * has been borrowed.
@@ -259,15 +281,29 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 	int ret;
 	unsigned long count;
 
-	ret = tracer_init(trace, tr);
+	/*
+	 * Simulate the init() callback but we attach a watchdog callback
+	 * to detect and recover from possible hangs
+	 */
+	tracing_reset_online_cpus(tr);
+	ret = register_ftrace_graph(&trace_graph_return,
+				    &trace_graph_entry_watchdog);
 	if (ret) {
 		warn_failed_init_tracer(trace, ret);
 		goto out;
 	}
+	tracing_start_cmdline_record();
 
 	/* Sleep for a 1/10 of a second */
 	msleep(100);
 
+	/* Have we just recovered from a hang? */
+	if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
+		trace->reset(tr);
+		ret = -1;
+		goto out;
+	}
+
 	tracing_stop();
 
 	/* check the trace buffer */
-- 
cgit v1.2.3


From 0cf53ff62b3e9e491ff5e5f05b193fb6ce643047 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 15:13:07 +0100
Subject: tracing: keep the tracing buffer after self-test failure

Instead of using ftrace_dump_on_oops, it's far more convenient
to have the trace leading up to a self-test failure available
in /debug/tracing/trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237694675-23509-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_selftest.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c
index b56dcf7d3566..08f4eb2763d1 100644
--- a/kernel/trace/trace_selftest.c
+++ b/kernel/trace/trace_selftest.c
@@ -299,7 +299,7 @@ trace_selftest_startup_function_graph(struct tracer *trace,
 
 	/* Have we just recovered from a hang? */
 	if (graph_hang_thresh > GRAPH_MAX_FUNC_TEST) {
-		trace->reset(tr);
+		tracing_selftest_disabled = true;
 		ret = -1;
 		goto out;
 	}
-- 
cgit v1.2.3


From cf027f645e6aee4f0ca6197a6b6a57f327fdb13f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:39 -0500
Subject: tracing: add run-time field descriptions for event filtering

This patch makes the field descriptions defined for event tracing
available at run-time, for the event-filtering mechanism introduced
in a subsequent patch.

The common event fields are prepended with 'common_' in the format
display, allowing them to be distinguished from the other fields
that might internally have same name and can therefore be
unambiguously used in filters.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710639.7703.46.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h                | 30 +++++++++++++++++--------
 kernel/trace/trace_events.c         | 40 ++++++++++++++++++++++++++++++++-
 kernel/trace/trace_events_stage_2.h | 45 +++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |  2 ++
 4 files changed, 107 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7cfb741be200..9288dc7ad14f 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -775,16 +775,26 @@ enum {
 	TRACE_EVENT_TYPE_RAW		= 2,
 };
 
+struct ftrace_event_field {
+	struct list_head	link;
+	char			*name;
+	char			*type;
+	int			offset;
+	int			size;
+};
+
 struct ftrace_event_call {
-	char		*name;
-	char		*system;
-	struct dentry	*dir;
-	int		enabled;
-	int		(*regfunc)(void);
-	void		(*unregfunc)(void);
-	int		id;
-	int		(*raw_init)(void);
-	int		(*show_format)(struct trace_seq *s);
+	char			*name;
+	char			*system;
+	struct dentry		*dir;
+	int			enabled;
+	int			(*regfunc)(void);
+	void			(*unregfunc)(void);
+	int			id;
+	int			(*raw_init)(void);
+	int			(*show_format)(struct trace_seq *s);
+	int			(*define_fields)(void);
+	struct list_head	fields;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
@@ -793,6 +803,8 @@ struct ftrace_event_call {
 #endif
 };
 
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size);
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 3047b56f6637..961b057da28b 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -19,6 +19,34 @@
 
 static DEFINE_MUTEX(event_mutex);
 
+int trace_define_field(struct ftrace_event_call *call, char *type,
+		       char *name, int offset, int size)
+{
+	struct ftrace_event_field *field;
+
+	field = kmalloc(sizeof(*field), GFP_KERNEL);
+	if (!field)
+		goto err;
+	field->name = kstrdup(name, GFP_KERNEL);
+	if (!field->name)
+		goto err;
+	field->type = kstrdup(type, GFP_KERNEL);
+	if (!field->type)
+		goto err;
+	field->offset = offset;
+	field->size = size;
+	list_add(&field->link, &call->fields);
+
+	return 0;
+err:
+	if (field) {
+		kfree(field->name);
+		kfree(field->type);
+	}
+	kfree(field);
+	return -ENOMEM;
+}
+
 static void ftrace_clear_events(void)
 {
 	struct ftrace_event_call *call = (void *)__start_ftrace_events;
@@ -343,7 +371,8 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 #undef FIELD
 #define FIELD(type, name)						\
-	#type, #name, offsetof(typeof(field), name), sizeof(field.name)
+	#type, "common_" #name, offsetof(typeof(field), name),		\
+		sizeof(field.name)
 
 static int trace_write_header(struct trace_seq *s)
 {
@@ -581,6 +610,15 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 					call->name);
 	}
 
+	if (call->define_fields) {
+		ret = call->define_fields();
+		if (ret < 0) {
+			pr_warning("Could not initialize trace point"
+				   " events/%s\n", call->name);
+			return ret;
+		}
+	}
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h
index 5117c43f5c67..30743f7d4110 100644
--- a/kernel/trace/trace_events_stage_2.h
+++ b/kernel/trace/trace_events_stage_2.h
@@ -129,3 +129,48 @@ ftrace_format_##call(struct trace_seq *s)				\
 }
 
 #include <trace/trace_event_types.h>
+
+#undef __field
+#define __field(type, item)						\
+	ret = trace_define_field(event_call, #type, #item,		\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#undef __array
+#define __array(type, item, len)					\
+	ret = trace_define_field(event_call, #type "[" #len "]", #item,	\
+				 offsetof(typeof(field), item),		\
+				 sizeof(field.item));			\
+	if (ret)							\
+		return ret;
+
+#define __common_field(type, item)					\
+	ret = trace_define_field(event_call, #type, "common_" #item,	\
+				 offsetof(typeof(field.ent), item),	\
+				 sizeof(field.ent.item));		\
+	if (ret)							\
+		return ret;
+
+#undef TRACE_EVENT
+#define TRACE_EVENT(call, proto, args, tstruct, func, print)		\
+int									\
+ftrace_define_fields_##call(void)					\
+{									\
+	struct ftrace_raw_##call field;					\
+	struct ftrace_event_call *event_call = &event_##call;		\
+	int ret;							\
+									\
+	__common_field(unsigned char, type);				\
+	__common_field(unsigned char, flags);				\
+	__common_field(unsigned char, preempt_count);			\
+	__common_field(int, pid);					\
+	__common_field(int, tgid);					\
+									\
+	tstruct;							\
+									\
+	return ret;							\
+}
+
+#include <trace/trace_event_types.h>
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 6b3261ca988c..468938f70141 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -252,6 +252,7 @@ static int ftrace_raw_init_event_##call(void)				\
 	if (!id)							\
 		return -ENODEV;						\
 	event_##call.id = id;						\
+	INIT_LIST_HEAD(&event_##call.fields);				\
 	return 0;							\
 }									\
 									\
@@ -264,6 +265,7 @@ __attribute__((section("_ftrace_events"))) event_##call = {		\
 	.regfunc		= ftrace_raw_reg_event_##call,		\
 	.unregfunc		= ftrace_raw_unreg_event_##call,	\
 	.show_format		= ftrace_format_##call,			\
+	.define_fields		= ftrace_define_fields_##call,		\
 	_TRACE_PROFILE_INIT(call)					\
 }
 
-- 
cgit v1.2.3


From f80d2d7725b04f8225b11b55e43bb2c77c819926 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:10 +0200
Subject: tracing, Text Edit Lock: Fix one sparse warning in kernel/extable.c

Impact: cleanup.

The global mutex text_mutex if declared in linux/memory.h, so
this file needs to be included into kernel/extable.c, where the
same mutex is defined. This fixes the following sparse warning:

 kernel/extable.c:32:1: warning: symbol 'text_mutex' was not declared.
 Should it be static?

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-3-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/extable.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/extable.c b/kernel/extable.c
index 25d39b0c3a1b..b54a6017b6b5 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -16,6 +16,7 @@
     Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */
 #include <linux/ftrace.h>
+#include <linux/memory.h>
 #include <linux/module.h>
 #include <linux/mutex.h>
 #include <linux/init.h>
-- 
cgit v1.2.3


From b8b94265337f83b7db9c5f429b1769d463d7da8c Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Sun, 22 Mar 2009 19:11:11 +0200
Subject: tracing: fix four sparse warnings

Impact: cleanup.

This patch fixes the following sparse warnings:

 kernel/trace/trace.c:385:9: warning: symbol 'trace_seq_to_buffer' was
 not declared. Should it be static?

 kernel/trace/trace_clock.c:29:13: warning: symbol 'trace_clock_local'
 was not declared. Should it be static?

 kernel/trace/trace_clock.c:54:13: warning: symbol 'trace_clock' was not
 declared. Should it be static?

 kernel/trace/trace_clock.c:74:13: warning: symbol 'trace_clock_global'
 was not declared. Should it be static?

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
LKML-Reference: <1237741871-5827-4-git-send-email-dmitri.vorobiev@movial.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c       | 2 +-
 kernel/trace/trace_clock.c | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6fac0ffe6f0..ace685c70186 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -382,7 +382,7 @@ ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, size_t cnt)
 	return cnt;
 }
 
-ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
+static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt)
 {
 	int len;
 	void *ret;
diff --git a/kernel/trace/trace_clock.c b/kernel/trace/trace_clock.c
index 05b176abfd30..b588fd81f7f9 100644
--- a/kernel/trace/trace_clock.c
+++ b/kernel/trace/trace_clock.c
@@ -18,6 +18,7 @@
 #include <linux/percpu.h>
 #include <linux/sched.h>
 #include <linux/ktime.h>
+#include <linux/trace_clock.h>
 
 /*
  * trace_clock_local(): the simplest and least coherent tracing clock.
-- 
cgit v1.2.3


From 2d622719f1572ef31e0616444a515eba3094d050 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:30:49 -0500
Subject: tracing: add ring_buffer_event_discard() to ring buffer

This patch overloads RINGBUF_TYPE_PADDING to provide a way to discard
events from the ring buffer, for the event-filtering mechanism
introduced in a subsequent patch.

I did the initial version but thanks to Steven Rostedt for adding
the parts that actually made it work. ;-)

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ring_buffer.h |  11 +++--
 kernel/trace/ring_buffer.c  | 117 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 105 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h
index 9e6052bd1a1c..e1b7b2173885 100644
--- a/include/linux/ring_buffer.h
+++ b/include/linux/ring_buffer.h
@@ -18,10 +18,13 @@ struct ring_buffer_event {
 /**
  * enum ring_buffer_type - internal ring buffer types
  *
- * @RINGBUF_TYPE_PADDING:	Left over page padding
- *				 array is ignored
- *				 size is variable depending on how much
+ * @RINGBUF_TYPE_PADDING:	Left over page padding or discarded event
+ *				 If time_delta is 0:
+ *				  array is ignored
+ *				  size is variable depending on how much
  *				  padding is needed
+ *				 If time_delta is non zero:
+ *				  everything else same as RINGBUF_TYPE_DATA
  *
  * @RINGBUF_TYPE_TIME_EXTEND:	Extend the time delta
  *				 array[0] = time delta (28 .. 59)
@@ -65,6 +68,8 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event)
 	return event->time_delta;
 }
 
+void ring_buffer_event_discard(struct ring_buffer_event *event);
+
 /*
  * size is in bytes for each per CPU buffer.
  */
diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 384ca5d9d729..a09027ec1714 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -189,16 +189,65 @@ enum {
 	RB_LEN_TIME_STAMP = 16,
 };
 
-/* inline for ring buffer fast paths */
+static inline int rb_null_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0;
+}
+
+static inline int rb_discarded_event(struct ring_buffer_event *event)
+{
+	return event->type == RINGBUF_TYPE_PADDING && event->time_delta;
+}
+
+static void rb_event_set_padding(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	event->time_delta = 0;
+}
+
+/**
+ * ring_buffer_event_discard - discard an event in the ring buffer
+ * @buffer: the ring buffer
+ * @event: the event to discard
+ *
+ * Sometimes a event that is in the ring buffer needs to be ignored.
+ * This function lets the user discard an event in the ring buffer
+ * and then that event will not be read later.
+ *
+ * Note, it is up to the user to be careful with this, and protect
+ * against races. If the user discards an event that has been consumed
+ * it is possible that it could corrupt the ring buffer.
+ */
+void ring_buffer_event_discard(struct ring_buffer_event *event)
+{
+	event->type = RINGBUF_TYPE_PADDING;
+	/* time delta must be non zero */
+	if (!event->time_delta)
+		event->time_delta = 1;
+}
+
 static unsigned
-rb_event_length(struct ring_buffer_event *event)
+rb_event_data_length(struct ring_buffer_event *event)
 {
 	unsigned length;
 
+	if (event->len)
+		length = event->len * RB_ALIGNMENT;
+	else
+		length = event->array[0];
+	return length + RB_EVNT_HDR_SIZE;
+}
+
+/* inline for ring buffer fast paths */
+static unsigned
+rb_event_length(struct ring_buffer_event *event)
+{
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		/* undefined */
-		return -1;
+		if (rb_null_event(event))
+			/* undefined */
+			return -1;
+		return rb_event_data_length(event);
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		return RB_LEN_TIME_EXTEND;
@@ -207,11 +256,7 @@ rb_event_length(struct ring_buffer_event *event)
 		return RB_LEN_TIME_STAMP;
 
 	case RINGBUF_TYPE_DATA:
-		if (event->len)
-			length = event->len * RB_ALIGNMENT;
-		else
-			length = event->array[0];
-		return length + RB_EVNT_HDR_SIZE;
+		return rb_event_data_length(event);
 	default:
 		BUG();
 	}
@@ -845,11 +890,6 @@ int ring_buffer_resize(struct ring_buffer *buffer, unsigned long size)
 }
 EXPORT_SYMBOL_GPL(ring_buffer_resize);
 
-static inline int rb_null_event(struct ring_buffer_event *event)
-{
-	return event->type == RINGBUF_TYPE_PADDING;
-}
-
 static inline void *
 __rb_data_page_index(struct buffer_data_page *bpage, unsigned index)
 {
@@ -1219,7 +1259,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer,
 		if (tail < BUF_PAGE_SIZE) {
 			/* Mark the rest of the page with padding */
 			event = __rb_page_index(tail_page, tail);
-			event->type = RINGBUF_TYPE_PADDING;
+			rb_event_set_padding(event);
 		}
 
 		if (tail <= BUF_PAGE_SIZE)
@@ -1969,7 +2009,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer)
 
 	event = rb_reader_event(cpu_buffer);
 
-	if (event->type == RINGBUF_TYPE_DATA)
+	if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event))
 		cpu_buffer->entries--;
 
 	rb_update_read_stamp(cpu_buffer, event);
@@ -2052,9 +2092,18 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		RB_WARN_ON(cpu_buffer, 1);
+		if (rb_null_event(event))
+			RB_WARN_ON(cpu_buffer, 1);
+		/*
+		 * Because the writer could be discarding every
+		 * event it creates (which would probably be bad)
+		 * if we were to go back to "again" then we may never
+		 * catch up, and will trigger the warn on, or lock
+		 * the box. Return the padding, and we will release
+		 * the current locks, and try again.
+		 */
 		rb_advance_reader(cpu_buffer);
-		return NULL;
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2115,8 +2164,12 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 
 	switch (event->type) {
 	case RINGBUF_TYPE_PADDING:
-		rb_inc_iter(iter);
-		goto again;
+		if (rb_null_event(event)) {
+			rb_inc_iter(iter);
+			goto again;
+		}
+		rb_advance_iter(iter);
+		return event;
 
 	case RINGBUF_TYPE_TIME_EXTEND:
 		/* Internal data, OK to advance */
@@ -2163,10 +2216,16 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts)
 	if (!cpumask_test_cpu(cpu, buffer->cpumask))
 		return NULL;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_buffer_peek(buffer, cpu, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2185,10 +2244,16 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_event *event;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 
@@ -2207,6 +2272,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
 	struct ring_buffer_event *event = NULL;
 	unsigned long flags;
 
+ again:
 	/* might be called in atomic */
 	preempt_disable();
 
@@ -2228,6 +2294,11 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts)
  out:
 	preempt_enable();
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_consume);
@@ -2306,6 +2377,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
 	struct ring_buffer_per_cpu *cpu_buffer = iter->cpu_buffer;
 	unsigned long flags;
 
+ again:
 	spin_lock_irqsave(&cpu_buffer->reader_lock, flags);
 	event = rb_iter_peek(iter, ts);
 	if (!event)
@@ -2315,6 +2387,11 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts)
  out:
 	spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags);
 
+	if (event && event->type == RINGBUF_TYPE_PADDING) {
+		cpu_relax();
+		goto again;
+	}
+
 	return event;
 }
 EXPORT_SYMBOL_GPL(ring_buffer_read);
-- 
cgit v1.2.3


From 7ce7e4249921d5073e764f7ff7ad83cfa9894bd7 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:31:04 -0500
Subject: tracing: add per-event filtering

This patch adds per-event filtering to the event tracing subsystem.

It adds a 'filter' debugfs file to each event directory.  This file can
be written to to set filters; reading from it will display the current
set of filters set for that event.

Basically, any field listed in the 'format' file for an event can be
filtered on (including strings, but not yet other array types) using
either matching ('==') or non-matching ('!=') 'predicates'.  A
'predicate' can be either a single expression:

 # echo pid != 0 > filter

 # cat filter
 pid != 0

or a compound expression of up to 8 sub-expressions combined using '&&'
or '||':

 # echo comm == Xorg > filter
 # echo "&& sig != 29" > filter

 # cat filter
 comm == Xorg
 && sig != 29

Only events having field values matching an expression will be available
in the trace output; non-matching events are discarded.

Note that a compound expression is built up by echoing each
sub-expression separately - it's not the most efficient way to do
things, but it keeps the parser simple and assumes that compound
expressions will be relatively uncommon.  In any case, a subsequent
patch introducing a way to set filters for entire subsystems should
mitigate any need to do this for lots of events.

Setting a filter without an '&&' or '||' clears the previous filter
completely and sets the filter to the new expression:

 # cat filter
 comm == Xorg
 && sig != 29

 # echo comm != Xorg

 # cat filter
 comm != Xorg

To clear a filter, echo 0 to the filter file:

 # echo 0 > filter
 # cat filter
 none

The limit of 8 predicates for a compound expression is arbitrary - for
efficiency, it's implemented as an array of pointers to predicates, and
8 seemed more than enough for any filter...

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710665.7703.48.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Makefile               |   1 +
 kernel/trace/trace.h                |  28 ++++
 kernel/trace/trace_events.c         |  77 +++++++++
 kernel/trace/trace_events_filter.c  | 326 ++++++++++++++++++++++++++++++++++++
 kernel/trace/trace_events_stage_3.h |   4 +
 5 files changed, 436 insertions(+)
 create mode 100644 kernel/trace/trace_events_filter.c

(limited to 'kernel')

diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 0e45c206c2f9..2630f5121ec1 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -45,5 +45,6 @@ obj-$(CONFIG_EVENT_TRACER) += events.o
 obj-$(CONFIG_EVENT_TRACER) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
+obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 9288dc7ad14f..d9eb39e4bb38 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -795,6 +795,7 @@ struct ftrace_event_call {
 	int			(*show_format)(struct trace_seq *s);
 	int			(*define_fields)(void);
 	struct list_head	fields;
+	struct filter_pred	**preds;
 
 #ifdef CONFIG_EVENT_PROFILE
 	atomic_t	profile_count;
@@ -803,8 +804,35 @@ struct ftrace_event_call {
 #endif
 };
 
+#define MAX_FILTER_PRED 8
+
+struct filter_pred;
+
+typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event);
+
+struct filter_pred {
+	filter_pred_fn_t fn;
+	u64 val;
+	char *str_val;
+	int str_len;
+	char *field_name;
+	int offset;
+	int not;
+	int or;
+	int compound;
+	int clear;
+};
+
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
+extern void filter_free_pred(struct filter_pred *pred);
+extern int filter_print_preds(struct filter_pred **preds, char *buf);
+extern int filter_parse(char **pbuf, struct filter_pred *pred);
+extern int filter_add_pred(struct ftrace_event_call *call,
+			   struct filter_pred *pred);
+extern void filter_free_preds(struct ftrace_event_call *call);
+extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
 extern struct ftrace_event_call __stop_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 961b057da28b..97470c48956e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -459,6 +459,71 @@ event_id_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos)
 	return r;
 }
 
+static ssize_t
+event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+		  loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	r = filter_print_preds(call->preds, s->buffer);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		   loff_t *ppos)
+{
+	struct ftrace_event_call *call = filp->private_data;
+	char buf[64], *pbuf = buf;
+	struct filter_pred *pred;
+	int err;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return -ENOMEM;
+
+	err = filter_parse(&pbuf, pred);
+	if (err < 0) {
+		filter_free_pred(pred);
+		return err;
+	}
+
+	if (pred->clear) {
+		filter_free_preds(call);
+		return cnt;
+	}
+
+	if (filter_add_pred(call, pred)) {
+		filter_free_pred(pred);
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -504,6 +569,12 @@ static const struct file_operations ftrace_event_id_fops = {
 	.read = event_id_read,
 };
 
+static const struct file_operations ftrace_event_filter_fops = {
+	.open = tracing_open_generic,
+	.read = event_filter_read,
+	.write = event_filter_write,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -619,6 +690,12 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
+	entry = debugfs_create_file("filter", 0444, call->dir, call,
+				    &ftrace_event_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", call->name);
+
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
new file mode 100644
index 000000000000..8e8c5fa25be9
--- /dev/null
+++ b/kernel/trace/trace_events_filter.c
@@ -0,0 +1,326 @@
+/*
+ * trace_events_filter - generic event filtering
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
+ *
+ * Copyright (C) 2009 Tom Zanussi <tzanussi@gmail.com>
+ */
+
+#include <linux/debugfs.h>
+#include <linux/uaccess.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+
+#include "trace.h"
+
+static int filter_pred_64(struct filter_pred *pred, void *event)
+{
+	u64 *addr = (u64 *)(event + pred->offset);
+	u64 val = (u64)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_32(struct filter_pred *pred, void *event)
+{
+	u32 *addr = (u32 *)(event + pred->offset);
+	u32 val = (u32)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_16(struct filter_pred *pred, void *event)
+{
+	u16 *addr = (u16 *)(event + pred->offset);
+	u16 val = (u16)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_8(struct filter_pred *pred, void *event)
+{
+	u8 *addr = (u8 *)(event + pred->offset);
+	u8 val = (u8)pred->val;
+	int match;
+
+	match = (val == *addr) ^ pred->not;
+
+	return match;
+}
+
+static int filter_pred_string(struct filter_pred *pred, void *event)
+{
+	char *addr = (char *)(event + pred->offset);
+	int cmp, match;
+
+	cmp = strncmp(addr, pred->str_val, pred->str_len);
+
+	match = (!cmp) ^ pred->not;
+
+	return match;
+}
+
+/* return 1 if event matches, 0 otherwise (discard) */
+int filter_match_preds(struct ftrace_event_call *call, void *rec)
+{
+	int i, matched, and_failed = 0;
+	struct filter_pred *pred;
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (call->preds[i]) {
+			pred = call->preds[i];
+			if (and_failed && !pred->or)
+				continue;
+			matched = pred->fn(pred, rec);
+			if (!matched && !pred->or) {
+				and_failed = 1;
+				continue;
+			} else if (matched && pred->or)
+				return 1;
+		} else
+			break;
+	}
+
+	if (and_failed)
+		return 0;
+
+	return 1;
+}
+
+int filter_print_preds(struct filter_pred **preds, char *buf)
+{
+	ssize_t this_len = 0;
+	char *field_name;
+	struct filter_pred *pred;
+	int i;
+
+	if (!preds) {
+		this_len += sprintf(buf + this_len, "none\n");
+		return this_len;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (preds[i]) {
+			pred = preds[i];
+			field_name = pred->field_name;
+			if (i)
+				this_len += sprintf(buf + this_len,
+					    pred->or ? "|| " : "&& ");
+			this_len += sprintf(buf + this_len,
+					    "%s ", field_name);
+			this_len += sprintf(buf + this_len,
+					    pred->not ? "!= " : "== ");
+			if (pred->str_val)
+				this_len += sprintf(buf + this_len,
+						    "%s\n", pred->str_val);
+			else
+				this_len += sprintf(buf + this_len,
+						    "%llu\n", pred->val);
+		} else
+			break;
+	}
+
+	return this_len;
+}
+
+static struct ftrace_event_field *
+find_event_field(struct ftrace_event_call *call, char *name)
+{
+	struct ftrace_event_field *field;
+	struct list_head *entry, *tmp;
+
+	list_for_each_safe(entry, tmp, &call->fields) {
+		field = list_entry(entry, struct ftrace_event_field, link);
+		if (!strcmp(field->name, name))
+			return field;
+	}
+
+	return NULL;
+}
+
+void filter_free_pred(struct filter_pred *pred)
+{
+	if (!pred)
+		return;
+
+	kfree(pred->field_name);
+	kfree(pred->str_val);
+	kfree(pred);
+}
+
+void filter_free_preds(struct ftrace_event_call *call)
+{
+	int i;
+
+	if (call->preds) {
+		for (i = 0; i < MAX_FILTER_PRED; i++)
+			filter_free_pred(call->preds[i]);
+		kfree(call->preds);
+		call->preds = NULL;
+	}
+}
+
+static int __filter_add_pred(struct ftrace_event_call *call,
+			     struct filter_pred *pred)
+{
+	int i;
+
+	if (call->preds && !pred->compound)
+		filter_free_preds(call);
+
+	if (!call->preds) {
+		call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+				      GFP_KERNEL);
+		if (!call->preds)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (!call->preds[i]) {
+			call->preds[i] = pred;
+			return 0;
+		}
+	}
+
+	return -ENOMEM;
+}
+
+static int is_string_field(const char *type)
+{
+	if (strchr(type, '[') && strstr(type, "char"))
+		return 1;
+
+	return 0;
+}
+
+int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
+{
+	struct ftrace_event_field *field;
+
+	field = find_event_field(call, pred->field_name);
+	if (!field)
+		return -EINVAL;
+
+	pred->offset = field->offset;
+
+	if (is_string_field(field->type)) {
+		pred->fn = filter_pred_string;
+		pred->str_len = field->size;
+		return __filter_add_pred(call, pred);
+	}
+
+	switch (field->size) {
+	case 8:
+		pred->fn = filter_pred_64;
+		break;
+	case 4:
+		pred->fn = filter_pred_32;
+		break;
+	case 2:
+		pred->fn = filter_pred_16;
+		break;
+	case 1:
+		pred->fn = filter_pred_8;
+		break;
+	default:
+		return -EINVAL;
+	}
+
+	return __filter_add_pred(call, pred);
+}
+
+int filter_parse(char **pbuf, struct filter_pred *pred)
+{
+	char *tmp, *tok, *val_str = NULL;
+	int tok_n = 0;
+
+	/* field ==/!= number, or/and field ==/!= number, number */
+	while ((tok = strsep(pbuf, " \n"))) {
+		if (tok_n == 0) {
+			if (!strcmp(tok, "0")) {
+				pred->clear = 1;
+				return 0;
+			} else if (!strcmp(tok, "&&")) {
+				pred->or = 0;
+				pred->compound = 1;
+			} else if (!strcmp(tok, "||")) {
+				pred->or = 1;
+				pred->compound = 1;
+			} else
+				pred->field_name = tok;
+			tok_n = 1;
+			continue;
+		}
+		if (tok_n == 1) {
+			if (!pred->field_name)
+				pred->field_name = tok;
+			else if (!strcmp(tok, "!="))
+				pred->not = 1;
+			else if (!strcmp(tok, "=="))
+				pred->not = 0;
+			else {
+				pred->field_name = NULL;
+				return -EINVAL;
+			}
+			tok_n = 2;
+			continue;
+		}
+		if (tok_n == 2) {
+			if (pred->compound) {
+				if (!strcmp(tok, "!="))
+					pred->not = 1;
+				else if (!strcmp(tok, "=="))
+					pred->not = 0;
+				else {
+					pred->field_name = NULL;
+					return -EINVAL;
+				}
+			} else {
+				val_str = tok;
+				break; /* done */
+			}
+			tok_n = 3;
+			continue;
+		}
+		if (tok_n == 3) {
+			val_str = tok;
+			break; /* done */
+		}
+	}
+
+	pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+	if (!pred->field_name)
+		return -ENOMEM;
+
+	pred->val = simple_strtoull(val_str, &tmp, 10);
+	if (tmp == val_str) {
+		pred->str_val = kstrdup(val_str, GFP_KERNEL);
+		if (!pred->str_val)
+			return -ENOMEM;
+	}
+
+	return 0;
+}
+
+
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 468938f70141..ebf215e87d5e 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -204,6 +204,7 @@ static struct ftrace_event_call event_##call;				\
 									\
 static void ftrace_raw_event_##call(proto)				\
 {									\
+	struct ftrace_event_call *call = &event_##call;			\
 	struct ring_buffer_event *event;				\
 	struct ftrace_raw_##call *entry;				\
 	unsigned long irq_flags;					\
@@ -222,6 +223,9 @@ static void ftrace_raw_event_##call(proto)				\
 	assign;								\
 									\
 	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+									\
+	if (call->preds && !filter_match_preds(call, entry))		\
+		ring_buffer_event_discard(event);			\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
-- 
cgit v1.2.3


From cfb180f3e71b2a280a254c8646a9ab1beab63f84 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Sun, 22 Mar 2009 03:31:17 -0500
Subject: tracing: add per-subsystem filtering

This patch adds per-subsystem filtering to the event tracing subsystem.

It adds a 'filter' debugfs file to each subsystem directory.  This file
can be written to to set filters; reading from it will display the
current set of filters set for that subsystem.

Basically what it does is propagate the filter down to each event
contained in the subsystem.  If a particular event doesn't have a field
with the name specified in the filter, it simply doesn't get set for
that event.  You can verify whether or not the filter was set for a
particular event by looking at the filter file for that event.

As with per-event filters, compound expressions are supported, echoing
'0' to the subsystem's filter file clears all filters in the subsystem,
etc.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237710677.7703.49.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               | 15 +++++++
 kernel/trace/trace_events.c        | 86 +++++++++++++++++++++++++++++++++++---
 kernel/trace/trace_events_filter.c | 80 +++++++++++++++++++++++++++++++++++
 3 files changed, 175 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index d9eb39e4bb38..f267723c3c52 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -804,6 +804,18 @@ struct ftrace_event_call {
 #endif
 };
 
+struct event_subsystem {
+	struct list_head	list;
+	const char		*name;
+	struct dentry		*entry;
+	struct filter_pred	**preds;
+};
+
+#define events_for_each(event)						\
+	for (event = __start_ftrace_events;				\
+	     (unsigned long)event < (unsigned long)__stop_ftrace_events; \
+	     event++)
+
 #define MAX_FILTER_PRED 8
 
 struct filter_pred;
@@ -832,6 +844,9 @@ extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
 extern void filter_free_preds(struct ftrace_event_call *call);
 extern int filter_match_preds(struct ftrace_event_call *call, void *rec);
+extern void filter_free_subsystem_preds(struct event_subsystem *system);
+extern int filter_add_subsystem_pred(struct event_subsystem *system,
+				     struct filter_pred *pred);
 
 void event_trace_printk(unsigned long ip, const char *fmt, ...);
 extern struct ftrace_event_call __start_ftrace_events[];
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97470c48956e..97d4daaddd9a 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -524,6 +524,71 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	return cnt;
 }
 
+static ssize_t
+subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
+		      loff_t *ppos)
+{
+	struct event_subsystem *system = filp->private_data;
+	struct trace_seq *s;
+	int r;
+
+	if (*ppos)
+		return 0;
+
+	s = kmalloc(sizeof(*s), GFP_KERNEL);
+	if (!s)
+		return -ENOMEM;
+
+	trace_seq_init(s);
+
+	r = filter_print_preds(system->preds, s->buffer);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+
+	kfree(s);
+
+	return r;
+}
+
+static ssize_t
+subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
+		       loff_t *ppos)
+{
+	struct event_subsystem *system = filp->private_data;
+	char buf[64], *pbuf = buf;
+	struct filter_pred *pred;
+	int err;
+
+	if (cnt >= sizeof(buf))
+		return -EINVAL;
+
+	if (copy_from_user(&buf, ubuf, cnt))
+		return -EFAULT;
+
+	pred = kzalloc(sizeof(*pred), GFP_KERNEL);
+	if (!pred)
+		return -ENOMEM;
+
+	err = filter_parse(&pbuf, pred);
+	if (err < 0) {
+		filter_free_pred(pred);
+		return err;
+	}
+
+	if (pred->clear) {
+		filter_free_subsystem_preds(system);
+		return cnt;
+	}
+
+	if (filter_add_subsystem_pred(system, pred)) {
+		filter_free_pred(pred);
+		return -EINVAL;
+	}
+
+	*ppos += cnt;
+
+	return cnt;
+}
+
 static const struct seq_operations show_event_seq_ops = {
 	.start = t_start,
 	.next = t_next,
@@ -575,6 +640,12 @@ static const struct file_operations ftrace_event_filter_fops = {
 	.write = event_filter_write,
 };
 
+static const struct file_operations ftrace_subsystem_filter_fops = {
+	.open = tracing_open_generic,
+	.read = subsystem_filter_read,
+	.write = subsystem_filter_write,
+};
+
 static struct dentry *event_trace_events_dir(void)
 {
 	static struct dentry *d_tracer;
@@ -595,18 +666,13 @@ static struct dentry *event_trace_events_dir(void)
 	return d_events;
 }
 
-struct event_subsystem {
-	struct list_head	list;
-	const char		*name;
-	struct dentry		*entry;
-};
-
 static LIST_HEAD(event_subsystems);
 
 static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
+	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -633,6 +699,14 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 	system->name = name;
 	list_add(&system->list, &event_subsystems);
 
+	system->preds = NULL;
+
+	entry = debugfs_create_file("filter", 0444, system->entry, system,
+				    &ftrace_subsystem_filter_fops);
+	if (!entry)
+		pr_warning("Could not create debugfs "
+			   "'%s/filter' entry\n", name);
+
 	return system->entry;
 }
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 8e8c5fa25be9..1ab20cee0e4c 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -181,6 +181,27 @@ void filter_free_preds(struct ftrace_event_call *call)
 	}
 }
 
+void filter_free_subsystem_preds(struct event_subsystem *system)
+{
+	struct ftrace_event_call *call = __start_ftrace_events;
+	int i;
+
+	if (system->preds) {
+		for (i = 0; i < MAX_FILTER_PRED; i++)
+			filter_free_pred(system->preds[i]);
+		kfree(system->preds);
+		system->preds = NULL;
+	}
+
+	events_for_each(call) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!strcmp(call->system, system->name))
+			filter_free_preds(call);
+	}
+}
+
 static int __filter_add_pred(struct ftrace_event_call *call,
 			     struct filter_pred *pred)
 {
@@ -250,6 +271,65 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 	return __filter_add_pred(call, pred);
 }
 
+static struct filter_pred *copy_pred(struct filter_pred *pred)
+{
+	struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL);
+	if (!new_pred)
+		return NULL;
+
+	memcpy(new_pred, pred, sizeof(*pred));
+	if (pred->str_val) {
+		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
+		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+		if (!new_pred->str_val) {
+			kfree(new_pred);
+			return NULL;
+		}
+	}
+
+	return new_pred;
+}
+
+int filter_add_subsystem_pred(struct event_subsystem *system,
+			      struct filter_pred *pred)
+{
+	struct ftrace_event_call *call = __start_ftrace_events;
+	struct filter_pred *event_pred;
+	int i;
+
+	if (system->preds && !pred->compound)
+		filter_free_subsystem_preds(system);
+
+	if (!system->preds) {
+		system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred),
+					GFP_KERNEL);
+		if (!system->preds)
+			return -ENOMEM;
+	}
+
+	for (i = 0; i < MAX_FILTER_PRED; i++) {
+		if (!system->preds[i]) {
+			system->preds[i] = pred;
+			break;
+		}
+		if (i == MAX_FILTER_PRED - 1)
+			return -EINVAL;
+	}
+
+	events_for_each(call) {
+		if (!call->name || !call->regfunc)
+			continue;
+
+		if (!strcmp(call->system, system->name)) {
+			event_pred = copy_pred(pred);
+			if (event_pred)
+				filter_add_pred(call, event_pred);
+		}
+	}
+
+	return 0;
+}
+
 int filter_parse(char **pbuf, struct filter_pred *pred)
 {
 	char *tmp, *tok, *val_str = NULL;
-- 
cgit v1.2.3


From fe9f57f250ab4d781b99504caeb218ca2db14c1a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 22 Mar 2009 18:41:59 +0100
Subject: tracing: add run-time field descriptions for event filtering, kfree
 fix

Impact: fix potential kfree of random data in (rare) failure path

Zero-initialize the field structure.

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237710639.7703.46.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 97d4daaddd9a..594d78aaa185 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -24,26 +24,31 @@ int trace_define_field(struct ftrace_event_call *call, char *type,
 {
 	struct ftrace_event_field *field;
 
-	field = kmalloc(sizeof(*field), GFP_KERNEL);
+	field = kzalloc(sizeof(*field), GFP_KERNEL);
 	if (!field)
 		goto err;
+
 	field->name = kstrdup(name, GFP_KERNEL);
 	if (!field->name)
 		goto err;
+
 	field->type = kstrdup(type, GFP_KERNEL);
 	if (!field->type)
 		goto err;
+
 	field->offset = offset;
 	field->size = size;
 	list_add(&field->link, &call->fields);
 
 	return 0;
+
 err:
 	if (field) {
 		kfree(field->name);
 		kfree(field->type);
 	}
 	kfree(field);
+
 	return -ENOMEM;
 }
 
-- 
cgit v1.2.3


From 9bd7d099ab3f10dd666da399c064999bae427cd9 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:43 +0100
Subject: tracing/events: make the filter files writable

We need the filter files to be writable, the current
filter file permissions are only set readable.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237759847-21025-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 594d78aaa185..19f61dd23219 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -706,7 +706,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
-	entry = debugfs_create_file("filter", 0444, system->entry, system,
+	entry = debugfs_create_file("filter", 0644, system->entry, system,
 				    &ftrace_subsystem_filter_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
@@ -769,7 +769,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 		}
 	}
 
-	entry = debugfs_create_file("filter", 0444, call->dir, call,
+	entry = debugfs_create_file("filter", 0644, call->dir, call,
 				    &ftrace_event_filter_fops);
 	if (!entry)
 		pr_warning("Could not create debugfs "
-- 
cgit v1.2.3


From 07edf7121374609709ef1b0889f6e7b8d6a62ec1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:46 +0100
Subject: tracing/events: don't use wake up for events

Impact: fix hard-lockup with sched switch events

Some ftrace events, such as sched wakeup, can be traced
while the runqueue lock is hold. Since they are using
trace_current_buffer_unlock_commit(), they call wake_up()
which can try to grab the runqueue lock too, resulting in
a deadlock.

Now for all event, we call a new helper:
trace_nowake_buffer_unlock_commit() which do pretty the same than
trace_current_buffer_unlock_commit() except than it doesn't call
trace_wake_up().

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-4-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c                | 26 +++++++++++++++++++++-----
 kernel/trace/trace.h                |  2 ++
 kernel/trace/trace_events_stage_3.h |  2 +-
 3 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e6fac0ffe6f0..6bad12819eb6 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -860,15 +860,25 @@ static void ftrace_trace_stack(struct trace_array *tr,
 static void ftrace_trace_userstack(struct trace_array *tr,
 				   unsigned long flags, int pc);
 
-void trace_buffer_unlock_commit(struct trace_array *tr,
-				struct ring_buffer_event *event,
-				unsigned long flags, int pc)
+static inline void __trace_buffer_unlock_commit(struct trace_array *tr,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc,
+					int wake)
 {
 	ring_buffer_unlock_commit(tr->buffer, event);
 
 	ftrace_trace_stack(tr, flags, 6, pc);
 	ftrace_trace_userstack(tr, flags, pc);
-	trace_wake_up();
+
+	if (wake)
+		trace_wake_up();
+}
+
+void trace_buffer_unlock_commit(struct trace_array *tr,
+					struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	__trace_buffer_unlock_commit(tr, event, flags, pc, 1);
 }
 
 struct ring_buffer_event *
@@ -882,7 +892,13 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc)
 {
-	return trace_buffer_unlock_commit(&global_trace, event, flags, pc);
+	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1);
+}
+
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc)
+{
+	return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0);
 }
 
 void
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index f267723c3c52..54fd9bcd0a65 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -483,6 +483,8 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len,
 				  unsigned long flags, int pc);
 void trace_current_buffer_unlock_commit(struct ring_buffer_event *event,
 					unsigned long flags, int pc);
+void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event,
+					unsigned long flags, int pc);
 
 struct trace_entry *tracing_get_trace_entry(struct trace_array *tr,
 						struct trace_array_cpu *data);
diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index ebf215e87d5e..9a3bd49b52e5 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,7 +222,7 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	trace_current_buffer_unlock_commit(event, irq_flags, pc);	\
+	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
 									\
 	if (call->preds && !filter_match_preds(call, entry))		\
 		ring_buffer_event_discard(event);			\
-- 
cgit v1.2.3


From 7e6ea92df3fd7cbe74e7985c6f3e40255c44b201 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:47 +0100
Subject: tracing/ftrace: make nop-tracer use polling wait for events on pipe

Impact: display events when they arrive

Now that the events don't use wake_up() anymore, we need the nop
tracer to poll waiting for events on the pipe. Especially because
nop is useful to look at orphan traces types (traces types that
don't rely on specific tracers) because it doesn't produce traces
itself.

And unlike other tracers that trigger specific traces periodically,
nop triggers no traces by itself that can wake him.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-5-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_nop.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_nop.c b/kernel/trace/trace_nop.c
index 9aa84bde23cd..394f94417e2f 100644
--- a/kernel/trace/trace_nop.c
+++ b/kernel/trace/trace_nop.c
@@ -91,6 +91,7 @@ struct tracer nop_trace __read_mostly =
 	.name		= "nop",
 	.init		= nop_trace_init,
 	.reset		= nop_trace_reset,
+	.wait_pipe	= poll_wait_pipe,
 #ifdef CONFIG_FTRACE_SELFTEST
 	.selftest	= trace_selftest_startup_nop,
 #endif
-- 
cgit v1.2.3


From b118415bfad6d75792a85ac999e25149db8e6919 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Mar 2009 00:18:39 +0100
Subject: tracing/events: don't discard an event after commit

When we want to filter an event, the filter test is done after
the event is commited to the ring-buffer to be discarded later if
needed.

But a reader could be reading this event while we are trying to discard
it. Other kind of racy events can even happen because the event is
commited and can be read and/or consumed.

What we want is to discard the event before committing it.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
LKML-Reference: <1237763919-21505-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_stage_3.h | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h
index 9a3bd49b52e5..9d2fa78cecca 100644
--- a/kernel/trace/trace_events_stage_3.h
+++ b/kernel/trace/trace_events_stage_3.h
@@ -222,10 +222,11 @@ static void ftrace_raw_event_##call(proto)				\
 									\
 	assign;								\
 									\
-	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
-									\
 	if (call->preds && !filter_match_preds(call, entry))		\
 		ring_buffer_event_discard(event);			\
+									\
+	trace_nowake_buffer_unlock_commit(event, irq_flags, pc);	\
+									\
 }									\
 									\
 static int ftrace_raw_reg_event_##call(void)				\
-- 
cgit v1.2.3


From 75c8b417526529d0a7072e4d93ec99dbd483a6f4 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:28 -0500
Subject: tracing/filters: use list_for_each_entry_safe

Impact: cleanup

Use list_for_each_entry_safe instead of list_for_each_entry in
find_event_field().

Reported-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237796788.7527.35.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 1ab20cee0e4c..c4a413b70f78 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -147,11 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
-	struct ftrace_event_field *field;
-	struct list_head *entry, *tmp;
+	struct ftrace_event_field *field, *next;
 
-	list_for_each_safe(entry, tmp, &call->fields) {
-		field = list_entry(entry, struct ftrace_event_field, link);
+	list_for_each_entry_safe(field, next, &call->fields, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
-- 
cgit v1.2.3


From ee6cdabc820a29bd607f38d9cb335c3ceddc673b Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:42 -0500
Subject: tracing/filters: fix bug in copy_pred()

Impact: fix potential crash on subsystem filter expression freeing

When making a copy of the predicate, pred->field_name needs to be
duplicated in the copy as well, otherwise bad things can happen due to
later multiple frees of the same string.

This affects only per-subsystem event filtering.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237796802.7527.39.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index c4a413b70f78..fd01d8022ad1 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -276,11 +276,19 @@ static struct filter_pred *copy_pred(struct filter_pred *pred)
 		return NULL;
 
 	memcpy(new_pred, pred, sizeof(*pred));
+
+	if (pred->field_name) {
+		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
+		if (!new_pred->field_name) {
+			kfree(new_pred);
+			return NULL;
+		}
+	}
+
 	if (pred->str_val) {
 		new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL);
-		new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL);
 		if (!new_pred->str_val) {
-			kfree(new_pred);
+			filter_free_pred(new_pred);
 			return NULL;
 		}
 	}
-- 
cgit v1.2.3


From c4cff064be678f1e8344d907499f2a81282edc19 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Mon, 23 Mar 2009 03:26:48 -0500
Subject: tracing/filters: clean up filter_add_subsystem_pred()

Impact: cleanup, memory leak fix

This patch cleans up filter_add_subsystem_pred():

- searches for the field before creating a copy of the pred

- fixes memory leak in the case a predicate isn't applied

- if -ENOMEM, makes sure there's no longer a reference to the
  pred so the caller can free the half-finished filter

- changes the confusing i == MAX_FILTER_PRED - 1 comparison
  previously remarked upon

This affects only per-subsystem event filtering.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237796808.7527.40.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c        |  1 +
 kernel/trace/trace_events_filter.c | 31 ++++++++++++++++++++++++-------
 2 files changed, 25 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index 19f61dd23219..fdab30d6c835 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -585,6 +585,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 	}
 
 	if (filter_add_subsystem_pred(system, pred)) {
+		filter_free_subsystem_preds(system);
 		filter_free_pred(pred);
 		return -EINVAL;
 	}
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index fd01d8022ad1..4117c2ebc245 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -318,22 +318,39 @@ int filter_add_subsystem_pred(struct event_subsystem *system,
 			system->preds[i] = pred;
 			break;
 		}
-		if (i == MAX_FILTER_PRED - 1)
-			return -EINVAL;
 	}
 
+	if (i == MAX_FILTER_PRED)
+		return -EINVAL;
+
 	events_for_each(call) {
+		int err;
+
 		if (!call->name || !call->regfunc)
 			continue;
 
-		if (!strcmp(call->system, system->name)) {
-			event_pred = copy_pred(pred);
-			if (event_pred)
-				filter_add_pred(call, event_pred);
-		}
+		if (strcmp(call->system, system->name))
+			continue;
+
+		if (!find_event_field(call, pred->field_name))
+			continue;
+
+		event_pred = copy_pred(pred);
+		if (!event_pred)
+			goto oom;
+
+		err = filter_add_pred(call, event_pred);
+		if (err)
+			filter_free_pred(event_pred);
+		if (err == -ENOMEM)
+			goto oom;
 	}
 
 	return 0;
+
+oom:
+	system->preds[i] = NULL;
+	return -ENOMEM;
 }
 
 int filter_parse(char **pbuf, struct filter_pred *pred)
-- 
cgit v1.2.3


From 3e1f60b80cafcb5d7e8d3665b35962fbb8fb9efa Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 22 Mar 2009 23:10:45 +0100
Subject: tracing/ftrace: check if debugfs is registered before creating files

Impact: fix a crash with ftrace={nop,boot} parameter

If the nop or initcall tracers are launched as boot tracers,
they will attempt to create their option directory and files.
But these tracers are registered very early and then assigned
as "boot tracers" very early if asked to.

Since they do this before debugfs has been registered (core initcall),
a crash is triggered.

Another early tracers could also come later. So we fix it by
checking if debugfs is initialized before creating the root
tracing directory.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237759847-21025-3-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index ace685c70186..f0e1337b1ebd 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -3513,6 +3513,9 @@ struct dentry *tracing_init_dentry(void)
 	if (d_tracer)
 		return d_tracer;
 
+	if (!debugfs_initialized())
+		return NULL;
+
 	d_tracer = debugfs_create_dir("tracing", NULL);
 
 	if (!d_tracer && !once) {
-- 
cgit v1.2.3


From 53da1d9456fe7f87a920a78fdbdcf1225d197cb7 Mon Sep 17 00:00:00 2001
From: Miklos Szeredi <mszeredi@suse.cz>
Date: Mon, 23 Mar 2009 16:07:24 +0100
Subject: fix ptrace slowness

This patch fixes bug #12208:

  Bug-Entry       : http://bugzilla.kernel.org/show_bug.cgi?id=12208
  Subject         : uml is very slow on 2.6.28 host

This turned out to be not a scheduler regression, but an already
existing problem in ptrace being triggered by subtle scheduler
changes.

The problem is this:

 - task A is ptracing task B
 - task B stops on a trace event
 - task A is woken up and preempts task B
 - task A calls ptrace on task B, which does ptrace_check_attach()
 - this calls wait_task_inactive(), which sees that task B is still on the runq
 - task A goes to sleep for a jiffy
 - ...

Since UML does lots of the above sequences, those jiffies quickly add
up to make it slow as hell.

This patch solves this by not rescheduling in read_unlock() after
ptrace_stop() has woken up the tracer.

Thanks to Oleg Nesterov and Ingo Molnar for the feedback.

Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
CC: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 8 ++++++++
 1 file changed, 8 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 2a74fe87c0dd..1c8814481a11 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1575,7 +1575,15 @@ static void ptrace_stop(int exit_code, int clear_code, siginfo_t *info)
 	read_lock(&tasklist_lock);
 	if (may_ptrace_stop()) {
 		do_notify_parent_cldstop(current, CLD_TRAPPED);
+		/*
+		 * Don't want to allow preemption here, because
+		 * sys_ptrace() needs this task to be inactive.
+		 *
+		 * XXX: implement read_unlock_no_resched().
+		 */
+		preempt_disable();
 		read_unlock(&tasklist_lock);
+		preempt_enable_no_resched();
 		schedule();
 	} else {
 		/*
-- 
cgit v1.2.3


From 37bebc70d7ad4144c571d74500db3bb26ec0c0eb Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 23 Mar 2009 20:34:11 +0100
Subject: posix timers: fix RLIMIT_CPU && fork()

See http://bugzilla.kernel.org/show_bug.cgi?id=12911

copy_signal() copies signal->rlim, but RLIMIT_CPU is "lost". Because
posix_cpu_timers_init_group() sets cputime_expires.prof_exp = 0 and thus
fastpath_timer_check() returns false unless we have other cpu timers.

This is the minimal fix for 2.6.29 (tested) and 2.6.28. The patch is not
optimal, we need further cleanups here. With this patch update_rlimit_cpu()
is not really needed, but I don't think it should be removed.

The proper fix (I think) is:

	- set_process_cpu_timer() should just start the cputimer->running
	  logic (it does), no need to change cputime_expires.xxx_exp

	- posix_cpu_timers_init_group() should set ->running when needed

	- fastpath_timer_check() can check ->running instead of
	  task_cputime_zero(signal->cputime_expires)

Reported-by: Peter Lojkin <ia6432@inbox.ru>
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Roland McGrath <roland@redhat.com>
Cc: <stable@kernel.org> [for 2.6.29.x]
LKML-Reference: <20090323193411.GA17514@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/posix-cpu-timers.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e976e505648d..8e5d9a68b022 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1370,7 +1370,8 @@ static inline int fastpath_timer_check(struct task_struct *tsk)
 		if (task_cputime_expired(&group_sample, &sig->cputime_expires))
 			return 1;
 	}
-	return 0;
+
+	return sig->rlim[RLIMIT_CPU].rlim_cur != RLIM_INFINITY;
 }
 
 /*
-- 
cgit v1.2.3


From 45b9560895b07a4a09d55d49235c984db512c5aa Mon Sep 17 00:00:00 2001
From: Anton Vorontsov <avorontsov@ru.mvista.com>
Date: Tue, 24 Mar 2009 01:07:24 +0300
Subject: tracing: Fix TRACING_SUPPORT dependency for PPC32

commit 40ada30f9621fbd831ac2437b9a2a399aa ("tracing: clean up menu"),
despite the "clean up" in its purpose, introduced a behavioural
change for Kconfig symbols: we no longer able to select tracing
support on PPC32 (because IRQFLAGS_SUPPORT isn't yet implemented).

The IRQFLAGS_SUPPORT is not mandatory for most tracers, tracing core
has a special case for platforms w/o irqflags (which, by the way, has
become useless as of the commit above).

Though according to Ingo Molnar, there was periodic build failures on
weird, unmaintained architectures that had no irqflags-tracing support
and hence didn't know the raw_irqs_save/restore primitives. Thus we'd
better not enable irqflags-less tracing for all architectures.

This patch restores the old behaviour for PPC32, and thus brings the
tracing back. Other architectures can either add themselves to the
exception list or (better) implement TRACE_IRQFLAGS_SUPPORT.

Signed-off-by: Anton Vorontsov <avorontsov@ru.mvista.com>
Acked-b: Steven Rostedt <rostedt@goodmis.org>
Cc: linuxppc-dev@ozlabs.org
LKML-Reference: <20090323220724.GA9851@oksana.dev.rtsoft.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/Kconfig | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index b0a46f889659..8a4d72931042 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -63,7 +63,11 @@ config TRACING
 #
 config TRACING_SUPPORT
 	bool
-	depends on TRACE_IRQFLAGS_SUPPORT
+	# PPC32 has no irqflags tracing support, but it can use most of the
+	# tracers anyway, they were tested to build and work. Note that new
+	# exceptions to this list aren't welcomed, better implement the
+	# irqflags tracing for your architecture.
+	depends on TRACE_IRQFLAGS_SUPPORT || PPC32
 	depends on STACKTRACE_SUPPORT
 	default y
 
-- 
cgit v1.2.3


From 1618536961d31f9b3f55767b22d4a897f4204c26 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 23 Mar 2009 22:17:01 +0100
Subject: tracing/function-graph-tracer: fix functions call traces imbalance

Impact: fix traces output

Sometimes one can observe an imbalance in the traces between function
calls and function return traces:

func1() {
    }
}

The curly brace inside func1() is the return of another function nested
inside func1. The return trace have been inserted in the buffer but not
the entry.
We are storing a return address on the function traces stack while we
haven't inserted its entry on the buffer, hence the imbalance on the
traces.

This is because the tracers doesn't check all failures that can happen
on buffer insertion.

This patch reports the tracing recursion failures and the ring buffer
failures. In such cases, we now restore the original return address for
the function, giving up its return trace.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1237843021-11695-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 6bad12819eb6..89f0c2544ad0 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -924,7 +924,7 @@ trace_function(struct trace_array *tr,
 }
 
 #ifdef CONFIG_FUNCTION_GRAPH_TRACER
-static void __trace_graph_entry(struct trace_array *tr,
+static int __trace_graph_entry(struct trace_array *tr,
 				struct ftrace_graph_ent *trace,
 				unsigned long flags,
 				int pc)
@@ -933,15 +933,17 @@ static void __trace_graph_entry(struct trace_array *tr,
 	struct ftrace_graph_ent_entry *entry;
 
 	if (unlikely(local_read(&__get_cpu_var(ftrace_cpu_disabled))))
-		return;
+		return 0;
 
 	event = trace_buffer_lock_reserve(&global_trace, TRACE_GRAPH_ENT,
 					  sizeof(*entry), flags, pc);
 	if (!event)
-		return;
+		return 0;
 	entry	= ring_buffer_event_data(event);
 	entry->graph_ent			= *trace;
 	ring_buffer_unlock_commit(global_trace.buffer, event);
+
+	return 1;
 }
 
 static void __trace_graph_return(struct trace_array *tr,
@@ -1162,6 +1164,7 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	struct trace_array_cpu *data;
 	unsigned long flags;
 	long disabled;
+	int ret;
 	int cpu;
 	int pc;
 
@@ -1177,15 +1180,18 @@ int trace_graph_entry(struct ftrace_graph_ent *trace)
 	disabled = atomic_inc_return(&data->disabled);
 	if (likely(disabled == 1)) {
 		pc = preempt_count();
-		__trace_graph_entry(tr, trace, flags, pc);
+		ret = __trace_graph_entry(tr, trace, flags, pc);
+	} else {
+		ret = 0;
 	}
 	/* Only do the atomic if it is not already set */
 	if (!test_tsk_trace_graph(current))
 		set_tsk_trace_graph(current);
+
 	atomic_dec(&data->disabled);
 	local_irq_restore(flags);
 
-	return 1;
+	return ret;
 }
 
 void trace_graph_return(struct ftrace_graph_ret *trace)
-- 
cgit v1.2.3


From 1fc2d5c11918082536acf261ce6abb1f5511053f Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:01 -0500
Subject: tracing/filters: use list_for_each_entry

Impact: cleanup

No need to use the safe version here, so use list_for_each_entry instead
of list_for_each_entry_safe in find_event_field().

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878841.8339.57.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 4117c2ebc245..3f0b79f8a4bc 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -147,9 +147,9 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 static struct ftrace_event_field *
 find_event_field(struct ftrace_event_call *call, char *name)
 {
-	struct ftrace_event_field *field, *next;
+	struct ftrace_event_field *field;
 
-	list_for_each_entry_safe(field, next, &call->fields, link) {
+	list_for_each_entry(field, &call->fields, link) {
 		if (!strcmp(field->name, name))
 			return field;
 	}
-- 
cgit v1.2.3


From 09f1f245c79585383de63e3ca54d0f91824bff3a Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:11 -0500
Subject: tracing/filters: free pred when clearing filters

Impact: fix (small) per trace filter modification memory leak

Free the current pred when clearing the filters via the filter files.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878851.8339.58.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index fdab30d6c835..a9381384aa9e 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -516,6 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (pred->clear) {
 		filter_free_preds(call);
+		filter_free_pred(pred);
 		return cnt;
 	}
 
@@ -581,6 +582,7 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt,
 
 	if (pred->clear) {
 		filter_free_subsystem_preds(system);
+		filter_free_pred(pred);
 		return cnt;
 	}
 
-- 
cgit v1.2.3


From 4bda2d517bfa3ce3d7044e06988cdddae7adffe2 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:31 -0500
Subject: tracing/filters: use trace_seq_printf() to print filters

Impact: cleanup

Instead of just using the trace_seq buffer to print the filters, use
trace_seq_printf() as it was intended to be used.

Reported-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878871.8339.59.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.h               |  3 ++-
 kernel/trace/trace_events.c        |  8 ++++----
 kernel/trace/trace_events_filter.c | 25 +++++++++----------------
 3 files changed, 15 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 54fd9bcd0a65..90a848debcba 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -840,7 +840,8 @@ struct filter_pred {
 int trace_define_field(struct ftrace_event_call *call, char *type,
 		       char *name, int offset, int size);
 extern void filter_free_pred(struct filter_pred *pred);
-extern int filter_print_preds(struct filter_pred **preds, char *buf);
+extern void filter_print_preds(struct filter_pred **preds,
+			       struct trace_seq *s);
 extern int filter_parse(char **pbuf, struct filter_pred *pred);
 extern int filter_add_pred(struct ftrace_event_call *call,
 			   struct filter_pred *pred);
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index a9381384aa9e..d132997ab756 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -481,8 +481,8 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	r = filter_print_preds(call->preds, s->buffer);
-	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+	filter_print_preds(call->preds, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
 
@@ -547,8 +547,8 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt,
 
 	trace_seq_init(s);
 
-	r = filter_print_preds(system->preds, s->buffer);
-	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, r);
+	filter_print_preds(system->preds, s);
+	r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len);
 
 	kfree(s);
 
diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 3f0b79f8a4bc..9fca8bb1c06b 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -24,6 +24,7 @@
 #include <linux/ctype.h>
 
 #include "trace.h"
+#include "trace_output.h"
 
 static int filter_pred_64(struct filter_pred *pred, void *event)
 {
@@ -108,16 +109,15 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec)
 	return 1;
 }
 
-int filter_print_preds(struct filter_pred **preds, char *buf)
+void filter_print_preds(struct filter_pred **preds, struct trace_seq *s)
 {
-	ssize_t this_len = 0;
 	char *field_name;
 	struct filter_pred *pred;
 	int i;
 
 	if (!preds) {
-		this_len += sprintf(buf + this_len, "none\n");
-		return this_len;
+		trace_seq_printf(s, "none\n");
+		return;
 	}
 
 	for (i = 0; i < MAX_FILTER_PRED; i++) {
@@ -125,23 +125,16 @@ int filter_print_preds(struct filter_pred **preds, char *buf)
 			pred = preds[i];
 			field_name = pred->field_name;
 			if (i)
-				this_len += sprintf(buf + this_len,
-					    pred->or ? "|| " : "&& ");
-			this_len += sprintf(buf + this_len,
-					    "%s ", field_name);
-			this_len += sprintf(buf + this_len,
-					    pred->not ? "!= " : "== ");
+				trace_seq_printf(s, pred->or ? "|| " : "&& ");
+			trace_seq_printf(s, "%s ", field_name);
+			trace_seq_printf(s, pred->not ? "!= " : "== ");
 			if (pred->str_val)
-				this_len += sprintf(buf + this_len,
-						    "%s\n", pred->str_val);
+				trace_seq_printf(s, "%s\n", pred->str_val);
 			else
-				this_len += sprintf(buf + this_len,
-						    "%llu\n", pred->val);
+				trace_seq_printf(s, "%llu\n", pred->val);
 		} else
 			break;
 	}
-
-	return this_len;
 }
 
 static struct ftrace_event_field *
-- 
cgit v1.2.3


From 9f58a159d022c8f2533a27708aa267adf4f0e3ce Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Tue, 24 Mar 2009 02:14:42 -0500
Subject: tracing/filters: disallow integer values for string filters and vice
 versa

Impact: fix filter use boundary condition / crash

Make sure filters for string fields don't use integer values and vice
versa.  Getting it wrong can crash the system or produce bogus
results.

Signed-off-by: Tom Zanussi <tzanussi@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878882.8339.61.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events_filter.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c
index 9fca8bb1c06b..026be412f356 100644
--- a/kernel/trace/trace_events_filter.c
+++ b/kernel/trace/trace_events_filter.c
@@ -237,9 +237,14 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred)
 	pred->offset = field->offset;
 
 	if (is_string_field(field->type)) {
+		if (!pred->str_val)
+			return -EINVAL;
 		pred->fn = filter_pred_string;
 		pred->str_len = field->size;
 		return __filter_add_pred(call, pred);
+	} else {
+		if (pred->str_val)
+			return -EINVAL;
 	}
 
 	switch (field->size) {
-- 
cgit v1.2.3


From 3aa551c9b4c40018f0e261a178e3d25478dc04a9 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Mon, 23 Mar 2009 18:28:15 +0100
Subject: genirq: add threaded interrupt handler support

Add support for threaded interrupt handlers:

A device driver can request that its main interrupt handler runs in a
thread. To achive this the device driver requests the interrupt with
request_threaded_irq() and provides additionally to the handler a
thread function. The handler function is called in hard interrupt
context and needs to check whether the interrupt originated from the
device. If the interrupt originated from the device then the handler
can either return IRQ_HANDLED or IRQ_WAKE_THREAD. IRQ_HANDLED is
returned when no further action is required. IRQ_WAKE_THREAD causes
the genirq code to invoke the threaded (main) handler. When
IRQ_WAKE_THREAD is returned handler must have disabled the interrupt
on the device level. This is mandatory for shared interrupt handlers,
but we need to do it as well for obscure x86 hardware where disabling
an interrupt on the IO_APIC level redirects the interrupt to the
legacy PIC interrupt lines.

Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Reviewed-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hardirq.h   |   2 +-
 include/linux/interrupt.h |  37 ++++++++-
 include/linux/irq.h       |   5 ++
 include/linux/irqreturn.h |   2 +
 include/linux/sched.h     |   5 ++
 kernel/exit.c             |   2 +
 kernel/irq/handle.c       |  31 +++++++-
 kernel/irq/manage.c       | 192 ++++++++++++++++++++++++++++++++++++++++++----
 8 files changed, 259 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index f83288347dda..2dfaadbdb2ac 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -105,7 +105,7 @@
 # define IRQ_EXIT_OFFSET HARDIRQ_OFFSET
 #endif
 
-#ifdef CONFIG_SMP
+#if defined(CONFIG_SMP) || defined(CONFIG_GENERIC_HARDIRQS)
 extern void synchronize_irq(unsigned int irq);
 #else
 # define synchronize_irq(irq)	barrier()
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0c9cb63e6895..6fc2b720c231 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -59,6 +59,16 @@
 #define IRQF_NOBALANCING	0x00000800
 #define IRQF_IRQPOLL		0x00001000
 
+/*
+ * Bits used by threaded handlers:
+ * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
+ * IRQTF_DIED      - handler thread died
+ */
+enum {
+	IRQTF_RUNTHREAD,
+	IRQTF_DIED,
+};
+
 typedef irqreturn_t (*irq_handler_t)(int, void *);
 
 /**
@@ -71,6 +81,9 @@ typedef irqreturn_t (*irq_handler_t)(int, void *);
  * @next:	pointer to the next irqaction for shared interrupts
  * @irq:	interrupt number
  * @dir:	pointer to the proc/irq/NN/name entry
+ * @thread_fn:	interupt handler function for threaded interrupts
+ * @thread:	thread pointer for threaded interrupts
+ * @thread_flags:	flags related to @thread
  */
 struct irqaction {
 	irq_handler_t handler;
@@ -81,11 +94,31 @@ struct irqaction {
 	struct irqaction *next;
 	int irq;
 	struct proc_dir_entry *dir;
+	irq_handler_t thread_fn;
+	struct task_struct *thread;
+	unsigned long thread_flags;
 };
 
 extern irqreturn_t no_action(int cpl, void *dev_id);
-extern int __must_check request_irq(unsigned int, irq_handler_t handler,
-		       unsigned long, const char *, void *);
+
+extern int __must_check
+request_threaded_irq(unsigned int irq, irq_handler_t handler,
+		     irq_handler_t thread_fn,
+		     unsigned long flags, const char *name, void *dev);
+
+static inline int __must_check
+request_irq(unsigned int irq, irq_handler_t handler, unsigned long flags,
+	    const char *name, void *dev)
+{
+	return request_threaded_irq(irq, handler, NULL, flags, name, dev);
+}
+
+#ifdef CONFIG_GENERIC_HARDIRQS
+extern void exit_irq_thread(void);
+#else
+static inline void exit_irq_thread(void) { }
+#endif
+
 extern void free_irq(unsigned int, void *);
 
 struct device;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 873e4ac11b81..8b1cf0630210 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -20,6 +20,7 @@
 #include <linux/irqreturn.h>
 #include <linux/irqnr.h>
 #include <linux/errno.h>
+#include <linux/wait.h>
 
 #include <asm/irq.h>
 #include <asm/ptrace.h>
@@ -155,6 +156,8 @@ struct irq_2_iommu;
  * @affinity:		IRQ affinity on SMP
  * @cpu:		cpu index useful for balancing
  * @pending_mask:	pending rebalanced interrupts
+ * @threads_active:	number of irqaction threads currently running
+ * @wait_for_threads:	wait queue for sync_irq to wait for threaded handlers
  * @dir:		/proc/irq/ procfs entry
  * @name:		flow handler name for /proc/interrupts output
  */
@@ -186,6 +189,8 @@ struct irq_desc {
 	cpumask_var_t		pending_mask;
 #endif
 #endif
+	atomic_t		threads_active;
+	wait_queue_head_t       wait_for_threads;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry	*dir;
 #endif
diff --git a/include/linux/irqreturn.h b/include/linux/irqreturn.h
index c5584ca5b8c9..819acaaac3f5 100644
--- a/include/linux/irqreturn.h
+++ b/include/linux/irqreturn.h
@@ -5,10 +5,12 @@
  * enum irqreturn
  * @IRQ_NONE		interrupt was not from this device
  * @IRQ_HANDLED		interrupt was handled by this device
+ * @IRQ_WAKE_THREAD	handler requests to wake the handler thread
  */
 enum irqreturn {
 	IRQ_NONE,
 	IRQ_HANDLED,
+	IRQ_WAKE_THREAD,
 };
 
 typedef enum irqreturn irqreturn_t;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 46d680643f89..38b77b0f56e5 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1292,6 +1292,11 @@ struct task_struct {
 /* Protection of (de-)allocation: mm, files, fs, tty, keyrings */
 	spinlock_t alloc_lock;
 
+#ifdef CONFIG_GENERIC_HARDIRQS
+	/* IRQ handler threads */
+	struct irqaction *irqaction;
+#endif
+
 	/* Protection of the PI data structures: */
 	spinlock_t pi_lock;
 
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ca0b3488c4a9 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1037,6 +1037,8 @@ NORET_TYPE void do_exit(long code)
 		schedule();
 	}
 
+	exit_irq_thread();
+
 	exit_signals(tsk);  /* sets PF_EXITING */
 	/*
 	 * tsk->flags are checked in the futex code to protect against
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index 9ebf77968871..fe8f45374e86 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -357,8 +357,37 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 	do {
 		ret = action->handler(irq, action->dev_id);
-		if (ret == IRQ_HANDLED)
+
+		switch (ret) {
+		case IRQ_WAKE_THREAD:
+			/*
+			 * Wake up the handler thread for this
+			 * action. In case the thread crashed and was
+			 * killed we just pretend that we handled the
+			 * interrupt. The hardirq handler above has
+			 * disabled the device interrupt, so no irq
+			 * storm is lurking.
+			 */
+			if (likely(!test_bit(IRQTF_DIED,
+					     &action->thread_flags))) {
+				set_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+				wake_up_process(action->thread);
+			}
+
+			/*
+			 * Set it to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+			/* Fall through to add to randomness */
+		case IRQ_HANDLED:
 			status |= action->flags;
+			break;
+
+		default:
+			break;
+		}
+
 		retval |= ret;
 		action = action->next;
 	} while (action);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99984c0..a4c1ab86cd25 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -8,16 +8,15 @@
  */
 
 #include <linux/irq.h>
+#include <linux/kthread.h>
 #include <linux/module.h>
 #include <linux/random.h>
 #include <linux/interrupt.h>
 #include <linux/slab.h>
+#include <linux/sched.h>
 
 #include "internals.h"
 
-#if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
-cpumask_var_t irq_default_affinity;
-
 /**
  *	synchronize_irq - wait for pending IRQ handlers (on other CPUs)
  *	@irq: interrupt number to wait for
@@ -53,9 +52,18 @@ void synchronize_irq(unsigned int irq)
 
 		/* Oops, that failed? */
 	} while (status & IRQ_INPROGRESS);
+
+	/*
+	 * We made sure that no hardirq handler is running. Now verify
+	 * that no threaded handlers are active.
+	 */
+	wait_event(desc->wait_for_threads, !atomic_read(&desc->threads_active));
 }
 EXPORT_SYMBOL(synchronize_irq);
 
+#ifdef CONFIG_SMP
+cpumask_var_t irq_default_affinity;
+
 /**
  *	irq_can_set_affinity - Check if the affinity of a given irq can be set
  *	@irq:		Interrupt to check
@@ -72,6 +80,18 @@ int irq_can_set_affinity(unsigned int irq)
 	return 1;
 }
 
+static void
+irq_set_thread_affinity(struct irq_desc *desc, const struct cpumask *cpumask)
+{
+	struct irqaction *action = desc->action;
+
+	while (action) {
+		if (action->thread)
+			set_cpus_allowed_ptr(action->thread, cpumask);
+		action = action->next;
+	}
+}
+
 /**
  *	irq_set_affinity - Set the irq affinity of a given irq
  *	@irq:		Interrupt to set affinity
@@ -100,6 +120,7 @@ int irq_set_affinity(unsigned int irq, const struct cpumask *cpumask)
 	cpumask_copy(desc->affinity, cpumask);
 	desc->chip->set_affinity(irq, cpumask);
 #endif
+	irq_set_thread_affinity(desc, cpumask);
 	desc->status |= IRQ_AFFINITY_SET;
 	spin_unlock_irqrestore(&desc->lock, flags);
 	return 0;
@@ -150,6 +171,8 @@ int irq_select_affinity_usr(unsigned int irq)
 
 	spin_lock_irqsave(&desc->lock, flags);
 	ret = setup_affinity(irq, desc);
+	if (!ret)
+		irq_set_thread_affinity(desc, desc->affinity);
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	return ret;
@@ -384,6 +407,93 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
+static inline int irq_thread_should_run(struct irqaction *action)
+{
+	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
+}
+
+static int irq_wait_for_interrupt(struct irqaction *action)
+{
+	while (!kthread_should_stop()) {
+		set_current_state(TASK_INTERRUPTIBLE);
+		if (irq_thread_should_run(action)) {
+			__set_current_state(TASK_RUNNING);
+			return 0;
+		} else
+			schedule();
+	}
+	return -1;
+}
+
+/*
+ * Interrupt handler thread
+ */
+static int irq_thread(void *data)
+{
+	struct sched_param param = { .sched_priority = MAX_USER_RT_PRIO/2, };
+	struct irqaction *action = data;
+	struct irq_desc *desc = irq_to_desc(action->irq);
+	int wake;
+
+	sched_setscheduler(current, SCHED_FIFO, &param);
+	current->irqaction = action;
+
+	while (!irq_wait_for_interrupt(action)) {
+
+		atomic_inc(&desc->threads_active);
+
+		spin_lock_irq(&desc->lock);
+		if (unlikely(desc->status & IRQ_DISABLED)) {
+			/*
+			 * CHECKME: We might need a dedicated
+			 * IRQ_THREAD_PENDING flag here, which
+			 * retriggers the thread in check_irq_resend()
+			 * but AFAICT IRQ_PENDING should be fine as it
+			 * retriggers the interrupt itself --- tglx
+			 */
+			desc->status |= IRQ_PENDING;
+			spin_unlock_irq(&desc->lock);
+		} else {
+			spin_unlock_irq(&desc->lock);
+
+			action->thread_fn(action->irq, action->dev_id);
+		}
+
+		wake = atomic_dec_and_test(&desc->threads_active);
+
+		if (wake && waitqueue_active(&desc->wait_for_threads))
+			wake_up(&desc->wait_for_threads);
+	}
+
+	/*
+	 * Clear irqaction. Otherwise exit_irq_thread() would make
+	 * fuzz about an active irq thread going into nirvana.
+	 */
+	current->irqaction = NULL;
+	return 0;
+}
+
+/*
+ * Called from do_exit()
+ */
+void exit_irq_thread(void)
+{
+	struct task_struct *tsk = current;
+
+	if (!tsk->irqaction)
+		return;
+
+	printk(KERN_ERR
+	       "exiting task \"%s\" (%d) is an active IRQ thread (irq %d)\n",
+	       tsk->comm ? tsk->comm : "", tsk->pid, tsk->irqaction->irq);
+
+	/*
+	 * Set the THREAD DIED flag to prevent further wakeups of the
+	 * soon to be gone threaded handler.
+	 */
+	set_bit(IRQTF_DIED, &tsk->irqaction->flags);
+}
+
 /*
  * Internal function to register an irqaction - typically used to
  * allocate special interrupts that are part of the architecture.
@@ -419,6 +529,26 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 		rand_initialize_irq(irq);
 	}
 
+	/*
+	 * Threaded handler ?
+	 */
+	if (new->thread_fn) {
+		struct task_struct *t;
+
+		t = kthread_create(irq_thread, new, "irq/%d-%s", irq,
+				   new->name);
+		if (IS_ERR(t))
+			return PTR_ERR(t);
+		/*
+		 * We keep the reference to the task struct even if
+		 * the thread dies to avoid that the interrupt code
+		 * references an already freed task_struct.
+		 */
+		get_task_struct(t);
+		new->thread = t;
+		wake_up_process(t);
+	}
+
 	/*
 	 * The following block of code has to be executed atomically
 	 */
@@ -456,15 +586,15 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	if (!shared) {
 		irq_chip_set_defaults(desc->chip);
 
+		init_waitqueue_head(&desc->wait_for_threads);
+
 		/* Setup the type (level, edge polarity) if configured: */
 		if (new->flags & IRQF_TRIGGER_MASK) {
 			ret = __irq_set_trigger(desc, irq,
 					new->flags & IRQF_TRIGGER_MASK);
 
-			if (ret) {
-				spin_unlock_irqrestore(&desc->lock, flags);
-				return ret;
-			}
+			if (ret)
+				goto out_thread;
 		} else
 			compat_irq_chip_set_default_handler(desc);
 #if defined(CONFIG_IRQ_PER_CPU)
@@ -532,8 +662,19 @@ mismatch:
 		dump_stack();
 	}
 #endif
+	ret = -EBUSY;
+
+out_thread:
 	spin_unlock_irqrestore(&desc->lock, flags);
-	return -EBUSY;
+	if (new->thread) {
+		struct task_struct *t = new->thread;
+
+		new->thread = NULL;
+		if (likely(!test_bit(IRQTF_DIED, &new->thread_flags)))
+			kthread_stop(t);
+		put_task_struct(t);
+	}
+	return ret;
 }
 
 /**
@@ -559,6 +700,7 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 {
 	struct irq_desc *desc = irq_to_desc(irq);
 	struct irqaction *action, **action_ptr;
+	struct task_struct *irqthread;
 	unsigned long flags;
 
 	WARN(in_interrupt(), "Trying to free IRQ %d from IRQ context!\n", irq);
@@ -605,6 +747,10 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 		else
 			desc->chip->disable(irq);
 	}
+
+	irqthread = action->thread;
+	action->thread = NULL;
+
 	spin_unlock_irqrestore(&desc->lock, flags);
 
 	unregister_handler_proc(irq, action);
@@ -612,6 +758,12 @@ static struct irqaction *__free_irq(unsigned int irq, void *dev_id)
 	/* Make sure it's not being used on another CPU: */
 	synchronize_irq(irq);
 
+	if (irqthread) {
+		if (!test_bit(IRQTF_DIED, &action->thread_flags))
+			kthread_stop(irqthread);
+		put_task_struct(irqthread);
+	}
+
 #ifdef CONFIG_DEBUG_SHIRQ
 	/*
 	 * It's a shared IRQ -- the driver ought to be prepared for an IRQ
@@ -664,9 +816,12 @@ void free_irq(unsigned int irq, void *dev_id)
 EXPORT_SYMBOL(free_irq);
 
 /**
- *	request_irq - allocate an interrupt line
+ *	request_threaded_irq - allocate an interrupt line
  *	@irq: Interrupt line to allocate
- *	@handler: Function to be called when the IRQ occurs
+ *	@handler: Function to be called when the IRQ occurs.
+ *		  Primary handler for threaded interrupts
+ *      @thread_fn: Function called from the irq handler thread
+ *                  If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -678,6 +833,15 @@ EXPORT_SYMBOL(free_irq);
  *	raises, you must take care both to initialise your hardware
  *	and to set up the interrupt handler in the right order.
  *
+ *	If you want to set up a threaded irq handler for your device
+ *	then you need to supply @handler and @thread_fn. @handler ist
+ *	still called in hard interrupt context and has to check
+ *	whether the interrupt originates from the device. If yes it
+ *	needs to disable the interrupt on the device and return
+ *	IRQ_THREAD_WAKE which will wake up the handler thread and run
+ *	@thread_fn. This split handler design is necessary to support
+ *	shared interrupts.
+ *
  *	Dev_id must be globally unique. Normally the address of the
  *	device data structure is used as the cookie. Since the handler
  *	receives this value it makes sense to use it.
@@ -693,8 +857,9 @@ EXPORT_SYMBOL(free_irq);
  *	IRQF_TRIGGER_*		Specify active edge(s) or level
  *
  */
-int request_irq(unsigned int irq, irq_handler_t handler,
-		unsigned long irqflags, const char *devname, void *dev_id)
+int request_threaded_irq(unsigned int irq, irq_handler_t handler,
+			 irq_handler_t thread_fn, unsigned long irqflags,
+			 const char *devname, void *dev_id)
 {
 	struct irqaction *action;
 	struct irq_desc *desc;
@@ -742,6 +907,7 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 		return -ENOMEM;
 
 	action->handler = handler;
+	action->thread_fn = thread_fn;
 	action->flags = irqflags;
 	action->name = devname;
 	action->dev_id = dev_id;
@@ -771,4 +937,4 @@ int request_irq(unsigned int irq, irq_handler_t handler,
 #endif
 	return retval;
 }
-EXPORT_SYMBOL(request_irq);
+EXPORT_SYMBOL(request_threaded_irq);
-- 
cgit v1.2.3


From 935bd5b971f0df7c06d214d022cf8392e2f37952 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 23 Mar 2009 18:28:16 +0100
Subject: genirq: add support for threaded interrupts to devres

Some devices use devres_request_irq() for to install their interrupt
handler. Add support for threaded interrupts to devres as well.

[tglx - simplified and adapted to latest threadirq version]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h | 17 ++++++++++++++---
 kernel/irq/devres.c       | 16 ++++++++++------
 2 files changed, 24 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 6fc2b720c231..dbf6a6fd116c 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -123,9 +123,20 @@ extern void free_irq(unsigned int, void *);
 
 struct device;
 
-extern int __must_check devm_request_irq(struct device *dev, unsigned int irq,
-			    irq_handler_t handler, unsigned long irqflags,
-			    const char *devname, void *dev_id);
+extern int __must_check
+devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			  irq_handler_t handler, irq_handler_t thread_fn,
+			  unsigned long irqflags, const char *devname,
+			  void *dev_id);
+
+static inline int __must_check
+devm_request_irq(struct device *dev, unsigned int irq, irq_handler_t handler,
+		 unsigned long irqflags, const char *devname, void *dev_id)
+{
+	return devm_request_threaded_irq(dev, irq, handler, NULL, irqflags,
+					 devname, dev_id);
+}
+
 extern void devm_free_irq(struct device *dev, unsigned int irq, void *dev_id);
 
 /*
diff --git a/kernel/irq/devres.c b/kernel/irq/devres.c
index 38a25b8d8bff..d06df9c41cba 100644
--- a/kernel/irq/devres.c
+++ b/kernel/irq/devres.c
@@ -26,10 +26,12 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
 }
 
 /**
- *	devm_request_irq - allocate an interrupt line for a managed device
+ *	devm_request_threaded_irq - allocate an interrupt line for a managed device
  *	@dev: device to request interrupt for
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs
+ *	@thread_fn: function to be called in a threaded interrupt context. NULL
+ *		    for devices which handle everything in @handler
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
@@ -42,9 +44,10 @@ static int devm_irq_match(struct device *dev, void *res, void *data)
  *	If an IRQ allocated with this function needs to be freed
  *	separately, dev_free_irq() must be used.
  */
-int devm_request_irq(struct device *dev, unsigned int irq,
-		     irq_handler_t handler, unsigned long irqflags,
-		     const char *devname, void *dev_id)
+int devm_request_threaded_irq(struct device *dev, unsigned int irq,
+			      irq_handler_t handler, irq_handler_t thread_fn,
+			      unsigned long irqflags, const char *devname,
+			      void *dev_id)
 {
 	struct irq_devres *dr;
 	int rc;
@@ -54,7 +57,8 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 	if (!dr)
 		return -ENOMEM;
 
-	rc = request_irq(irq, handler, irqflags, devname, dev_id);
+	rc = request_threaded_irq(irq, handler, thread_fn, irqflags, devname,
+				  dev_id);
 	if (rc) {
 		devres_free(dr);
 		return rc;
@@ -66,7 +70,7 @@ int devm_request_irq(struct device *dev, unsigned int irq,
 
 	return 0;
 }
-EXPORT_SYMBOL(devm_request_irq);
+EXPORT_SYMBOL(devm_request_threaded_irq);
 
 /**
  *	devm_free_irq - free an interrupt
-- 
cgit v1.2.3


From f48fe81e5b032914183e9a17052313720c2cac56 Mon Sep 17 00:00:00 2001
From: Thomas Gleixner <tglx@linutronix.de>
Date: Tue, 24 Mar 2009 11:46:22 +0100
Subject: genirq: threaded irq handlers review fixups

Delta patch to address the review comments.

      - Implement warning when IRQ_WAKE_THREAD is requested and no
        thread handler installed
      - coding style fixes

Pointed-out-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/interrupt.h |  2 ++
 kernel/irq/handle.c       | 29 ++++++++++++++++++++++++-----
 kernel/irq/manage.c       | 17 +++++++----------
 3 files changed, 33 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index dbf6a6fd116c..266a5f5f57cc 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -63,10 +63,12 @@
  * Bits used by threaded handlers:
  * IRQTF_RUNTHREAD - signals that the interrupt handler thread should run
  * IRQTF_DIED      - handler thread died
+ * IRQTF_WARNED    - warning "IRQ_WAKE_THREAD w/o thread_fn" has been printed
  */
 enum {
 	IRQTF_RUNTHREAD,
 	IRQTF_DIED,
+	IRQTF_WARNED,
 };
 
 typedef irqreturn_t (*irq_handler_t)(int, void *);
diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c
index fe8f45374e86..38b49a9e508a 100644
--- a/kernel/irq/handle.c
+++ b/kernel/irq/handle.c
@@ -338,6 +338,15 @@ irqreturn_t no_action(int cpl, void *dev_id)
 	return IRQ_NONE;
 }
 
+static void warn_no_thread(unsigned int irq, struct irqaction *action)
+{
+	if (test_and_set_bit(IRQTF_WARNED, &action->thread_flags))
+		return;
+
+	printk(KERN_WARNING "IRQ %d device %s returned IRQ_WAKE_THREAD "
+	       "but no thread function available.", irq, action->name);
+}
+
 /**
  * handle_IRQ_event - irq action chain handler
  * @irq:	the interrupt number
@@ -360,6 +369,21 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 
 		switch (ret) {
 		case IRQ_WAKE_THREAD:
+			/*
+			 * Set result to handled so the spurious check
+			 * does not trigger.
+			 */
+			ret = IRQ_HANDLED;
+
+			/*
+			 * Catch drivers which return WAKE_THREAD but
+			 * did not set up a thread function
+			 */
+			if (unlikely(!action->thread_fn)) {
+				warn_no_thread(irq, action);
+				break;
+			}
+
 			/*
 			 * Wake up the handler thread for this
 			 * action. In case the thread crashed and was
@@ -374,11 +398,6 @@ irqreturn_t handle_IRQ_event(unsigned int irq, struct irqaction *action)
 				wake_up_process(action->thread);
 			}
 
-			/*
-			 * Set it to handled so the spurious check
-			 * does not trigger.
-			 */
-			ret = IRQ_HANDLED;
 			/* Fall through to add to randomness */
 		case IRQ_HANDLED:
 			status |= action->flags;
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index a4c1ab86cd25..a3eb7baf1e46 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -407,20 +407,17 @@ int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 	return ret;
 }
 
-static inline int irq_thread_should_run(struct irqaction *action)
-{
-	return test_and_clear_bit(IRQTF_RUNTHREAD, &action->thread_flags);
-}
-
 static int irq_wait_for_interrupt(struct irqaction *action)
 {
 	while (!kthread_should_stop()) {
 		set_current_state(TASK_INTERRUPTIBLE);
-		if (irq_thread_should_run(action)) {
+
+		if (test_and_clear_bit(IRQTF_RUNTHREAD,
+				       &action->thread_flags)) {
 			__set_current_state(TASK_RUNNING);
 			return 0;
-		} else
-			schedule();
+		}
+		schedule();
 	}
 	return -1;
 }
@@ -820,8 +817,8 @@ EXPORT_SYMBOL(free_irq);
  *	@irq: Interrupt line to allocate
  *	@handler: Function to be called when the IRQ occurs.
  *		  Primary handler for threaded interrupts
- *      @thread_fn: Function called from the irq handler thread
- *                  If NULL, no irq thread is created
+ *	@thread_fn: Function called from the irq handler thread
+ *		    If NULL, no irq thread is created
  *	@irqflags: Interrupt type flags
  *	@devname: An ascii name for the claiming device
  *	@dev_id: A cookie passed back to the handler function
-- 
cgit v1.2.3


From e4955c9986a27bb47ddeb6cd55803053f547e2e9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:04:37 +0800
Subject: blktrace: mark ddir_act[] const

Impact: cleanup

ddir_act and what2act always stay immutable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C89415.5080503@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 108f4f7715a5..1ffcbd4ac45b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -147,8 +147,8 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector,
 /*
  * Data direction bit lookup
  */
-static u32 ddir_act[2] __read_mostly = { BLK_TC_ACT(BLK_TC_READ),
-					 BLK_TC_ACT(BLK_TC_WRITE) };
+static const u32 ddir_act[2] = { BLK_TC_ACT(BLK_TC_READ),
+				 BLK_TC_ACT(BLK_TC_WRITE) };
 
 /* The ilog2() calls fall out because they're constant */
 #define MASK_TC_BIT(rw, __name) ((rw & (1 << BIO_RW_ ## __name)) << \
@@ -1116,10 +1116,10 @@ static void blk_tracer_reset(struct trace_array *tr)
 	blk_tracer_stop(tr);
 }
 
-static struct {
+static const struct {
 	const char *act[2];
 	int	   (*print)(struct trace_seq *s, const struct trace_entry *ent);
-} what2act[] __read_mostly = {
+} what2act[] = {
 	[__BLK_TA_QUEUE]	= {{  "Q", "queue" },	   blk_log_generic },
 	[__BLK_TA_BACKMERGE]	= {{  "M", "backmerge" },  blk_log_generic },
 	[__BLK_TA_FRONTMERGE]	= {{  "F", "frontmerge" }, blk_log_generic },
-- 
cgit v1.2.3


From 65796348e09880e12b97267d39b8857c758440a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:06 +0800
Subject: blktrace: fix wrong calculation of RWBS

Impact: fix the output of IO type category characters

Trace categories are the upper 16 bits, not the lower 16 bits.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C89432.8010805@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 1ffcbd4ac45b..9af41430ee54 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -922,23 +922,24 @@ static void blk_unregister_tracepoints(void)
 static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 {
 	int i = 0;
+	int tc = t->action >> BLK_TC_SHIFT;
 
-	if (t->action & BLK_TC_DISCARD)
+	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
-	else if (t->action & BLK_TC_WRITE)
+	else if (tc & BLK_TC_WRITE)
 		rwbs[i++] = 'W';
 	else if (t->bytes)
 		rwbs[i++] = 'R';
 	else
 		rwbs[i++] = 'N';
 
-	if (t->action & BLK_TC_AHEAD)
+	if (tc & BLK_TC_AHEAD)
 		rwbs[i++] = 'A';
-	if (t->action & BLK_TC_BARRIER)
+	if (tc & BLK_TC_BARRIER)
 		rwbs[i++] = 'B';
-	if (t->action & BLK_TC_SYNC)
+	if (tc & BLK_TC_SYNC)
 		rwbs[i++] = 'S';
-	if (t->action & BLK_TC_META)
+	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
 
 	rwbs[i] = '\0';
-- 
cgit v1.2.3


From e0dc81bec0927fa0c8aabc521793161909eef7a5 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:51 +0800
Subject: blktrace: fix t_error()

Impact: fix error flag output

t_error() should return t->error but not t->sector.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C8945F.5020802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 9af41430ee54..f69f8bd8bef9 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -968,7 +968,7 @@ static inline unsigned long long t_sector(const struct trace_entry *ent)
 
 static inline __u16 t_error(const struct trace_entry *ent)
 {
-	return te_blk_io_trace(ent)->sector;
+	return te_blk_io_trace(ent)->error;
 }
 
 static __u64 get_pdu_int(const struct trace_entry *ent)
-- 
cgit v1.2.3


From 093419971e03362a00f499960557c119982ea09f Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 17:43:30 +0800
Subject: blktrace: print human-readable act_mask

Impact: new feature, allow symbolic values in /debug/tracing/act_mask

Print stringified act_mask instead of hex value:

 # cat act_mask
 read,write,barrier,sync,queue,requeue,issue,complete,fs,pc,ahead,meta,
 discard,drv_data
 # echo "meta,write" > act_mask
 # cat act_mask
 write,meta

Also:
 - make act_mask accept "ahead", "meta", "discard" and "drv_data"
 - use strsep() instead of strchr() to parse user input
 - return -EINVAL if a token is not found in the mask map
 - fix a bug that 'value' is unsigned, so it can < 0
 - propagate error value of blk_trace_mask2str() to userspace, but not
   always return -ENXIO.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <49C8AB42.1000802@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 103 ++++++++++++++++++++++++++++++------------------
 1 file changed, 65 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index f69f8bd8bef9..6fb274f5f34e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1316,53 +1316,77 @@ struct attribute_group blk_trace_attr_group = {
 	.attrs = blk_trace_attrs,
 };
 
-static int blk_str2act_mask(const char *str)
+static const struct {
+	int mask;
+	const char *str;
+} mask_maps[] = {
+	{ BLK_TC_READ,		"read"		},
+	{ BLK_TC_WRITE,		"write"		},
+	{ BLK_TC_BARRIER,	"barrier"	},
+	{ BLK_TC_SYNC,		"sync"		},
+	{ BLK_TC_QUEUE,		"queue"		},
+	{ BLK_TC_REQUEUE,	"requeue"	},
+	{ BLK_TC_ISSUE,		"issue"		},
+	{ BLK_TC_COMPLETE,	"complete"	},
+	{ BLK_TC_FS,		"fs"		},
+	{ BLK_TC_PC,		"pc"		},
+	{ BLK_TC_AHEAD,		"ahead"		},
+	{ BLK_TC_META,		"meta"		},
+	{ BLK_TC_DISCARD,	"discard"	},
+	{ BLK_TC_DRV_DATA,	"drv_data"	},
+};
+
+static int blk_trace_str2mask(const char *str)
 {
+	int i;
 	int mask = 0;
-	char *copy = kstrdup(str, GFP_KERNEL), *s;
+	char *s, *token;
 
-	if (copy == NULL)
+	s = kstrdup(str, GFP_KERNEL);
+	if (s == NULL)
 		return -ENOMEM;
-
-	s = strstrip(copy);
+	s = strstrip(s);
 
 	while (1) {
-		char *sep = strchr(s, ',');
-
-		if (sep != NULL)
-			*sep = '\0';
-
-		if (strcasecmp(s, "barrier") == 0)
-			mask |= BLK_TC_BARRIER;
-		else if (strcasecmp(s, "complete") == 0)
-			mask |= BLK_TC_COMPLETE;
-		else if (strcasecmp(s, "fs") == 0)
-			mask |= BLK_TC_FS;
-		else if (strcasecmp(s, "issue") == 0)
-			mask |= BLK_TC_ISSUE;
-		else if (strcasecmp(s, "pc") == 0)
-			mask |= BLK_TC_PC;
-		else if (strcasecmp(s, "queue") == 0)
-			mask |= BLK_TC_QUEUE;
-		else if (strcasecmp(s, "read") == 0)
-			mask |= BLK_TC_READ;
-		else if (strcasecmp(s, "requeue") == 0)
-			mask |= BLK_TC_REQUEUE;
-		else if (strcasecmp(s, "sync") == 0)
-			mask |= BLK_TC_SYNC;
-		else if (strcasecmp(s, "write") == 0)
-			mask |= BLK_TC_WRITE;
-
-		if (sep == NULL)
+		token = strsep(&s, ",");
+		if (token == NULL)
 			break;
 
-		s = sep + 1;
+		if (*token == '\0')
+			continue;
+
+		for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+			if (strcasecmp(token, mask_maps[i].str) == 0) {
+				mask |= mask_maps[i].mask;
+				break;
+			}
+		}
+		if (i == ARRAY_SIZE(mask_maps)) {
+			mask = -EINVAL;
+			break;
+		}
 	}
-	kfree(copy);
+	kfree(s);
 
 	return mask;
 }
 
+static ssize_t blk_trace_mask2str(char *buf, int mask)
+{
+	int i;
+	char *p = buf;
+
+	for (i = 0; i < ARRAY_SIZE(mask_maps); i++) {
+		if (mask & mask_maps[i].mask) {
+			p += sprintf(p, "%s%s",
+				    (p == buf) ? "" : ",", mask_maps[i].str);
+		}
+	}
+	*p++ = '\n';
+
+	return p - buf;
+}
+
 static struct request_queue *blk_trace_get_queue(struct block_device *bdev)
 {
 	if (bdev->bd_disk == NULL)
@@ -1399,7 +1423,7 @@ static ssize_t sysfs_blk_trace_attr_show(struct device *dev,
 	if (q->blk_trace == NULL)
 		ret = sprintf(buf, "disabled\n");
 	else if (attr == &dev_attr_act_mask)
-		ret = sprintf(buf, "%#x\n", q->blk_trace->act_mask);
+		ret = blk_trace_mask2str(buf, q->blk_trace->act_mask);
 	else if (attr == &dev_attr_pid)
 		ret = sprintf(buf, "%u\n", q->blk_trace->pid);
 	else if (attr == &dev_attr_start_lba)
@@ -1424,7 +1448,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	struct request_queue *q;
 	struct hd_struct *p;
 	u64 value;
-	ssize_t ret = -ENXIO;
+	ssize_t ret = -EINVAL;
 
 	if (count == 0)
 		goto out;
@@ -1432,13 +1456,16 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev,
 	if (attr == &dev_attr_act_mask) {
 		if (sscanf(buf, "%llx", &value) != 1) {
 			/* Assume it is a list of trace category names */
-			value = blk_str2act_mask(buf);
-			if (value < 0)
+			ret = blk_trace_str2mask(buf);
+			if (ret < 0)
 				goto out;
+			value = ret;
 		}
 	} else if (sscanf(buf, "%llu", &value) != 1)
 		goto out;
 
+	ret = -ENXIO;
+
 	lock_kernel();
 	p = dev_to_part(dev);
 	bdev = bdget(part_devt(p));
-- 
cgit v1.2.3


From 098335215a4921a8a54193829eaed602dca24df5 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Sat, 21 Mar 2009 02:44:50 -0400
Subject: tracing: fix memory leak in trace_stat

If the function profiler does not have any items recorded and one were
to cat the function stat file, the kernel would take a BUG with a NULL
pointer dereference.

Looking further into this, I found that returning NULL from stat_start
did not stop the stat logic, and would later call stat_next. This breaks
from the way seq_file works, so I looked into fixing the stat code.

This is where I noticed that the last next_entry is never freed.
It is allocated, and if the stat_next returns NULL, the code breaks out
of the loop, unlocks the mutex and exits. We never link the next_entry
nor do we free it. Thus it is a real memory leak.

This patch rearranges the code a bit to not only fix the memory leak,
but also to act more like seq_file where nothing is printed if there
is nothing to print. That is, stat_start returns NULL.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/trace_stat.c | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 39310e3434ee..f71b85b22cfe 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -75,7 +75,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 {
 	struct trace_stat_list *iter_entry, *new_entry;
 	struct tracer_stat *ts = session->ts;
-	void *prev_stat;
+	void *stat;
 	int ret = 0;
 	int i;
 
@@ -85,6 +85,10 @@ static int stat_seq_init(struct tracer_stat_session *session)
 	if (!ts->stat_cmp)
 		ts->stat_cmp = dummy_cmp;
 
+	stat = ts->stat_start();
+	if (!stat)
+		goto exit;
+
 	/*
 	 * The first entry. Actually this is the second, but the first
 	 * one (the stat_list head) is pointless.
@@ -99,14 +103,19 @@ static int stat_seq_init(struct tracer_stat_session *session)
 
 	list_add(&new_entry->list, &session->stat_list);
 
-	new_entry->stat = ts->stat_start();
-	prev_stat = new_entry->stat;
+	new_entry->stat = stat;
 
 	/*
 	 * Iterate over the tracer stat entries and store them in a sorted
 	 * list.
 	 */
 	for (i = 1; ; i++) {
+		stat = ts->stat_next(stat, i);
+
+		/* End of insertion */
+		if (!stat)
+			break;
+
 		new_entry = kmalloc(sizeof(struct trace_stat_list), GFP_KERNEL);
 		if (!new_entry) {
 			ret = -ENOMEM;
@@ -114,11 +123,7 @@ static int stat_seq_init(struct tracer_stat_session *session)
 		}
 
 		INIT_LIST_HEAD(&new_entry->list);
-		new_entry->stat = ts->stat_next(prev_stat, i);
-
-		/* End of insertion */
-		if (!new_entry->stat)
-			break;
+		new_entry->stat = stat;
 
 		list_for_each_entry(iter_entry, &session->stat_list, list) {
 
@@ -137,8 +142,6 @@ static int stat_seq_init(struct tracer_stat_session *session)
 				break;
 			}
 		}
-
-		prev_stat = new_entry->stat;
 	}
 exit:
 	mutex_unlock(&session->stat_mutex);
-- 
cgit v1.2.3


From 5d1a03dc541dc6672e60e57249ed22f40654ca47 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 23 Mar 2009 23:38:49 -0400
Subject: function-graph: moved the timestamp from arch to generic code

This patch move the timestamp from happening in the arch specific
code into the general code. This allows for better control by the tracer
to time manipulation.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 arch/x86/kernel/ftrace.c             | 6 +-----
 include/linux/ftrace.h               | 3 +--
 kernel/trace/trace_functions_graph.c | 8 +++++---
 3 files changed, 7 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c
index 57b33edb7ce3..61df77532120 100644
--- a/arch/x86/kernel/ftrace.c
+++ b/arch/x86/kernel/ftrace.c
@@ -410,7 +410,6 @@ int ftrace_disable_ftrace_graph_caller(void)
 void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 {
 	unsigned long old;
-	unsigned long long calltime;
 	int faulted;
 	struct ftrace_graph_ent trace;
 	unsigned long return_hooker = (unsigned long)
@@ -453,10 +452,7 @@ void prepare_ftrace_return(unsigned long *parent, unsigned long self_addr)
 		return;
 	}
 
-	calltime = trace_clock_local();
-
-	if (ftrace_push_return_trace(old, calltime,
-				self_addr, &trace.depth) == -EBUSY) {
+	if (ftrace_push_return_trace(old, self_addr, &trace.depth) == -EBUSY) {
 		*parent = old;
 		return;
 	}
diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index db3fed630db3..1141248c84ee 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -369,8 +369,7 @@ struct ftrace_ret_stack {
 extern void return_to_handler(void);
 
 extern int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth);
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth);
 extern void
 ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret);
 
diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c
index e876816fa8e7..d28687e7b3a7 100644
--- a/kernel/trace/trace_functions_graph.c
+++ b/kernel/trace/trace_functions_graph.c
@@ -57,9 +57,9 @@ static struct tracer_flags tracer_flags = {
 
 /* Add a function return address to the trace stack on thread info.*/
 int
-ftrace_push_return_trace(unsigned long ret, unsigned long long time,
-			 unsigned long func, int *depth)
+ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth)
 {
+	unsigned long long calltime;
 	int index;
 
 	if (!current->ret_stack)
@@ -71,11 +71,13 @@ ftrace_push_return_trace(unsigned long ret, unsigned long long time,
 		return -EBUSY;
 	}
 
+	calltime = trace_clock_local();
+
 	index = ++current->curr_ret_stack;
 	barrier();
 	current->ret_stack[index].ret = ret;
 	current->ret_stack[index].func = func;
-	current->ret_stack[index].calltime = time;
+	current->ret_stack[index].calltime = calltime;
 	*depth = index;
 
 	return 0;
-- 
cgit v1.2.3


From 05ce5818adee8f8efd0a5ca0d900a6789012516b Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 00:18:31 -0400
Subject: function-graph: prevent more than one tracer registering

Impact: prevent crash due to multiple function graph tracers

The function graph tracer can currently only handle a single tracer
being registered. If another tracer registers with the function
graph tracer it can crash the system.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7847806eefef..c81a759fbf76 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2643,6 +2643,12 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc,
 
 	mutex_lock(&ftrace_lock);
 
+	/* we currently allow only one tracer registered at a time */
+	if (atomic_read(&ftrace_graph_active)) {
+		ret = -EBUSY;
+		goto out;
+	}
+
 	ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call;
 	register_pm_notifier(&ftrace_suspend_notifier);
 
-- 
cgit v1.2.3


From 8aef2d2856158a36c295a8d1288281e4839bff13 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 01:10:15 -0400
Subject: function-graph: ignore times across schedule

Impact: more accurate timings

The current method of function graph tracing does not take into
account the time spent when a task is not running. This shows functions
that call schedule have increased costs:

 3) + 18.664 us   |      }
 ------------------------------------------
 3)    <idle>-0    =>  kblockd-123
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   1.441 us    |        _spin_unlock_irq();
 3)   3.966 us    |      }
 3) ! 2959.433 us |    }
 3) ! 2961.465 us |  }

This patch uses the tracepoint in the scheduling context switch to
account for time that has elapsed while a task is scheduled out.
Now we see:

 ------------------------------------------
 3)    <idle>-0    =>  edac-po-1067
 ------------------------------------------

 3)               |      finish_task_switch() {
 3)   0.685 us    |        _spin_unlock_irq();
 3)   2.331 us    |      }
 3) + 41.439 us   |    }
 3) + 42.663 us   |  }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 include/linux/sched.h |  2 ++
 kernel/trace/ftrace.c | 36 ++++++++++++++++++++++++++++++++++++
 2 files changed, 38 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 89cd308cc7a5..471e36d30123 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1409,6 +1409,8 @@ struct task_struct {
 	int curr_ret_stack;
 	/* Stack of return addresses for return function tracing */
 	struct ftrace_ret_stack	*ret_stack;
+	/* time stamp for last schedule */
+	unsigned long long ftrace_timestamp;
 	/*
 	 * Number of functions that haven't been traced
 	 * because of depth overrun.
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index c81a759fbf76..0b90364d1a2c 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -29,6 +29,8 @@
 #include <linux/list.h>
 #include <linux/hash.h>
 
+#include <trace/sched.h>
+
 #include <asm/ftrace.h>
 
 #include "trace.h"
@@ -2590,6 +2592,31 @@ free:
 	return ret;
 }
 
+static void
+ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
+				struct task_struct *next)
+{
+	unsigned long long timestamp;
+	int index;
+
+	timestamp = trace_clock_local();
+
+	prev->ftrace_timestamp = timestamp;
+
+	/* only process tasks that we timestamped */
+	if (!next->ftrace_timestamp)
+		return;
+
+	/*
+	 * Update all the counters in next to make up for the
+	 * time next was sleeping.
+	 */
+	timestamp -= next->ftrace_timestamp;
+
+	for (index = next->curr_ret_stack; index >= 0; index--)
+		next->ret_stack[index].calltime += timestamp;
+}
+
 /* Allocate a return stack for each task */
 static int start_graph_tracing(void)
 {
@@ -2611,6 +2638,13 @@ static int start_graph_tracing(void)
 		ret = alloc_retstack_tasklist(ret_stack_list);
 	} while (ret == -EAGAIN);
 
+	if (!ret) {
+		ret = register_trace_sched_switch(ftrace_graph_probe_sched_switch);
+		if (ret)
+			pr_info("ftrace_graph: Couldn't activate tracepoint"
+				" probe to kernel_sched_switch\n");
+	}
+
 	kfree(ret_stack_list);
 	return ret;
 }
@@ -2674,6 +2708,7 @@ void unregister_ftrace_graph(void)
 	mutex_lock(&ftrace_lock);
 
 	atomic_dec(&ftrace_graph_active);
+	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
 	ftrace_graph_entry = ftrace_graph_entry_stub;
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
@@ -2694,6 +2729,7 @@ void ftrace_graph_init_task(struct task_struct *t)
 		t->curr_ret_stack = -1;
 		atomic_set(&t->tracing_graph_pause, 0);
 		atomic_set(&t->trace_overrun, 0);
+		t->ftrace_timestamp = 0;
 	} else
 		t->ret_stack = NULL;
 }
-- 
cgit v1.2.3


From be6f164a02f394675e2ac2077dd354cebef5b4c0 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Tue, 24 Mar 2009 11:06:24 -0400
Subject: function-graph: add option for include sleep times

Impact: give user a choice to show times spent while sleeping

The user may want to see the time a function spent sleeping.
This patch adds the trace option "sleep-time" to allow that.
The "sleep-time" option is default on.

 echo sleep-time > /debug/tracing/trace_options

produces:

 ------------------------------------------
 2)  avahi-d-3428  =>    <idle>-0
 ------------------------------------------

 2)               |      finish_task_switch() {
 2)   0.621 us    |        _spin_unlock_irq();
 2)   2.202 us    |      }
 2) ! 1002.197 us |    }
 2) ! 1003.521 us |  }

where as,

 echo nosleep-time > /debug/tracing/trace_options

produces:

 0)    <idle>-0    =>  yum-upd-3416
 ------------------------------------------

 0)               |              finish_task_switch() {
 0)   0.643 us    |                _spin_unlock_irq();
 0)   2.342 us    |              }
 0) + 41.302 us   |            }
 0) + 42.453 us   |          }

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 7 +++++++
 kernel/trace/trace.c  | 3 ++-
 kernel/trace/trace.h  | 1 +
 3 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 0b90364d1a2c..02d2de9d08ba 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2599,6 +2599,13 @@ ftrace_graph_probe_sched_switch(struct rq *__rq, struct task_struct *prev,
 	unsigned long long timestamp;
 	int index;
 
+	/*
+	 * Does the user want to count the time a function was asleep.
+	 * If so, do not update the time stamps.
+	 */
+	if (trace_flags & TRACE_ITER_SLEEP_TIME)
+		return;
+
 	timestamp = trace_clock_local();
 
 	prev->ftrace_timestamp = timestamp;
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index f0e1337b1ebd..67c6a21dd427 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -255,7 +255,7 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait);
 
 /* trace_flags holds trace_options default values */
 unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK |
-	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO;
+	TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME;
 
 /**
  * trace_wake_up - wake up tasks waiting for trace input
@@ -316,6 +316,7 @@ static const char *trace_options[] = {
 	"context-info",
 	"latency-format",
 	"global-clock",
+	"sleep-time",
 	NULL
 };
 
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 7cfb741be200..d7410bbb9a80 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -683,6 +683,7 @@ enum trace_iterator_flags {
 	TRACE_ITER_CONTEXT_INFO		= 0x20000, /* Print pid/cpu/time */
 	TRACE_ITER_LATENCY_FMT		= 0x40000,
 	TRACE_ITER_GLOBAL_CLK		= 0x80000,
+	TRACE_ITER_SLEEP_TIME		= 0x100000,
 };
 
 /*
-- 
cgit v1.2.3


From cc59c9e8d0165c632fd056c4a23e36f917507fb4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 11:03:01 +0800
Subject: ftrace: show virtual PID

Impact: fix PID output under namespaces

When current namespace is not the global namespace,
pid read from set_ftrace_pid is no correct.

 # ~/newpid_namespace_run bash
 # echo $$
 1
 # echo 1 > set_ftrace_pid
 # cat set_ftrace_pid
 3756

Since we write virtual PID to set_ftrace_pid, we need get
virtual PID when we read it.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C84D65.9050606@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 02d2de9d08ba..bb377112b1bb 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2264,7 +2264,7 @@ ftrace_pid_read(struct file *file, char __user *ubuf,
 	if (ftrace_pid_trace == ftrace_swapper_pid)
 		r = sprintf(buf, "swapper tasks\n");
 	else if (ftrace_pid_trace)
-		r = sprintf(buf, "%u\n", pid_nr(ftrace_pid_trace));
+		r = sprintf(buf, "%u\n", pid_vnr(ftrace_pid_trace));
 	else
 		r = sprintf(buf, "no pid\n");
 
-- 
cgit v1.2.3


From ee000b7f9fe429d2470c674ccec8d344f6789e0d Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 13:38:06 +0800
Subject: tracing: use union for multi-usages field

Impact: cleanup

struct dyn_ftrace::ip has different usages in his lifecycle,
we use union for it. And also for struct dyn_ftrace::flags.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Steven Rostedt <srostedt@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C871BE.3080405@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/ftrace.h | 12 +++++++++---
 kernel/trace/ftrace.c  |  8 ++++----
 2 files changed, 13 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h
index 1141248c84ee..015a3d22cf74 100644
--- a/include/linux/ftrace.h
+++ b/include/linux/ftrace.h
@@ -145,9 +145,15 @@ enum {
 };
 
 struct dyn_ftrace {
-	unsigned long		ip; /* address of mcount call-site */
-	unsigned long		flags;
-	struct dyn_arch_ftrace	arch;
+	union {
+		unsigned long		ip; /* address of mcount call-site */
+		struct dyn_ftrace	*freelist;
+	};
+	union {
+		unsigned long		flags;
+		struct dyn_ftrace	*newlist;
+	};
+	struct dyn_arch_ftrace		arch;
 };
 
 int ftrace_force_update(void);
diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index bb377112b1bb..7b8722baf153 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -341,7 +341,7 @@ static inline int record_frozen(struct dyn_ftrace *rec)
 
 static void ftrace_free_rec(struct dyn_ftrace *rec)
 {
-	rec->ip = (unsigned long)ftrace_free_records;
+	rec->freelist = ftrace_free_records;
 	ftrace_free_records = rec;
 	rec->flags |= FTRACE_FL_FREE;
 }
@@ -379,7 +379,7 @@ static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip)
 			return NULL;
 		}
 
-		ftrace_free_records = (void *)rec->ip;
+		ftrace_free_records = rec->freelist;
 		memset(rec, 0, sizeof(*rec));
 		return rec;
 	}
@@ -411,7 +411,7 @@ ftrace_record_ip(unsigned long ip)
 		return NULL;
 
 	rec->ip = ip;
-	rec->flags = (unsigned long)ftrace_new_addrs;
+	rec->newlist = ftrace_new_addrs;
 	ftrace_new_addrs = rec;
 
 	return rec;
@@ -731,7 +731,7 @@ static int ftrace_update_code(struct module *mod)
 			return -1;
 
 		p = ftrace_new_addrs;
-		ftrace_new_addrs = (struct dyn_ftrace *)p->flags;
+		ftrace_new_addrs = p->newlist;
 		p->flags = 0L;
 
 		/* convert record (i.e, patch mcount-call with NOP) */
-- 
cgit v1.2.3


From 67aa0f767af488a7f1e41cccb4f7a4893f24a1ab Mon Sep 17 00:00:00 2001
From: Luis Henriques <henrix@sapo.pt>
Date: Tue, 24 Mar 2009 22:10:02 +0000
Subject: sched: remove unused fields from struct rq

Impact: cleanup, new schedstat ABI

Since they are used on in statistics and are always set to zero, the
following fields from struct rq have been removed: yld_exp_empty,
yld_act_empty and yld_both_empty.

Both Sched Debug and SCHEDSTAT_VERSION versions has also been
incremented since ABIs have been changed.

The schedtop tool has been updated to properly handle new version of
schedstat:

   http://rt.wiki.kernel.org/index.php/Schedtop_utility

Signed-off-by: Luis Henriques <henrix@sapo.pt>
Acked-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090324221002.GA10061@hades.domain.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c       | 3 ---
 kernel/sched_debug.c | 5 +----
 kernel/sched_stats.h | 7 +++----
 3 files changed, 4 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index d2dfe4c1a225..7b389c74f8ff 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -638,9 +638,6 @@ struct rq {
 	/* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */
 
 	/* sys_sched_yield() stats */
-	unsigned int yld_exp_empty;
-	unsigned int yld_act_empty;
-	unsigned int yld_both_empty;
 	unsigned int yld_count;
 
 	/* schedule() stats */
diff --git a/kernel/sched_debug.c b/kernel/sched_debug.c
index 4daebffa0565..467ca72f1657 100644
--- a/kernel/sched_debug.c
+++ b/kernel/sched_debug.c
@@ -286,9 +286,6 @@ static void print_cpu(struct seq_file *m, int cpu)
 #ifdef CONFIG_SCHEDSTATS
 #define P(n) SEQ_printf(m, "  .%-30s: %d\n", #n, rq->n);
 
-	P(yld_exp_empty);
-	P(yld_act_empty);
-	P(yld_both_empty);
 	P(yld_count);
 
 	P(sched_switch);
@@ -313,7 +310,7 @@ static int sched_debug_show(struct seq_file *m, void *v)
 	u64 now = ktime_to_ns(ktime_get());
 	int cpu;
 
-	SEQ_printf(m, "Sched Debug Version: v0.08, %s %.*s\n",
+	SEQ_printf(m, "Sched Debug Version: v0.09, %s %.*s\n",
 		init_utsname()->release,
 		(int)strcspn(init_utsname()->version, " "),
 		init_utsname()->version);
diff --git a/kernel/sched_stats.h b/kernel/sched_stats.h
index a8f93dd374e1..32d2bd4061b0 100644
--- a/kernel/sched_stats.h
+++ b/kernel/sched_stats.h
@@ -4,7 +4,7 @@
  * bump this up when changing the output format or the meaning of an existing
  * format, so that tools can adapt (or abort)
  */
-#define SCHEDSTAT_VERSION 14
+#define SCHEDSTAT_VERSION 15
 
 static int show_schedstat(struct seq_file *seq, void *v)
 {
@@ -26,9 +26,8 @@ static int show_schedstat(struct seq_file *seq, void *v)
 
 		/* runqueue-specific stats */
 		seq_printf(seq,
-		    "cpu%d %u %u %u %u %u %u %u %u %u %llu %llu %lu",
-		    cpu, rq->yld_both_empty,
-		    rq->yld_act_empty, rq->yld_exp_empty, rq->yld_count,
+		    "cpu%d %u %u %u %u %u %u %llu %llu %lu",
+		    cpu, rq->yld_count,
 		    rq->sched_switch, rq->sched_count, rq->sched_goidle,
 		    rq->ttwu_count, rq->ttwu_local,
 		    rq->rq_cpu_time,
-- 
cgit v1.2.3


From e9d376f0fa66bd630fe27403669c6ae6c22a868f Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Thu, 5 Feb 2009 11:51:38 -0500
Subject: dynamic debug: combine dprintk and dynamic printk

This patch combines Greg Bank's dprintk() work with the existing dynamic
printk patchset, we are now calling it 'dynamic debug'.

The new feature of this patchset is a richer /debugfs control file interface,
(an example output from my system is at the bottom), which allows fined grained
control over the the debug output. The output can be controlled by function,
file, module, format string, and line number.

for example, enabled all debug messages in module 'nf_conntrack':

echo -n 'module nf_conntrack +p' > /mnt/debugfs/dynamic_debug/control

to disable them:

echo -n 'module nf_conntrack -p' > /mnt/debugfs/dynamic_debug/control

A further explanation can be found in the documentation patch.

Signed-off-by: Greg Banks <gnb@sgi.com>
Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@suse.de>
---
 Documentation/kernel-parameters.txt |   5 -
 include/asm-generic/vmlinux.lds.h   |  15 +-
 include/linux/device.h              |   2 +-
 include/linux/dynamic_debug.h       |  88 +++++
 include/linux/dynamic_printk.h      |  93 -----
 include/linux/kernel.h              |   4 +-
 kernel/module.c                     |  25 +-
 lib/Kconfig.debug                   |   2 +-
 lib/Makefile                        |   2 +-
 lib/dynamic_debug.c                 | 756 ++++++++++++++++++++++++++++++++++++
 lib/dynamic_printk.c                | 414 --------------------
 net/netfilter/nf_conntrack_pptp.c   |   2 +-
 scripts/Makefile.lib                |   2 +-
 13 files changed, 867 insertions(+), 543 deletions(-)
 create mode 100644 include/linux/dynamic_debug.h
 delete mode 100644 include/linux/dynamic_printk.h
 create mode 100644 lib/dynamic_debug.c
 delete mode 100644 lib/dynamic_printk.c

(limited to 'kernel')

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 54f21a5c262b..3a1aa8a4affc 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1816,11 +1816,6 @@ and is between 256 and 4096 characters. It is defined in the file
 			autoconfiguration.
 			Ranges are in pairs (memory base and size).
 
-	dynamic_printk	Enables pr_debug()/dev_dbg() calls if
-			CONFIG_DYNAMIC_PRINTK_DEBUG has been enabled.
-			These can also be switched on/off via
-			<debugfs>/dynamic_printk/modules
-
 	print-fatal-signals=
 			[KNL] debug: print fatal signals
 			print-fatal-signals=1: print segfault info to
diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h
index c61fab1dd2f8..aca40b93bd28 100644
--- a/include/asm-generic/vmlinux.lds.h
+++ b/include/asm-generic/vmlinux.lds.h
@@ -80,6 +80,11 @@
 	VMLINUX_SYMBOL(__start___tracepoints) = .;			\
 	*(__tracepoints)						\
 	VMLINUX_SYMBOL(__stop___tracepoints) = .;			\
+	/* implement dynamic printk debug */				\
+	. = ALIGN(8);							\
+	VMLINUX_SYMBOL(__start___verbose) = .;                          \
+	*(__verbose)                                                    \
+	VMLINUX_SYMBOL(__stop___verbose) = .;				\
 	LIKELY_PROFILE()		       				\
 	BRANCH_PROFILE()
 
@@ -309,15 +314,7 @@
 	CPU_DISCARD(init.data)						\
 	CPU_DISCARD(init.rodata)					\
 	MEM_DISCARD(init.data)						\
-	MEM_DISCARD(init.rodata)					\
-	/* implement dynamic printk debug */				\
-	VMLINUX_SYMBOL(__start___verbose_strings) = .;                  \
-	*(__verbose_strings)                                            \
-	VMLINUX_SYMBOL(__stop___verbose_strings) = .;                   \
-	. = ALIGN(8);							\
-	VMLINUX_SYMBOL(__start___verbose) = .;                          \
-	*(__verbose)                                                    \
-	VMLINUX_SYMBOL(__stop___verbose) = .;
+	MEM_DISCARD(init.rodata)
 
 #define INIT_TEXT							\
 	*(.init.text)							\
diff --git a/include/linux/device.h b/include/linux/device.h
index f98d0cfb4f81..2918c0e8fdfd 100644
--- a/include/linux/device.h
+++ b/include/linux/device.h
@@ -582,7 +582,7 @@ extern const char *dev_driver_string(const struct device *dev);
 #if defined(DEBUG)
 #define dev_dbg(dev, format, arg...)		\
 	dev_printk(KERN_DEBUG , dev , format , ## arg)
-#elif defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#elif defined(CONFIG_DYNAMIC_DEBUG)
 #define dev_dbg(dev, format, ...) do { \
 	dynamic_dev_dbg(dev, format, ##__VA_ARGS__); \
 	} while (0)
diff --git a/include/linux/dynamic_debug.h b/include/linux/dynamic_debug.h
new file mode 100644
index 000000000000..07781aaa1164
--- /dev/null
+++ b/include/linux/dynamic_debug.h
@@ -0,0 +1,88 @@
+#ifndef _DYNAMIC_DEBUG_H
+#define _DYNAMIC_DEBUG_H
+
+/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
+ * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
+ * use independent hash functions, to reduce the chance of false positives.
+ */
+extern long long dynamic_debug_enabled;
+extern long long dynamic_debug_enabled2;
+
+/*
+ * An instance of this structure is created in a special
+ * ELF section at every dynamic debug callsite.  At runtime,
+ * the special section is treated as an array of these.
+ */
+struct _ddebug {
+	/*
+	 * These fields are used to drive the user interface
+	 * for selecting and displaying debug callsites.
+	 */
+	const char *modname;
+	const char *function;
+	const char *filename;
+	const char *format;
+	char primary_hash;
+	char secondary_hash;
+	unsigned int lineno:24;
+	/*
+ 	 * The flags field controls the behaviour at the callsite.
+ 	 * The bits here are changed dynamically when the user
+ 	 * writes commands to <debugfs>/dynamic_debug/ddebug
+	 */
+#define _DPRINTK_FLAGS_PRINT   (1<<0)  /* printk() a message using the format */
+#define _DPRINTK_FLAGS_DEFAULT 0
+	unsigned int flags:8;
+} __attribute__((aligned(8)));
+
+
+int ddebug_add_module(struct _ddebug *tab, unsigned int n,
+				const char *modname);
+
+#if defined(CONFIG_DYNAMIC_DEBUG)
+extern int ddebug_remove_module(char *mod_name);
+
+#define __dynamic_dbg_enabled(dd)  ({	     \
+	int __ret = 0;							     \
+	if (unlikely((dynamic_debug_enabled & (1LL << DEBUG_HASH)) &&	     \
+			(dynamic_debug_enabled2 & (1LL << DEBUG_HASH2))))   \
+				if (unlikely(dd.flags))			     \
+					__ret = 1;			     \
+	__ret; })
+
+#define dynamic_pr_debug(fmt, ...) do {					\
+	static struct _ddebug descriptor				\
+	__used								\
+	__attribute__((section("__verbose"), aligned(8))) =		\
+	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
+		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
+	if (__dynamic_dbg_enabled(descriptor))				\
+		printk(KERN_DEBUG KBUILD_MODNAME ":" fmt,		\
+				##__VA_ARGS__);				\
+	} while (0)
+
+
+#define dynamic_dev_dbg(dev, fmt, ...) do {				\
+	static struct _ddebug descriptor				\
+	__used								\
+	__attribute__((section("__verbose"), aligned(8))) =		\
+	{ KBUILD_MODNAME, __func__, __FILE__, fmt, DEBUG_HASH,	\
+		DEBUG_HASH2, __LINE__, _DPRINTK_FLAGS_DEFAULT };	\
+	if (__dynamic_dbg_enabled(descriptor))				\
+			dev_printk(KERN_DEBUG, dev,			\
+					KBUILD_MODNAME ": " fmt,	\
+					##__VA_ARGS__);			\
+	} while (0)
+
+#else
+
+static inline int ddebug_remove_module(char *mod)
+{
+	return 0;
+}
+
+#define dynamic_pr_debug(fmt, ...)  do { } while (0)
+#define dynamic_dev_dbg(dev, format, ...)  do { } while (0)
+#endif
+
+#endif
diff --git a/include/linux/dynamic_printk.h b/include/linux/dynamic_printk.h
deleted file mode 100644
index 2d528d009074..000000000000
--- a/include/linux/dynamic_printk.h
+++ /dev/null
@@ -1,93 +0,0 @@
-#ifndef _DYNAMIC_PRINTK_H
-#define _DYNAMIC_PRINTK_H
-
-#define DYNAMIC_DEBUG_HASH_BITS 6
-#define DEBUG_HASH_TABLE_SIZE (1 << DYNAMIC_DEBUG_HASH_BITS)
-
-#define TYPE_BOOLEAN 1
-
-#define DYNAMIC_ENABLED_ALL 0
-#define DYNAMIC_ENABLED_NONE 1
-#define DYNAMIC_ENABLED_SOME 2
-
-extern int dynamic_enabled;
-
-/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
- * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
- * use independent hash functions, to reduce the chance of false positives.
- */
-extern long long dynamic_printk_enabled;
-extern long long dynamic_printk_enabled2;
-
-struct mod_debug {
-	char *modname;
-	char *logical_modname;
-	char *flag_names;
-	int type;
-	int hash;
-	int hash2;
-} __attribute__((aligned(8)));
-
-int register_dynamic_debug_module(char *mod_name, int type, char *share_name,
-					char *flags, int hash, int hash2);
-
-#if defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
-extern int unregister_dynamic_debug_module(char *mod_name);
-extern int __dynamic_dbg_enabled_helper(char *modname, int type,
-					int value, int hash);
-
-#define __dynamic_dbg_enabled(module, type, value, level, hash)  ({	     \
-	int __ret = 0;							     \
-	if (unlikely((dynamic_printk_enabled & (1LL << DEBUG_HASH)) &&	     \
-			(dynamic_printk_enabled2 & (1LL << DEBUG_HASH2))))   \
-			__ret = __dynamic_dbg_enabled_helper(module, type,   \
-								value, hash);\
-	__ret; })
-
-#define dynamic_pr_debug(fmt, ...) do {					    \
-	static char mod_name[]						    \
-	__attribute__((section("__verbose_strings")))			    \
-	 = KBUILD_MODNAME;						    \
-	static struct mod_debug descriptor				    \
-	__used								    \
-	__attribute__((section("__verbose"), aligned(8))) =		    \
-	{ mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\
-	if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN,		    \
-						0, 0, DEBUG_HASH))	    \
-		printk(KERN_DEBUG KBUILD_MODNAME ":" fmt,		    \
-				##__VA_ARGS__);				    \
-	} while (0)
-
-#define dynamic_dev_dbg(dev, format, ...) do {				    \
-	static char mod_name[]						    \
-	__attribute__((section("__verbose_strings")))			    \
-	 = KBUILD_MODNAME;						    \
-	static struct mod_debug descriptor				    \
-	__used								    \
-	__attribute__((section("__verbose"), aligned(8))) =		    \
-	{ mod_name, mod_name, NULL, TYPE_BOOLEAN, DEBUG_HASH, DEBUG_HASH2 };\
-	if (__dynamic_dbg_enabled(KBUILD_MODNAME, TYPE_BOOLEAN,		    \
-						0, 0, DEBUG_HASH))	    \
-			dev_printk(KERN_DEBUG, dev,			    \
-					KBUILD_MODNAME ": " format,	    \
-					##__VA_ARGS__);			    \
-	} while (0)
-
-#else
-
-static inline int unregister_dynamic_debug_module(const char *mod_name)
-{
-	return 0;
-}
-static inline int __dynamic_dbg_enabled_helper(char *modname, int type,
-						int value, int hash)
-{
-	return 0;
-}
-
-#define __dynamic_dbg_enabled(module, type, value, level, hash)  ({ 0; })
-#define dynamic_pr_debug(fmt, ...)  do { } while (0)
-#define dynamic_dev_dbg(dev, format, ...)  do { } while (0)
-#endif
-
-#endif
diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index 7fa371898e3e..b5496ecbec71 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -16,7 +16,7 @@
 #include <linux/log2.h>
 #include <linux/typecheck.h>
 #include <linux/ratelimit.h>
-#include <linux/dynamic_printk.h>
+#include <linux/dynamic_debug.h>
 #include <asm/byteorder.h>
 #include <asm/bug.h>
 
@@ -358,7 +358,7 @@ static inline char *pack_hex_byte(char *buf, u8 byte)
 #if defined(DEBUG)
 #define pr_debug(fmt, ...) \
 	printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__)
-#elif defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#elif defined(CONFIG_DYNAMIC_DEBUG)
 #define pr_debug(fmt, ...) do { \
 	dynamic_pr_debug(pr_fmt(fmt), ##__VA_ARGS__); \
 	} while (0)
diff --git a/kernel/module.c b/kernel/module.c
index 1196f5d11700..77672233387f 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -822,7 +822,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user,
 	mutex_lock(&module_mutex);
 	/* Store the name of the last unloaded module for diagnostic purposes */
 	strlcpy(last_unloaded_module, mod->name, sizeof(last_unloaded_module));
-	unregister_dynamic_debug_module(mod->name);
+	ddebug_remove_module(mod->name);
 	free_module(mod);
 
  out:
@@ -1827,19 +1827,13 @@ static inline void add_kallsyms(struct module *mod,
 }
 #endif /* CONFIG_KALLSYMS */
 
-static void dynamic_printk_setup(struct mod_debug *debug, unsigned int num)
+static void dynamic_debug_setup(struct _ddebug *debug, unsigned int num)
 {
-#ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
-	unsigned int i;
-
-	for (i = 0; i < num; i++) {
-		register_dynamic_debug_module(debug[i].modname,
-					      debug[i].type,
-					      debug[i].logical_modname,
-					      debug[i].flag_names,
-					      debug[i].hash, debug[i].hash2);
-	}
-#endif /* CONFIG_DYNAMIC_PRINTK_DEBUG */
+#ifdef CONFIG_DYNAMIC_DEBUG
+	if (ddebug_add_module(debug, num, debug->modname))
+		printk(KERN_ERR "dynamic debug error adding module: %s\n",
+					debug->modname);
+#endif
 }
 
 static void *module_alloc_update_bounds(unsigned long size)
@@ -2213,12 +2207,13 @@ static noinline struct module *load_module(void __user *umod,
 	add_kallsyms(mod, sechdrs, symindex, strindex, secstrings);
 
 	if (!mod->taints) {
-		struct mod_debug *debug;
+		struct _ddebug *debug;
 		unsigned int num_debug;
 
 		debug = section_objs(hdr, sechdrs, secstrings, "__verbose",
 				     sizeof(*debug), &num_debug);
-		dynamic_printk_setup(debug, num_debug);
+		if (debug)
+			dynamic_debug_setup(debug, num_debug);
 	}
 
 	/* sechdrs[0].sh_size is always zero */
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 1bcf9cd4baa0..0dd1c04c7323 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -847,7 +847,7 @@ config BUILD_DOCSRC
 
 	  Say N if you are unsure.
 
-config DYNAMIC_PRINTK_DEBUG
+config DYNAMIC_DEBUG
 	bool "Enable dynamic printk() call support"
 	default n
 	depends on PRINTK
diff --git a/lib/Makefile b/lib/Makefile
index 32b0e64ded27..8633d6be9d21 100644
--- a/lib/Makefile
+++ b/lib/Makefile
@@ -82,7 +82,7 @@ obj-$(CONFIG_HAVE_LMB) += lmb.o
 
 obj-$(CONFIG_HAVE_ARCH_TRACEHOOK) += syscall.o
 
-obj-$(CONFIG_DYNAMIC_PRINTK_DEBUG) += dynamic_printk.o
+obj-$(CONFIG_DYNAMIC_DEBUG) += dynamic_debug.o
 
 hostprogs-y	:= gen_crc32table
 clean-files	:= crc32table.h
diff --git a/lib/dynamic_debug.c b/lib/dynamic_debug.c
new file mode 100644
index 000000000000..9e123ae326bc
--- /dev/null
+++ b/lib/dynamic_debug.c
@@ -0,0 +1,756 @@
+/*
+ * lib/dynamic_debug.c
+ *
+ * make pr_debug()/dev_dbg() calls runtime configurable based upon their
+ * source module.
+ *
+ * Copyright (C) 2008 Jason Baron <jbaron@redhat.com>
+ * By Greg Banks <gnb@melbourne.sgi.com>
+ * Copyright (c) 2008 Silicon Graphics Inc.  All Rights Reserved.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/kallsyms.h>
+#include <linux/version.h>
+#include <linux/types.h>
+#include <linux/mutex.h>
+#include <linux/proc_fs.h>
+#include <linux/seq_file.h>
+#include <linux/list.h>
+#include <linux/sysctl.h>
+#include <linux/ctype.h>
+#include <linux/uaccess.h>
+#include <linux/dynamic_debug.h>
+#include <linux/debugfs.h>
+
+extern struct _ddebug __start___verbose[];
+extern struct _ddebug __stop___verbose[];
+
+/* dynamic_debug_enabled, and dynamic_debug_enabled2 are bitmasks in which
+ * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
+ * use independent hash functions, to reduce the chance of false positives.
+ */
+long long dynamic_debug_enabled;
+EXPORT_SYMBOL_GPL(dynamic_debug_enabled);
+long long dynamic_debug_enabled2;
+EXPORT_SYMBOL_GPL(dynamic_debug_enabled2);
+
+struct ddebug_table {
+	struct list_head link;
+	char *mod_name;
+	unsigned int num_ddebugs;
+	unsigned int num_enabled;
+	struct _ddebug *ddebugs;
+};
+
+struct ddebug_query {
+	const char *filename;
+	const char *module;
+	const char *function;
+	const char *format;
+	unsigned int first_lineno, last_lineno;
+};
+
+struct ddebug_iter {
+	struct ddebug_table *table;
+	unsigned int idx;
+};
+
+static DEFINE_MUTEX(ddebug_lock);
+static LIST_HEAD(ddebug_tables);
+static int verbose = 0;
+
+/* Return the last part of a pathname */
+static inline const char *basename(const char *path)
+{
+	const char *tail = strrchr(path, '/');
+	return tail ? tail+1 : path;
+}
+
+/* format a string into buf[] which describes the _ddebug's flags */
+static char *ddebug_describe_flags(struct _ddebug *dp, char *buf,
+				    size_t maxlen)
+{
+	char *p = buf;
+
+	BUG_ON(maxlen < 4);
+	if (dp->flags & _DPRINTK_FLAGS_PRINT)
+		*p++ = 'p';
+	if (p == buf)
+		*p++ = '-';
+	*p = '\0';
+
+	return buf;
+}
+
+/*
+ * must be called with ddebug_lock held
+ */
+
+static int disabled_hash(char hash, bool first_table)
+{
+	struct ddebug_table *dt;
+	char table_hash_value;
+
+	list_for_each_entry(dt, &ddebug_tables, link) {
+		if (first_table)
+			table_hash_value = dt->ddebugs->primary_hash;
+		else
+			table_hash_value = dt->ddebugs->secondary_hash;
+		if (dt->num_enabled && (hash == table_hash_value))
+			return 0;
+	}
+	return 1;
+}
+
+/*
+ * Search the tables for _ddebug's which match the given
+ * `query' and apply the `flags' and `mask' to them.  Tells
+ * the user which ddebug's were changed, or whether none
+ * were matched.
+ */
+static void ddebug_change(const struct ddebug_query *query,
+			   unsigned int flags, unsigned int mask)
+{
+	int i;
+	struct ddebug_table *dt;
+	unsigned int newflags;
+	unsigned int nfound = 0;
+	char flagbuf[8];
+
+	/* search for matching ddebugs */
+	mutex_lock(&ddebug_lock);
+	list_for_each_entry(dt, &ddebug_tables, link) {
+
+		/* match against the module name */
+		if (query->module != NULL &&
+		    strcmp(query->module, dt->mod_name))
+			continue;
+
+		for (i = 0 ; i < dt->num_ddebugs ; i++) {
+			struct _ddebug *dp = &dt->ddebugs[i];
+
+			/* match against the source filename */
+			if (query->filename != NULL &&
+			    strcmp(query->filename, dp->filename) &&
+			    strcmp(query->filename, basename(dp->filename)))
+				continue;
+
+			/* match against the function */
+			if (query->function != NULL &&
+			    strcmp(query->function, dp->function))
+				continue;
+
+			/* match against the format */
+			if (query->format != NULL &&
+			    strstr(dp->format, query->format) == NULL)
+				continue;
+
+			/* match against the line number range */
+			if (query->first_lineno &&
+			    dp->lineno < query->first_lineno)
+				continue;
+			if (query->last_lineno &&
+			    dp->lineno > query->last_lineno)
+				continue;
+
+			nfound++;
+
+			newflags = (dp->flags & mask) | flags;
+			if (newflags == dp->flags)
+				continue;
+
+			if (!newflags)
+				dt->num_enabled--;
+			else if (!dp-flags)
+				dt->num_enabled++;
+			dp->flags = newflags;
+			if (newflags) {
+				dynamic_debug_enabled |=
+						(1LL << dp->primary_hash);
+				dynamic_debug_enabled2 |=
+						(1LL << dp->secondary_hash);
+			} else {
+				if (disabled_hash(dp->primary_hash, true))
+					dynamic_debug_enabled &=
+						~(1LL << dp->primary_hash);
+				if (disabled_hash(dp->secondary_hash, false))
+					dynamic_debug_enabled2 &=
+						~(1LL << dp->secondary_hash);
+			}
+			if (verbose)
+				printk(KERN_INFO
+					"ddebug: changed %s:%d [%s]%s %s\n",
+					dp->filename, dp->lineno,
+					dt->mod_name, dp->function,
+					ddebug_describe_flags(dp, flagbuf,
+							sizeof(flagbuf)));
+		}
+	}
+	mutex_unlock(&ddebug_lock);
+
+	if (!nfound && verbose)
+		printk(KERN_INFO "ddebug: no matches for query\n");
+}
+
+/*
+ * Wrapper around strsep() to collapse the multiple empty tokens
+ * that it returns when fed sequences of separator characters.
+ * Now, if we had strtok_r()...
+ */
+static inline char *nearly_strtok_r(char **p, const char *sep)
+{
+	char *r;
+
+	while ((r = strsep(p, sep)) != NULL && *r == '\0')
+		;
+	return r;
+}
+
+/*
+ * Split the buffer `buf' into space-separated words.
+ * Return the number of such words or <0 on error.
+ */
+static int ddebug_tokenize(char *buf, char *words[], int maxwords)
+{
+	int nwords = 0;
+
+	while (nwords < maxwords &&
+	       (words[nwords] = nearly_strtok_r(&buf, " \t\r\n")) != NULL)
+		nwords++;
+	if (buf)
+		return -EINVAL;	/* ran out of words[] before bytes */
+
+	if (verbose) {
+		int i;
+		printk(KERN_INFO "%s: split into words:", __func__);
+		for (i = 0 ; i < nwords ; i++)
+			printk(" \"%s\"", words[i]);
+		printk("\n");
+	}
+
+	return nwords;
+}
+
+/*
+ * Parse a single line number.  Note that the empty string ""
+ * is treated as a special case and converted to zero, which
+ * is later treated as a "don't care" value.
+ */
+static inline int parse_lineno(const char *str, unsigned int *val)
+{
+	char *end = NULL;
+	BUG_ON(str == NULL);
+	if (*str == '\0') {
+		*val = 0;
+		return 0;
+	}
+	*val = simple_strtoul(str, &end, 10);
+	return end == NULL || end == str || *end != '\0' ? -EINVAL : 0;
+}
+
+/*
+ * Undo octal escaping in a string, inplace.  This is useful to
+ * allow the user to express a query which matches a format
+ * containing embedded spaces.
+ */
+#define isodigit(c)		((c) >= '0' && (c) <= '7')
+static char *unescape(char *str)
+{
+	char *in = str;
+	char *out = str;
+
+	while (*in) {
+		if (*in == '\\') {
+			if (in[1] == '\\') {
+				*out++ = '\\';
+				in += 2;
+				continue;
+			} else if (in[1] == 't') {
+				*out++ = '\t';
+				in += 2;
+				continue;
+			} else if (in[1] == 'n') {
+				*out++ = '\n';
+				in += 2;
+				continue;
+			} else if (isodigit(in[1]) &&
+			         isodigit(in[2]) &&
+			         isodigit(in[3])) {
+				*out++ = ((in[1] - '0')<<6) |
+				          ((in[2] - '0')<<3) |
+				          (in[3] - '0');
+				in += 4;
+				continue;
+			}
+		}
+		*out++ = *in++;
+	}
+	*out = '\0';
+
+	return str;
+}
+
+/*
+ * Parse words[] as a ddebug query specification, which is a series
+ * of (keyword, value) pairs chosen from these possibilities:
+ *
+ * func <function-name>
+ * file <full-pathname>
+ * file <base-filename>
+ * module <module-name>
+ * format <escaped-string-to-find-in-format>
+ * line <lineno>
+ * line <first-lineno>-<last-lineno> // where either may be empty
+ */
+static int ddebug_parse_query(char *words[], int nwords,
+			       struct ddebug_query *query)
+{
+	unsigned int i;
+
+	/* check we have an even number of words */
+	if (nwords % 2 != 0)
+		return -EINVAL;
+	memset(query, 0, sizeof(*query));
+
+	for (i = 0 ; i < nwords ; i += 2) {
+		if (!strcmp(words[i], "func"))
+			query->function = words[i+1];
+		else if (!strcmp(words[i], "file"))
+			query->filename = words[i+1];
+		else if (!strcmp(words[i], "module"))
+			query->module = words[i+1];
+		else if (!strcmp(words[i], "format"))
+			query->format = unescape(words[i+1]);
+		else if (!strcmp(words[i], "line")) {
+			char *first = words[i+1];
+			char *last = strchr(first, '-');
+			if (last)
+				*last++ = '\0';
+			if (parse_lineno(first, &query->first_lineno) < 0)
+				return -EINVAL;
+			if (last != NULL) {
+				/* range <first>-<last> */
+				if (parse_lineno(last, &query->last_lineno) < 0)
+					return -EINVAL;
+			} else {
+				query->last_lineno = query->first_lineno;
+			}
+		} else {
+			if (verbose)
+				printk(KERN_ERR "%s: unknown keyword \"%s\"\n",
+					__func__, words[i]);
+			return -EINVAL;
+		}
+	}
+
+	if (verbose)
+		printk(KERN_INFO "%s: q->function=\"%s\" q->filename=\"%s\" "
+		       "q->module=\"%s\" q->format=\"%s\" q->lineno=%u-%u\n",
+			__func__, query->function, query->filename,
+			query->module, query->format, query->first_lineno,
+			query->last_lineno);
+
+	return 0;
+}
+
+/*
+ * Parse `str' as a flags specification, format [-+=][p]+.
+ * Sets up *maskp and *flagsp to be used when changing the
+ * flags fields of matched _ddebug's.  Returns 0 on success
+ * or <0 on error.
+ */
+static int ddebug_parse_flags(const char *str, unsigned int *flagsp,
+			       unsigned int *maskp)
+{
+	unsigned flags = 0;
+	int op = '=';
+
+	switch (*str) {
+	case '+':
+	case '-':
+	case '=':
+		op = *str++;
+		break;
+	default:
+		return -EINVAL;
+	}
+	if (verbose)
+		printk(KERN_INFO "%s: op='%c'\n", __func__, op);
+
+	for ( ; *str ; ++str) {
+		switch (*str) {
+		case 'p':
+			flags |= _DPRINTK_FLAGS_PRINT;
+			break;
+		default:
+			return -EINVAL;
+		}
+	}
+	if (flags == 0)
+		return -EINVAL;
+	if (verbose)
+		printk(KERN_INFO "%s: flags=0x%x\n", __func__, flags);
+
+	/* calculate final *flagsp, *maskp according to mask and op */
+	switch (op) {
+	case '=':
+		*maskp = 0;
+		*flagsp = flags;
+		break;
+	case '+':
+		*maskp = ~0U;
+		*flagsp = flags;
+		break;
+	case '-':
+		*maskp = ~flags;
+		*flagsp = 0;
+		break;
+	}
+	if (verbose)
+		printk(KERN_INFO "%s: *flagsp=0x%x *maskp=0x%x\n",
+			__func__, *flagsp, *maskp);
+	return 0;
+}
+
+/*
+ * File_ops->write method for <debugfs>/dynamic_debug/conrol.  Gathers the
+ * command text from userspace, parses and executes it.
+ */
+static ssize_t ddebug_proc_write(struct file *file, const char __user *ubuf,
+				  size_t len, loff_t *offp)
+{
+	unsigned int flags = 0, mask = 0;
+	struct ddebug_query query;
+#define MAXWORDS 9
+	int nwords;
+	char *words[MAXWORDS];
+	char tmpbuf[256];
+
+	if (len == 0)
+		return 0;
+	/* we don't check *offp -- multiple writes() are allowed */
+	if (len > sizeof(tmpbuf)-1)
+		return -E2BIG;
+	if (copy_from_user(tmpbuf, ubuf, len))
+		return -EFAULT;
+	tmpbuf[len] = '\0';
+	if (verbose)
+		printk(KERN_INFO "%s: read %d bytes from userspace\n",
+			__func__, (int)len);
+
+	nwords = ddebug_tokenize(tmpbuf, words, MAXWORDS);
+	if (nwords < 0)
+		return -EINVAL;
+	if (ddebug_parse_query(words, nwords-1, &query))
+		return -EINVAL;
+	if (ddebug_parse_flags(words[nwords-1], &flags, &mask))
+		return -EINVAL;
+
+	/* actually go and implement the change */
+	ddebug_change(&query, flags, mask);
+
+	*offp += len;
+	return len;
+}
+
+/*
+ * Set the iterator to point to the first _ddebug object
+ * and return a pointer to that first object.  Returns
+ * NULL if there are no _ddebugs at all.
+ */
+static struct _ddebug *ddebug_iter_first(struct ddebug_iter *iter)
+{
+	if (list_empty(&ddebug_tables)) {
+		iter->table = NULL;
+		iter->idx = 0;
+		return NULL;
+	}
+	iter->table = list_entry(ddebug_tables.next,
+				 struct ddebug_table, link);
+	iter->idx = 0;
+	return &iter->table->ddebugs[iter->idx];
+}
+
+/*
+ * Advance the iterator to point to the next _ddebug
+ * object from the one the iterator currently points at,
+ * and returns a pointer to the new _ddebug.  Returns
+ * NULL if the iterator has seen all the _ddebugs.
+ */
+static struct _ddebug *ddebug_iter_next(struct ddebug_iter *iter)
+{
+	if (iter->table == NULL)
+		return NULL;
+	if (++iter->idx == iter->table->num_ddebugs) {
+		/* iterate to next table */
+		iter->idx = 0;
+		if (list_is_last(&iter->table->link, &ddebug_tables)) {
+			iter->table = NULL;
+			return NULL;
+		}
+		iter->table = list_entry(iter->table->link.next,
+					 struct ddebug_table, link);
+	}
+	return &iter->table->ddebugs[iter->idx];
+}
+
+/*
+ * Seq_ops start method.  Called at the start of every
+ * read() call from userspace.  Takes the ddebug_lock and
+ * seeks the seq_file's iterator to the given position.
+ */
+static void *ddebug_proc_start(struct seq_file *m, loff_t *pos)
+{
+	struct ddebug_iter *iter = m->private;
+	struct _ddebug *dp;
+	int n = *pos;
+
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p *pos=%lld\n",
+			__func__, m, (unsigned long long)*pos);
+
+	mutex_lock(&ddebug_lock);
+
+	if (!n)
+		return SEQ_START_TOKEN;
+	if (n < 0)
+		return NULL;
+	dp = ddebug_iter_first(iter);
+	while (dp != NULL && --n > 0)
+		dp = ddebug_iter_next(iter);
+	return dp;
+}
+
+/*
+ * Seq_ops next method.  Called several times within a read()
+ * call from userspace, with ddebug_lock held.  Walks to the
+ * next _ddebug object with a special case for the header line.
+ */
+static void *ddebug_proc_next(struct seq_file *m, void *p, loff_t *pos)
+{
+	struct ddebug_iter *iter = m->private;
+	struct _ddebug *dp;
+
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p p=%p *pos=%lld\n",
+			__func__, m, p, (unsigned long long)*pos);
+
+	if (p == SEQ_START_TOKEN)
+		dp = ddebug_iter_first(iter);
+	else
+		dp = ddebug_iter_next(iter);
+	++*pos;
+	return dp;
+}
+
+/*
+ * Seq_ops show method.  Called several times within a read()
+ * call from userspace, with ddebug_lock held.  Formats the
+ * current _ddebug as a single human-readable line, with a
+ * special case for the header line.
+ */
+static int ddebug_proc_show(struct seq_file *m, void *p)
+{
+	struct ddebug_iter *iter = m->private;
+	struct _ddebug *dp = p;
+	char flagsbuf[8];
+
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p p=%p\n",
+			__func__, m, p);
+
+	if (p == SEQ_START_TOKEN) {
+		seq_puts(m,
+			"# filename:lineno [module]function flags format\n");
+		return 0;
+	}
+
+	seq_printf(m, "%s:%u [%s]%s %s \"",
+		   dp->filename, dp->lineno,
+		   iter->table->mod_name, dp->function,
+		   ddebug_describe_flags(dp, flagsbuf, sizeof(flagsbuf)));
+	seq_escape(m, dp->format, "\t\r\n\"");
+	seq_puts(m, "\"\n");
+
+	return 0;
+}
+
+/*
+ * Seq_ops stop method.  Called at the end of each read()
+ * call from userspace.  Drops ddebug_lock.
+ */
+static void ddebug_proc_stop(struct seq_file *m, void *p)
+{
+	if (verbose)
+		printk(KERN_INFO "%s: called m=%p p=%p\n",
+			__func__, m, p);
+	mutex_unlock(&ddebug_lock);
+}
+
+static const struct seq_operations ddebug_proc_seqops = {
+	.start = ddebug_proc_start,
+	.next = ddebug_proc_next,
+	.show = ddebug_proc_show,
+	.stop = ddebug_proc_stop
+};
+
+/*
+ * File_ops->open method for <debugfs>/dynamic_debug/control.  Does the seq_file
+ * setup dance, and also creates an iterator to walk the _ddebugs.
+ * Note that we create a seq_file always, even for O_WRONLY files
+ * where it's not needed, as doing so simplifies the ->release method.
+ */
+static int ddebug_proc_open(struct inode *inode, struct file *file)
+{
+	struct ddebug_iter *iter;
+	int err;
+
+	if (verbose)
+		printk(KERN_INFO "%s: called\n", __func__);
+
+	iter = kzalloc(sizeof(*iter), GFP_KERNEL);
+	if (iter == NULL)
+		return -ENOMEM;
+
+	err = seq_open(file, &ddebug_proc_seqops);
+	if (err) {
+		kfree(iter);
+		return err;
+	}
+	((struct seq_file *) file->private_data)->private = iter;
+	return 0;
+}
+
+static const struct file_operations ddebug_proc_fops = {
+	.owner = THIS_MODULE,
+	.open = ddebug_proc_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = seq_release_private,
+	.write = ddebug_proc_write
+};
+
+/*
+ * Allocate a new ddebug_table for the given module
+ * and add it to the global list.
+ */
+int ddebug_add_module(struct _ddebug *tab, unsigned int n,
+			     const char *name)
+{
+	struct ddebug_table *dt;
+	char *new_name;
+
+	dt = kzalloc(sizeof(*dt), GFP_KERNEL);
+	if (dt == NULL)
+		return -ENOMEM;
+	new_name = kstrdup(name, GFP_KERNEL);
+	if (new_name == NULL) {
+		kfree(dt);
+		return -ENOMEM;
+	}
+	dt->mod_name = new_name;
+	dt->num_ddebugs = n;
+	dt->num_enabled = 0;
+	dt->ddebugs = tab;
+
+	mutex_lock(&ddebug_lock);
+	list_add_tail(&dt->link, &ddebug_tables);
+	mutex_unlock(&ddebug_lock);
+
+	if (verbose)
+		printk(KERN_INFO "%u debug prints in module %s\n",
+				 n, dt->mod_name);
+	return 0;
+}
+EXPORT_SYMBOL_GPL(ddebug_add_module);
+
+static void ddebug_table_free(struct ddebug_table *dt)
+{
+	list_del_init(&dt->link);
+	kfree(dt->mod_name);
+	kfree(dt);
+}
+
+/*
+ * Called in response to a module being unloaded.  Removes
+ * any ddebug_table's which point at the module.
+ */
+int ddebug_remove_module(char *mod_name)
+{
+	struct ddebug_table *dt, *nextdt;
+	int ret = -ENOENT;
+
+	if (verbose)
+		printk(KERN_INFO "%s: removing module \"%s\"\n",
+				__func__, mod_name);
+
+	mutex_lock(&ddebug_lock);
+	list_for_each_entry_safe(dt, nextdt, &ddebug_tables, link) {
+		if (!strcmp(dt->mod_name, mod_name)) {
+			ddebug_table_free(dt);
+			ret = 0;
+		}
+	}
+	mutex_unlock(&ddebug_lock);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(ddebug_remove_module);
+
+static void ddebug_remove_all_tables(void)
+{
+	mutex_lock(&ddebug_lock);
+	while (!list_empty(&ddebug_tables)) {
+		struct ddebug_table *dt = list_entry(ddebug_tables.next,
+						      struct ddebug_table,
+						      link);
+		ddebug_table_free(dt);
+	}
+	mutex_unlock(&ddebug_lock);
+}
+
+static int __init dynamic_debug_init(void)
+{
+	struct dentry *dir, *file;
+	struct _ddebug *iter, *iter_start;
+	const char *modname = NULL;
+	int ret = 0;
+	int n = 0;
+
+	dir = debugfs_create_dir("dynamic_debug", NULL);
+	if (!dir)
+		return -ENOMEM;
+	file = debugfs_create_file("control", 0644, dir, NULL,
+					&ddebug_proc_fops);
+	if (!file) {
+		debugfs_remove(dir);
+		return -ENOMEM;
+	}
+	if (__start___verbose != __stop___verbose) {
+		iter = __start___verbose;
+		modname = iter->modname;
+		iter_start = iter;
+		for (; iter < __stop___verbose; iter++) {
+			if (strcmp(modname, iter->modname)) {
+				ret = ddebug_add_module(iter_start, n, modname);
+				if (ret)
+					goto out_free;
+				n = 0;
+				modname = iter->modname;
+				iter_start = iter;
+			}
+			n++;
+		}
+		ret = ddebug_add_module(iter_start, n, modname);
+	}
+out_free:
+	if (ret) {
+		ddebug_remove_all_tables();
+		debugfs_remove(dir);
+		debugfs_remove(file);
+	}
+	return 0;
+}
+module_init(dynamic_debug_init);
diff --git a/lib/dynamic_printk.c b/lib/dynamic_printk.c
deleted file mode 100644
index 165a19763dc9..000000000000
--- a/lib/dynamic_printk.c
+++ /dev/null
@@ -1,414 +0,0 @@
-/*
- * lib/dynamic_printk.c
- *
- * make pr_debug()/dev_dbg() calls runtime configurable based upon their
- * their source module.
- *
- * Copyright (C) 2008 Red Hat, Inc., Jason Baron <jbaron@redhat.com>
- */
-
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/uaccess.h>
-#include <linux/seq_file.h>
-#include <linux/debugfs.h>
-#include <linux/fs.h>
-
-extern struct mod_debug __start___verbose[];
-extern struct mod_debug __stop___verbose[];
-
-struct debug_name {
-	struct hlist_node hlist;
-	struct hlist_node hlist2;
-	int hash1;
-	int hash2;
-	char *name;
-	int enable;
-	int type;
-};
-
-static int nr_entries;
-static int num_enabled;
-int dynamic_enabled = DYNAMIC_ENABLED_NONE;
-static struct hlist_head module_table[DEBUG_HASH_TABLE_SIZE] =
-	{ [0 ... DEBUG_HASH_TABLE_SIZE-1] = HLIST_HEAD_INIT };
-static struct hlist_head module_table2[DEBUG_HASH_TABLE_SIZE] =
-	{ [0 ... DEBUG_HASH_TABLE_SIZE-1] = HLIST_HEAD_INIT };
-static DECLARE_MUTEX(debug_list_mutex);
-
-/* dynamic_printk_enabled, and dynamic_printk_enabled2 are bitmasks in which
- * bit n is set to 1 if any modname hashes into the bucket n, 0 otherwise. They
- * use independent hash functions, to reduce the chance of false positives.
- */
-long long dynamic_printk_enabled;
-EXPORT_SYMBOL_GPL(dynamic_printk_enabled);
-long long dynamic_printk_enabled2;
-EXPORT_SYMBOL_GPL(dynamic_printk_enabled2);
-
-/* returns the debug module pointer. */
-static struct debug_name *find_debug_module(char *module_name)
-{
-	int i;
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct debug_name *element;
-
-	element = NULL;
-	for (i = 0; i < DEBUG_HASH_TABLE_SIZE; i++) {
-		head = &module_table[i];
-		hlist_for_each_entry_rcu(element, node, head, hlist)
-			if (!strcmp(element->name, module_name))
-				return element;
-	}
-	return NULL;
-}
-
-/* returns the debug module pointer. */
-static struct debug_name *find_debug_module_hash(char *module_name, int hash)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct debug_name *element;
-
-	element = NULL;
-	head = &module_table[hash];
-	hlist_for_each_entry_rcu(element, node, head, hlist)
-		if (!strcmp(element->name, module_name))
-			return element;
-	return NULL;
-}
-
-/* caller must hold mutex*/
-static int __add_debug_module(char *mod_name, int hash, int hash2)
-{
-	struct debug_name *new;
-	char *module_name;
-	int ret = 0;
-
-	if (find_debug_module(mod_name)) {
-		ret = -EINVAL;
-		goto out;
-	}
-	module_name = kmalloc(strlen(mod_name) + 1, GFP_KERNEL);
-	if (!module_name) {
-		ret = -ENOMEM;
-		goto out;
-	}
-	module_name = strcpy(module_name, mod_name);
-	module_name[strlen(mod_name)] = '\0';
-	new = kzalloc(sizeof(struct debug_name), GFP_KERNEL);
-	if (!new) {
-		kfree(module_name);
-		ret = -ENOMEM;
-		goto out;
-	}
-	INIT_HLIST_NODE(&new->hlist);
-	INIT_HLIST_NODE(&new->hlist2);
-	new->name = module_name;
-	new->hash1 = hash;
-	new->hash2 = hash2;
-	hlist_add_head_rcu(&new->hlist, &module_table[hash]);
-	hlist_add_head_rcu(&new->hlist2, &module_table2[hash2]);
-	nr_entries++;
-out:
-	return ret;
-}
-
-int unregister_dynamic_debug_module(char *mod_name)
-{
-	struct debug_name *element;
-	int ret = 0;
-
-	down(&debug_list_mutex);
-	element = find_debug_module(mod_name);
-	if (!element) {
-		ret = -EINVAL;
-		goto out;
-	}
-	hlist_del_rcu(&element->hlist);
-	hlist_del_rcu(&element->hlist2);
-	synchronize_rcu();
-	kfree(element->name);
-	if (element->enable)
-		num_enabled--;
-	kfree(element);
-	nr_entries--;
-out:
-	up(&debug_list_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(unregister_dynamic_debug_module);
-
-int register_dynamic_debug_module(char *mod_name, int type, char *share_name,
-					char *flags, int hash, int hash2)
-{
-	struct debug_name *elem;
-	int ret = 0;
-
-	down(&debug_list_mutex);
-	elem = find_debug_module(mod_name);
-	if (!elem) {
-		if (__add_debug_module(mod_name, hash, hash2))
-			goto out;
-		elem = find_debug_module(mod_name);
-		if (dynamic_enabled == DYNAMIC_ENABLED_ALL &&
-				!strcmp(mod_name, share_name)) {
-			elem->enable = true;
-			num_enabled++;
-		}
-	}
-	elem->type |= type;
-out:
-	up(&debug_list_mutex);
-	return ret;
-}
-EXPORT_SYMBOL_GPL(register_dynamic_debug_module);
-
-int __dynamic_dbg_enabled_helper(char *mod_name, int type, int value, int hash)
-{
-	struct debug_name *elem;
-	int ret = 0;
-
-	if (dynamic_enabled == DYNAMIC_ENABLED_ALL)
-		return 1;
-	rcu_read_lock();
-	elem = find_debug_module_hash(mod_name, hash);
-	if (elem && elem->enable)
-		ret = 1;
-	rcu_read_unlock();
-	return ret;
-}
-EXPORT_SYMBOL_GPL(__dynamic_dbg_enabled_helper);
-
-static void set_all(bool enable)
-{
-	struct debug_name *e;
-	struct hlist_node *node;
-	int i;
-	long long enable_mask;
-
-	for (i = 0; i < DEBUG_HASH_TABLE_SIZE; i++) {
-		if (module_table[i].first != NULL) {
-			hlist_for_each_entry(e, node, &module_table[i], hlist) {
-				e->enable = enable;
-			}
-		}
-	}
-	if (enable)
-		enable_mask = ULLONG_MAX;
-	else
-		enable_mask = 0;
-	dynamic_printk_enabled = enable_mask;
-	dynamic_printk_enabled2 = enable_mask;
-}
-
-static int disabled_hash(int i, bool first_table)
-{
-	struct debug_name *e;
-	struct hlist_node *node;
-
-	if (first_table) {
-		hlist_for_each_entry(e, node, &module_table[i], hlist) {
-			if (e->enable)
-				return 0;
-		}
-	} else {
-		hlist_for_each_entry(e, node, &module_table2[i], hlist2) {
-			if (e->enable)
-				return 0;
-		}
-	}
-	return 1;
-}
-
-static ssize_t pr_debug_write(struct file *file, const char __user *buf,
-				size_t length, loff_t *ppos)
-{
-	char *buffer, *s, *value_str, *setting_str;
-	int err, value;
-	struct debug_name *elem = NULL;
-	int all = 0;
-
-	if (length > PAGE_SIZE || length < 0)
-		return -EINVAL;
-
-	buffer = (char *)__get_free_page(GFP_KERNEL);
-	if (!buffer)
-		return -ENOMEM;
-
-	err = -EFAULT;
-	if (copy_from_user(buffer, buf, length))
-		goto out;
-
-	err = -EINVAL;
-	if (length < PAGE_SIZE)
-		buffer[length] = '\0';
-	else if (buffer[PAGE_SIZE-1])
-		goto out;
-
-	err = -EINVAL;
-	down(&debug_list_mutex);
-
-	if (strncmp("set", buffer, 3))
-		goto out_up;
-	s = buffer + 3;
-	setting_str = strsep(&s, "=");
-	if (s == NULL)
-		goto out_up;
-	setting_str = strstrip(setting_str);
-	value_str = strsep(&s, " ");
-	if (s == NULL)
-		goto out_up;
-	s = strstrip(s);
-	if (!strncmp(s, "all", 3))
-		all = 1;
-	else
-		elem = find_debug_module(s);
-	if (!strncmp(setting_str, "enable", 6)) {
-		value = !!simple_strtol(value_str, NULL, 10);
-		if (all) {
-			if (value) {
-				set_all(true);
-				num_enabled = nr_entries;
-				dynamic_enabled = DYNAMIC_ENABLED_ALL;
-			} else {
-				set_all(false);
-				num_enabled = 0;
-				dynamic_enabled = DYNAMIC_ENABLED_NONE;
-			}
-			err = 0;
-		} else if (elem) {
-			if (value && (elem->enable == 0)) {
-				dynamic_printk_enabled |= (1LL << elem->hash1);
-				dynamic_printk_enabled2 |= (1LL << elem->hash2);
-				elem->enable = 1;
-				num_enabled++;
-				dynamic_enabled = DYNAMIC_ENABLED_SOME;
-				err = 0;
-				printk(KERN_DEBUG
-					"debugging enabled for module %s\n",
-					elem->name);
-			} else if (!value && (elem->enable == 1)) {
-				elem->enable = 0;
-				num_enabled--;
-				if (disabled_hash(elem->hash1, true))
-					dynamic_printk_enabled &=
-							~(1LL << elem->hash1);
-				if (disabled_hash(elem->hash2, false))
-					dynamic_printk_enabled2 &=
-							~(1LL << elem->hash2);
-				if (num_enabled)
-					dynamic_enabled = DYNAMIC_ENABLED_SOME;
-				else
-					dynamic_enabled = DYNAMIC_ENABLED_NONE;
-				err = 0;
-				printk(KERN_DEBUG
-					"debugging disabled for module %s\n",
-					elem->name);
-			}
-		}
-	}
-	if (!err)
-		err = length;
-out_up:
-	up(&debug_list_mutex);
-out:
-	free_page((unsigned long)buffer);
-	return err;
-}
-
-static void *pr_debug_seq_start(struct seq_file *f, loff_t *pos)
-{
-	return (*pos < DEBUG_HASH_TABLE_SIZE) ? pos : NULL;
-}
-
-static void *pr_debug_seq_next(struct seq_file *s, void *v, loff_t *pos)
-{
-	(*pos)++;
-	if (*pos >= DEBUG_HASH_TABLE_SIZE)
-		return NULL;
-	return pos;
-}
-
-static void pr_debug_seq_stop(struct seq_file *s, void *v)
-{
-	/* Nothing to do */
-}
-
-static int pr_debug_seq_show(struct seq_file *s, void *v)
-{
-	struct hlist_head *head;
-	struct hlist_node *node;
-	struct debug_name *elem;
-	unsigned int i = *(loff_t *) v;
-
-	rcu_read_lock();
-	head = &module_table[i];
-	hlist_for_each_entry_rcu(elem, node, head, hlist) {
-		seq_printf(s, "%s enabled=%d", elem->name, elem->enable);
-		seq_printf(s, "\n");
-	}
-	rcu_read_unlock();
-	return 0;
-}
-
-static struct seq_operations pr_debug_seq_ops = {
-	.start = pr_debug_seq_start,
-	.next  = pr_debug_seq_next,
-	.stop  = pr_debug_seq_stop,
-	.show  = pr_debug_seq_show
-};
-
-static int pr_debug_open(struct inode *inode, struct file *filp)
-{
-	return seq_open(filp, &pr_debug_seq_ops);
-}
-
-static const struct file_operations pr_debug_operations = {
-	.open		= pr_debug_open,
-	.read		= seq_read,
-	.write		= pr_debug_write,
-	.llseek		= seq_lseek,
-	.release	= seq_release,
-};
-
-static int __init dynamic_printk_init(void)
-{
-	struct dentry *dir, *file;
-	struct mod_debug *iter;
-	unsigned long value;
-
-	dir = debugfs_create_dir("dynamic_printk", NULL);
-	if (!dir)
-		return -ENOMEM;
-	file = debugfs_create_file("modules", 0644, dir, NULL,
-					&pr_debug_operations);
-	if (!file) {
-		debugfs_remove(dir);
-		return -ENOMEM;
-	}
-	for (value = (unsigned long)__start___verbose;
-		value < (unsigned long)__stop___verbose;
-		value += sizeof(struct mod_debug)) {
-			iter = (struct mod_debug *)value;
-			register_dynamic_debug_module(iter->modname,
-				iter->type,
-				iter->logical_modname,
-				iter->flag_names, iter->hash, iter->hash2);
-	}
-	if (dynamic_enabled == DYNAMIC_ENABLED_ALL)
-		set_all(true);
-	return 0;
-}
-module_init(dynamic_printk_init);
-/* may want to move this earlier so we can get traces as early as possible */
-
-static int __init dynamic_printk_setup(char *str)
-{
-	if (str)
-		return -ENOENT;
-	dynamic_enabled = DYNAMIC_ENABLED_ALL;
-	return 0;
-}
-/* Use early_param(), so we can get debug output as early as possible */
-early_param("dynamic_printk", dynamic_printk_setup);
diff --git a/net/netfilter/nf_conntrack_pptp.c b/net/netfilter/nf_conntrack_pptp.c
index 9e169ef2e854..12bd09dbd36c 100644
--- a/net/netfilter/nf_conntrack_pptp.c
+++ b/net/netfilter/nf_conntrack_pptp.c
@@ -66,7 +66,7 @@ void
 			     struct nf_conntrack_expect *exp) __read_mostly;
 EXPORT_SYMBOL_GPL(nf_nat_pptp_hook_expectfn);
 
-#if defined(DEBUG) || defined(CONFIG_DYNAMIC_PRINTK_DEBUG)
+#if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
 /* PptpControlMessageType names */
 const char *const pptp_msg_name[] = {
 	"UNKNOWN_MESSAGE",
diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib
index e06365775bdf..c18fa150b6fe 100644
--- a/scripts/Makefile.lib
+++ b/scripts/Makefile.lib
@@ -97,7 +97,7 @@ modname_flags  = $(if $(filter 1,$(words $(modname))),\
                  -D"KBUILD_MODNAME=KBUILD_STR($(call name-fix,$(modname)))")
 
 #hash values
-ifdef CONFIG_DYNAMIC_PRINTK_DEBUG
+ifdef CONFIG_DYNAMIC_DEBUG
 debug_flags = -D"DEBUG_HASH=$(shell ./scripts/basic/hash djb2 $(@D)$(modname))"\
               -D"DEBUG_HASH2=$(shell ./scripts/basic/hash r5 $(@D)$(modname))"
 else
-- 
cgit v1.2.3


From 67bb6c036d1fc3d332c8527a36a546e3e72e822c Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:35 +0530
Subject: sched: Simple helper functions for find_busiest_group()

Impact: cleanup

Currently the load idx calculation code is in find_busiest_group().
Move that to a static inline helper function.

Similary, to find the first cpu of a sched_group we use
cpumask_first(sched_group_cpus(group))

Use a helper to that. It improves readability in some cases.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091335.13992.55424.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 55 +++++++++++++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 7b389c74f8ff..6aec1e7a72a3 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3189,6 +3189,43 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 
 	return 0;
 }
+/********** Helpers for find_busiest_group ************************/
+
+/**
+ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
+ * @group: The group whose first cpu is to be returned.
+ */
+static inline unsigned int group_first_cpu(struct sched_group *group)
+{
+	return cpumask_first(sched_group_cpus(group));
+}
+
+/**
+ * get_sd_load_idx - Obtain the load index for a given sched domain.
+ * @sd: The sched_domain whose load_idx is to be obtained.
+ * @idle: The Idle status of the CPU for whose sd load_icx is obtained.
+ */
+static inline int get_sd_load_idx(struct sched_domain *sd,
+					enum cpu_idle_type idle)
+{
+	int load_idx;
+
+	switch (idle) {
+	case CPU_NOT_IDLE:
+		load_idx = sd->busy_idx;
+		break;
+
+	case CPU_NEWLY_IDLE:
+		load_idx = sd->newidle_idx;
+		break;
+	default:
+		load_idx = sd->idle_idx;
+		break;
+	}
+
+	return load_idx;
+}
+/******* find_busiest_group() helpers end here *********************/
 
 /*
  * find_busiest_group finds and returns the busiest CPU group within the
@@ -3217,12 +3254,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	busiest_load_per_task = busiest_nr_running = 0;
 	this_load_per_task = this_nr_running = 0;
 
-	if (idle == CPU_NOT_IDLE)
-		load_idx = sd->busy_idx;
-	else if (idle == CPU_NEWLY_IDLE)
-		load_idx = sd->newidle_idx;
-	else
-		load_idx = sd->idle_idx;
+	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
 		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
@@ -3238,7 +3270,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 					       sched_group_cpus(group));
 
 		if (local_group)
-			balance_cpu = cpumask_first(sched_group_cpus(group));
+			balance_cpu = group_first_cpu(group);
 
 		/* Tally up the load of all CPUs in the group */
 		sum_weighted_load = sum_nr_running = avg_load = 0;
@@ -3359,8 +3391,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 */
 		if ((sum_nr_running < min_nr_running) ||
 		    (sum_nr_running == min_nr_running &&
-		     cpumask_first(sched_group_cpus(group)) >
-		     cpumask_first(sched_group_cpus(group_min)))) {
+		     group_first_cpu(group) > group_first_cpu(group_min))) {
 			group_min = group;
 			min_nr_running = sum_nr_running;
 			min_load_per_task = sum_weighted_load /
@@ -3375,8 +3406,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sum_nr_running <= group_capacity - 1) {
 			if (sum_nr_running > leader_nr_running ||
 			    (sum_nr_running == leader_nr_running &&
-			     cpumask_first(sched_group_cpus(group)) <
-			     cpumask_first(sched_group_cpus(group_leader)))) {
+			     group_first_cpu(group) <
+			     group_first_cpu(group_leader))) {
 				group_leader = group;
 				leader_nr_running = sum_nr_running;
 			}
@@ -3504,7 +3535,7 @@ out_balanced:
 		*imbalance = min_load_per_task;
 		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-				cpumask_first(sched_group_cpus(group_leader));
+				group_first_cpu(group_leader);
 		}
 		return group_min;
 	}
-- 
cgit v1.2.3


From 6dfdb0629019f307ab18864b1fd3e5dbb02f383c Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:40 +0530
Subject: sched: Fix indentations in find_busiest_group() using gotos

Impact: cleanup

Some indentations in find_busiest_group() can minimized by using
early exits with the help of gotos. This improves readability in
a couple of cases.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091340.13992.45062.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 6aec1e7a72a3..f87adbe999e0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3403,14 +3403,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
-		if (sum_nr_running <= group_capacity - 1) {
-			if (sum_nr_running > leader_nr_running ||
-			    (sum_nr_running == leader_nr_running &&
-			     group_first_cpu(group) <
-			     group_first_cpu(group_leader))) {
-				group_leader = group;
-				leader_nr_running = sum_nr_running;
-			}
+		if (sum_nr_running > group_capacity - 1)
+			goto group_next;
+
+		if (sum_nr_running > leader_nr_running ||
+		    (sum_nr_running == leader_nr_running &&
+		     group_first_cpu(group) < group_first_cpu(group_leader))) {
+			group_leader = group;
+			leader_nr_running = sum_nr_running;
 		}
 group_next:
 #endif
@@ -3531,14 +3531,16 @@ out_balanced:
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 
-	if (this == group_leader && group_leader != group_min) {
-		*imbalance = min_load_per_task;
-		if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-			cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-				group_first_cpu(group_leader);
-		}
-		return group_min;
+	if (this != group_leader || group_leader == group_min)
+		goto ret;
+
+	*imbalance = min_load_per_task;
+	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+			group_first_cpu(group_leader);
 	}
+	return group_min;
+
 #endif
 ret:
 	*imbalance = 0;
-- 
cgit v1.2.3


From 381be78fdc829a22f6327a0ed09f54b6270a976d Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:46 +0530
Subject: sched: Define structure to store the sched_group statistics for fbg()

Impact: cleanup

Currently a whole bunch of variables are used to store the
various statistics pertaining to the groups we iterate over
in find_busiest_group().

Group them together in a single data structure and add
appropriate comments.

This will be useful later on when we create helper functions
to calculate the sched_group statistics.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091345.13992.20099.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 79 ++++++++++++++++++++++++++++++++++------------------------
 1 file changed, 46 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f87adbe999e0..109db122de50 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3191,6 +3191,18 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 }
 /********** Helpers for find_busiest_group ************************/
 
+/**
+ * sg_lb_stats - stats of a sched_group required for load_balancing
+ */
+struct sg_lb_stats {
+	unsigned long avg_load; /*Avg load across the CPUs of the group */
+	unsigned long group_load; /* Total load over the CPUs of the group */
+	unsigned long sum_nr_running; /* Nr tasks running in the group */
+	unsigned long sum_weighted_load; /* Weighted load of group's tasks */
+	unsigned long group_capacity;
+	int group_imb; /* Is there an imbalance in the group ? */
+};
+
 /**
  * group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
  * @group: The group whose first cpu is to be returned.
@@ -3257,23 +3269,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
-		unsigned long load, group_capacity, max_cpu_load, min_cpu_load;
+		struct sg_lb_stats sgs;
+		unsigned long load, max_cpu_load, min_cpu_load;
 		int local_group;
 		int i;
-		int __group_imb = 0;
 		unsigned int balance_cpu = -1, first_idle_cpu = 0;
-		unsigned long sum_nr_running, sum_weighted_load;
 		unsigned long sum_avg_load_per_task;
 		unsigned long avg_load_per_task;
 
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
+		memset(&sgs, 0, sizeof(sgs));
 
 		if (local_group)
 			balance_cpu = group_first_cpu(group);
 
 		/* Tally up the load of all CPUs in the group */
-		sum_weighted_load = sum_nr_running = avg_load = 0;
 		sum_avg_load_per_task = avg_load_per_task = 0;
 
 		max_cpu_load = 0;
@@ -3301,9 +3312,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 					min_cpu_load = load;
 			}
 
-			avg_load += load;
-			sum_nr_running += rq->nr_running;
-			sum_weighted_load += weighted_cpuload(i);
+			sgs.group_load += load;
+			sgs.sum_nr_running += rq->nr_running;
+			sgs.sum_weighted_load += weighted_cpuload(i);
 
 			sum_avg_load_per_task += cpu_avg_load_per_task(i);
 		}
@@ -3320,12 +3331,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 			goto ret;
 		}
 
-		total_load += avg_load;
+		total_load += sgs.group_load;
 		total_pwr += group->__cpu_power;
 
 		/* Adjust by relative CPU power of the group */
-		avg_load = sg_div_cpu_power(group,
-				avg_load * SCHED_LOAD_SCALE);
+		sgs.avg_load = sg_div_cpu_power(group,
+				sgs.group_load * SCHED_LOAD_SCALE);
 
 
 		/*
@@ -3341,22 +3352,23 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 				sum_avg_load_per_task * SCHED_LOAD_SCALE);
 
 		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-			__group_imb = 1;
+			sgs.group_imb = 1;
 
-		group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+		sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
 		if (local_group) {
-			this_load = avg_load;
+			this_load = sgs.avg_load;
 			this = group;
-			this_nr_running = sum_nr_running;
-			this_load_per_task = sum_weighted_load;
-		} else if (avg_load > max_load &&
-			   (sum_nr_running > group_capacity || __group_imb)) {
-			max_load = avg_load;
+			this_nr_running = sgs.sum_nr_running;
+			this_load_per_task = sgs.sum_weighted_load;
+		} else if (sgs.avg_load > max_load &&
+			   (sgs.sum_nr_running > sgs.group_capacity ||
+				sgs.group_imb)) {
+			max_load = sgs.avg_load;
 			busiest = group;
-			busiest_nr_running = sum_nr_running;
-			busiest_load_per_task = sum_weighted_load;
-			group_imb = __group_imb;
+			busiest_nr_running = sgs.sum_nr_running;
+			busiest_load_per_task = sgs.sum_weighted_load;
+			group_imb = sgs.group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3372,7 +3384,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
-		if (local_group && (this_nr_running >= group_capacity ||
+		if (local_group && (this_nr_running >= sgs.group_capacity ||
 				    !this_nr_running))
 			power_savings_balance = 0;
 
@@ -3380,8 +3392,9 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
-		if (!power_savings_balance || sum_nr_running >= group_capacity
-		    || !sum_nr_running)
+		if (!power_savings_balance ||
+			sgs.sum_nr_running >= sgs.group_capacity ||
+			!sgs.sum_nr_running)
 			goto group_next;
 
 		/*
@@ -3389,13 +3402,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
-		if ((sum_nr_running < min_nr_running) ||
-		    (sum_nr_running == min_nr_running &&
+		if ((sgs.sum_nr_running < min_nr_running) ||
+		    (sgs.sum_nr_running == min_nr_running &&
 		     group_first_cpu(group) > group_first_cpu(group_min))) {
 			group_min = group;
-			min_nr_running = sum_nr_running;
-			min_load_per_task = sum_weighted_load /
-						sum_nr_running;
+			min_nr_running = sgs.sum_nr_running;
+			min_load_per_task = sgs.sum_weighted_load /
+						sgs.sum_nr_running;
 		}
 
 		/*
@@ -3403,14 +3416,14 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * capacity but still has some space to pick up some load
 		 * from other group and save more power
 		 */
-		if (sum_nr_running > group_capacity - 1)
+		if (sgs.sum_nr_running > sgs.group_capacity - 1)
 			goto group_next;
 
-		if (sum_nr_running > leader_nr_running ||
-		    (sum_nr_running == leader_nr_running &&
+		if (sgs.sum_nr_running > leader_nr_running ||
+		    (sgs.sum_nr_running == leader_nr_running &&
 		     group_first_cpu(group) < group_first_cpu(group_leader))) {
 			group_leader = group;
-			leader_nr_running = sum_nr_running;
+			leader_nr_running = sgs.sum_nr_running;
 		}
 group_next:
 #endif
-- 
cgit v1.2.3


From 1f8c553d0f11d85f7993fe21015695d266771c00 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:51 +0530
Subject: sched: Create a helper function to calculate sched_group stats for
 fbg()

Impact: cleanup

Create a helper function named update_sg_lb_stats() which
can be invoked to calculate the individual group's statistics
in find_busiest_group().

This reduces the lenght of find_busiest_group() considerably.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Aked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091351.13992.43461.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 175 ++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 100 insertions(+), 75 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 109db122de50..1893d5562f5f 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3237,6 +3237,103 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 
 	return load_idx;
 }
+
+
+/**
+ * update_sg_lb_stats - Update sched_group's statistics for load balancing.
+ * @group: sched_group whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @load_idx: Load index of sched_domain of this_cpu for load calc.
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @local_group: Does group contain this_cpu.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sgs: variable to hold the statistics for this group.
+ */
+static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
+			enum cpu_idle_type idle, int load_idx, int *sd_idle,
+			int local_group, const struct cpumask *cpus,
+			int *balance, struct sg_lb_stats *sgs)
+{
+	unsigned long load, max_cpu_load, min_cpu_load;
+	int i;
+	unsigned int balance_cpu = -1, first_idle_cpu = 0;
+	unsigned long sum_avg_load_per_task;
+	unsigned long avg_load_per_task;
+
+	if (local_group)
+		balance_cpu = group_first_cpu(group);
+
+	/* Tally up the load of all CPUs in the group */
+	sum_avg_load_per_task = avg_load_per_task = 0;
+	max_cpu_load = 0;
+	min_cpu_load = ~0UL;
+
+	for_each_cpu_and(i, sched_group_cpus(group), cpus) {
+		struct rq *rq = cpu_rq(i);
+
+		if (*sd_idle && rq->nr_running)
+			*sd_idle = 0;
+
+		/* Bias balancing toward cpus of our domain */
+		if (local_group) {
+			if (idle_cpu(i) && !first_idle_cpu) {
+				first_idle_cpu = 1;
+				balance_cpu = i;
+			}
+
+			load = target_load(i, load_idx);
+		} else {
+			load = source_load(i, load_idx);
+			if (load > max_cpu_load)
+				max_cpu_load = load;
+			if (min_cpu_load > load)
+				min_cpu_load = load;
+		}
+
+		sgs->group_load += load;
+		sgs->sum_nr_running += rq->nr_running;
+		sgs->sum_weighted_load += weighted_cpuload(i);
+
+		sum_avg_load_per_task += cpu_avg_load_per_task(i);
+	}
+
+	/*
+	 * First idle cpu or the first cpu(busiest) in this sched group
+	 * is eligible for doing load balancing at this and above
+	 * domains. In the newly idle case, we will allow all the cpu's
+	 * to do the newly idle load balance.
+	 */
+	if (idle != CPU_NEWLY_IDLE && local_group &&
+	    balance_cpu != this_cpu && balance) {
+		*balance = 0;
+		return;
+	}
+
+	/* Adjust by relative CPU power of the group */
+	sgs->avg_load = sg_div_cpu_power(group,
+			sgs->group_load * SCHED_LOAD_SCALE);
+
+
+	/*
+	 * Consider the group unbalanced when the imbalance is larger
+	 * than the average weight of two tasks.
+	 *
+	 * APZ: with cgroup the avg task weight can vary wildly and
+	 *      might not be a suitable number - should we keep a
+	 *      normalized nr_running number somewhere that negates
+	 *      the hierarchy?
+	 */
+	avg_load_per_task = sg_div_cpu_power(group,
+			sum_avg_load_per_task * SCHED_LOAD_SCALE);
+
+	if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
+		sgs->group_imb = 1;
+
+	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
+
+}
 /******* find_busiest_group() helpers end here *********************/
 
 /*
@@ -3270,92 +3367,20 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 	do {
 		struct sg_lb_stats sgs;
-		unsigned long load, max_cpu_load, min_cpu_load;
 		int local_group;
-		int i;
-		unsigned int balance_cpu = -1, first_idle_cpu = 0;
-		unsigned long sum_avg_load_per_task;
-		unsigned long avg_load_per_task;
 
 		local_group = cpumask_test_cpu(this_cpu,
 					       sched_group_cpus(group));
 		memset(&sgs, 0, sizeof(sgs));
+		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
+				local_group, cpus, balance, &sgs);
 
-		if (local_group)
-			balance_cpu = group_first_cpu(group);
-
-		/* Tally up the load of all CPUs in the group */
-		sum_avg_load_per_task = avg_load_per_task = 0;
-
-		max_cpu_load = 0;
-		min_cpu_load = ~0UL;
-
-		for_each_cpu_and(i, sched_group_cpus(group), cpus) {
-			struct rq *rq = cpu_rq(i);
-
-			if (*sd_idle && rq->nr_running)
-				*sd_idle = 0;
-
-			/* Bias balancing toward cpus of our domain */
-			if (local_group) {
-				if (idle_cpu(i) && !first_idle_cpu) {
-					first_idle_cpu = 1;
-					balance_cpu = i;
-				}
-
-				load = target_load(i, load_idx);
-			} else {
-				load = source_load(i, load_idx);
-				if (load > max_cpu_load)
-					max_cpu_load = load;
-				if (min_cpu_load > load)
-					min_cpu_load = load;
-			}
-
-			sgs.group_load += load;
-			sgs.sum_nr_running += rq->nr_running;
-			sgs.sum_weighted_load += weighted_cpuload(i);
-
-			sum_avg_load_per_task += cpu_avg_load_per_task(i);
-		}
-
-		/*
-		 * First idle cpu or the first cpu(busiest) in this sched group
-		 * is eligible for doing load balancing at this and above
-		 * domains. In the newly idle case, we will allow all the cpu's
-		 * to do the newly idle load balance.
-		 */
-		if (idle != CPU_NEWLY_IDLE && local_group &&
-		    balance_cpu != this_cpu && balance) {
-			*balance = 0;
+		if (balance && !(*balance))
 			goto ret;
-		}
 
 		total_load += sgs.group_load;
 		total_pwr += group->__cpu_power;
 
-		/* Adjust by relative CPU power of the group */
-		sgs.avg_load = sg_div_cpu_power(group,
-				sgs.group_load * SCHED_LOAD_SCALE);
-
-
-		/*
-		 * Consider the group unbalanced when the imbalance is larger
-		 * than the average weight of two tasks.
-		 *
-		 * APZ: with cgroup the avg task weight can vary wildly and
-		 *      might not be a suitable number - should we keep a
-		 *      normalized nr_running number somewhere that negates
-		 *      the hierarchy?
-		 */
-		avg_load_per_task = sg_div_cpu_power(group,
-				sum_avg_load_per_task * SCHED_LOAD_SCALE);
-
-		if ((max_cpu_load - min_cpu_load) > 2*avg_load_per_task)
-			sgs.group_imb = 1;
-
-		sgs.group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
-
 		if (local_group) {
 			this_load = sgs.avg_load;
 			this = group;
-- 
cgit v1.2.3


From 222d656dea57e4e084fbd1e9383e6fed2ca9fa61 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:43:56 +0530
Subject: sched: Define structure to store the sched_domain statistics for
 fbg()

Impact: cleanup

Currently we use a lot of local variables in find_busiest_group()
to capture the various statistics related to the sched_domain.
Group them together into a single data structure.

This will help us to offload the job of updating the sched_domain
statistics to a helper function.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091356.13992.25970.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 207 +++++++++++++++++++++++++++++++++------------------------
 1 file changed, 121 insertions(+), 86 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 1893d5562f5f..8198dbe8e4aa 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3190,6 +3190,37 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 /********** Helpers for find_busiest_group ************************/
+/**
+ * sd_lb_stats - Structure to store the statistics of a sched_domain
+ * 		during load balancing.
+ */
+struct sd_lb_stats {
+	struct sched_group *busiest; /* Busiest group in this sd */
+	struct sched_group *this;  /* Local group in this sd */
+	unsigned long total_load;  /* Total load of all groups in sd */
+	unsigned long total_pwr;   /*	Total power of all groups in sd */
+	unsigned long avg_load;	   /* Average load across all groups in sd */
+
+	/** Statistics of this group */
+	unsigned long this_load;
+	unsigned long this_load_per_task;
+	unsigned long this_nr_running;
+
+	/* Statistics of the busiest group */
+	unsigned long max_load;
+	unsigned long busiest_load_per_task;
+	unsigned long busiest_nr_running;
+
+	int group_imb; /* Is there imbalance in this sd */
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+	int power_savings_balance; /* Is powersave balance needed for this sd */
+	struct sched_group *group_min; /* Least loaded group in sd */
+	struct sched_group *group_leader; /* Group which relieves group_min */
+	unsigned long min_load_per_task; /* load_per_task in group_min */
+	unsigned long leader_nr_running; /* Nr running of group_leader */
+	unsigned long min_nr_running; /* Nr running of group_min */
+#endif
+};
 
 /**
  * sg_lb_stats - stats of a sched_group required for load_balancing
@@ -3346,23 +3377,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   unsigned long *imbalance, enum cpu_idle_type idle,
 		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
-	struct sched_group *busiest = NULL, *this = NULL, *group = sd->groups;
-	unsigned long max_load, avg_load, total_load, this_load, total_pwr;
+	struct sd_lb_stats sds;
+	struct sched_group *group = sd->groups;
 	unsigned long max_pull;
-	unsigned long busiest_load_per_task, busiest_nr_running;
-	unsigned long this_load_per_task, this_nr_running;
-	int load_idx, group_imb = 0;
+	int load_idx;
+
+	memset(&sds, 0, sizeof(sds));
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	int power_savings_balance = 1;
-	unsigned long leader_nr_running = 0, min_load_per_task = 0;
-	unsigned long min_nr_running = ULONG_MAX;
-	struct sched_group *group_min = NULL, *group_leader = NULL;
+	sds.power_savings_balance = 1;
+	sds.min_nr_running = ULONG_MAX;
 #endif
-
-	max_load = this_load = total_load = total_pwr = 0;
-	busiest_load_per_task = busiest_nr_running = 0;
-	this_load_per_task = this_nr_running = 0;
-
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
@@ -3378,22 +3402,22 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (balance && !(*balance))
 			goto ret;
 
-		total_load += sgs.group_load;
-		total_pwr += group->__cpu_power;
+		sds.total_load += sgs.group_load;
+		sds.total_pwr += group->__cpu_power;
 
 		if (local_group) {
-			this_load = sgs.avg_load;
-			this = group;
-			this_nr_running = sgs.sum_nr_running;
-			this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > max_load &&
+			sds.this_load = sgs.avg_load;
+			sds.this = group;
+			sds.this_nr_running = sgs.sum_nr_running;
+			sds.this_load_per_task = sgs.sum_weighted_load;
+		} else if (sgs.avg_load > sds.max_load &&
 			   (sgs.sum_nr_running > sgs.group_capacity ||
 				sgs.group_imb)) {
-			max_load = sgs.avg_load;
-			busiest = group;
-			busiest_nr_running = sgs.sum_nr_running;
-			busiest_load_per_task = sgs.sum_weighted_load;
-			group_imb = sgs.group_imb;
+			sds.max_load = sgs.avg_load;
+			sds.busiest = group;
+			sds.busiest_nr_running = sgs.sum_nr_running;
+			sds.busiest_load_per_task = sgs.sum_weighted_load;
+			sds.group_imb = sgs.group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3409,15 +3433,16 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * If the local group is idle or completely loaded
 		 * no need to do power savings balance at this domain
 		 */
-		if (local_group && (this_nr_running >= sgs.group_capacity ||
-				    !this_nr_running))
-			power_savings_balance = 0;
+		if (local_group &&
+			(sds.this_nr_running >= sgs.group_capacity ||
+			!sds.this_nr_running))
+			sds.power_savings_balance = 0;
 
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
-		if (!power_savings_balance ||
+		if (!sds.power_savings_balance ||
 			sgs.sum_nr_running >= sgs.group_capacity ||
 			!sgs.sum_nr_running)
 			goto group_next;
@@ -3427,12 +3452,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
-		if ((sgs.sum_nr_running < min_nr_running) ||
-		    (sgs.sum_nr_running == min_nr_running &&
-		     group_first_cpu(group) > group_first_cpu(group_min))) {
-			group_min = group;
-			min_nr_running = sgs.sum_nr_running;
-			min_load_per_task = sgs.sum_weighted_load /
+		if ((sgs.sum_nr_running < sds.min_nr_running) ||
+		    (sgs.sum_nr_running == sds.min_nr_running &&
+		     group_first_cpu(group) >
+			group_first_cpu(sds.group_min))) {
+			sds.group_min = group;
+			sds.min_nr_running = sgs.sum_nr_running;
+			sds.min_load_per_task = sgs.sum_weighted_load /
 						sgs.sum_nr_running;
 		}
 
@@ -3444,29 +3470,32 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sgs.sum_nr_running > sgs.group_capacity - 1)
 			goto group_next;
 
-		if (sgs.sum_nr_running > leader_nr_running ||
-		    (sgs.sum_nr_running == leader_nr_running &&
-		     group_first_cpu(group) < group_first_cpu(group_leader))) {
-			group_leader = group;
-			leader_nr_running = sgs.sum_nr_running;
+		if (sgs.sum_nr_running > sds.leader_nr_running ||
+		    (sgs.sum_nr_running == sds.leader_nr_running &&
+		     group_first_cpu(group) <
+			group_first_cpu(sds.group_leader))) {
+			sds.group_leader = group;
+			sds.leader_nr_running = sgs.sum_nr_running;
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 
-	if (!busiest || this_load >= max_load || busiest_nr_running == 0)
+	if (!sds.busiest || sds.this_load >= sds.max_load
+		|| sds.busiest_nr_running == 0)
 		goto out_balanced;
 
-	avg_load = (SCHED_LOAD_SCALE * total_load) / total_pwr;
+	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 
-	if (this_load >= avg_load ||
-			100*max_load <= sd->imbalance_pct*this_load)
+	if (sds.this_load >= sds.avg_load ||
+			100*sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 
-	busiest_load_per_task /= busiest_nr_running;
-	if (group_imb)
-		busiest_load_per_task = min(busiest_load_per_task, avg_load);
+	sds.busiest_load_per_task /= sds.busiest_nr_running;
+	if (sds.group_imb)
+		sds.busiest_load_per_task =
+			min(sds.busiest_load_per_task, sds.avg_load);
 
 	/*
 	 * We're trying to get all the cpus to the average_load, so we don't
@@ -3479,7 +3508,7 @@ group_next:
 	 * by pulling tasks to us. Be careful of negative numbers as they'll
 	 * appear as very large values with unsigned longs.
 	 */
-	if (max_load <= busiest_load_per_task)
+	if (sds.max_load <= sds.busiest_load_per_task)
 		goto out_balanced;
 
 	/*
@@ -3487,17 +3516,18 @@ group_next:
 	 * max load less than avg load(as we skip the groups at or below
 	 * its cpu_power, while calculating max_load..)
 	 */
-	if (max_load < avg_load) {
+	if (sds.max_load < sds.avg_load) {
 		*imbalance = 0;
 		goto small_imbalance;
 	}
 
 	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(max_load - avg_load, max_load - busiest_load_per_task);
+	max_pull = min(sds.max_load - sds.avg_load,
+			sds.max_load - sds.busiest_load_per_task);
 
 	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * busiest->__cpu_power,
-				(avg_load - this_load) * this->__cpu_power)
+	*imbalance = min(max_pull * sds.busiest->__cpu_power,
+			(sds.avg_load - sds.this_load) * sds.this->__cpu_power)
 			/ SCHED_LOAD_SCALE;
 
 	/*
@@ -3506,24 +3536,27 @@ group_next:
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance < busiest_load_per_task) {
+	if (*imbalance < sds.busiest_load_per_task) {
 		unsigned long tmp, pwr_now, pwr_move;
 		unsigned int imbn;
 
 small_imbalance:
 		pwr_move = pwr_now = 0;
 		imbn = 2;
-		if (this_nr_running) {
-			this_load_per_task /= this_nr_running;
-			if (busiest_load_per_task > this_load_per_task)
+		if (sds.this_nr_running) {
+			sds.this_load_per_task /= sds.this_nr_running;
+			if (sds.busiest_load_per_task >
+					sds.this_load_per_task)
 				imbn = 1;
 		} else
-			this_load_per_task = cpu_avg_load_per_task(this_cpu);
-
-		if (max_load - this_load + busiest_load_per_task >=
-					busiest_load_per_task * imbn) {
-			*imbalance = busiest_load_per_task;
-			return busiest;
+			sds.this_load_per_task =
+				cpu_avg_load_per_task(this_cpu);
+
+		if (sds.max_load - sds.this_load +
+			sds.busiest_load_per_task >=
+				sds.busiest_load_per_task * imbn) {
+			*imbalance = sds.busiest_load_per_task;
+			return sds.busiest;
 		}
 
 		/*
@@ -3532,52 +3565,54 @@ small_imbalance:
 		 * moving them.
 		 */
 
-		pwr_now += busiest->__cpu_power *
-				min(busiest_load_per_task, max_load);
-		pwr_now += this->__cpu_power *
-				min(this_load_per_task, this_load);
+		pwr_now += sds.busiest->__cpu_power *
+				min(sds.busiest_load_per_task, sds.max_load);
+		pwr_now += sds.this->__cpu_power *
+				min(sds.this_load_per_task, sds.this_load);
 		pwr_now /= SCHED_LOAD_SCALE;
 
 		/* Amount of load we'd subtract */
-		tmp = sg_div_cpu_power(busiest,
-				busiest_load_per_task * SCHED_LOAD_SCALE);
-		if (max_load > tmp)
-			pwr_move += busiest->__cpu_power *
-				min(busiest_load_per_task, max_load - tmp);
+		tmp = sg_div_cpu_power(sds.busiest,
+				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+		if (sds.max_load > tmp)
+			pwr_move += sds.busiest->__cpu_power *
+				min(sds.busiest_load_per_task,
+						sds.max_load - tmp);
 
 		/* Amount of load we'd add */
-		if (max_load * busiest->__cpu_power <
-				busiest_load_per_task * SCHED_LOAD_SCALE)
-			tmp = sg_div_cpu_power(this,
-					max_load * busiest->__cpu_power);
+		if (sds.max_load * sds.busiest->__cpu_power <
+				sds.busiest_load_per_task * SCHED_LOAD_SCALE)
+			tmp = sg_div_cpu_power(sds.this,
+				sds.max_load * sds.busiest->__cpu_power);
 		else
-			tmp = sg_div_cpu_power(this,
-				busiest_load_per_task * SCHED_LOAD_SCALE);
-		pwr_move += this->__cpu_power *
-				min(this_load_per_task, this_load + tmp);
+			tmp = sg_div_cpu_power(sds.this,
+				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
+		pwr_move += sds.this->__cpu_power *
+				min(sds.this_load_per_task,
+					sds.this_load + tmp);
 		pwr_move /= SCHED_LOAD_SCALE;
 
 		/* Move if we gain throughput */
 		if (pwr_move > pwr_now)
-			*imbalance = busiest_load_per_task;
+			*imbalance = sds.busiest_load_per_task;
 	}
 
-	return busiest;
+	return sds.busiest;
 
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
 	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
 		goto ret;
 
-	if (this != group_leader || group_leader == group_min)
+	if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
 		goto ret;
 
-	*imbalance = min_load_per_task;
+	*imbalance = sds.min_load_per_task;
 	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
 		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(group_leader);
+			group_first_cpu(sds.group_leader);
 	}
-	return group_min;
+	return sds.group_min;
 
 #endif
 ret:
-- 
cgit v1.2.3


From 37abe198b1246ddd206319c43502a687db62d347 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:01 +0530
Subject: sched: Create a helper function to calculate sched_domain stats for
 fbg()

Impact: cleanup

Create a helper function named update_sd_lb_stats() to update the
various sched_domain related statistics in find_busiest_group().

With this we would have moved all the statistics computation out of
find_busiest_group().

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091401.13992.88737.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 117 +++++++++++++++++++++++++++++++++++----------------------
 1 file changed, 73 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 8198dbe8e4aa..ec715f97202e 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3365,32 +3365,33 @@ static inline void update_sg_lb_stats(struct sched_group *group, int this_cpu,
 	sgs->group_capacity = group->__cpu_power / SCHED_LOAD_SCALE;
 
 }
-/******* find_busiest_group() helpers end here *********************/
 
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * update_sd_lb_stats - Update sched_group's statistics for load balancing.
+ * @sd: sched_domain whose statistics are to be updated.
+ * @this_cpu: Cpu for which load balance is currently performed.
+ * @idle: Idle status of this_cpu
+ * @sd_idle: Idle status of the sched_domain containing group.
+ * @cpus: Set of cpus considered for load balancing.
+ * @balance: Should we balance.
+ * @sds: variable to hold the statistics for this sched_domain.
  */
-static struct sched_group *
-find_busiest_group(struct sched_domain *sd, int this_cpu,
-		   unsigned long *imbalance, enum cpu_idle_type idle,
-		   int *sd_idle, const struct cpumask *cpus, int *balance)
+static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
+			enum cpu_idle_type idle, int *sd_idle,
+			const struct cpumask *cpus, int *balance,
+			struct sd_lb_stats *sds)
 {
-	struct sd_lb_stats sds;
 	struct sched_group *group = sd->groups;
-	unsigned long max_pull;
+	struct sg_lb_stats sgs;
 	int load_idx;
 
-	memset(&sds, 0, sizeof(sds));
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	sds.power_savings_balance = 1;
-	sds.min_nr_running = ULONG_MAX;
+	sds->power_savings_balance = 1;
+	sds->min_nr_running = ULONG_MAX;
 #endif
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
-		struct sg_lb_stats sgs;
 		int local_group;
 
 		local_group = cpumask_test_cpu(this_cpu,
@@ -3399,25 +3400,25 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		update_sg_lb_stats(group, this_cpu, idle, load_idx, sd_idle,
 				local_group, cpus, balance, &sgs);
 
-		if (balance && !(*balance))
-			goto ret;
+		if (local_group && balance && !(*balance))
+			return;
 
-		sds.total_load += sgs.group_load;
-		sds.total_pwr += group->__cpu_power;
+		sds->total_load += sgs.group_load;
+		sds->total_pwr += group->__cpu_power;
 
 		if (local_group) {
-			sds.this_load = sgs.avg_load;
-			sds.this = group;
-			sds.this_nr_running = sgs.sum_nr_running;
-			sds.this_load_per_task = sgs.sum_weighted_load;
-		} else if (sgs.avg_load > sds.max_load &&
+			sds->this_load = sgs.avg_load;
+			sds->this = group;
+			sds->this_nr_running = sgs.sum_nr_running;
+			sds->this_load_per_task = sgs.sum_weighted_load;
+		} else if (sgs.avg_load > sds->max_load &&
 			   (sgs.sum_nr_running > sgs.group_capacity ||
 				sgs.group_imb)) {
-			sds.max_load = sgs.avg_load;
-			sds.busiest = group;
-			sds.busiest_nr_running = sgs.sum_nr_running;
-			sds.busiest_load_per_task = sgs.sum_weighted_load;
-			sds.group_imb = sgs.group_imb;
+			sds->max_load = sgs.avg_load;
+			sds->busiest = group;
+			sds->busiest_nr_running = sgs.sum_nr_running;
+			sds->busiest_load_per_task = sgs.sum_weighted_load;
+			sds->group_imb = sgs.group_imb;
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
@@ -3434,15 +3435,15 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * no need to do power savings balance at this domain
 		 */
 		if (local_group &&
-			(sds.this_nr_running >= sgs.group_capacity ||
-			!sds.this_nr_running))
-			sds.power_savings_balance = 0;
+			(sds->this_nr_running >= sgs.group_capacity ||
+			!sds->this_nr_running))
+			sds->power_savings_balance = 0;
 
 		/*
 		 * If a group is already running at full capacity or idle,
 		 * don't include that group in power savings calculations
 		 */
-		if (!sds.power_savings_balance ||
+		if (!sds->power_savings_balance ||
 			sgs.sum_nr_running >= sgs.group_capacity ||
 			!sgs.sum_nr_running)
 			goto group_next;
@@ -3452,13 +3453,13 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		 * This is the group from where we need to pick up the load
 		 * for saving power
 		 */
-		if ((sgs.sum_nr_running < sds.min_nr_running) ||
-		    (sgs.sum_nr_running == sds.min_nr_running &&
+		if ((sgs.sum_nr_running < sds->min_nr_running) ||
+		    (sgs.sum_nr_running == sds->min_nr_running &&
 		     group_first_cpu(group) >
-			group_first_cpu(sds.group_min))) {
-			sds.group_min = group;
-			sds.min_nr_running = sgs.sum_nr_running;
-			sds.min_load_per_task = sgs.sum_weighted_load /
+			group_first_cpu(sds->group_min))) {
+			sds->group_min = group;
+			sds->min_nr_running = sgs.sum_nr_running;
+			sds->min_load_per_task = sgs.sum_weighted_load /
 						sgs.sum_nr_running;
 		}
 
@@ -3470,18 +3471,46 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		if (sgs.sum_nr_running > sgs.group_capacity - 1)
 			goto group_next;
 
-		if (sgs.sum_nr_running > sds.leader_nr_running ||
-		    (sgs.sum_nr_running == sds.leader_nr_running &&
+		if (sgs.sum_nr_running > sds->leader_nr_running ||
+		    (sgs.sum_nr_running == sds->leader_nr_running &&
 		     group_first_cpu(group) <
-			group_first_cpu(sds.group_leader))) {
-			sds.group_leader = group;
-			sds.leader_nr_running = sgs.sum_nr_running;
+			group_first_cpu(sds->group_leader))) {
+			sds->group_leader = group;
+			sds->leader_nr_running = sgs.sum_nr_running;
 		}
 group_next:
 #endif
 		group = group->next;
 	} while (group != sd->groups);
 
+}
+/******* find_busiest_group() helpers end here *********************/
+
+/*
+ * find_busiest_group finds and returns the busiest CPU group within the
+ * domain. It calculates and returns the amount of weighted load which
+ * should be moved to restore balance via the imbalance parameter.
+ */
+static struct sched_group *
+find_busiest_group(struct sched_domain *sd, int this_cpu,
+		   unsigned long *imbalance, enum cpu_idle_type idle,
+		   int *sd_idle, const struct cpumask *cpus, int *balance)
+{
+	struct sd_lb_stats sds;
+	unsigned long max_pull;
+
+	memset(&sds, 0, sizeof(sds));
+
+	/*
+	 * Compute the various statistics relavent for load balancing at
+	 * this level.
+	 */
+	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
+					balance, &sds);
+
+	if (balance && !(*balance))
+		goto ret;
+
 	if (!sds.busiest || sds.this_load >= sds.max_load
 		|| sds.busiest_nr_running == 0)
 		goto out_balanced;
-- 
cgit v1.2.3


From 2e6f44aeda426054fc58464df1ad571aecca0c92 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:06 +0530
Subject: sched: Create helper to calculate small_imbalance in fbg()

Impact: cleanup

We have two places in find_busiest_group() where we need to calculate
the minor imbalance before returning the busiest group. Encapsulate
this functionality into a seperate helper function.

Credit: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
LKML-Reference: <20090325091406.13992.54316.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 131 ++++++++++++++++++++++++++++++---------------------------
 1 file changed, 70 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index ec715f97202e..540147e5e82b 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3484,6 +3484,71 @@ group_next:
 	} while (group != sd->groups);
 
 }
+
+/**
+ * fix_small_imbalance - Calculate the minor imbalance that exists
+ * 			amongst the groups of a sched_domain, during
+ * 			load balancing.
+ * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
+ * @imbalance: Variable to store the imbalance.
+ */
+static inline void fix_small_imbalance(struct sd_lb_stats *sds,
+				int this_cpu, unsigned long *imbalance)
+{
+	unsigned long tmp, pwr_now = 0, pwr_move = 0;
+	unsigned int imbn = 2;
+
+	if (sds->this_nr_running) {
+		sds->this_load_per_task /= sds->this_nr_running;
+		if (sds->busiest_load_per_task >
+				sds->this_load_per_task)
+			imbn = 1;
+	} else
+		sds->this_load_per_task =
+			cpu_avg_load_per_task(this_cpu);
+
+	if (sds->max_load - sds->this_load + sds->busiest_load_per_task >=
+			sds->busiest_load_per_task * imbn) {
+		*imbalance = sds->busiest_load_per_task;
+		return;
+	}
+
+	/*
+	 * OK, we don't have enough imbalance to justify moving tasks,
+	 * however we may be able to increase total CPU power used by
+	 * moving them.
+	 */
+
+	pwr_now += sds->busiest->__cpu_power *
+			min(sds->busiest_load_per_task, sds->max_load);
+	pwr_now += sds->this->__cpu_power *
+			min(sds->this_load_per_task, sds->this_load);
+	pwr_now /= SCHED_LOAD_SCALE;
+
+	/* Amount of load we'd subtract */
+	tmp = sg_div_cpu_power(sds->busiest,
+			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+	if (sds->max_load > tmp)
+		pwr_move += sds->busiest->__cpu_power *
+			min(sds->busiest_load_per_task, sds->max_load - tmp);
+
+	/* Amount of load we'd add */
+	if (sds->max_load * sds->busiest->__cpu_power <
+		sds->busiest_load_per_task * SCHED_LOAD_SCALE)
+		tmp = sg_div_cpu_power(sds->this,
+			sds->max_load * sds->busiest->__cpu_power);
+	else
+		tmp = sg_div_cpu_power(sds->this,
+			sds->busiest_load_per_task * SCHED_LOAD_SCALE);
+	pwr_move += sds->this->__cpu_power *
+			min(sds->this_load_per_task, sds->this_load + tmp);
+	pwr_move /= SCHED_LOAD_SCALE;
+
+	/* Move if we gain throughput */
+	if (pwr_move > pwr_now)
+		*imbalance = sds->busiest_load_per_task;
+}
 /******* find_busiest_group() helpers end here *********************/
 
 /*
@@ -3547,7 +3612,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 */
 	if (sds.max_load < sds.avg_load) {
 		*imbalance = 0;
-		goto small_imbalance;
+		fix_small_imbalance(&sds, this_cpu, imbalance);
+		goto ret_busiest;
 	}
 
 	/* Don't want to pull so many tasks that a group would go idle */
@@ -3565,67 +3631,10 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	 * a think about bumping its value to force at least one task to be
 	 * moved
 	 */
-	if (*imbalance < sds.busiest_load_per_task) {
-		unsigned long tmp, pwr_now, pwr_move;
-		unsigned int imbn;
-
-small_imbalance:
-		pwr_move = pwr_now = 0;
-		imbn = 2;
-		if (sds.this_nr_running) {
-			sds.this_load_per_task /= sds.this_nr_running;
-			if (sds.busiest_load_per_task >
-					sds.this_load_per_task)
-				imbn = 1;
-		} else
-			sds.this_load_per_task =
-				cpu_avg_load_per_task(this_cpu);
-
-		if (sds.max_load - sds.this_load +
-			sds.busiest_load_per_task >=
-				sds.busiest_load_per_task * imbn) {
-			*imbalance = sds.busiest_load_per_task;
-			return sds.busiest;
-		}
-
-		/*
-		 * OK, we don't have enough imbalance to justify moving tasks,
-		 * however we may be able to increase total CPU power used by
-		 * moving them.
-		 */
-
-		pwr_now += sds.busiest->__cpu_power *
-				min(sds.busiest_load_per_task, sds.max_load);
-		pwr_now += sds.this->__cpu_power *
-				min(sds.this_load_per_task, sds.this_load);
-		pwr_now /= SCHED_LOAD_SCALE;
-
-		/* Amount of load we'd subtract */
-		tmp = sg_div_cpu_power(sds.busiest,
-				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
-		if (sds.max_load > tmp)
-			pwr_move += sds.busiest->__cpu_power *
-				min(sds.busiest_load_per_task,
-						sds.max_load - tmp);
-
-		/* Amount of load we'd add */
-		if (sds.max_load * sds.busiest->__cpu_power <
-				sds.busiest_load_per_task * SCHED_LOAD_SCALE)
-			tmp = sg_div_cpu_power(sds.this,
-				sds.max_load * sds.busiest->__cpu_power);
-		else
-			tmp = sg_div_cpu_power(sds.this,
-				sds.busiest_load_per_task * SCHED_LOAD_SCALE);
-		pwr_move += sds.this->__cpu_power *
-				min(sds.this_load_per_task,
-					sds.this_load + tmp);
-		pwr_move /= SCHED_LOAD_SCALE;
-
-		/* Move if we gain throughput */
-		if (pwr_move > pwr_now)
-			*imbalance = sds.busiest_load_per_task;
-	}
+	if (*imbalance < sds.busiest_load_per_task)
+		fix_small_imbalance(&sds, this_cpu, imbalance);
 
+ret_busiest:
 	return sds.busiest;
 
 out_balanced:
-- 
cgit v1.2.3


From dbc523a3b86f9e1765b5e70e6886913b99cc5cec Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:12 +0530
Subject: sched: Create a helper function to calculate imbalance

Move all the imbalance calculation out of find_busiest_group()
through this helper function.

With this change, the structure of find_busiest_group() will be
as follows:

- update_sched_domain_statistics.

- check if imbalance exits.

- update imbalance and return busiest.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091411.13992.43293.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 78 +++++++++++++++++++++++++++++++++-------------------------
 1 file changed, 45 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 540147e5e82b..934f615ccceb 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3487,8 +3487,8 @@ group_next:
 
 /**
  * fix_small_imbalance - Calculate the minor imbalance that exists
- * 			amongst the groups of a sched_domain, during
- * 			load balancing.
+ *			amongst the groups of a sched_domain, during
+ *			load balancing.
  * @sds: Statistics of the sched_domain whose imbalance is to be calculated.
  * @this_cpu: The cpu at whose sched_domain we're performing load-balance.
  * @imbalance: Variable to store the imbalance.
@@ -3549,6 +3549,47 @@ static inline void fix_small_imbalance(struct sd_lb_stats *sds,
 	if (pwr_move > pwr_now)
 		*imbalance = sds->busiest_load_per_task;
 }
+
+/**
+ * calculate_imbalance - Calculate the amount of imbalance present within the
+ *			 groups of a given sched_domain during load balance.
+ * @sds: statistics of the sched_domain whose imbalance is to be calculated.
+ * @this_cpu: Cpu for which currently load balance is being performed.
+ * @imbalance: The variable to store the imbalance.
+ */
+static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
+		unsigned long *imbalance)
+{
+	unsigned long max_pull;
+	/*
+	 * In the presence of smp nice balancing, certain scenarios can have
+	 * max load less than avg load(as we skip the groups at or below
+	 * its cpu_power, while calculating max_load..)
+	 */
+	if (sds->max_load < sds->avg_load) {
+		*imbalance = 0;
+		return fix_small_imbalance(sds, this_cpu, imbalance);
+	}
+
+	/* Don't want to pull so many tasks that a group would go idle */
+	max_pull = min(sds->max_load - sds->avg_load,
+			sds->max_load - sds->busiest_load_per_task);
+
+	/* How much load to actually move to equalise the imbalance */
+	*imbalance = min(max_pull * sds->busiest->__cpu_power,
+		(sds->avg_load - sds->this_load) * sds->this->__cpu_power)
+			/ SCHED_LOAD_SCALE;
+
+	/*
+	 * if *imbalance is less than the average load per runnable task
+	 * there is no gaurantee that any tasks will be moved so we'll have
+	 * a think about bumping its value to force at least one task to be
+	 * moved
+	 */
+	if (*imbalance < sds->busiest_load_per_task)
+		return fix_small_imbalance(sds, this_cpu, imbalance);
+
+}
 /******* find_busiest_group() helpers end here *********************/
 
 /*
@@ -3562,7 +3603,6 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 		   int *sd_idle, const struct cpumask *cpus, int *balance)
 {
 	struct sd_lb_stats sds;
-	unsigned long max_pull;
 
 	memset(&sds, 0, sizeof(sds));
 
@@ -3605,36 +3645,8 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	if (sds.max_load <= sds.busiest_load_per_task)
 		goto out_balanced;
 
-	/*
-	 * In the presence of smp nice balancing, certain scenarios can have
-	 * max load less than avg load(as we skip the groups at or below
-	 * its cpu_power, while calculating max_load..)
-	 */
-	if (sds.max_load < sds.avg_load) {
-		*imbalance = 0;
-		fix_small_imbalance(&sds, this_cpu, imbalance);
-		goto ret_busiest;
-	}
-
-	/* Don't want to pull so many tasks that a group would go idle */
-	max_pull = min(sds.max_load - sds.avg_load,
-			sds.max_load - sds.busiest_load_per_task);
-
-	/* How much load to actually move to equalise the imbalance */
-	*imbalance = min(max_pull * sds.busiest->__cpu_power,
-			(sds.avg_load - sds.this_load) * sds.this->__cpu_power)
-			/ SCHED_LOAD_SCALE;
-
-	/*
-	 * if *imbalance is less than the average load per runnable task
-	 * there is no gaurantee that any tasks will be moved so we'll have
-	 * a think about bumping its value to force at least one task to be
-	 * moved
-	 */
-	if (*imbalance < sds.busiest_load_per_task)
-		fix_small_imbalance(&sds, this_cpu, imbalance);
-
-ret_busiest:
+	/* Looks like there is an imbalance. Compute it */
+	calculate_imbalance(&sds, this_cpu, imbalance);
 	return sds.busiest;
 
 out_balanced:
-- 
cgit v1.2.3


From a021dc03376707c55a3483e32c16b8986d4414cc Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:17 +0530
Subject: sched: Optimize the !power_savings_balance during fbg()

Impact: cleanup, micro-optimization

We don't need to perform power_savings balance if either the
cpu is NOT_IDLE or if the sched_domain doesn't contain the
SD_POWERSAVINGS_BALANCE flag set.

Currently, we check for these conditions multiple number of
times, even though these variables don't change over the scope
of find_busiest_group().

Check once, and store the value in the already exiting
"power_savings_balance" variable.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091417.13992.2657.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 23 ++++++++++++++---------
 1 file changed, 14 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 934f615ccceb..71e8dcaf2c79 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3386,8 +3386,17 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	int load_idx;
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	sds->power_savings_balance = 1;
-	sds->min_nr_running = ULONG_MAX;
+	/*
+	 * Busy processors will not participate in power savings
+	 * balance.
+	 */
+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		sds->power_savings_balance = 0;
+	else {
+		sds->power_savings_balance = 1;
+		sds->min_nr_running = ULONG_MAX;
+		sds->leader_nr_running = 0;
+	}
 #endif
 	load_idx = get_sd_load_idx(sd, idle);
 
@@ -3422,12 +3431,8 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 		}
 
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-		/*
-		 * Busy processors will not participate in power savings
-		 * balance.
-		 */
-		if (idle == CPU_NOT_IDLE ||
-				!(sd->flags & SD_POWERSAVINGS_BALANCE))
+
+		if (!sds->power_savings_balance)
 			goto group_next;
 
 		/*
@@ -3651,7 +3656,7 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 
 out_balanced:
 #if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+	if (!sds.power_savings_balance)
 		goto ret;
 
 	if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
-- 
cgit v1.2.3


From c071df18525a95b37dd5821a6dc4af83bd18675e Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:22 +0530
Subject: sched: Refactor the power savings balance code

Impact: cleanup

Create seperate helper functions to initialize the
power-savings-balance related variables, to update them and
to check if we have a scope for performing power-savings balance.

Add no-op inline functions for the !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT)
case.

This will eliminate all the #ifdef jungle in find_busiest_group() and the
other helper functions.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091422.13992.73616.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 236 +++++++++++++++++++++++++++++++++++++--------------------
 1 file changed, 153 insertions(+), 83 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 71e8dcaf2c79..5f21658b0f67 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3270,6 +3270,151 @@ static inline int get_sd_load_idx(struct sched_domain *sd,
 }
 
 
+#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
+/**
+ * init_sd_power_savings_stats - Initialize power savings statistics for
+ * the given sched_domain, during load balancing.
+ *
+ * @sd: Sched domain whose power-savings statistics are to be initialized.
+ * @sds: Variable containing the statistics for sd.
+ * @idle: Idle status of the CPU at which we're performing load-balancing.
+ */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+	/*
+	 * Busy processors will not participate in power savings
+	 * balance.
+	 */
+	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
+		sds->power_savings_balance = 0;
+	else {
+		sds->power_savings_balance = 1;
+		sds->min_nr_running = ULONG_MAX;
+		sds->leader_nr_running = 0;
+	}
+}
+
+/**
+ * update_sd_power_savings_stats - Update the power saving stats for a
+ * sched_domain while performing load balancing.
+ *
+ * @group: sched_group belonging to the sched_domain under consideration.
+ * @sds: Variable containing the statistics of the sched_domain
+ * @local_group: Does group contain the CPU for which we're performing
+ * 		load balancing ?
+ * @sgs: Variable containing the statistics of the group.
+ */
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+
+	if (!sds->power_savings_balance)
+		return;
+
+	/*
+	 * If the local group is idle or completely loaded
+	 * no need to do power savings balance at this domain
+	 */
+	if (local_group && (sds->this_nr_running >= sgs->group_capacity ||
+				!sds->this_nr_running))
+		sds->power_savings_balance = 0;
+
+	/*
+	 * If a group is already running at full capacity or idle,
+	 * don't include that group in power savings calculations
+	 */
+	if (!sds->power_savings_balance ||
+		sgs->sum_nr_running >= sgs->group_capacity ||
+		!sgs->sum_nr_running)
+		return;
+
+	/*
+	 * Calculate the group which has the least non-idle load.
+	 * This is the group from where we need to pick up the load
+	 * for saving power
+	 */
+	if ((sgs->sum_nr_running < sds->min_nr_running) ||
+	    (sgs->sum_nr_running == sds->min_nr_running &&
+	     group_first_cpu(group) > group_first_cpu(sds->group_min))) {
+		sds->group_min = group;
+		sds->min_nr_running = sgs->sum_nr_running;
+		sds->min_load_per_task = sgs->sum_weighted_load /
+						sgs->sum_nr_running;
+	}
+
+	/*
+	 * Calculate the group which is almost near its
+	 * capacity but still has some space to pick up some load
+	 * from other group and save more power
+	 */
+	if (sgs->sum_nr_running > sgs->group_capacity - 1)
+		return;
+
+	if (sgs->sum_nr_running > sds->leader_nr_running ||
+	    (sgs->sum_nr_running == sds->leader_nr_running &&
+	     group_first_cpu(group) < group_first_cpu(sds->group_leader))) {
+		sds->group_leader = group;
+		sds->leader_nr_running = sgs->sum_nr_running;
+	}
+}
+
+/**
+ * check_power_save_busiest_group - Check if we have potential to perform
+ *	some power-savings balance. If yes, set the busiest group to be
+ *	the least loaded group in the sched_domain, so that it's CPUs can
+ *	be put to idle.
+ *
+ * @sds: Variable containing the statistics of the sched_domain
+ *	under consideration.
+ * @this_cpu: Cpu at which we're currently performing load-balancing.
+ * @imbalance: Variable to store the imbalance.
+ *
+ * Returns 1 if there is potential to perform power-savings balance.
+ * Else returns 0.
+ */
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+					int this_cpu, unsigned long *imbalance)
+{
+	if (!sds->power_savings_balance)
+		return 0;
+
+	if (sds->this != sds->group_leader ||
+			sds->group_leader == sds->group_min)
+		return 0;
+
+	*imbalance = sds->min_load_per_task;
+	sds->busiest = sds->group_min;
+
+	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
+		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
+			group_first_cpu(sds->group_leader);
+	}
+
+	return 1;
+
+}
+#else /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+static inline void init_sd_power_savings_stats(struct sched_domain *sd,
+	struct sd_lb_stats *sds, enum cpu_idle_type idle)
+{
+	return;
+}
+
+static inline void update_sd_power_savings_stats(struct sched_group *group,
+	struct sd_lb_stats *sds, int local_group, struct sg_lb_stats *sgs)
+{
+	return;
+}
+
+static inline int check_power_save_busiest_group(struct sd_lb_stats *sds,
+					int this_cpu, unsigned long *imbalance)
+{
+	return 0;
+}
+#endif /* CONFIG_SCHED_MC || CONFIG_SCHED_SMT */
+
+
 /**
  * update_sg_lb_stats - Update sched_group's statistics for load balancing.
  * @group: sched_group whose statistics are to be updated.
@@ -3385,19 +3530,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 	struct sg_lb_stats sgs;
 	int load_idx;
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	/*
-	 * Busy processors will not participate in power savings
-	 * balance.
-	 */
-	if (idle == CPU_NOT_IDLE || !(sd->flags & SD_POWERSAVINGS_BALANCE))
-		sds->power_savings_balance = 0;
-	else {
-		sds->power_savings_balance = 1;
-		sds->min_nr_running = ULONG_MAX;
-		sds->leader_nr_running = 0;
-	}
-#endif
+	init_sd_power_savings_stats(sd, sds, idle);
 	load_idx = get_sd_load_idx(sd, idle);
 
 	do {
@@ -3430,61 +3563,7 @@ static inline void update_sd_lb_stats(struct sched_domain *sd, int this_cpu,
 			sds->group_imb = sgs.group_imb;
 		}
 
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-
-		if (!sds->power_savings_balance)
-			goto group_next;
-
-		/*
-		 * If the local group is idle or completely loaded
-		 * no need to do power savings balance at this domain
-		 */
-		if (local_group &&
-			(sds->this_nr_running >= sgs.group_capacity ||
-			!sds->this_nr_running))
-			sds->power_savings_balance = 0;
-
-		/*
-		 * If a group is already running at full capacity or idle,
-		 * don't include that group in power savings calculations
-		 */
-		if (!sds->power_savings_balance ||
-			sgs.sum_nr_running >= sgs.group_capacity ||
-			!sgs.sum_nr_running)
-			goto group_next;
-
-		/*
-		 * Calculate the group which has the least non-idle load.
-		 * This is the group from where we need to pick up the load
-		 * for saving power
-		 */
-		if ((sgs.sum_nr_running < sds->min_nr_running) ||
-		    (sgs.sum_nr_running == sds->min_nr_running &&
-		     group_first_cpu(group) >
-			group_first_cpu(sds->group_min))) {
-			sds->group_min = group;
-			sds->min_nr_running = sgs.sum_nr_running;
-			sds->min_load_per_task = sgs.sum_weighted_load /
-						sgs.sum_nr_running;
-		}
-
-		/*
-		 * Calculate the group which is almost near its
-		 * capacity but still has some space to pick up some load
-		 * from other group and save more power
-		 */
-		if (sgs.sum_nr_running > sgs.group_capacity - 1)
-			goto group_next;
-
-		if (sgs.sum_nr_running > sds->leader_nr_running ||
-		    (sgs.sum_nr_running == sds->leader_nr_running &&
-		     group_first_cpu(group) <
-			group_first_cpu(sds->group_leader))) {
-			sds->group_leader = group;
-			sds->leader_nr_running = sgs.sum_nr_running;
-		}
-group_next:
-#endif
+		update_sd_power_savings_stats(group, sds, local_group, &sgs);
 		group = group->next;
 	} while (group != sd->groups);
 
@@ -3655,21 +3734,12 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	return sds.busiest;
 
 out_balanced:
-#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT)
-	if (!sds.power_savings_balance)
-		goto ret;
-
-	if (sds.this != sds.group_leader || sds.group_leader == sds.group_min)
-		goto ret;
-
-	*imbalance = sds.min_load_per_task;
-	if (sched_mc_power_savings >= POWERSAVINGS_BALANCE_WAKEUP) {
-		cpu_rq(this_cpu)->rd->sched_mc_preferred_wakeup_cpu =
-			group_first_cpu(sds.group_leader);
-	}
-	return sds.group_min;
-
-#endif
+	/*
+	 * There is no obvious imbalance. But check if we can do some balancing
+	 * to save power.
+	 */
+	if (check_power_save_busiest_group(&sds, this_cpu, imbalance))
+		return sds.busiest;
 ret:
 	*imbalance = 0;
 	return NULL;
-- 
cgit v1.2.3


From b7bb4c9bb01941fe8feb653f3410e7ed0c9bb786 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Wed, 25 Mar 2009 14:44:27 +0530
Subject: sched: Add comments to find_busiest_group() function

Impact: cleanup

Add /** style comments around find_busiest_group(). Also add a few
explanatory comments.

This concludes the find_busiest_group() cleanup. The function is
now down to 72 lines from the original 313 lines.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Suresh Siddha <suresh.b.siddha@intel.com>
Cc: "Balbir Singh" <balbir@in.ibm.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: "Dhaval Giani" <dhaval@linux.vnet.ibm.com>
Cc: Bharata B Rao <bharata@linux.vnet.ibm.com>
Cc: "Vaidyanathan Srinivasan" <svaidy@linux.vnet.ibm.com>
LKML-Reference: <20090325091427.13992.18933.stgit@sofia.in.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 50 ++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 42 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index 5f21658b0f67..9f8506d68fdc 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3676,10 +3676,30 @@ static inline void calculate_imbalance(struct sd_lb_stats *sds, int this_cpu,
 }
 /******* find_busiest_group() helpers end here *********************/
 
-/*
- * find_busiest_group finds and returns the busiest CPU group within the
- * domain. It calculates and returns the amount of weighted load which
- * should be moved to restore balance via the imbalance parameter.
+/**
+ * find_busiest_group - Returns the busiest group within the sched_domain
+ * if there is an imbalance. If there isn't an imbalance, and
+ * the user has opted for power-savings, it returns a group whose
+ * CPUs can be put to idle by rebalancing those tasks elsewhere, if
+ * such a group exists.
+ *
+ * Also calculates the amount of weighted load which should be moved
+ * to restore balance.
+ *
+ * @sd: The sched_domain whose busiest group is to be returned.
+ * @this_cpu: The cpu for which load balancing is currently being performed.
+ * @imbalance: Variable which stores amount of weighted load which should
+ *		be moved to restore balance/put a group to idle.
+ * @idle: The idle status of this_cpu.
+ * @sd_idle: The idleness of sd
+ * @cpus: The set of CPUs under consideration for load-balancing.
+ * @balance: Pointer to a variable indicating if this_cpu
+ *	is the appropriate cpu to perform load balancing at this_level.
+ *
+ * Returns:	- the busiest group if imbalance exists.
+ *		- If no imbalance and user has opted for power-savings balance,
+ *		   return the least loaded group whose CPUs can be
+ *		   put to idle by rebalancing its tasks onto our group.
  */
 static struct sched_group *
 find_busiest_group(struct sched_domain *sd, int this_cpu,
@@ -3697,17 +3717,31 @@ find_busiest_group(struct sched_domain *sd, int this_cpu,
 	update_sd_lb_stats(sd, this_cpu, idle, sd_idle, cpus,
 					balance, &sds);
 
+	/* Cases where imbalance does not exist from POV of this_cpu */
+	/* 1) this_cpu is not the appropriate cpu to perform load balancing
+	 *    at this level.
+	 * 2) There is no busy sibling group to pull from.
+	 * 3) This group is the busiest group.
+	 * 4) This group is more busy than the avg busieness at this
+	 *    sched_domain.
+	 * 5) The imbalance is within the specified limit.
+	 * 6) Any rebalance would lead to ping-pong
+	 */
 	if (balance && !(*balance))
 		goto ret;
 
-	if (!sds.busiest || sds.this_load >= sds.max_load
-		|| sds.busiest_nr_running == 0)
+	if (!sds.busiest || sds.busiest_nr_running == 0)
+		goto out_balanced;
+
+	if (sds.this_load >= sds.max_load)
 		goto out_balanced;
 
 	sds.avg_load = (SCHED_LOAD_SCALE * sds.total_load) / sds.total_pwr;
 
-	if (sds.this_load >= sds.avg_load ||
-			100*sds.max_load <= sd->imbalance_pct * sds.this_load)
+	if (sds.this_load >= sds.avg_load)
+		goto out_balanced;
+
+	if (100 * sds.max_load <= sd->imbalance_pct * sds.this_load)
 		goto out_balanced;
 
 	sds.busiest_load_per_task /= sds.busiest_nr_running;
-- 
cgit v1.2.3


From e6f489013b985b58d096a3091ece0ed579367232 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:27:17 +0800
Subject: trace_stat: don't call seq_printf() in seq_operation->start()

Impact: Fix incorrect way using seq_file's API

Use SEQ_START_TOKEN instead of calling ->stat_headers()
int seq_operation->start().

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
LKML-Reference: <49C9EAE5.5070202@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index f71b85b22cfe..8c129dd480ad 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -163,7 +163,7 @@ static void *stat_seq_start(struct seq_file *s, loff_t *pos)
 
 	/* If we are in the beginning of the file, print the headers */
 	if (!*pos && session->ts->stat_headers)
-		session->ts->stat_headers(s);
+		return SEQ_START_TOKEN;
 
 	return seq_list_start(&session->stat_list, *pos);
 }
@@ -172,6 +172,9 @@ static void *stat_seq_next(struct seq_file *s, void *p, loff_t *pos)
 {
 	struct tracer_stat_session *session = s->private;
 
+	if (p == SEQ_START_TOKEN)
+		return seq_list_start(&session->stat_list, *pos);
+
 	return seq_list_next(p, &session->stat_list, pos);
 }
 
@@ -186,6 +189,9 @@ static int stat_seq_show(struct seq_file *s, void *v)
 	struct tracer_stat_session *session = s->private;
 	struct trace_stat_list *l = list_entry(v, struct trace_stat_list, list);
 
+	if (v == SEQ_START_TOKEN)
+		return session->ts->stat_headers(s);
+
 	return session->ts->stat_show(s, l->stat);
 }
 
-- 
cgit v1.2.3


From 220ba351dfa57eca4bec5ce0098a276446a47958 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:58:39 +0800
Subject: trace_stat: keep original order

Impact: make trace_stat files show items with the original order

trace_stat tracer reverse the items, it makes the output
looks a little ugly.

Example, when we read trace_stat/workqueues, we get cpu#7's stat.
at first, and then cpu#6... cpu#0.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C9F23F.5040307@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_stat.c | 18 ++++++++----------
 1 file changed, 8 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c
index 8c129dd480ad..acdebd771a93 100644
--- a/kernel/trace/trace_stat.c
+++ b/kernel/trace/trace_stat.c
@@ -125,23 +125,21 @@ static int stat_seq_init(struct tracer_stat_session *session)
 		INIT_LIST_HEAD(&new_entry->list);
 		new_entry->stat = stat;
 
-		list_for_each_entry(iter_entry, &session->stat_list, list) {
+		list_for_each_entry_reverse(iter_entry, &session->stat_list,
+				list) {
 
 			/* Insertion with a descendent sorting */
-			if (ts->stat_cmp(new_entry->stat,
-						iter_entry->stat) > 0) {
+			if (ts->stat_cmp(iter_entry->stat,
+					new_entry->stat) >= 0) {
 
-				list_add_tail(&new_entry->list,
-						&iter_entry->list);
-				break;
-
-			/* The current smaller value */
-			} else if (list_is_last(&iter_entry->list,
-						&session->stat_list)) {
 				list_add(&new_entry->list, &iter_entry->list);
 				break;
 			}
 		}
+
+		/* The current larger value */
+		if (list_empty(&new_entry->list))
+			list_add(&new_entry->list, &session->stat_list);
 	}
 exit:
 	mutex_unlock(&session->stat_mutex);
-- 
cgit v1.2.3


From 2f63b840bc8a816ac879ee773b035cf3e433fae4 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 16:59:18 +0800
Subject: trace_workqueues: fix empty line's output

Empty lines separate cpus stat. After previous
fix(trace_stat: keep original order) applied, the empty lines
are displayed at incorrect position.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <49C9F266.2060706@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_workqueue.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c
index 9ab035b58cf1..797201e4a137 100644
--- a/kernel/trace/trace_workqueue.c
+++ b/kernel/trace/trace_workqueue.c
@@ -196,6 +196,11 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 	struct pid *pid;
 	struct task_struct *tsk;
 
+	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
+	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
+		seq_printf(s, "\n");
+	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
+
 	pid = find_get_pid(cws->pid);
 	if (pid) {
 		tsk = get_pid_task(pid, PIDTYPE_PID);
@@ -208,18 +213,13 @@ static int workqueue_stat_show(struct seq_file *s, void *p)
 		put_pid(pid);
 	}
 
-	spin_lock_irqsave(&workqueue_cpu_stat(cpu)->lock, flags);
-	if (&cws->list == workqueue_cpu_stat(cpu)->list.next)
-		seq_printf(s, "\n");
-	spin_unlock_irqrestore(&workqueue_cpu_stat(cpu)->lock, flags);
-
 	return 0;
 }
 
 static int workqueue_stat_headers(struct seq_file *s)
 {
 	seq_printf(s, "# CPU  INSERTED  EXECUTED   NAME\n");
-	seq_printf(s, "# |      |         |          |\n\n");
+	seq_printf(s, "# |      |         |          |\n");
 	return 0;
 }
 
-- 
cgit v1.2.3


From 2a4efa42450762cbfa5c5712aa4cc9f06924c9fd Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 12:06:05 +0800
Subject: ftrace: Using FTRACE_WARN_ON() to check "freed record" in
 ftrace_release()

 "Because when we call ftrace_free_rec we change the rec->ip to point to the
  next record in the chain. Something is very wrong if rec->ip >= s &&
  rec->ip < e and the record is already free."

 "Note, use FTRACE_WARN_ON() macro. This way it shuts down ftrace if it is
  hit and helps to avoid further damage later."
                   -- Steven Rostedt <rostedt@goodmis.org>

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <srostedt@redhat.com>
---
 kernel/trace/ftrace.c | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 7b8722baf153..1752a63f37c0 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -358,9 +358,14 @@ void ftrace_release(void *start, unsigned long size)
 
 	mutex_lock(&ftrace_lock);
 	do_for_each_ftrace_rec(pg, rec) {
-		if ((rec->ip >= s) && (rec->ip < e) &&
-		    !(rec->flags & FTRACE_FL_FREE))
+		if ((rec->ip >= s) && (rec->ip < e)) {
+			/*
+			 * rec->ip is changed in ftrace_free_rec()
+			 * It should not between s and e if record was freed.
+			 */
+			FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE);
 			ftrace_free_rec(rec);
+		}
 	} while_for_each_ftrace_rec();
 	mutex_unlock(&ftrace_lock);
 }
-- 
cgit v1.2.3


From 9a8118baaeb0eaa148913bed77bf9c6335f6ca63 Mon Sep 17 00:00:00 2001
From: Tom Zanussi <tzanussi@gmail.com>
Date: Thu, 26 Mar 2009 01:24:34 -0500
Subject: tracing: filter fix for TRACE_EVENT_FORMAT events

Impact: fix crash (hang) when using TRACE_EVENT_FORMAT filter files

filters are only hooked up to the tracepoint events defined using
TRACE_EVENT but not the tracers that use TRACE_EVENT_FORMAT, such
as ftrace.

Do not display the filter files at all for TRACE_EVENT_FORMAT events
for the time being.

Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: =?ISO-8859-1?Q?Fr=E9d=E9ric?= Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1237878882.8339.61.camel@charm-linux>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_events.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index d132997ab756..64ec4d278ffb 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -680,7 +680,6 @@ static struct dentry *
 event_subsystem_dir(const char *name, struct dentry *d_events)
 {
 	struct event_subsystem *system;
-	struct dentry *entry;
 
 	/* First see if we did not already create this dir */
 	list_for_each_entry(system, &event_subsystems, list) {
@@ -709,12 +708,6 @@ event_subsystem_dir(const char *name, struct dentry *d_events)
 
 	system->preds = NULL;
 
-	entry = debugfs_create_file("filter", 0644, system->entry, system,
-				    &ftrace_subsystem_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", name);
-
 	return system->entry;
 }
 
@@ -770,14 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events)
 				   " events/%s\n", call->name);
 			return ret;
 		}
+		entry = debugfs_create_file("filter", 0644, call->dir, call,
+					    &ftrace_event_filter_fops);
+		if (!entry)
+			pr_warning("Could not create debugfs "
+				   "'%s/filter' entry\n", call->name);
 	}
 
-	entry = debugfs_create_file("filter", 0644, call->dir, call,
-				    &ftrace_event_filter_fops);
-	if (!entry)
-		pr_warning("Could not create debugfs "
-			   "'%s/filter' entry\n", call->name);
-
 	/* A trace may not want to export its format */
 	if (!call->show_format)
 		return 0;
-- 
cgit v1.2.3


From 3ba13d179e8c24c68eac32b93593a6b10fcd1572 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Fri, 20 Feb 2009 06:02:22 +0000
Subject: constify dentry_operations: rest

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/ia64/kernel/perfmon.c | 2 +-
 fs/anon_inodes.c           | 2 +-
 fs/libfs.c                 | 2 +-
 fs/pipe.c                  | 2 +-
 kernel/cgroup.c            | 2 +-
 net/socket.c               | 2 +-
 net/sunrpc/rpc_pipe.c      | 2 +-
 7 files changed, 7 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 0e499757309b..5c0f408cfd71 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -2196,7 +2196,7 @@ pfmfs_delete_dentry(struct dentry *dentry)
 	return 1;
 }
 
-static struct dentry_operations pfmfs_dentry_operations = {
+static const struct dentry_operations pfmfs_dentry_operations = {
 	.d_delete = pfmfs_delete_dentry,
 };
 
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 3bbdb9d02376..1dd96d4406c0 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -48,7 +48,7 @@ static struct file_system_type anon_inode_fs_type = {
 	.get_sb		= anon_inodefs_get_sb,
 	.kill_sb	= kill_anon_super,
 };
-static struct dentry_operations anon_inodefs_dentry_operations = {
+static const struct dentry_operations anon_inodefs_dentry_operations = {
 	.d_delete	= anon_inodefs_delete_dentry,
 };
 
diff --git a/fs/libfs.c b/fs/libfs.c
index 49b44099dabb..ec600bd33e75 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -44,7 +44,7 @@ static int simple_delete_dentry(struct dentry *dentry)
  */
 struct dentry *simple_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd)
 {
-	static struct dentry_operations simple_dentry_operations = {
+	static const struct dentry_operations simple_dentry_operations = {
 		.d_delete = simple_delete_dentry,
 	};
 
diff --git a/fs/pipe.c b/fs/pipe.c
index df3719562fc1..6ddf05209a4c 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -870,7 +870,7 @@ static char *pipefs_dname(struct dentry *dentry, char *buffer, int buflen)
 				dentry->d_inode->i_ino);
 }
 
-static struct dentry_operations pipefs_dentry_operations = {
+static const struct dentry_operations pipefs_dentry_operations = {
 	.d_delete	= pipefs_delete_dentry,
 	.d_dname	= pipefs_dname,
 };
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9edb5c4b79b4..b01100ebd074 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1627,7 +1627,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
 static int cgroup_create_file(struct dentry *dentry, int mode,
 				struct super_block *sb)
 {
-	static struct dentry_operations cgroup_dops = {
+	static const struct dentry_operations cgroup_dops = {
 		.d_iput = cgroup_diput,
 	};
 
diff --git a/net/socket.c b/net/socket.c
index 35dd7371752a..2f895f60ca8a 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -328,7 +328,7 @@ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen)
 				dentry->d_inode->i_ino);
 }
 
-static struct dentry_operations sockfs_dentry_operations = {
+static const struct dentry_operations sockfs_dentry_operations = {
 	.d_delete = sockfs_delete_dentry,
 	.d_dname  = sockfs_dname,
 };
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index 577385a4a5dc..9ced0628d69c 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -480,7 +480,7 @@ static int rpc_delete_dentry(struct dentry *dentry)
 	return 1;
 }
 
-static struct dentry_operations rpc_dentry_operations = {
+static const struct dentry_operations rpc_dentry_operations = {
 	.d_delete = rpc_delete_dentry,
 };
 
-- 
cgit v1.2.3


From a3ec947c85ec339884b30ef6a08133e9311fdae1 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Wed, 4 Mar 2009 12:06:34 -0800
Subject: vfs: simple_set_mnt() should return void

simple_set_mnt() is defined as returning 'int' but always returns 0.
Callers assume simple_set_mnt() never fails and don't properly cleanup if
it were to _ever_ fail.  For instance, get_sb_single() and get_sb_nodev()
should:

        up_write(sb->s_unmount);
        deactivate_super(sb);

if simple_set_mnt() fails.

Since simple_set_mnt() never fails, would be cleaner if it did not
return anything.

[akpm@linux-foundation.org: fix build]
Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 drivers/mtd/mtdsuper.c | 7 +++++--
 fs/9p/vfs_super.c      | 5 +++--
 fs/cifs/cifsfs.c       | 3 ++-
 fs/devpts/inode.c      | 3 ++-
 fs/libfs.c             | 3 ++-
 fs/namespace.c         | 3 +--
 fs/proc/root.c         | 3 ++-
 fs/super.c             | 9 ++++++---
 fs/ubifs/super.c       | 3 ++-
 include/linux/fs.h     | 2 +-
 kernel/cgroup.c        | 3 ++-
 11 files changed, 28 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/drivers/mtd/mtdsuper.c b/drivers/mtd/mtdsuper.c
index 00d46e137b2a..92285d0089c2 100644
--- a/drivers/mtd/mtdsuper.c
+++ b/drivers/mtd/mtdsuper.c
@@ -81,13 +81,16 @@ static int get_sb_mtd_aux(struct file_system_type *fs_type, int flags,
 
 	/* go */
 	sb->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+
+	return 0;
 
 	/* new mountpoint for an already mounted superblock */
 already_mounted:
 	DEBUG(1, "MTDSB: Device %d (\"%s\") is already mounted\n",
 	      mtd->index, mtd->name);
-	ret = simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	ret = 0;
 	goto out_put;
 
 out_error:
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 93212e40221a..5f8ab8adb5f5 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -168,8 +168,9 @@ static int v9fs_get_sb(struct file_system_type *fs_type, int flags,
 	p9stat_free(st);
 	kfree(st);
 
-P9_DPRINTK(P9_DEBUG_VFS, " return simple set mount\n");
-	return simple_set_mnt(mnt, sb);
+P9_DPRINTK(P9_DEBUG_VFS, " simple set mount, return 0\n");
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 release_sb:
 	if (sb) {
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index 13ea53251dcf..38491fd3871d 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -606,7 +606,8 @@ cifs_get_sb(struct file_system_type *fs_type,
 		return rc;
 	}
 	sb->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static ssize_t cifs_file_aio_write(struct kiocb *iocb, const struct iovec *iov,
diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index 140b43144cd8..b0a76340a4cd 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -454,7 +454,8 @@ static int get_init_pts_sb(struct file_system_type *fs_type, int flags,
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 /*
diff --git a/fs/libfs.c b/fs/libfs.c
index ec600bd33e75..4910a36f516e 100644
--- a/fs/libfs.c
+++ b/fs/libfs.c
@@ -242,7 +242,8 @@ int get_sb_pseudo(struct file_system_type *fs_type, char *name,
 	d_instantiate(dentry, root);
 	s->s_root = dentry;
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 Enomem:
 	up_write(&s->s_umount);
diff --git a/fs/namespace.c b/fs/namespace.c
index 06f8e63f6cb1..2432ca6bb223 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -397,11 +397,10 @@ static void __mnt_unmake_readonly(struct vfsmount *mnt)
 	spin_unlock(&vfsmount_lock);
 }
 
-int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
+void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb)
 {
 	mnt->mnt_sb = sb;
 	mnt->mnt_root = dget(sb->s_root);
-	return 0;
 }
 
 EXPORT_SYMBOL(simple_set_mnt);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index f6299a25594e..1e15a2b176e8 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -83,7 +83,8 @@ static int proc_get_sb(struct file_system_type *fs_type,
 		ns->proc_mnt = mnt;
 	}
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 }
 
 static void proc_kill_sb(struct super_block *sb)
diff --git a/fs/super.c b/fs/super.c
index 6ce501447ada..e512fab64c93 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -831,7 +831,8 @@ int get_sb_bdev(struct file_system_type *fs_type,
 		bdev->bd_super = s;
 	}
 
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 
 error_s:
 	error = PTR_ERR(s);
@@ -877,7 +878,8 @@ int get_sb_nodev(struct file_system_type *fs_type,
 		return error;
 	}
 	s->s_flags |= MS_ACTIVE;
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_nodev);
@@ -909,7 +911,8 @@ int get_sb_single(struct file_system_type *fs_type,
 		s->s_flags |= MS_ACTIVE;
 	}
 	do_remount_sb(s, flags, data, 0);
-	return simple_set_mnt(mnt, s);
+	simple_set_mnt(mnt, s);
+	return 0;
 }
 
 EXPORT_SYMBOL(get_sb_single);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index 1182b66a5491..c5c98355459a 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -2034,7 +2034,8 @@ static int ubifs_get_sb(struct file_system_type *fs_type, int flags,
 	/* 'fill_super()' opens ubi again so we must close it here */
 	ubi_close_volume(ubi);
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 
 out_deact:
 	up_write(&sb->s_umount);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index c2c4454a268a..a7d73914a9f7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1719,7 +1719,7 @@ struct super_block *sget(struct file_system_type *type,
 extern int get_sb_pseudo(struct file_system_type *, char *,
 	const struct super_operations *ops, unsigned long,
 	struct vfsmount *mnt);
-extern int simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
+extern void simple_set_mnt(struct vfsmount *mnt, struct super_block *sb);
 int __put_super_and_need_restart(struct super_block *sb);
 
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index b01100ebd074..c500ca7239b2 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1071,7 +1071,8 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 		mutex_unlock(&cgroup_mutex);
 	}
 
-	return simple_set_mnt(mnt, sb);
+	simple_set_mnt(mnt, sb);
+	return 0;
 
  free_cg_links:
 	free_cg_links(&tmp_cg_links);
-- 
cgit v1.2.3


From 9710794383ee5008d67f1a6613a4717bf6de47bc Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 15 Mar 2009 11:11:44 -0700
Subject: async: remove the temporary (2.6.29) "async is off by default" code

Now that everyone has been able to test the async code (and it's being used
in the Moblin betas by default), we can enable it by default.
The various fixes needed have gone into 2.6.29 already.

[With an important bugfix from Stefan Richter]

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
---
 kernel/async.c | 18 +++++-------------
 1 file changed, 5 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/async.c b/kernel/async.c
index f565891f2c9b..968ef9457d4e 100644
--- a/kernel/async.c
+++ b/kernel/async.c
@@ -49,6 +49,7 @@ asynchronous and synchronous parts of the kernel.
 */
 
 #include <linux/async.h>
+#include <linux/bug.h>
 #include <linux/module.h>
 #include <linux/wait.h>
 #include <linux/sched.h>
@@ -387,20 +388,11 @@ static int async_manager_thread(void *unused)
 
 static int __init async_init(void)
 {
-	if (async_enabled)
-		if (IS_ERR(kthread_run(async_manager_thread, NULL,
-				       "async/mgr")))
-			async_enabled = 0;
-	return 0;
-}
+	async_enabled =
+		!IS_ERR(kthread_run(async_manager_thread, NULL, "async/mgr"));
 
-static int __init setup_async(char *str)
-{
-	async_enabled = 1;
-	return 1;
+	WARN_ON(!async_enabled);
+	return 0;
 }
 
-__setup("fastboot", setup_async);
-
-
 core_initcall(async_init);
-- 
cgit v1.2.3


From d5ac537e5fb6fc12384c9f3ed6a15e912dfbbc2a Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Sat, 28 Mar 2009 21:52:47 -0700
Subject: sched: fix errors in struct & function comments

Fix kernel-doc errors in sched.c:  the structs don't have
kernel-doc notation and the short function description needs to
be one line only.

  Error(kernel/sched.c:3197): cannot understand prototype: 'struct sd_lb_stats '
  Error(kernel/sched.c:3228): cannot understand prototype: 'struct sg_lb_stats '
  Error(kernel/sched.c:3375): duplicate section name 'Description'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sched.c | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched.c b/kernel/sched.c
index f4c413bdd38d..5757e03cfac0 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3190,7 +3190,7 @@ static int move_one_task(struct rq *this_rq, int this_cpu, struct rq *busiest,
 	return 0;
 }
 /********** Helpers for find_busiest_group ************************/
-/**
+/*
  * sd_lb_stats - Structure to store the statistics of a sched_domain
  * 		during load balancing.
  */
@@ -3222,7 +3222,7 @@ struct sd_lb_stats {
 #endif
 };
 
-/**
+/*
  * sg_lb_stats - stats of a sched_group required for load_balancing
  */
 struct sg_lb_stats {
@@ -3360,16 +3360,17 @@ static inline void update_sd_power_savings_stats(struct sched_group *group,
 }
 
 /**
- * check_power_save_busiest_group - Check if we have potential to perform
- *	some power-savings balance. If yes, set the busiest group to be
- *	the least loaded group in the sched_domain, so that it's CPUs can
- *	be put to idle.
- *
+ * check_power_save_busiest_group - see if there is potential for some power-savings balance
  * @sds: Variable containing the statistics of the sched_domain
  *	under consideration.
  * @this_cpu: Cpu at which we're currently performing load-balancing.
  * @imbalance: Variable to store the imbalance.
  *
+ * Description:
+ * Check if we have potential to perform some power-savings balance.
+ * If yes, set the busiest group to be the least loaded group in the
+ * sched_domain, so that it's CPUs can be put to idle.
+ *
  * Returns 1 if there is potential to perform power-savings balance.
  * Else returns 0.
  */
-- 
cgit v1.2.3


From 1a2142afa5646ad5af44bbe1febaa5e0b7e71156 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:10 -0600
Subject: cpumask: remove dangerous CPU_MASK_ALL_PTR, &CPU_MASK_ALL

Impact: cleanup

(Thanks to Al Viro for reminding me of this, via Ingo)

CPU_MASK_ALL is the (deprecated) "all bits set" cpumask, defined as so:

	#define CPU_MASK_ALL (cpumask_t) { { ... } }

Taking the address of such a temporary is questionable at best,
unfortunately 321a8e9d (cpumask: add CPU_MASK_ALL_PTR macro) added
CPU_MASK_ALL_PTR:

	#define CPU_MASK_ALL_PTR (&CPU_MASK_ALL)

Which formalizes this practice.  One day gcc could bite us over this
usage (though we seem to have gotten away with it so far).

So replace everywhere which used &CPU_MASK_ALL or CPU_MASK_ALL_PTR
with the modern "cpu_all_mask" (a real const struct cpumask *).

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: Ingo Molnar <mingo@elte.hu>
Reported-by: Al Viro <viro@zeniv.linux.org.uk>
Cc: Mike Travis <travis@sgi.com>
---
 init/main.c      | 2 +-
 kernel/kmod.c    | 2 +-
 kernel/kthread.c | 4 ++--
 mm/pdflush.c     | 2 +-
 4 files changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index 6bf83afd654d..1ac7ec78e601 100644
--- a/init/main.c
+++ b/init/main.c
@@ -842,7 +842,7 @@ static int __init kernel_init(void * unused)
 	/*
 	 * init can run on any cpu.
 	 */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 	/*
 	 * Tell the world that we're going to be the grim
 	 * reaper of innocent orphaned children.
diff --git a/kernel/kmod.c b/kernel/kmod.c
index a27a5f64443d..f0c8f545180d 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -167,7 +167,7 @@ static int ____call_usermodehelper(void *data)
 	}
 
 	/* We can run anywhere, unlike our parent keventd(). */
-	set_cpus_allowed_ptr(current, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(current, cpu_all_mask);
 
 	/*
 	 * Our parent is keventd, which runs with elevated scheduling priority.
diff --git a/kernel/kthread.c b/kernel/kthread.c
index 4fbc456f393d..84bbadd4d021 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -110,7 +110,7 @@ static void create_kthread(struct kthread_create_info *create)
 		 */
 		sched_setscheduler(create->result, SCHED_NORMAL, &param);
 		set_user_nice(create->result, KTHREAD_NICE_LEVEL);
-		set_cpus_allowed_ptr(create->result, CPU_MASK_ALL_PTR);
+		set_cpus_allowed_ptr(create->result, cpu_all_mask);
 	}
 	complete(&create->done);
 }
@@ -240,7 +240,7 @@ int kthreadd(void *unused)
 	set_task_comm(tsk, "kthreadd");
 	ignore_signals(tsk);
 	set_user_nice(tsk, KTHREAD_NICE_LEVEL);
-	set_cpus_allowed_ptr(tsk, CPU_MASK_ALL_PTR);
+	set_cpus_allowed_ptr(tsk, cpu_all_mask);
 
 	current->flags |= PF_NOFREEZE | PF_FREEZER_NOSIG;
 
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 15de509b68fd..118905e3d788 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -191,7 +191,7 @@ static int pdflush(void *dummy)
 
 	/*
 	 * Some configs put our parent kthread in a limited cpuset,
-	 * which kthread() overrides, forcing cpus_allowed == CPU_MASK_ALL.
+	 * which kthread() overrides, forcing cpus_allowed == cpu_all_mask.
 	 * Our needs are more modest - cut back to our cpusets cpus_allowed.
 	 * This is needed as pdflush's are dynamically created and destroyed.
 	 * The boottime pdflush's are easily placed w/o these 2 lines.
-- 
cgit v1.2.3


From 2b17fa506c418e9fb02bbbc7f426d2bbb5b247a6 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:12 -0600
Subject: cpumask: use set_cpu_active in init/main.c

cpu_active_map is deprecated in favor of cpu_active_mask, which is
const for safety: we use accessors now (set_cpu_active) is we really
want to make a change.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 init/main.c  | 3 +--
 kernel/cpu.c | 6 +++---
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/init/main.c b/init/main.c
index 1ac7ec78e601..d6b388fbffa6 100644
--- a/init/main.c
+++ b/init/main.c
@@ -407,8 +407,7 @@ static void __init smp_init(void)
 	 * Set up the current CPU as possible to migrate to.
 	 * The other ones will be done by cpu_up/cpu_down()
 	 */
-	cpu = smp_processor_id();
-	cpu_set(cpu, cpu_active_map);
+	set_cpu_active(smp_processor_id(), true);
 
 	/* FIXME: This should be done in userspace --RR */
 	for_each_present_cpu(cpu) {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 79e40f00dcb8..395b6974dc8d 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -281,7 +281,7 @@ int __ref cpu_down(unsigned int cpu)
 		goto out;
 	}
 
-	cpu_clear(cpu, cpu_active_map);
+	set_cpu_active(cpu, false);
 
 	/*
 	 * Make sure the all cpus did the reschedule and are not
@@ -296,7 +296,7 @@ int __ref cpu_down(unsigned int cpu)
 	err = _cpu_down(cpu, 0);
 
 	if (cpu_online(cpu))
-		cpu_set(cpu, cpu_active_map);
+		set_cpu_active(cpu, true);
 
 out:
 	cpu_maps_update_done();
@@ -333,7 +333,7 @@ static int __cpuinit _cpu_up(unsigned int cpu, int tasks_frozen)
 		goto out_notify;
 	BUG_ON(!cpu_online(cpu));
 
-	cpu_set(cpu, cpu_active_map);
+	set_cpu_active(cpu, true);
 
 	/* Now call notifier in preparation. */
 	raw_notifier_call_chain(&cpu_chain, CPU_ONLINE | mod, hcpu);
-- 
cgit v1.2.3


From 9489424454c93f4d225d7af47978f8c7e84bf4d4 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:12 -0600
Subject: cpumask: use mm_cpumask() wrapper: kernel/fork.c

Impact: futureproof

Makes code futureproof against the impending change to mm->cpu_vm_mask.

It's also a chance to use the new cpumask_ ops which take a pointer.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/fork.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index 6715ebc3761d..47c15840a381 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -284,7 +284,7 @@ static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm)
 	mm->free_area_cache = oldmm->mmap_base;
 	mm->cached_hole_size = ~0UL;
 	mm->map_count = 0;
-	cpus_clear(mm->cpu_vm_mask);
+	cpumask_clear(mm_cpumask(mm));
 	mm->mm_rb = RB_ROOT;
 	rb_link = &mm->mm_rb.rb_node;
 	rb_parent = NULL;
-- 
cgit v1.2.3


From aa85ea5b89c36c51200d795dd788139bd9b8cf50 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:15 -0600
Subject: cpumask: use new cpumask_ functions in core code.

Impact: cleanup

Time to clean up remaining laggards using the old cpu_ functions.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Cc: Greg Kroah-Hartman <gregkh@suse.de>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Trond.Myklebust@netapp.com
---
 drivers/base/cpu.c     | 2 +-
 include/linux/cpuset.h | 4 ++--
 kernel/workqueue.c     | 6 +++---
 mm/allocpercpu.c       | 2 +-
 mm/vmstat.c            | 2 +-
 net/sunrpc/svc.c       | 2 +-
 6 files changed, 9 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c
index 5b257a57bc57..e62a4ccea54d 100644
--- a/drivers/base/cpu.c
+++ b/drivers/base/cpu.c
@@ -119,7 +119,7 @@ static ssize_t print_cpus_map(char *buf, const struct cpumask *map)
 #define	print_cpus_func(type) \
 static ssize_t print_cpus_##type(struct sysdev_class *class, char *buf)	\
 {									\
-	return print_cpus_map(buf, &cpu_##type##_map);			\
+	return print_cpus_map(buf, cpu_##type##_mask);			\
 }									\
 static struct sysdev_class_attribute attr_##type##_map = 		\
 	_SYSDEV_CLASS_ATTR(type, 0444, print_cpus_##type, NULL)
diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 90c6074a36ca..2e0d79678deb 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -90,12 +90,12 @@ static inline void cpuset_init_smp(void) {}
 static inline void cpuset_cpus_allowed(struct task_struct *p,
 				       struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 static inline void cpuset_cpus_allowed_locked(struct task_struct *p,
 					      struct cpumask *mask)
 {
-	*mask = cpu_possible_map;
+	cpumask_copy(mask, cpu_possible_mask);
 }
 
 static inline nodemask_t cpuset_mems_allowed(struct task_struct *p)
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 1f0c509b40d3..9aedd9fd825b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -416,7 +416,7 @@ void flush_workqueue(struct workqueue_struct *wq)
 	might_sleep();
 	lock_map_acquire(&wq->lockdep_map);
 	lock_map_release(&wq->lockdep_map);
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		flush_cpu_workqueue(per_cpu_ptr(wq->cpu_wq, cpu));
 }
 EXPORT_SYMBOL_GPL(flush_workqueue);
@@ -547,7 +547,7 @@ static void wait_on_work(struct work_struct *work)
 	wq = cwq->wq;
 	cpu_map = wq_cpu_map(wq);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		wait_on_cpu_work(per_cpu_ptr(wq->cpu_wq, cpu), work);
 }
 
@@ -911,7 +911,7 @@ void destroy_workqueue(struct workqueue_struct *wq)
 	list_del(&wq->list);
 	spin_unlock(&workqueue_lock);
 
-	for_each_cpu_mask_nr(cpu, *cpu_map)
+	for_each_cpu(cpu, cpu_map)
 		cleanup_workqueue_thread(per_cpu_ptr(wq->cpu_wq, cpu));
  	cpu_maps_update_done();
 
diff --git a/mm/allocpercpu.c b/mm/allocpercpu.c
index 1882923bc706..139d5b7b6621 100644
--- a/mm/allocpercpu.c
+++ b/mm/allocpercpu.c
@@ -143,7 +143,7 @@ void free_percpu(void *__pdata)
 {
 	if (unlikely(!__pdata))
 		return;
-	__percpu_depopulate_mask(__pdata, &cpu_possible_map);
+	__percpu_depopulate_mask(__pdata, cpu_possible_mask);
 	kfree(__percpu_disguise(__pdata));
 }
 EXPORT_SYMBOL_GPL(free_percpu);
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 91149746bb8d..8cd81ea1ddc1 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -27,7 +27,7 @@ static void sum_vm_events(unsigned long *ret, const struct cpumask *cpumask)
 
 	memset(ret, 0, NR_VM_EVENT_ITEMS * sizeof(unsigned long));
 
-	for_each_cpu_mask_nr(cpu, *cpumask) {
+	for_each_cpu(cpu, cpumask) {
 		struct vm_event_state *this = &per_cpu(vm_event_states, cpu);
 
 		for (i = 0; i < NR_VM_EVENT_ITEMS; i++)
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c51fed4d1af1..bb507e2bb94d 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -312,7 +312,7 @@ svc_pool_map_set_cpumask(struct task_struct *task, unsigned int pidx)
 	switch (m->mode) {
 	case SVC_POOL_PERCPU:
 	{
-		set_cpus_allowed_ptr(task, &cpumask_of_cpu(node));
+		set_cpus_allowed_ptr(task, cpumask_of(node));
 		break;
 	}
 	case SVC_POOL_PERNODE:
-- 
cgit v1.2.3


From 73d0a4b107d58908305f272bfae9bd17f74a2c81 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:16 -0600
Subject: cpumask: convert rcutorture.c

We're getting rid of cpumasks on the stack.

Simply change tmp_mask to a global, and allocate it in
rcu_torture_init().

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
Acked-by: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Cc: Josh Triplett <josh@freedesktop.org>
---
 kernel/rcutorture.c | 25 +++++++++++++++++--------
 1 file changed, 17 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/rcutorture.c b/kernel/rcutorture.c
index 7c4142a79f0a..9b4a975a4b4a 100644
--- a/kernel/rcutorture.c
+++ b/kernel/rcutorture.c
@@ -126,6 +126,7 @@ static atomic_t n_rcu_torture_mberror;
 static atomic_t n_rcu_torture_error;
 static long n_rcu_torture_timers = 0;
 static struct list_head rcu_torture_removed;
+static cpumask_var_t shuffle_tmp_mask;
 
 static int stutter_pause_test = 0;
 
@@ -889,10 +890,9 @@ static int rcu_idle_cpu;	/* Force all torture tasks off this CPU */
  */
 static void rcu_torture_shuffle_tasks(void)
 {
-	cpumask_t tmp_mask;
 	int i;
 
-	cpus_setall(tmp_mask);
+	cpumask_setall(shuffle_tmp_mask);
 	get_online_cpus();
 
 	/* No point in shuffling if there is only one online CPU (ex: UP) */
@@ -902,29 +902,29 @@ static void rcu_torture_shuffle_tasks(void)
 	}
 
 	if (rcu_idle_cpu != -1)
-		cpu_clear(rcu_idle_cpu, tmp_mask);
+		cpumask_clear_cpu(rcu_idle_cpu, shuffle_tmp_mask);
 
-	set_cpus_allowed_ptr(current, &tmp_mask);
+	set_cpus_allowed_ptr(current, shuffle_tmp_mask);
 
 	if (reader_tasks) {
 		for (i = 0; i < nrealreaders; i++)
 			if (reader_tasks[i])
 				set_cpus_allowed_ptr(reader_tasks[i],
-						     &tmp_mask);
+						     shuffle_tmp_mask);
 	}
 
 	if (fakewriter_tasks) {
 		for (i = 0; i < nfakewriters; i++)
 			if (fakewriter_tasks[i])
 				set_cpus_allowed_ptr(fakewriter_tasks[i],
-						     &tmp_mask);
+						     shuffle_tmp_mask);
 	}
 
 	if (writer_task)
-		set_cpus_allowed_ptr(writer_task, &tmp_mask);
+		set_cpus_allowed_ptr(writer_task, shuffle_tmp_mask);
 
 	if (stats_task)
-		set_cpus_allowed_ptr(stats_task, &tmp_mask);
+		set_cpus_allowed_ptr(stats_task, shuffle_tmp_mask);
 
 	if (rcu_idle_cpu == -1)
 		rcu_idle_cpu = num_online_cpus() - 1;
@@ -1012,6 +1012,7 @@ rcu_torture_cleanup(void)
 	if (shuffler_task) {
 		VERBOSE_PRINTK_STRING("Stopping rcu_torture_shuffle task");
 		kthread_stop(shuffler_task);
+		free_cpumask_var(shuffle_tmp_mask);
 	}
 	shuffler_task = NULL;
 
@@ -1190,10 +1191,18 @@ rcu_torture_init(void)
 	}
 	if (test_no_idle_hz) {
 		rcu_idle_cpu = num_online_cpus() - 1;
+
+		if (!alloc_cpumask_var(&shuffle_tmp_mask, GFP_KERNEL)) {
+			firsterr = -ENOMEM;
+			VERBOSE_PRINTK_ERRSTRING("Failed to alloc mask");
+			goto unwind;
+		}
+
 		/* Create the shuffler thread */
 		shuffler_task = kthread_run(rcu_torture_shuffle, NULL,
 					  "rcu_torture_shuffle");
 		if (IS_ERR(shuffler_task)) {
+			free_cpumask_var(shuffle_tmp_mask);
 			firsterr = PTR_ERR(shuffler_task);
 			VERBOSE_PRINTK_ERRSTRING("Failed to create shuffler");
 			shuffler_task = NULL;
-- 
cgit v1.2.3


From 612a726faf8486fa48b34fa37115ce1e7421d383 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Mon, 30 Mar 2009 22:05:16 -0600
Subject: cpumask: remove cpumask_t from core

Impact: cleanup

struct cpumask is nicer, and we use it to make where we've made code
safe for CONFIG_CPUMASK_OFFSTACK=y.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/sched_cpupri.h | 2 +-
 kernel/stop_machine.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sched_cpupri.h b/kernel/sched_cpupri.h
index 642a94ef8a0a..9a7e859b8fbf 100644
--- a/kernel/sched_cpupri.h
+++ b/kernel/sched_cpupri.h
@@ -25,7 +25,7 @@ struct cpupri {
 
 #ifdef CONFIG_SMP
 int  cpupri_find(struct cpupri *cp,
-		 struct task_struct *p, cpumask_t *lowest_mask);
+		 struct task_struct *p, struct cpumask *lowest_mask);
 void cpupri_set(struct cpupri *cp, int cpu, int pri);
 int cpupri_init(struct cpupri *cp, bool bootmem);
 void cpupri_cleanup(struct cpupri *cp);
diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index 74541ca49536..912823e2a11b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -44,7 +44,7 @@ static DEFINE_MUTEX(setup_lock);
 static int refcount;
 static struct workqueue_struct *stop_machine_wq;
 static struct stop_machine_data active, idle;
-static const cpumask_t *active_cpus;
+static const struct cpumask *active_cpus;
 static void *stop_machine_work;
 
 static void set_state(enum stopmachine_state newstate)
-- 
cgit v1.2.3


From 2a93a1f21480f3aa134fe0c48954b64b8a97ef25 Mon Sep 17 00:00:00 2001
From: Uwe Kleine-Koenig <ukleinek@strlen.de>
Date: Mon, 12 Jan 2009 23:35:50 +0100
Subject: trivial: fix typo "resgister" -> "register"

Signed-off-by: Uwe Kleine-Koenig <ukleinek@strlen.de>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 kernel/trace/ftrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index fdf913dfc7e8..53e8c8bc0c98 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -1908,7 +1908,7 @@ int register_ftrace_function(struct ftrace_ops *ops)
 }
 
 /**
- * unregister_ftrace_function - unresgister a function for profiling.
+ * unregister_ftrace_function - unregister a function for profiling.
  * @ops - ops structure that holds the function to unregister
  *
  * Unregister a function that was added to be called by ftrace profiling.
-- 
cgit v1.2.3


From 877d03105d04b2c13e241130277fa69c8d2564f0 Mon Sep 17 00:00:00 2001
From: Nick Andrew <nick@nick-andrew.net>
Date: Mon, 26 Jan 2009 11:06:57 +0100
Subject: trivial: Fix misspelling of firmware

Fix misspelling of firmware.

Signed-off-by: Nick Andrew <nick@nick-andrew.net>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 Documentation/ia64/kvm.txt                                    | 2 +-
 Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt | 2 +-
 arch/mips/sgi-ip27/ip27-smp.c                                 | 2 +-
 arch/sparc/kernel/head_64.S                                   | 2 +-
 drivers/net/sb1250-mac.c                                      | 2 +-
 drivers/net/tg3.c                                             | 2 +-
 drivers/net/wireless/ipw2x00/ipw2100.c                        | 2 +-
 drivers/net/wireless/ipw2x00/ipw2200.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl-agn.c                        | 2 +-
 drivers/net/wireless/iwlwifi/iwl3945-base.c                   | 2 +-
 drivers/net/wireless/libertas/cmd.c                           | 2 +-
 drivers/pci/pci.c                                             | 2 +-
 drivers/platform/x86/thinkpad_acpi.c                          | 2 +-
 drivers/staging/otus/hal/hpmain.c                             | 2 +-
 drivers/usb/atm/ueagle-atm.c                                  | 2 +-
 drivers/usb/serial/ChangeLog.history                          | 2 +-
 include/linux/libata.h                                        | 2 +-
 kernel/power/disk.c                                           | 4 ++--
 sound/oss/pss.c                                               | 2 +-
 sound/sh/aica.c                                               | 2 +-
 20 files changed, 21 insertions(+), 21 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/ia64/kvm.txt b/Documentation/ia64/kvm.txt
index 84f7cb3d5bec..ffb5c80bec3e 100644
--- a/Documentation/ia64/kvm.txt
+++ b/Documentation/ia64/kvm.txt
@@ -42,7 +42,7 @@ Note: For step 2, please make sure that host page size == TARGET_PAGE_SIZE of qe
 		hg clone http://xenbits.xensource.com/ext/efi-vfirmware.hg
 	    you can get the firmware's binary in the directory of efi-vfirmware.hg/binaries.
 
-	(3) Rename the firware you owned to Flash.fd, and copy it to /usr/local/share/qemu
+	(3) Rename the firmware you owned to Flash.fd, and copy it to /usr/local/share/qemu
 
 4. Boot up Linux or Windows guests:
 	4.1 Create or install a image for guest boot. If you have xen experience, it should be easy.
diff --git a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
index 6c238f59b2a9..249db3a15d15 100644
--- a/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
+++ b/Documentation/powerpc/dts-bindings/fsl/cpm_qe/qe/firmware.txt
@@ -1,6 +1,6 @@
 * Uploaded QE firmware
 
-      If a new firwmare has been uploaded to the QE (usually by the
+      If a new firmware has been uploaded to the QE (usually by the
       boot loader), then a 'firmware' child node should be added to the QE
       node.  This node provides information on the uploaded firmware that
       device drivers may need.
diff --git a/arch/mips/sgi-ip27/ip27-smp.c b/arch/mips/sgi-ip27/ip27-smp.c
index 5b47d6b65275..cbcd7eb83bd1 100644
--- a/arch/mips/sgi-ip27/ip27-smp.c
+++ b/arch/mips/sgi-ip27/ip27-smp.c
@@ -221,7 +221,7 @@ static void __init ip27_smp_setup(void)
 	 * Assumption to be fixed: we're always booted on logical / physical
 	 * processor 0.  While we're always running on logical processor 0
 	 * this still means this is physical processor zero; it might for
-	 * example be disabled in the firwware.
+	 * example be disabled in the firmware.
 	 */
 	alloc_cpupda(0, 0);
 }
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index a46c3a21e26d..3a1b7bf03cff 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -686,7 +686,7 @@ tlb_fixup_done:
 	 * point.
 	 *
 	 * There used to be enormous complexity wrt. transferring
-	 * over from the firwmare's trap table to the Linux kernel's.
+	 * over from the firmware's trap table to the Linux kernel's.
 	 * For example, there was a chicken & egg problem wrt. building
 	 * the OBP page tables, yet needing to be on the Linux kernel
 	 * trap table (to translate PAGE_OFFSET addresses) in order to
diff --git a/drivers/net/sb1250-mac.c b/drivers/net/sb1250-mac.c
index 88dd2e09832f..ce7551e17ba7 100644
--- a/drivers/net/sb1250-mac.c
+++ b/drivers/net/sb1250-mac.c
@@ -2299,7 +2299,7 @@ static int sbmac_init(struct platform_device *pldev, long long base)
 	eaddr = sc->sbm_hwaddr;
 
 	/*
-	 * Read the ethernet address.  The firwmare left this programmed
+	 * Read the ethernet address.  The firmware left this programmed
 	 * for us in the ethernet address register for each mac.
 	 */
 
diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c
index f7efcecc4108..ed60b18addac 100644
--- a/drivers/net/tg3.c
+++ b/drivers/net/tg3.c
@@ -11225,7 +11225,7 @@ static int __devinit tg3_phy_probe(struct tg3 *tp)
 		return tg3_phy_init(tp);
 
 	/* Reading the PHY ID register can conflict with ASF
-	 * firwmare access to the PHY hardware.
+	 * firmware access to the PHY hardware.
 	 */
 	err = 0;
 	if ((tp->tg3_flags & TG3_FLAG_ENABLE_ASF) ||
diff --git a/drivers/net/wireless/ipw2x00/ipw2100.c b/drivers/net/wireless/ipw2x00/ipw2100.c
index 115b70487502..f4e963ba768b 100644
--- a/drivers/net/wireless/ipw2x00/ipw2100.c
+++ b/drivers/net/wireless/ipw2x00/ipw2100.c
@@ -2362,7 +2362,7 @@ static void ipw2100_corruption_detected(struct ipw2100_priv *priv, int i)
 		       i * sizeof(struct ipw2100_status));
 
 #ifdef IPW2100_DEBUG_C3
-	/* Halt the fimrware so we can get a good image */
+	/* Halt the firmware so we can get a good image */
 	write_register(priv->net_dev, IPW_REG_RESET_REG,
 		       IPW_AUX_HOST_RESET_REG_STOP_MASTER);
 	j = 5;
diff --git a/drivers/net/wireless/ipw2x00/ipw2200.c b/drivers/net/wireless/ipw2x00/ipw2200.c
index b3449948a25a..f6174fdc12bf 100644
--- a/drivers/net/wireless/ipw2x00/ipw2200.c
+++ b/drivers/net/wireless/ipw2x00/ipw2200.c
@@ -8844,7 +8844,7 @@ static int ipw_wx_set_mode(struct net_device *dev,
 #endif				/* CONFIG_IPW2200_MONITOR */
 
 	/* Free the existing firmware and reset the fw_loaded
-	 * flag so ipw_load() will bring in the new firmawre */
+	 * flag so ipw_load() will bring in the new firmware */
 	free_firmware();
 
 	priv->ieee->iw_mode = wrqu->mode;
diff --git a/drivers/net/wireless/iwlwifi/iwl-agn.c b/drivers/net/wireless/iwlwifi/iwl-agn.c
index 663dc83be501..3889158b359c 100644
--- a/drivers/net/wireless/iwlwifi/iwl-agn.c
+++ b/drivers/net/wireless/iwlwifi/iwl-agn.c
@@ -1337,7 +1337,7 @@ static int iwl_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c
index a71b08ca7c71..9d5f97dd7c73 100644
--- a/drivers/net/wireless/iwlwifi/iwl3945-base.c
+++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c
@@ -2562,7 +2562,7 @@ static int iwl3945_read_ucode(struct iwl_priv *priv)
 
 	/* api_ver should match the api version forming part of the
 	 * firmware filename ... but we don't check for that and only rely
-	 * on the API version read from firware header from here on forward */
+	 * on the API version read from firmware header from here on forward */
 
 	if (api_ver < api_min || api_ver > api_max) {
 		IWL_ERR(priv, "Driver unable to support your firmware API. "
diff --git a/drivers/net/wireless/libertas/cmd.c b/drivers/net/wireless/libertas/cmd.c
index 639dd02d3d31..8c3605cdc64c 100644
--- a/drivers/net/wireless/libertas/cmd.c
+++ b/drivers/net/wireless/libertas/cmd.c
@@ -1649,7 +1649,7 @@ static struct cmd_ctrl_node *lbs_get_cmd_ctrl_node(struct lbs_private *priv)
 
 /**
  *  @brief This function executes next command in command
- *  pending queue. It will put fimware back to PS mode
+ *  pending queue. It will put firmware back to PS mode
  *  if applicable.
  *
  *  @param priv     A pointer to struct lbs_private structure
diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 6d6120007af4..dab33a21d49a 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -550,7 +550,7 @@ void pci_update_current_state(struct pci_dev *dev, pci_power_t state)
  * @dev: PCI device to handle.
  * @state: PCI power state (D0, D1, D2, D3hot) to put the device into.
  *
- * Transition a device to a new power state, using the platform formware and/or
+ * Transition a device to a new power state, using the platform firmware and/or
  * the device's PCI PM registers.
  *
  * RETURN VALUE:
diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c
index d2433204a40c..814cb6520673 100644
--- a/drivers/platform/x86/thinkpad_acpi.c
+++ b/drivers/platform/x86/thinkpad_acpi.c
@@ -5811,7 +5811,7 @@ static struct ibm_struct volume_driver_data = {
  *	ThinkPads from this same time period (and earlier) probably lack the
  *	tachometer as well.
  *
- *	Unfortunately a lot of ThinkPads with new-style ECs but whose firwmare
+ *	Unfortunately a lot of ThinkPads with new-style ECs but whose firmware
  *	was never fixed by IBM to report the EC firmware version string
  *	probably support the tachometer (like the early X models), so
  *	detecting it is quite hard.  We need more data to know for sure.
diff --git a/drivers/staging/otus/hal/hpmain.c b/drivers/staging/otus/hal/hpmain.c
index 2e65c466aae8..dab278326931 100644
--- a/drivers/staging/otus/hal/hpmain.c
+++ b/drivers/staging/otus/hal/hpmain.c
@@ -152,7 +152,7 @@ u16_t zfHpInit(zdev_t* dev, u32_t frequency)
     else
     {
     #ifndef ZM_OTUS_LINUX_PHASE_2
-        /* donwload the normal frimware */
+        /* download the normal firmware */
         if ((ret = zfFirmwareDownload(dev, (u32_t*)zcFwImage,
                 (u32_t)zcFwImageSize, ZM_FIRMWARE_WLAN_ADDR)) != ZM_SUCCESS)
         {
diff --git a/drivers/usb/atm/ueagle-atm.c b/drivers/usb/atm/ueagle-atm.c
index b6483dd98acc..9cf9ff69e3e3 100644
--- a/drivers/usb/atm/ueagle-atm.c
+++ b/drivers/usb/atm/ueagle-atm.c
@@ -626,7 +626,7 @@ static void uea_upload_pre_firmware(const struct firmware *fw_entry, void *conte
 		goto err_fw_corrupted;
 
 	/*
-	 * Start to upload formware : send reset
+	 * Start to upload firmware : send reset
 	 */
 	value = 1;
 	ret = uea_send_modem_cmd(usb, F8051_USBCS, sizeof(value), &value);
diff --git a/drivers/usb/serial/ChangeLog.history b/drivers/usb/serial/ChangeLog.history
index c1b279939bbf..f13fd488ebec 100644
--- a/drivers/usb/serial/ChangeLog.history
+++ b/drivers/usb/serial/ChangeLog.history
@@ -715,7 +715,7 @@ io_edgeport.c Change Log comments:
 
  0.2 (01/30/2000) greg kroah-hartman
 	Milestone 1 release.
-	Device is found by USB subsystem, enumerated, fimware is downloaded
+	Device is found by USB subsystem, enumerated, firmware is downloaded
 	and the descriptors are printed to the debug log, config is set, and
 	green light starts to blink. Open port works, and data can be sent
 	and received at the default settings of the UART. Loopback connector
diff --git a/include/linux/libata.h b/include/linux/libata.h
index 76262d83656b..b450a2628855 100644
--- a/include/linux/libata.h
+++ b/include/linux/libata.h
@@ -379,7 +379,7 @@ enum {
 	ATA_HORKAGE_BRIDGE_OK	= (1 << 10),	/* no bridge limits */
 	ATA_HORKAGE_ATAPI_MOD16_DMA = (1 << 11), /* use ATAPI DMA for commands
 						    not multiple of 16 bytes */
-	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firwmare update warning */
+	ATA_HORKAGE_FIRMWARE_WARN = (1 << 12),	/* firmware update warning */
 	ATA_HORKAGE_1_5_GBPS	= (1 << 13),	/* force 1.5 Gbps */
 
 	 /* DMA mask for user DMA control: User visible values; DO NOT
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206b1979..9d1c1a0de350 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -265,7 +265,7 @@ static int create_image(int platform_mode)
  *	hibernation_snapshot - quiesce devices and create the hibernation
  *	snapshot image.
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the power transition.
+ *			 prepare the platform firmware for the power transition.
  *
  *	Must be called with pm_mutex held
  */
@@ -378,7 +378,7 @@ static int resume_target_kernel(void)
  *	hibernation_restore - quiesce devices and restore the hibernation
  *	snapshot image.  If successful, control returns in hibernation_snaphot()
  *	@platform_mode - if set, use the platform driver, if available, to
- *			 prepare the platform frimware for the transition.
+ *			 prepare the platform firmware for the transition.
  *
  *	Must be called with pm_mutex held
  */
diff --git a/sound/oss/pss.c b/sound/oss/pss.c
index 16517a5a1301..83f5ee236b12 100644
--- a/sound/oss/pss.c
+++ b/sound/oss/pss.c
@@ -46,7 +46,7 @@
  *          load the driver as it did in previous versions.
  * 04-07-1999: Anthony Barbachan <barbcode@xmen.cis.fordham.edu>
  *          Added module parameter pss_firmware to allow the user to tell 
- *          the driver where the fireware file is located.  The default 
+ *          the driver where the firmware file is located.  The default 
  *          setting is the previous hardcoded setting "/etc/sound/pss_synth".
  * 00-03-03: Christoph Hellwig <chhellwig@infradead.org>
  *	    Adapted to module_init/module_exit
diff --git a/sound/sh/aica.c b/sound/sh/aica.c
index f551233c5a08..583a3693df75 100644
--- a/sound/sh/aica.c
+++ b/sound/sh/aica.c
@@ -565,7 +565,7 @@ static int load_aica_firmware(void)
 	err = request_firmware(&fw_entry, "aica_firmware.bin", &pd->dev);
 	if (unlikely(err))
 		return err;
-	/* write firware into memory */
+	/* write firmware into memory */
 	spu_disable();
 	spu_memload(0, fw_entry->data, fw_entry->size);
 	spu_enable();
-- 
cgit v1.2.3


From 692105b8ac5bcd75dc65f6a8f10bdbd0f0f34dcf Mon Sep 17 00:00:00 2001
From: Matt LaPlante <kernel1@cyberdogtech.com>
Date: Mon, 26 Jan 2009 11:12:25 +0100
Subject: trivial: fix typos/grammar errors in Kconfig texts

Signed-off-by: Matt LaPlante <kernel1@cyberdogtech.com>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
---
 arch/arm/mach-omap1/Kconfig          |  2 +-
 arch/avr32/Kconfig                   |  6 +++---
 arch/blackfin/Kconfig                |  6 +++---
 arch/cris/Kconfig                    |  6 +++---
 arch/cris/arch-v32/Kconfig           |  2 +-
 arch/cris/arch-v32/drivers/Kconfig   |  2 +-
 arch/cris/arch-v32/mach-fs/Kconfig   |  2 +-
 arch/mips/Kconfig                    |  2 +-
 arch/powerpc/Kconfig                 |  2 +-
 arch/powerpc/platforms/Kconfig       |  2 +-
 arch/powerpc/sysdev/bestcomm/Kconfig |  4 ++--
 arch/sh/Kconfig                      |  4 ++--
 arch/x86/Kconfig                     |  4 ++--
 drivers/ata/Kconfig                  |  2 +-
 drivers/gpio/Kconfig                 |  6 +++---
 drivers/hid/Kconfig                  |  2 +-
 drivers/input/Kconfig                |  2 +-
 drivers/isdn/mISDN/Kconfig           | 10 ++++++----
 drivers/leds/Kconfig                 |  6 +++---
 drivers/media/common/tuners/Kconfig  |  2 +-
 drivers/media/dvb/frontends/Kconfig  |  2 +-
 drivers/mfd/Kconfig                  |  2 +-
 drivers/misc/Kconfig                 |  6 +++---
 drivers/mmc/host/Kconfig             |  2 +-
 drivers/scsi/Kconfig                 |  4 ++--
 drivers/serial/Kconfig               |  2 +-
 drivers/staging/Kconfig              |  4 ++--
 drivers/staging/comedi/Kconfig       |  4 ++--
 drivers/staging/go7007/Kconfig       |  4 ++--
 drivers/staging/panel/Kconfig        |  2 +-
 drivers/usb/gadget/Kconfig           |  2 +-
 drivers/usb/serial/Kconfig           |  4 ++--
 drivers/uwb/Kconfig                  |  4 ++--
 drivers/xen/Kconfig                  |  2 +-
 fs/ext4/Kconfig                      |  2 +-
 fs/ubifs/Kconfig                     |  4 ++--
 init/Kconfig                         |  6 +++---
 kernel/trace/Kconfig                 |  9 ++++-----
 net/Kconfig                          |  2 +-
 net/ipv6/Kconfig                     | 18 +++++++++---------
 net/mac80211/Kconfig                 |  2 +-
 net/netfilter/Kconfig                |  2 +-
 net/phonet/Kconfig                   |  2 +-
 net/sunrpc/Kconfig                   |  2 +-
 net/wimax/Kconfig                    |  2 +-
 sound/soc/blackfin/Kconfig           |  2 +-
 46 files changed, 86 insertions(+), 85 deletions(-)

(limited to 'kernel')

diff --git a/arch/arm/mach-omap1/Kconfig b/arch/arm/mach-omap1/Kconfig
index 3f325d3718a9..cd8de89c5fad 100644
--- a/arch/arm/mach-omap1/Kconfig
+++ b/arch/arm/mach-omap1/Kconfig
@@ -109,7 +109,7 @@ config MACH_OMAP_PALMZ71
 	help
 	 Support for the Palm Zire71 PDA. To boot the kernel,
 	 you'll need a PalmOS compatible bootloader; check out
-	 http://hackndev.com/palm/z71 for more informations.
+	 http://hackndev.com/palm/z71 for more information.
 	 Say Y here if you have such a PDA, say N otherwise.
 
 config MACH_OMAP_PALMTT
diff --git a/arch/avr32/Kconfig b/arch/avr32/Kconfig
index 05fe3053dcae..414a8ad97f52 100644
--- a/arch/avr32/Kconfig
+++ b/arch/avr32/Kconfig
@@ -127,13 +127,13 @@ config BOARD_HAMMERHEAD
 	select CPU_AT32AP7000
 	select USB_ARCH_HAS_HCD
 	help
-	  The Hammerhead platform is built around a AVR32 32-bit microcontroller from Atmel.
+	  The Hammerhead platform is built around an AVR32 32-bit microcontroller from Atmel.
 	  It offers versatile peripherals, such as ethernet, usb device, usb host etc.
 
-	  The board also incooperates a power supply and is a Power over Ethernet (PoE) Powered
+	  The board also incorporates a power supply and is a Power over Ethernet (PoE) Powered
 	  Device (PD).
 
-	  Additonally, a Cyclone III FPGA from Altera is integrated on the board. The FPGA is
+	  Additionally, a Cyclone III FPGA from Altera is integrated on the board. The FPGA is
 	  mapped into the 32-bit AVR memory bus. The FPGA offers two DDR2 SDRAM interfaces, which
 	  will cover even the most exceptional need of memory bandwidth. Together with the onboard
 	  video decoder the board is ready for video processing.
diff --git a/arch/blackfin/Kconfig b/arch/blackfin/Kconfig
index 0c1f86e3e44a..3640cdc38aac 100644
--- a/arch/blackfin/Kconfig
+++ b/arch/blackfin/Kconfig
@@ -777,7 +777,7 @@ config CACHELINE_ALIGNED_L1
 	default n if BF54x
 	depends on !BF531
 	help
-	  If enabled, cacheline_anligned data is linked
+	  If enabled, cacheline_aligned data is linked
 	  into L1 data memory. (less latency)
 
 config SYSCALL_TAB_L1
@@ -957,7 +957,7 @@ config MPU
 	  memory they do not own.  This comes at a performance penalty
 	  and is recommended only for debugging.
 
-comment "Asynchonous Memory Configuration"
+comment "Asynchronous Memory Configuration"
 
 menu "EBIU_AMGCTL Global Control"
 config C_AMCKEN
@@ -989,7 +989,7 @@ config C_B3PEN
 	default n
 
 choice
-	prompt"Enable Asynchonous Memory Banks"
+	prompt "Enable Asynchronous Memory Banks"
 	default C_AMBEN_ALL
 
 config C_AMBEN
diff --git a/arch/cris/Kconfig b/arch/cris/Kconfig
index 3462245fe9fb..7adac388a771 100644
--- a/arch/cris/Kconfig
+++ b/arch/cris/Kconfig
@@ -438,7 +438,7 @@ config ETRAX_SERIAL_PORT0_DMA1_IN
 	help
 	  Enables the DMA1 input channel for ser0 (ttyS0).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
@@ -565,7 +565,7 @@ config ETRAX_SERIAL_PORT2_DMA7_IN
 	help
 	  Enables the DMA7 input channel for ser2 (ttyS2).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
@@ -604,7 +604,7 @@ config ETRAX_SERIAL_PORT3_DMA3_IN
 	help
 	  Enables the DMA3 input channel for ser3 (ttyS3).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
diff --git a/arch/cris/arch-v32/Kconfig b/arch/cris/arch-v32/Kconfig
index 005ed2b3f7f4..21bbd93be34f 100644
--- a/arch/cris/arch-v32/Kconfig
+++ b/arch/cris/arch-v32/Kconfig
@@ -28,7 +28,7 @@ config	ETRAX_NBR_LED_GRP_ONE
 	help
 	  Select this if you want one Ethernet LED group. This LED group
 	  can be used for one or more Ethernet interfaces. However, it is
-	  recomended that each Ethernet interface use a dedicated LED group.
+	  recommended that each Ethernet interface use a dedicated LED group.
 
 config	ETRAX_NBR_LED_GRP_TWO
 	bool "Use two LED groups"
diff --git a/arch/cris/arch-v32/drivers/Kconfig b/arch/cris/arch-v32/drivers/Kconfig
index 7a64fcef9d07..b9e328e688be 100644
--- a/arch/cris/arch-v32/drivers/Kconfig
+++ b/arch/cris/arch-v32/drivers/Kconfig
@@ -342,7 +342,7 @@ config ETRAX_SERIAL_PORT4_DMA9_IN
 	help
 	  Enables the DMA9 input channel for ser4 (ttyS4).
 	  If you do not enable DMA, an interrupt for each character will be
-	  used when receiveing data.
+	  used when receiving data.
 	  Normally you want to use DMA, unless you use the DMA channel for
 	  something else.
 
diff --git a/arch/cris/arch-v32/mach-fs/Kconfig b/arch/cris/arch-v32/mach-fs/Kconfig
index f6d74475f1c6..774de82abef6 100644
--- a/arch/cris/arch-v32/mach-fs/Kconfig
+++ b/arch/cris/arch-v32/mach-fs/Kconfig
@@ -59,7 +59,7 @@ config ETRAX_SDRAM_GRP1_CONFIG
 	depends on ETRAX_ARCH_V32
 	default "0"
 	help
-	  SDRAM configuration for group 1. The defult value is 0
+	  SDRAM configuration for group 1. The default value is 0
 	  because group 1 is not used in the default configuration,
 	  described in the help for SDRAM_GRP0_CONFIG.
 
diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig
index 206cb7953b0c..92508c521673 100644
--- a/arch/mips/Kconfig
+++ b/arch/mips/Kconfig
@@ -136,7 +136,7 @@ config MACH_JAZZ
 	help
 	 This a family of machines based on the MIPS R4030 chipset which was
 	 used by several vendors to build RISC/os and Windows NT workstations.
-	 Members include the Acer PICA, MIPS Magnum 4000, MIPS Millenium and
+	 Members include the Acer PICA, MIPS Magnum 4000, MIPS Millennium and
 	 Olivetti M700-10 workstations.
 
 config LASAT
diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 74cc312c347c..2e2160c3de7c 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -342,7 +342,7 @@ config PHYP_DUMP
 	help
 	  Hypervisor-assisted dump is meant to be a kdump replacement
 	  offering robustness and speed not possible without system
-	  hypervisor assistence.
+	  hypervisor assistance.
 
 	  If unsure, say "N"
 
diff --git a/arch/powerpc/platforms/Kconfig b/arch/powerpc/platforms/Kconfig
index 200b9cb900ea..9a8bb53d315d 100644
--- a/arch/powerpc/platforms/Kconfig
+++ b/arch/powerpc/platforms/Kconfig
@@ -298,7 +298,7 @@ config CPM
 config OF_RTC
 	bool
 	help
-	  Uses information from the OF or flattened device tree to instatiate
+	  Uses information from the OF or flattened device tree to instantiate
 	  platform devices for direct mapped RTC chips like the DS1742 or DS1743.
 
 source "arch/powerpc/sysdev/bestcomm/Kconfig"
diff --git a/arch/powerpc/sysdev/bestcomm/Kconfig b/arch/powerpc/sysdev/bestcomm/Kconfig
index 0b192a1c429d..29e427085efb 100644
--- a/arch/powerpc/sysdev/bestcomm/Kconfig
+++ b/arch/powerpc/sysdev/bestcomm/Kconfig
@@ -9,8 +9,8 @@ config PPC_BESTCOMM
 	select PPC_LIB_RHEAP
 	help
 	  BestComm is the name of the communication coprocessor found
-	  on the Freescale MPC5200 family of processor. It's usage is
-	  optionnal for some drivers (like ATA), but required for
+	  on the Freescale MPC5200 family of processor.  Its usage is
+	  optional for some drivers (like ATA), but required for
 	  others (like FEC).
 
 	  If you want to use drivers that require DMA operations,
diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig
index 8d50d527c595..2d52b515c241 100644
--- a/arch/sh/Kconfig
+++ b/arch/sh/Kconfig
@@ -640,10 +640,10 @@ config GUSA_RB
 	depends on GUSA && CPU_SH3 || (CPU_SH4 && !CPU_SH4A)
 	help
 	  Enabling this option will allow the kernel to implement some
-	  atomic operations using a software implemention of load-locked/
+	  atomic operations using a software implementation of load-locked/
 	  store-conditional (LLSC). On machines which do not have hardware
 	  LLSC, this should be more efficient than the other alternative of
-	  disabling insterrupts around the atomic sequence.
+	  disabling interrupts around the atomic sequence.
 
 endmenu
 
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 06c02c00d7d9..d7b5621382f6 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1129,7 +1129,7 @@ config NODES_SHIFT
 	depends on NEED_MULTIPLE_NODES
 	---help---
 	  Specify the maximum number of NUMA Nodes available on the target
-	  system.  Increases memory reserved to accomodate various tables.
+	  system.  Increases memory reserved to accommodate various tables.
 
 config HAVE_ARCH_BOOTMEM
 	def_bool y
@@ -1307,7 +1307,7 @@ config MTRR_SANITIZER
 	  add writeback entries.
 
 	  Can be disabled with disable_mtrr_cleanup on the kernel command line.
-	  The largest mtrr entry size for a continous block can be set with
+	  The largest mtrr entry size for a continuous block can be set with
 	  mtrr_chunk_size.
 
 	  If unsure, say Y.
diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig
index 0bcf26464670..9120717c0701 100644
--- a/drivers/ata/Kconfig
+++ b/drivers/ata/Kconfig
@@ -86,7 +86,7 @@ config ATA_SFF
 
 	  For users with exclusively modern controllers like AHCI,
 	  Silicon Image 3124, or Marvell 6440, you may choose to
-	  disable this uneeded SFF support.
+	  disable this unneeded SFF support.
 
 	  If unsure, say Y.
 
diff --git a/drivers/gpio/Kconfig b/drivers/gpio/Kconfig
index 3d2565441b36..edb02530e461 100644
--- a/drivers/gpio/Kconfig
+++ b/drivers/gpio/Kconfig
@@ -42,9 +42,9 @@ config DEBUG_GPIO
 	depends on DEBUG_KERNEL
 	help
 	  Say Y here to add some extra checks and diagnostics to GPIO calls.
-	  The checks help ensure that GPIOs have been properly initialized
-	  before they are used and that sleeping calls aren not made from
-	  nonsleeping contexts.  They can make bitbanged serial protocols
+	  These checks help ensure that GPIOs have been properly initialized
+	  before they are used, and that sleeping calls are not made from
+	  non-sleeping contexts.  They can make bitbanged serial protocols
 	  slower.  The diagnostics help catch the type of setup errors
 	  that are most common when setting up new platforms or boards.
 
diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig
index e85c8fe9ffcf..504cfaa6160f 100644
--- a/drivers/hid/Kconfig
+++ b/drivers/hid/Kconfig
@@ -243,7 +243,7 @@ config GREENASIA_FF
 	select INPUT_FF_MEMLESS
 	---help---
 	Say Y here if you have a GreenAsia (Product ID 0x12) based game controller
-	(like MANTA Warior MM816 and SpeedLink Strike2 SL-6635) or adapter
+	(like MANTA Warrior MM816 and SpeedLink Strike2 SL-6635) or adapter
 	and want to enable force feedback support for it.
 
 config HID_TOPSEED
diff --git a/drivers/input/Kconfig b/drivers/input/Kconfig
index 5f9d860925a1..cd50c00ab20f 100644
--- a/drivers/input/Kconfig
+++ b/drivers/input/Kconfig
@@ -143,7 +143,7 @@ config INPUT_APMPOWER
 	---help---
 	  Say Y here if you want suspend key events to trigger a user
 	  requested suspend through APM. This is useful on embedded
-	  systems where such behviour is desired without userspace
+	  systems where such behaviour is desired without userspace
 	  interaction. If unsure, say N.
 
 	  To compile this driver as a module, choose M here: the
diff --git a/drivers/isdn/mISDN/Kconfig b/drivers/isdn/mISDN/Kconfig
index 4938355c4072..1747a02a019a 100644
--- a/drivers/isdn/mISDN/Kconfig
+++ b/drivers/isdn/mISDN/Kconfig
@@ -14,13 +14,15 @@ config MISDN_DSP
 	depends on MISDN
 	help
 	  Enable support for digital audio processing capability.
+
 	  This module may be used for special applications that require
-	  cross connecting of bchannels, conferencing, dtmf decoding
+	  cross connecting of bchannels, conferencing, dtmf decoding,
 	  echo cancelation, tone generation, and Blowfish encryption and
-	  decryption.
-	  It may use hardware features if available.
+	  decryption. It may use hardware features if available.
+
 	  E.g. it is required for PBX4Linux. Go to http://isdn.eversberg.eu
-	  and get more informations about this module and it's usage.
+	  and get more information about this module and its usage.
+
 	  If unsure, say 'N'.
 
 config MISDN_L1OIP
diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig
index 556aeca0d860..d9db17624f12 100644
--- a/drivers/leds/Kconfig
+++ b/drivers/leds/Kconfig
@@ -100,7 +100,7 @@ config LEDS_HP6XX
 	tristate "LED Support for the HP Jornada 6xx"
 	depends on LEDS_CLASS && SH_HP6XX
 	help
-	  This option enables led support for the handheld
+	  This option enables LED support for the handheld
 	  HP Jornada 620/660/680/690.
 
 config LEDS_PCA9532
@@ -108,7 +108,7 @@ config LEDS_PCA9532
 	depends on LEDS_CLASS && I2C && INPUT && EXPERIMENTAL
 	help
 	  This option enables support for NXP pca9532
-	  led controller. It is generally only usefull
+	  LED controller. It is generally only useful
 	  as a platform driver
 
 config LEDS_GPIO
@@ -144,7 +144,7 @@ config LEDS_CLEVO_MAIL
 	  	Positivo Mobile (Clevo M5x0V)
 
 	  If your model is not listed here you can try the "nodetect"
-	  module paramter.
+	  module parameter.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called leds-clevo-mail.
diff --git a/drivers/media/common/tuners/Kconfig b/drivers/media/common/tuners/Kconfig
index 6f92beaa5ac8..da058c174049 100644
--- a/drivers/media/common/tuners/Kconfig
+++ b/drivers/media/common/tuners/Kconfig
@@ -147,7 +147,7 @@ config MEDIA_TUNER_XC5000
 	default m if DVB_FE_CUSTOMISE
 	help
 	  A driver for the silicon tuner XC5000 from Xceive.
-	  This device is only used inside a SiP called togther with a
+	  This device is only used inside a SiP called together with a
 	  demodulator for now.
 
 config MEDIA_TUNER_MXL5005S
diff --git a/drivers/media/dvb/frontends/Kconfig b/drivers/media/dvb/frontends/Kconfig
index 00269560793a..baf808e9d6e1 100644
--- a/drivers/media/dvb/frontends/Kconfig
+++ b/drivers/media/dvb/frontends/Kconfig
@@ -439,7 +439,7 @@ config DVB_TUNER_DIB0070
 	default m if DVB_FE_CUSTOMISE
 	help
 	  A driver for the silicon baseband tuner DiB0070 from DiBcom.
-	  This device is only used inside a SiP called togther with a
+	  This device is only used inside a SiP called together with a
 	  demodulator for now.
 
 comment "SEC control devices for DVB-S"
diff --git a/drivers/mfd/Kconfig b/drivers/mfd/Kconfig
index 06a2b0f7737c..75f35dbb11dc 100644
--- a/drivers/mfd/Kconfig
+++ b/drivers/mfd/Kconfig
@@ -88,7 +88,7 @@ config MENELAUS
 	help
 	  If you say yes here you get support for the Texas Instruments
 	  TWL92330/Menelaus Power Management chip. This include voltage
-	  regulators, Dual slot memory card tranceivers, real-time clock
+	  regulators, Dual slot memory card transceivers, real-time clock
 	  and other features that are often used in portable devices like
 	  cell phones and PDAs.
 
diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig
index 1c484084ed4f..7a1e948840e6 100644
--- a/drivers/misc/Kconfig
+++ b/drivers/misc/Kconfig
@@ -18,8 +18,8 @@ config ATMEL_PWM
 	depends on AVR32 || ARCH_AT91SAM9263 || ARCH_AT91SAM9RL || ARCH_AT91CAP9
 	help
 	  This option enables device driver support for the PWM channels
-	  on certain Atmel prcoessors.  Pulse Width Modulation is used for
-	  purposes including software controlled power-efficent backlights
+	  on certain Atmel processors.  Pulse Width Modulation is used for
+	  purposes including software controlled power-efficient backlights
 	  on LCD displays, motor control, and waveform generation.
 
 config ATMEL_TCLIB
@@ -142,7 +142,7 @@ config ATMEL_SSC
 	tristate "Device driver for Atmel SSC peripheral"
 	depends on AVR32 || ARCH_AT91
 	---help---
-	  This option enables device driver support for Atmel Syncronized
+	  This option enables device driver support for Atmel Synchronized
 	  Serial Communication peripheral (SSC).
 
 	  The SSC peripheral supports a wide variety of serial frame based
diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig
index 99d4b28d52ed..6fbb246c40bb 100644
--- a/drivers/mmc/host/Kconfig
+++ b/drivers/mmc/host/Kconfig
@@ -177,7 +177,7 @@ config MMC_SPI
 	select CRC7
 	select CRC_ITU_T
 	help
-	  Some systems accss MMC/SD/SDIO cards using a SPI controller
+	  Some systems access MMC/SD/SDIO cards using a SPI controller
 	  instead of using a "native" MMC/SD/SDIO controller.  This has a
 	  disadvantage of being relatively high overhead, but a compensating
 	  advantage of working on many systems without dedicated MMC/SD/SDIO
diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig
index e2f44e6c0bcb..20297c521e50 100644
--- a/drivers/scsi/Kconfig
+++ b/drivers/scsi/Kconfig
@@ -1380,7 +1380,7 @@ config SCSI_LPFC_DEBUG_FS
 	bool "Emulex LightPulse Fibre Channel debugfs Support"
 	depends on SCSI_LPFC && DEBUG_FS
 	help
-	  This makes debugging infomation from the lpfc driver
+	  This makes debugging information from the lpfc driver
 	  available via the debugfs filesystem.
 
 config SCSI_SIM710
@@ -1388,7 +1388,7 @@ config SCSI_SIM710
 	depends on (EISA || MCA) && SCSI
 	select SCSI_SPI_ATTRS
 	---help---
-	  This driver for NCR53c710 based SCSI host adapters.
+	  This driver is for NCR53c710 based SCSI host adapters.
 
 	  It currently supports Compaq EISA cards and NCR MCA cards
 
diff --git a/drivers/serial/Kconfig b/drivers/serial/Kconfig
index 9be11b0963f2..aa9d3a4c2d50 100644
--- a/drivers/serial/Kconfig
+++ b/drivers/serial/Kconfig
@@ -1374,7 +1374,7 @@ config SERIAL_BFIN_SPORT
 	depends on BLACKFIN && EXPERIMENTAL
 	select SERIAL_CORE
 	help
-	  Enble support SPORT emulate UART on Blackfin series.
+	  Enable SPORT emulate UART on Blackfin series.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called bfin_sport_uart.
diff --git a/drivers/staging/Kconfig b/drivers/staging/Kconfig
index 211af86a6c55..92981c2383ee 100644
--- a/drivers/staging/Kconfig
+++ b/drivers/staging/Kconfig
@@ -4,7 +4,7 @@ menuconfig STAGING
 	---help---
 	  This option allows you to select a number of drivers that are
 	  not of the "normal" Linux kernel quality level.  These drivers
-	  are placed here in order to get a wider audience for use of
+	  are placed here in order to get a wider audience to make use of
 	  them.  Please note that these drivers are under heavy
 	  development, may or may not work, and may contain userspace
 	  interfaces that most likely will be changed in the near
@@ -12,7 +12,7 @@ menuconfig STAGING
 
 	  Using any of these drivers will taint your kernel which might
 	  affect support options from both the community, and various
-	  commercial support orginizations.
+	  commercial support organizations.
 
 	  If you wish to work on these drivers, to help improve them, or
 	  to report problems you have with them, please see the
diff --git a/drivers/staging/comedi/Kconfig b/drivers/staging/comedi/Kconfig
index b47ca1e7e383..83a93a5c6392 100644
--- a/drivers/staging/comedi/Kconfig
+++ b/drivers/staging/comedi/Kconfig
@@ -1,9 +1,9 @@
 config COMEDI
-	tristate "Data Acquision support (comedi)"
+	tristate "Data acquisition support (comedi)"
 	default N
 	depends on m
 	---help---
-	  Enable support a wide range of data acquision devices
+	  Enable support a wide range of data acquisition devices
 	  for Linux.
 
 config COMEDI_RT
diff --git a/drivers/staging/go7007/Kconfig b/drivers/staging/go7007/Kconfig
index f2cf7f66ae05..ca6ade6c4b47 100644
--- a/drivers/staging/go7007/Kconfig
+++ b/drivers/staging/go7007/Kconfig
@@ -10,7 +10,7 @@ config VIDEO_GO7007
 	select CRC32
 	default N
 	---help---
-	  This is a video4linux driver for some wierd device...
+	  This is a video4linux driver for some weird device...
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called go7007
@@ -20,7 +20,7 @@ config VIDEO_GO7007_USB
 	depends on VIDEO_GO7007 && USB
 	default N
 	---help---
-	  This is a video4linux driver for some wierd device...
+	  This is a video4linux driver for some weird device...
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called go7007-usb
diff --git a/drivers/staging/panel/Kconfig b/drivers/staging/panel/Kconfig
index c4b30f2a549b..3abe7c9d558d 100644
--- a/drivers/staging/panel/Kconfig
+++ b/drivers/staging/panel/Kconfig
@@ -110,7 +110,7 @@ config PANEL_LCD_BWIDTH
 	---help---
 	  Most LCDs use a standard controller which supports hardware lines of 40
 	  characters, although sometimes only 16, 20 or 24 of them are really wired
-	  to the terminal. This results in some non-visible but adressable characters,
+	  to the terminal. This results in some non-visible but addressable characters,
 	  and is the case for most parallel LCDs. Other LCDs, and some serial ones,
 	  however, use the same line width internally as what is visible. The KS0074
 	  for example, uses 16 characters per line for 16 visible characters per line.
diff --git a/drivers/usb/gadget/Kconfig b/drivers/usb/gadget/Kconfig
index 770b3eaa9184..080bb1e4b847 100644
--- a/drivers/usb/gadget/Kconfig
+++ b/drivers/usb/gadget/Kconfig
@@ -392,7 +392,7 @@ config USB_GADGET_FSL_QE
 	   controllers having QE or CPM2, given minor tweaks.
 
 	   Set CONFIG_USB_GADGET to "m" to build this driver as a
-	   dynmically linked module called "fsl_qe_udc".
+	   dynamically linked module called "fsl_qe_udc".
 
 config USB_FSL_QE
 	tristate
diff --git a/drivers/usb/serial/Kconfig b/drivers/usb/serial/Kconfig
index a65f9196b0a0..c480ea4c19f2 100644
--- a/drivers/usb/serial/Kconfig
+++ b/drivers/usb/serial/Kconfig
@@ -518,8 +518,8 @@ config USB_SERIAL_SIERRAWIRELESS
 	help
 	  Say M here if you want to use Sierra Wireless devices.
 
-	  Many deviecs have a feature known as TRU-Install, for those devices
-	  to work properly the USB Storage Sierra feature must be enabled.
+	  Many devices have a feature known as TRU-Install. For those devices
+	  to work properly, the USB Storage Sierra feature must be enabled.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called sierra.
diff --git a/drivers/uwb/Kconfig b/drivers/uwb/Kconfig
index ca783127af36..bac8e7a6f17b 100644
--- a/drivers/uwb/Kconfig
+++ b/drivers/uwb/Kconfig
@@ -48,10 +48,10 @@ config UWB_WHCI
         help
           This driver enables the radio controller for WHCI cards.
 
-          WHCI is an specification developed by Intel
+          WHCI is a specification developed by Intel
           (http://www.intel.com/technology/comms/wusb/whci.htm) much
           in the spirit of USB's EHCI, but for UWB and Wireless USB
-          radio/host controllers connected via memmory mapping (eg:
+          radio/host controllers connected via memory mapping (eg:
           PCI). Most of these cards come also with a Wireless USB host
           controller.
 
diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig
index 526187c8a12d..8ac9cddac575 100644
--- a/drivers/xen/Kconfig
+++ b/drivers/xen/Kconfig
@@ -37,7 +37,7 @@ config XEN_COMPAT_XENFS
          The old xenstore userspace tools expect to find "xenbus"
          under /proc/xen, but "xenbus" is now found at the root of the
          xenfs filesystem.  Selecting this causes the kernel to create
-         the compatibilty mount point /proc/xen if it is running on
+         the compatibility mount point /proc/xen if it is running on
          a xen platform.
          If in doubt, say yes.
 
diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig
index 7505482a08fa..418b6f3b0ae8 100644
--- a/fs/ext4/Kconfig
+++ b/fs/ext4/Kconfig
@@ -18,7 +18,7 @@ config EXT4_FS
 	  filesystem; while there will be some performance gains from
 	  the delayed allocation and inode table readahead, the best
 	  performance gains will require enabling ext4 features in the
-	  filesystem, or formating a new filesystem as an ext4
+	  filesystem, or formatting a new filesystem as an ext4
 	  filesystem initially.
 
 	  To compile this file system support as a module, choose M here. The
diff --git a/fs/ubifs/Kconfig b/fs/ubifs/Kconfig
index e35b54d5059d..830e3f76f442 100644
--- a/fs/ubifs/Kconfig
+++ b/fs/ubifs/Kconfig
@@ -22,7 +22,7 @@ config UBIFS_FS_ADVANCED_COMPR
 	depends on UBIFS_FS
 	help
 	  This option allows to explicitly choose which compressions, if any,
-	  are enabled in UBIFS. Removing compressors means inbility to read
+	  are enabled in UBIFS. Removing compressors means inability to read
 	  existing file systems.
 
 	  If unsure, say 'N'.
@@ -32,7 +32,7 @@ config UBIFS_FS_LZO
 	depends on UBIFS_FS
 	default y
 	help
-	   LZO compressor is generally faster then zlib but compresses worse.
+	   LZO compressor is generally faster than zlib but compresses worse.
 	   Say 'Y' if unsure.
 
 config UBIFS_FS_ZLIB
diff --git a/init/Kconfig b/init/Kconfig
index 14c483d2b7c9..bcffc0e47647 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -687,7 +687,7 @@ config PID_NS
 	depends on NAMESPACES && EXPERIMENTAL
 	help
 	  Support process id namespaces.  This allows having multiple
-	  process with the same pid as long as they are in different
+	  processes with the same pid as long as they are in different
 	  pid namespaces.  This is a building block of containers.
 
 	  Unless you want to work with an experimental feature
@@ -952,7 +952,7 @@ config COMPAT_BRK
 	  Randomizing heap placement makes heap exploits harder, but it
 	  also breaks ancient binaries (including anything libc5 based).
 	  This option changes the bootup default to heap randomization
-	  disabled, and can be overriden runtime by setting
+	  disabled, and can be overridden at runtime by setting
 	  /proc/sys/kernel/randomize_va_space to 2.
 
 	  On non-ancient distros (post-2000 ones) N is usually a safe choice.
@@ -1110,7 +1110,7 @@ config INIT_ALL_POSSIBLE
 	  cpu_possible_map, some of them chose to initialize cpu_possible_map
 	  with all 1s, and others with all 0s.  When they were centralised,
 	  it was better to provide this option than to break all the archs
-	  and have several arch maintainers persuing me down dark alleys.
+	  and have several arch maintainers pursuing me down dark alleys.
 
 config STOP_MACHINE
 	bool
diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig
index 34e707e5ab87..504086ab4443 100644
--- a/kernel/trace/Kconfig
+++ b/kernel/trace/Kconfig
@@ -72,11 +72,10 @@ config FUNCTION_GRAPH_TRACER
 	help
 	  Enable the kernel to trace a function at both its return
 	  and its entry.
-	  It's first purpose is to trace the duration of functions and
-	  draw a call graph for each thread with some informations like
-	  the return value.
-	  This is done by setting the current return address on the current
-	  task structure into a stack of calls.
+	  Its first purpose is to trace the duration of functions and
+	  draw a call graph for each thread with some information like
+	  the return value. This is done by setting the current return 
+	  address on the current task structure into a stack of calls.
 
 config IRQSOFF_TRACER
 	bool "Interrupts-off Latency Tracer"
diff --git a/net/Kconfig b/net/Kconfig
index ec93e7e38b38..ce77db4fcec8 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -140,7 +140,7 @@ config NETFILTER_ADVANCED
 	default y
 	help
 	  If you say Y here you can select between all the netfilter modules.
-	  If you say N the more ununsual ones will not be shown and the
+	  If you say N the more unusual ones will not be shown and the
 	  basic ones needed by most people will default to 'M'.
 
 	  If unsure, say Y.
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index ec992159b5f8..ca8cb326d1d2 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -22,17 +22,17 @@ menuconfig IPV6
 if IPV6
 
 config IPV6_PRIVACY
-	bool "IPv6: Privacy Extensions support"
+	bool "IPv6: Privacy Extensions (RFC 3041) support"
 	---help---
 	  Privacy Extensions for Stateless Address Autoconfiguration in IPv6
-	  support.  With this option, additional periodically-alter 
-	  pseudo-random global-scope unicast address(es) will assigned to
+	  support.  With this option, additional periodically-altered
+	  pseudo-random global-scope unicast address(es) will be assigned to
 	  your interface(s).
 	
-	  We use our standard pseudo random algorithm to generate randomized
-	  interface identifier, instead of one described in RFC 3041.
+	  We use our standard pseudo-random algorithm to generate the
+          randomized interface identifier, instead of one described in RFC 3041.
 
-	  By default, kernel do not generate temporary addresses.
+	  By default the kernel does not generate temporary addresses.
 	  To use temporary addresses, do
 	
 	        echo 2 >/proc/sys/net/ipv6/conf/all/use_tempaddr 
@@ -43,9 +43,9 @@ config IPV6_ROUTER_PREF
 	bool "IPv6: Router Preference (RFC 4191) support"
 	---help---
 	  Router Preference is an optional extension to the Router
-	  Advertisement message to improve the ability of hosts
-	  to pick more appropriate router, especially when the hosts
-	  is placed in a multi-homed network.
+	  Advertisement message which improves the ability of hosts
+	  to pick an appropriate router, especially when the hosts
+	  are placed in a multi-homed network.
 
 	  If unsure, say N.
 
diff --git a/net/mac80211/Kconfig b/net/mac80211/Kconfig
index 60c16162474c..f3d9ae350fb6 100644
--- a/net/mac80211/Kconfig
+++ b/net/mac80211/Kconfig
@@ -33,7 +33,7 @@ choice
 	---help---
 	  This option selects the default rate control algorithm
 	  mac80211 will use. Note that this default can still be
-	  overriden through the ieee80211_default_rc_algo module
+	  overridden through the ieee80211_default_rc_algo module
 	  parameter if different algorithms are available.
 
 config MAC80211_RC_DEFAULT_PID
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 2c967e4f706c..bb279bf59a1b 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -52,7 +52,7 @@ config NF_CT_ACCT
 
 	  Please note that currently this option only sets a default state.
 	  You may change it at boot time with nf_conntrack.acct=0/1 kernel
-	  paramater or by loading the nf_conntrack module with acct=0/1.
+	  parameter or by loading the nf_conntrack module with acct=0/1.
 
 	  You may also disable/enable it on a running system with:
 	   sysctl net.netfilter.nf_conntrack_acct=0/1
diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig
index 51a5669573f2..6ec7d55b1769 100644
--- a/net/phonet/Kconfig
+++ b/net/phonet/Kconfig
@@ -6,7 +6,7 @@ config PHONET
 	tristate "Phonet protocols family"
 	help
 	  The Phone Network protocol (PhoNet) is a packet-oriented
-	  communication protocol developped by Nokia for use with its modems.
+	  communication protocol developed by Nokia for use with its modems.
 
 	  This is required for Maemo to use cellular data connectivity (if
 	  supported). It can also be used to control Nokia phones
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 5592883e1e4a..3f7faa9688ae 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -69,7 +69,7 @@ config RPCSEC_GSS_SPKM3
 	select CRYPTO_CBC
 	help
 	  Choose Y here to enable Secure RPC using the SPKM3 public key
-	  GSS-API mechansim (RFC 2025).
+	  GSS-API mechanism (RFC 2025).
 
 	  Secure RPC calls with SPKM3 require an auxiliary userspace
 	  daemon which may be found in the Linux nfs-utils package
diff --git a/net/wimax/Kconfig b/net/wimax/Kconfig
index 18495cdcd10d..1b46747a5f5a 100644
--- a/net/wimax/Kconfig
+++ b/net/wimax/Kconfig
@@ -8,7 +8,7 @@
 #
 # As well, enablement of the RFKILL code means we need the INPUT layer
 # support to inject events coming from hw rfkill switches. That
-# dependency could be killed if input.h provided appropiate means to
+# dependency could be killed if input.h provided appropriate means to
 # work when input is disabled.
 
 comment "WiMAX Wireless Broadband support requires CONFIG_INPUT enabled"
diff --git a/sound/soc/blackfin/Kconfig b/sound/soc/blackfin/Kconfig
index 0a2f8f9eff53..811596f4c092 100644
--- a/sound/soc/blackfin/Kconfig
+++ b/sound/soc/blackfin/Kconfig
@@ -42,7 +42,7 @@ config SND_BF5XX_AC97
 	  You will also need to select the audio interfaces to support below.
 
 	  Note:
-	  AC97 codecs which do not implment the slot-16 mode will not function
+	  AC97 codecs which do not implement the slot-16 mode will not function
 	  properly with this driver. This driver is known to work with the
 	  Analog Devices line of AC97 codecs.
 
-- 
cgit v1.2.3


From 0a0c5168df270a50e3518e4f12bddb31f8f5f38f Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:33:49 +0100
Subject: PM: Introduce functions for suspending and resuming device interrupts

Introduce helper functions allowing us to prevent device drivers from
getting any interrupts (without disabling interrupts on the CPU)
during suspend (or hibernation) and to make them start to receive
interrupts again during the subsequent resume.  These functions make it
possible to keep timer interrupts enabled while the "late" suspend and
"early" resume callbacks provided by device drivers are being
executed.  In turn, this allows device drivers' "late" suspend and
"early" resume callbacks to sleep, execute ACPI callbacks etc.

The functions introduced here will be used to rework the handling of
interrupts during suspend (hibernation) and resume.  Namely,
interrupts will only be disabled on the CPU right before suspending
sysdevs, while device drivers will be prevented from receiving
interrupts, with the help of the new helper function, before their
"late" suspend callbacks run (and analogously during resume).

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/interrupt.h |  9 ++++++
 include/linux/irq.h       |  1 +
 kernel/irq/Makefile       |  1 +
 kernel/irq/internals.h    |  2 ++
 kernel/irq/manage.c       | 31 ++++++++++++++-----
 kernel/irq/pm.c           | 79 +++++++++++++++++++++++++++++++++++++++++++++++
 6 files changed, 116 insertions(+), 7 deletions(-)
 create mode 100644 kernel/irq/pm.c

(limited to 'kernel')

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 0c9cb63e6895..c68bffd182bb 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -117,6 +117,15 @@ extern void disable_irq_nosync(unsigned int irq);
 extern void disable_irq(unsigned int irq);
 extern void enable_irq(unsigned int irq);
 
+/* The following three functions are for the core kernel use only. */
+extern void suspend_device_irqs(void);
+extern void resume_device_irqs(void);
+#ifdef CONFIG_PM_SLEEP
+extern int check_wakeup_irqs(void);
+#else
+static inline int check_wakeup_irqs(void) { return 0; }
+#endif
+
 #if defined(CONFIG_SMP) && defined(CONFIG_GENERIC_HARDIRQS)
 
 extern cpumask_var_t irq_default_affinity;
diff --git a/include/linux/irq.h b/include/linux/irq.h
index 9c62fbe2ef30..974890b3c52f 100644
--- a/include/linux/irq.h
+++ b/include/linux/irq.h
@@ -67,6 +67,7 @@ typedef	void (*irq_flow_handler_t)(unsigned int irq,
 #define IRQ_SPURIOUS_DISABLED	0x00800000	/* IRQ was disabled by the spurious trap */
 #define IRQ_MOVE_PCNTXT		0x01000000	/* IRQ migration from process context */
 #define IRQ_AFFINITY_SET	0x02000000	/* IRQ affinity was set from userspace*/
+#define IRQ_SUSPENDED		0x04000000	/* IRQ has gone through suspend sequence */
 
 #ifdef CONFIG_IRQ_PER_CPU
 # define CHECK_IRQ_PER_CPU(var) ((var) & IRQ_PER_CPU)
diff --git a/kernel/irq/Makefile b/kernel/irq/Makefile
index 4dd5b1edac98..3394f8f52964 100644
--- a/kernel/irq/Makefile
+++ b/kernel/irq/Makefile
@@ -4,3 +4,4 @@ obj-$(CONFIG_GENERIC_IRQ_PROBE) += autoprobe.o
 obj-$(CONFIG_PROC_FS) += proc.o
 obj-$(CONFIG_GENERIC_PENDING_IRQ) += migration.o
 obj-$(CONFIG_NUMA_MIGRATE_IRQ_DESC) += numa_migrate.o
+obj-$(CONFIG_PM_SLEEP) += pm.o
diff --git a/kernel/irq/internals.h b/kernel/irq/internals.h
index ee1aa9f8e8b9..01ce20eab38f 100644
--- a/kernel/irq/internals.h
+++ b/kernel/irq/internals.h
@@ -12,6 +12,8 @@ extern void compat_irq_chip_set_default_handler(struct irq_desc *desc);
 
 extern int __irq_set_trigger(struct irq_desc *desc, unsigned int irq,
 		unsigned long flags);
+extern void __disable_irq(struct irq_desc *desc, unsigned int irq, bool susp);
+extern void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume);
 
 extern struct lock_class_key irq_desc_lock_class;
 extern void init_kstat_irqs(struct irq_desc *desc, int cpu, int nr);
diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c
index 6458e99984c0..1516ab77355c 100644
--- a/kernel/irq/manage.c
+++ b/kernel/irq/manage.c
@@ -162,6 +162,20 @@ static inline int setup_affinity(unsigned int irq, struct irq_desc *desc)
 }
 #endif
 
+void __disable_irq(struct irq_desc *desc, unsigned int irq, bool suspend)
+{
+	if (suspend) {
+		if (!desc->action || (desc->action->flags & IRQF_TIMER))
+			return;
+		desc->status |= IRQ_SUSPENDED;
+	}
+
+	if (!desc->depth++) {
+		desc->status |= IRQ_DISABLED;
+		desc->chip->disable(irq);
+	}
+}
+
 /**
  *	disable_irq_nosync - disable an irq without waiting
  *	@irq: Interrupt to disable
@@ -182,10 +196,7 @@ void disable_irq_nosync(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	if (!desc->depth++) {
-		desc->status |= IRQ_DISABLED;
-		desc->chip->disable(irq);
-	}
+	__disable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(disable_irq_nosync);
@@ -215,15 +226,21 @@ void disable_irq(unsigned int irq)
 }
 EXPORT_SYMBOL(disable_irq);
 
-static void __enable_irq(struct irq_desc *desc, unsigned int irq)
+void __enable_irq(struct irq_desc *desc, unsigned int irq, bool resume)
 {
+	if (resume)
+		desc->status &= ~IRQ_SUSPENDED;
+
 	switch (desc->depth) {
 	case 0:
+ err_out:
 		WARN(1, KERN_WARNING "Unbalanced enable for IRQ %d\n", irq);
 		break;
 	case 1: {
 		unsigned int status = desc->status & ~IRQ_DISABLED;
 
+		if (desc->status & IRQ_SUSPENDED)
+			goto err_out;
 		/* Prevent probing on this irq: */
 		desc->status = status | IRQ_NOPROBE;
 		check_irq_resend(desc, irq);
@@ -253,7 +270,7 @@ void enable_irq(unsigned int irq)
 		return;
 
 	spin_lock_irqsave(&desc->lock, flags);
-	__enable_irq(desc, irq);
+	__enable_irq(desc, irq, false);
 	spin_unlock_irqrestore(&desc->lock, flags);
 }
 EXPORT_SYMBOL(enable_irq);
@@ -511,7 +528,7 @@ __setup_irq(unsigned int irq, struct irq_desc *desc, struct irqaction *new)
 	 */
 	if (shared && (desc->status & IRQ_SPURIOUS_DISABLED)) {
 		desc->status &= ~IRQ_SPURIOUS_DISABLED;
-		__enable_irq(desc, irq);
+		__enable_irq(desc, irq, false);
 	}
 
 	spin_unlock_irqrestore(&desc->lock, flags);
diff --git a/kernel/irq/pm.c b/kernel/irq/pm.c
new file mode 100644
index 000000000000..638d8bedec14
--- /dev/null
+++ b/kernel/irq/pm.c
@@ -0,0 +1,79 @@
+/*
+ * linux/kernel/irq/pm.c
+ *
+ * Copyright (C) 2009 Rafael J. Wysocki <rjw@sisk.pl>, Novell Inc.
+ *
+ * This file contains power management functions related to interrupts.
+ */
+
+#include <linux/irq.h>
+#include <linux/module.h>
+#include <linux/interrupt.h>
+
+#include "internals.h"
+
+/**
+ * suspend_device_irqs - disable all currently enabled interrupt lines
+ *
+ * During system-wide suspend or hibernation device interrupts need to be
+ * disabled at the chip level and this function is provided for this purpose.
+ * It disables all interrupt lines that are enabled at the moment and sets the
+ * IRQ_SUSPENDED flag for them.
+ */
+void suspend_device_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		__disable_irq(desc, irq, true);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+
+	for_each_irq_desc(irq, desc)
+		if (desc->status & IRQ_SUSPENDED)
+			synchronize_irq(irq);
+}
+EXPORT_SYMBOL_GPL(suspend_device_irqs);
+
+/**
+ * resume_device_irqs - enable interrupt lines disabled by suspend_device_irqs()
+ *
+ * Enable all interrupt lines previously disabled by suspend_device_irqs() that
+ * have the IRQ_SUSPENDED flag set.
+ */
+void resume_device_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc) {
+		unsigned long flags;
+
+		if (!(desc->status & IRQ_SUSPENDED))
+			continue;
+
+		spin_lock_irqsave(&desc->lock, flags);
+		__enable_irq(desc, irq, true);
+		spin_unlock_irqrestore(&desc->lock, flags);
+	}
+}
+EXPORT_SYMBOL_GPL(resume_device_irqs);
+
+/**
+ * check_wakeup_irqs - check if any wake-up interrupts are pending
+ */
+int check_wakeup_irqs(void)
+{
+	struct irq_desc *desc;
+	int irq;
+
+	for_each_irq_desc(irq, desc)
+		if ((desc->status & IRQ_WAKEUP) && (desc->status & IRQ_PENDING))
+			return -EBUSY;
+
+	return 0;
+}
-- 
cgit v1.2.3


From 2ed8d2b3a81bdbb0418301628ccdb008ac9f40b7 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:06 +0100
Subject: PM: Rework handling of interrupts during suspend-resume

Use the functions introduced in by the previous patch,
suspend_device_irqs(), resume_device_irqs() and check_wakeup_irqs(),
to rework the handling of interrupts during suspend (hibernation) and
resume.  Namely, interrupts will only be disabled on the CPU right
before suspending sysdevs, while device drivers will be prevented
from receiving interrupts, with the help of the new helper function,
before their "late" suspend callbacks run (and analogously during
resume).

In addition, since the device interrups are now disabled before the
CPU has turned all interrupts off and the CPU will ACK the interrupts
setting the IRQ_PENDING bit for them, check in sysdev_suspend() if
any wake-up interrupts are pending and abort suspend if that's the
case.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/apm_32.c  | 15 +++++++++++----
 drivers/base/power/main.c | 20 +++++++++++---------
 drivers/base/sys.c        |  8 ++++++++
 drivers/xen/manage.c      | 16 +++++++++-------
 kernel/kexec.c            |  8 ++++----
 kernel/power/disk.c       | 39 +++++++++++++++++++++++++++++----------
 kernel/power/main.c       | 17 +++++++++++------
 7 files changed, 83 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/apm_32.c b/arch/x86/kernel/apm_32.c
index 10033fe718e0..ac7783a67432 100644
--- a/arch/x86/kernel/apm_32.c
+++ b/arch/x86/kernel/apm_32.c
@@ -1190,8 +1190,10 @@ static int suspend(int vetoable)
 	struct apm_user	*as;
 
 	device_suspend(PMSG_SUSPEND);
-	local_irq_disable();
+
 	device_power_down(PMSG_SUSPEND);
+
+	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
 
 	local_irq_enable();
@@ -1209,9 +1211,12 @@ static int suspend(int vetoable)
 	if (err != APM_SUCCESS)
 		apm_error("suspend", err);
 	err = (err == APM_SUCCESS) ? 0 : -EIO;
+
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
 	local_irq_enable();
+
+	device_power_up(PMSG_RESUME);
+
 	device_resume(PMSG_RESUME);
 	queue_event(APM_NORMAL_RESUME, NULL);
 	spin_lock(&user_list_lock);
@@ -1228,8 +1233,9 @@ static void standby(void)
 {
 	int err;
 
-	local_irq_disable();
 	device_power_down(PMSG_SUSPEND);
+
+	local_irq_disable();
 	sysdev_suspend(PMSG_SUSPEND);
 	local_irq_enable();
 
@@ -1239,8 +1245,9 @@ static void standby(void)
 
 	local_irq_disable();
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
 	local_irq_enable();
+
+	device_power_up(PMSG_RESUME);
 }
 
 static apm_event_t get_event(void)
diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c
index e255341682c8..69b4ddb7de3b 100644
--- a/drivers/base/power/main.c
+++ b/drivers/base/power/main.c
@@ -23,6 +23,7 @@
 #include <linux/pm.h>
 #include <linux/resume-trace.h>
 #include <linux/rwsem.h>
+#include <linux/interrupt.h>
 
 #include "../base.h"
 #include "power.h"
@@ -349,7 +350,8 @@ static int resume_device_noirq(struct device *dev, pm_message_t state)
  *	Execute the appropriate "noirq resume" callback for all devices marked
  *	as DPM_OFF_IRQ.
  *
- *	Must be called with interrupts disabled and only one CPU running.
+ *	Must be called under dpm_list_mtx.  Device drivers should not receive
+ *	interrupts while it's being executed.
  */
 static void dpm_power_up(pm_message_t state)
 {
@@ -370,14 +372,13 @@ static void dpm_power_up(pm_message_t state)
  *	device_power_up - Turn on all devices that need special attention.
  *	@state: PM transition of the system being carried out.
  *
- *	Power on system devices, then devices that required we shut them down
- *	with interrupts disabled.
- *
- *	Must be called with interrupts disabled.
+ *	Call the "early" resume handlers and enable device drivers to receive
+ *	interrupts.
  */
 void device_power_up(pm_message_t state)
 {
 	dpm_power_up(state);
+	resume_device_irqs();
 }
 EXPORT_SYMBOL_GPL(device_power_up);
 
@@ -602,16 +603,17 @@ static int suspend_device_noirq(struct device *dev, pm_message_t state)
  *	device_power_down - Shut down special devices.
  *	@state: PM transition of the system being carried out.
  *
- *	Power down devices that require interrupts to be disabled.
- *	Then power down system devices.
+ *	Prevent device drivers from receiving interrupts and call the "late"
+ *	suspend handlers.
  *
- *	Must be called with interrupts disabled and only one CPU running.
+ *	Must be called under dpm_list_mtx.
  */
 int device_power_down(pm_message_t state)
 {
 	struct device *dev;
 	int error = 0;
 
+	suspend_device_irqs();
 	list_for_each_entry_reverse(dev, &dpm_list, power.entry) {
 		error = suspend_device_noirq(dev, state);
 		if (error) {
@@ -621,7 +623,7 @@ int device_power_down(pm_message_t state)
 		dev->power.status = DPM_OFF_IRQ;
 	}
 	if (error)
-		dpm_power_up(resume_event(state));
+		device_power_up(resume_event(state));
 	return error;
 }
 EXPORT_SYMBOL_GPL(device_power_down);
diff --git a/drivers/base/sys.c b/drivers/base/sys.c
index cbd36cf59a0f..76ce75bad91e 100644
--- a/drivers/base/sys.c
+++ b/drivers/base/sys.c
@@ -22,6 +22,7 @@
 #include <linux/pm.h>
 #include <linux/device.h>
 #include <linux/mutex.h>
+#include <linux/interrupt.h>
 
 #include "base.h"
 
@@ -369,6 +370,13 @@ int sysdev_suspend(pm_message_t state)
 	struct sysdev_driver *drv, *err_drv;
 	int ret;
 
+	pr_debug("Checking wake-up interrupts\n");
+
+	/* Return error code if there are any wake-up interrupts pending */
+	ret = check_wakeup_irqs();
+	if (ret)
+		return ret;
+
 	pr_debug("Suspending System Devices\n");
 
 	list_for_each_entry_reverse(cls, &system_kset->list, kset.kobj.entry) {
diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c
index 3ccd348d112d..0d61db1e7b49 100644
--- a/drivers/xen/manage.c
+++ b/drivers/xen/manage.c
@@ -39,12 +39,6 @@ static int xen_suspend(void *data)
 
 	BUG_ON(!irqs_disabled());
 
-	err = device_power_down(PMSG_SUSPEND);
-	if (err) {
-		printk(KERN_ERR "xen_suspend: device_power_down failed: %d\n",
-		       err);
-		return err;
-	}
 	err = sysdev_suspend(PMSG_SUSPEND);
 	if (err) {
 		printk(KERN_ERR "xen_suspend: sysdev_suspend failed: %d\n",
@@ -69,7 +63,6 @@ static int xen_suspend(void *data)
 	xen_mm_unpin_all();
 
 	sysdev_resume();
-	device_power_up(PMSG_RESUME);
 
 	if (!*cancelled) {
 		xen_irq_resume();
@@ -108,6 +101,12 @@ static void do_suspend(void)
 	/* XXX use normal device tree? */
 	xenbus_suspend();
 
+	err = device_power_down(PMSG_SUSPEND);
+	if (err) {
+		printk(KERN_ERR "device_power_down failed: %d\n", err);
+		goto resume_devices;
+	}
+
 	err = stop_machine(xen_suspend, &cancelled, cpumask_of(0));
 	if (err) {
 		printk(KERN_ERR "failed to start xen_suspend: %d\n", err);
@@ -120,6 +119,9 @@ static void do_suspend(void)
 	} else
 		xenbus_suspend_cancel();
 
+	device_power_up(PMSG_RESUME);
+
+resume_devices:
 	device_resume(PMSG_RESUME);
 
 	/* Make sure timer events get retriggered on all CPUs */
diff --git a/kernel/kexec.c b/kernel/kexec.c
index c7fd6692939d..dade9af6bf21 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1454,7 +1454,6 @@ int kernel_kexec(void)
 		if (error)
 			goto Resume_devices;
 		device_pm_lock();
-		local_irq_disable();
 		/* At this point, device_suspend() has been called,
 		 * but *not* device_power_down(). We *must*
 		 * device_power_down() now.  Otherwise, drivers for
@@ -1464,8 +1463,9 @@ int kernel_kexec(void)
 		 */
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
-			goto Enable_irqs;
+			goto Unlock_pm;
 
+		local_irq_disable();
 		/* Suspend system devices */
 		error = sysdev_suspend(PMSG_FREEZE);
 		if (error)
@@ -1484,9 +1484,9 @@ int kernel_kexec(void)
 	if (kexec_image->preserve_context) {
 		sysdev_resume();
  Power_up_devices:
-		device_power_up(PMSG_RESTORE);
- Enable_irqs:
 		local_irq_enable();
+		device_power_up(PMSG_RESTORE);
+ Unlock_pm:
 		device_pm_unlock();
 		enable_nonboot_cpus();
  Resume_devices:
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 4a4a206b1979..320bb0949bdf 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -214,7 +214,7 @@ static int create_image(int platform_mode)
 		return error;
 
 	device_pm_lock();
-	local_irq_disable();
+
 	/* At this point, device_suspend() has been called, but *not*
 	 * device_power_down(). We *must* call device_power_down() now.
 	 * Otherwise, drivers for some devices (e.g. interrupt controllers)
@@ -225,8 +225,11 @@ static int create_image(int platform_mode)
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
-		goto Enable_irqs;
+		goto Unlock;
 	}
+
+	local_irq_disable();
+
 	sysdev_suspend(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
@@ -252,12 +255,16 @@ static int create_image(int platform_mode)
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
+
  Power_up_devices:
+	local_irq_enable();
+
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
- Enable_irqs:
-	local_irq_enable();
+
+ Unlock:
 	device_pm_unlock();
+
 	return error;
 }
 
@@ -336,13 +343,16 @@ static int resume_target_kernel(void)
 	int error;
 
 	device_pm_lock();
-	local_irq_disable();
+
 	error = device_power_down(PMSG_QUIESCE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting resume\n");
-		goto Enable_irqs;
+		goto Unlock;
 	}
+
+	local_irq_disable();
+
 	sysdev_suspend(PMSG_QUIESCE);
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
@@ -366,11 +376,16 @@ static int resume_target_kernel(void)
 	swsusp_free();
 	restore_processor_state();
 	touch_softlockup_watchdog();
+
 	sysdev_resume();
-	device_power_up(PMSG_RECOVER);
- Enable_irqs:
+
 	local_irq_enable();
+
+	device_power_up(PMSG_RECOVER);
+
+ Unlock:
 	device_pm_unlock();
+
 	return error;
 }
 
@@ -447,15 +462,16 @@ int hibernation_platform_enter(void)
 		goto Finish;
 
 	device_pm_lock();
-	local_irq_disable();
+
 	error = device_power_down(PMSG_HIBERNATE);
 	if (!error) {
+		local_irq_disable();
 		sysdev_suspend(PMSG_HIBERNATE);
 		hibernation_ops->enter();
 		/* We should never get here */
 		while (1);
 	}
-	local_irq_enable();
+
 	device_pm_unlock();
 
 	/*
@@ -464,12 +480,15 @@ int hibernation_platform_enter(void)
 	 */
  Finish:
 	hibernation_ops->finish();
+
  Resume_devices:
 	entering_platform_hibernation = false;
 	device_resume(PMSG_RESTORE);
 	resume_console();
+
  Close:
 	hibernation_ops->end();
+
 	return error;
 }
 
diff --git a/kernel/power/main.c b/kernel/power/main.c
index c9632f841f64..f0a466736c01 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -287,17 +287,19 @@ void __attribute__ ((weak)) arch_suspend_enable_irqs(void)
  */
 static int suspend_enter(suspend_state_t state)
 {
-	int error = 0;
+	int error;
 
 	device_pm_lock();
-	arch_suspend_disable_irqs();
-	BUG_ON(!irqs_disabled());
 
-	if ((error = device_power_down(PMSG_SUSPEND))) {
+	error = device_power_down(PMSG_SUSPEND);
+	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down\n");
 		goto Done;
 	}
 
+	arch_suspend_disable_irqs();
+	BUG_ON(!irqs_disabled());
+
 	error = sysdev_suspend(PMSG_SUSPEND);
 	if (!error) {
 		if (!suspend_test(TEST_CORE))
@@ -305,11 +307,14 @@ static int suspend_enter(suspend_state_t state)
 		sysdev_resume();
 	}
 
-	device_power_up(PMSG_RESUME);
- Done:
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
+
+	device_power_up(PMSG_RESUME);
+
+ Done:
 	device_pm_unlock();
+
 	return error;
 }
 
-- 
cgit v1.2.3


From 900af0d973856d6feb6fc088c2d0d3fde57707d3 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:15 +0100
Subject: PM: Change suspend code ordering

Change the ordering of the suspend core code so that the platform
"prepare" callback is executed and the nonboot CPUs are disabled
after calling device drivers' "late suspend" methods.

This change will allow us to rework the PCI PM core so that the power
state of devices is changed in the "late" phase of suspend (and
analogously in the "early" phase of resume), which in turn will allow
us to avoid the race condition where a device using shared interrupts
is put into a low power state with interrupts enabled and then an
interrupt (for another device) comes in and confuses its driver.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/main.c | 38 ++++++++++++++++++++++----------------
 1 file changed, 22 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/main.c b/kernel/power/main.c
index f0a466736c01..f172f41858bb 100644
--- a/kernel/power/main.c
+++ b/kernel/power/main.c
@@ -297,6 +297,19 @@ static int suspend_enter(suspend_state_t state)
 		goto Done;
 	}
 
+	if (suspend_ops->prepare) {
+		error = suspend_ops->prepare();
+		if (error)
+			goto Power_up_devices;
+	}
+
+	if (suspend_test(TEST_PLATFORM))
+		goto Platfrom_finish;
+
+	error = disable_nonboot_cpus();
+	if (error || suspend_test(TEST_CPUS))
+		goto Enable_cpus;
+
 	arch_suspend_disable_irqs();
 	BUG_ON(!irqs_disabled());
 
@@ -310,6 +323,14 @@ static int suspend_enter(suspend_state_t state)
 	arch_suspend_enable_irqs();
 	BUG_ON(irqs_disabled());
 
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Platfrom_finish:
+	if (suspend_ops->finish)
+		suspend_ops->finish();
+
+ Power_up_devices:
 	device_power_up(PMSG_RESUME);
 
  Done:
@@ -346,23 +367,8 @@ int suspend_devices_and_enter(suspend_state_t state)
 	if (suspend_test(TEST_DEVICES))
 		goto Recover_platform;
 
-	if (suspend_ops->prepare) {
-		error = suspend_ops->prepare();
-		if (error)
-			goto Resume_devices;
-	}
-
-	if (suspend_test(TEST_PLATFORM))
-		goto Finish;
+	suspend_enter(state);
 
-	error = disable_nonboot_cpus();
-	if (!error && !suspend_test(TEST_CPUS))
-		suspend_enter(state);
-
-	enable_nonboot_cpus();
- Finish:
-	if (suspend_ops->finish)
-		suspend_ops->finish();
  Resume_devices:
 	suspend_test_start();
 	device_resume(PMSG_RESUME);
-- 
cgit v1.2.3


From 4aecd6718939eb3c4145b248369b65f7483a8a02 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:26 +0100
Subject: PM: Change hibernation code ordering

Change the ordering of the hibernation core code so that the platform
"prepare" callbacks are executed and the nonboot CPUs are disabled
after calling device drivers' "late suspend" methods.

This change (along with the previous analogous change of the suspend
core code) will allow us to rework the PCI PM core so that the power
state of devices is changed in the "late" phase of suspend (and
analogously in the "early" phase of resume), which in turn will allow
us to avoid the race condition where a device using shared interrupts
is put into a low power state with interrupts enabled and then an
interrupt (for another device) comes in and confuses its driver.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/power/disk.c | 109 +++++++++++++++++++++++++++++-----------------------
 1 file changed, 61 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index 320bb0949bdf..e886d1332a10 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -228,13 +228,22 @@ static int create_image(int platform_mode)
 		goto Unlock;
 	}
 
+	error = platform_pre_snapshot(platform_mode);
+	if (error || hibernation_test(TEST_PLATFORM))
+		goto Platform_finish;
+
+	error = disable_nonboot_cpus();
+	if (error || hibernation_test(TEST_CPUS)
+	    || hibernation_testmode(HIBERNATION_TEST))
+		goto Enable_cpus;
+
 	local_irq_disable();
 
 	sysdev_suspend(PMSG_FREEZE);
 	if (error) {
 		printk(KERN_ERR "PM: Some devices failed to power down, "
 			"aborting hibernation\n");
-		goto Power_up_devices;
+		goto Enable_irqs;
 	}
 
 	if (hibernation_test(TEST_CORE))
@@ -250,15 +259,22 @@ static int create_image(int platform_mode)
 	restore_processor_state();
 	if (!in_suspend)
 		platform_leave(platform_mode);
+
  Power_up:
 	sysdev_resume();
 	/* NOTE:  device_power_up() is just a resume() for devices
 	 * that suspended with irqs off ... no overall powerup.
 	 */
 
- Power_up_devices:
+ Enable_irqs:
 	local_irq_enable();
 
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Platform_finish:
+	platform_finish(platform_mode);
+
 	device_power_up(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
 
@@ -298,25 +314,9 @@ int hibernation_snapshot(int platform_mode)
 	if (hibernation_test(TEST_DEVICES))
 		goto Recover_platform;
 
-	error = platform_pre_snapshot(platform_mode);
-	if (error || hibernation_test(TEST_PLATFORM))
-		goto Finish;
-
-	error = disable_nonboot_cpus();
-	if (!error) {
-		if (hibernation_test(TEST_CPUS))
-			goto Enable_cpus;
-
-		if (hibernation_testmode(HIBERNATION_TEST))
-			goto Enable_cpus;
+	error = create_image(platform_mode);
+	/* Control returns here after successful restore */
 
-		error = create_image(platform_mode);
-		/* Control returns here after successful restore */
-	}
- Enable_cpus:
-	enable_nonboot_cpus();
- Finish:
-	platform_finish(platform_mode);
  Resume_devices:
 	device_resume(in_suspend ?
 		(error ? PMSG_RECOVER : PMSG_THAW) : PMSG_RESTORE);
@@ -338,7 +338,7 @@ int hibernation_snapshot(int platform_mode)
  *	kernel.
  */
 
-static int resume_target_kernel(void)
+static int resume_target_kernel(bool platform_mode)
 {
 	int error;
 
@@ -351,9 +351,20 @@ static int resume_target_kernel(void)
 		goto Unlock;
 	}
 
+	error = platform_pre_restore(platform_mode);
+	if (error)
+		goto Cleanup;
+
+	error = disable_nonboot_cpus();
+	if (error)
+		goto Enable_cpus;
+
 	local_irq_disable();
 
-	sysdev_suspend(PMSG_QUIESCE);
+	error = sysdev_suspend(PMSG_QUIESCE);
+	if (error)
+		goto Enable_irqs;
+
 	/* We'll ignore saved state, but this gets preempt count (etc) right */
 	save_processor_state();
 	error = restore_highmem();
@@ -379,8 +390,15 @@ static int resume_target_kernel(void)
 
 	sysdev_resume();
 
+ Enable_irqs:
 	local_irq_enable();
 
+ Enable_cpus:
+	enable_nonboot_cpus();
+
+ Cleanup:
+	platform_restore_cleanup(platform_mode);
+
 	device_power_up(PMSG_RECOVER);
 
  Unlock:
@@ -405,19 +423,10 @@ int hibernation_restore(int platform_mode)
 	pm_prepare_console();
 	suspend_console();
 	error = device_suspend(PMSG_QUIESCE);
-	if (error)
-		goto Finish;
-
-	error = platform_pre_restore(platform_mode);
 	if (!error) {
-		error = disable_nonboot_cpus();
-		if (!error)
-			error = resume_target_kernel();
-		enable_nonboot_cpus();
+		error = resume_target_kernel(platform_mode);
+		device_resume(PMSG_RECOVER);
 	}
-	platform_restore_cleanup(platform_mode);
-	device_resume(PMSG_RECOVER);
- Finish:
 	resume_console();
 	pm_restore_console();
 	return error;
@@ -453,34 +462,38 @@ int hibernation_platform_enter(void)
 		goto Resume_devices;
 	}
 
+	device_pm_lock();
+
+	error = device_power_down(PMSG_HIBERNATE);
+	if (error)
+		goto Unlock;
+
 	error = hibernation_ops->prepare();
 	if (error)
-		goto Resume_devices;
+		goto Platofrm_finish;
 
 	error = disable_nonboot_cpus();
 	if (error)
-		goto Finish;
-
-	device_pm_lock();
-
-	error = device_power_down(PMSG_HIBERNATE);
-	if (!error) {
-		local_irq_disable();
-		sysdev_suspend(PMSG_HIBERNATE);
-		hibernation_ops->enter();
-		/* We should never get here */
-		while (1);
-	}
+		goto Platofrm_finish;
 
-	device_pm_unlock();
+	local_irq_disable();
+	sysdev_suspend(PMSG_HIBERNATE);
+	hibernation_ops->enter();
+	/* We should never get here */
+	while (1);
 
 	/*
 	 * We don't need to reenable the nonboot CPUs or resume consoles, since
 	 * the system is going to be halted anyway.
 	 */
- Finish:
+ Platofrm_finish:
 	hibernation_ops->finish();
 
+	device_power_up(PMSG_RESTORE);
+
+ Unlock:
+	device_pm_unlock();
+
  Resume_devices:
 	entering_platform_hibernation = false;
 	device_resume(PMSG_RESTORE);
-- 
cgit v1.2.3


From 749b0afc3a9d90dda3319fd1464a3b32fc225361 Mon Sep 17 00:00:00 2001
From: "Rafael J. Wysocki" <rjw@sisk.pl>
Date: Mon, 16 Mar 2009 22:34:35 +0100
Subject: kexec: Change kexec jump code ordering

Change the ordering of the kexec jump code so that the nonboot CPUs
are disabled after calling device drivers' "late suspend" methods.

This change reflects the recent modifications of the power management
code that is also used by kexec jump.

Signed-off-by: Rafael J. Wysocki <rjw@sisk.pl>
Acked-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/kexec.c | 19 +++++++++----------
 1 file changed, 9 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index dade9af6bf21..93eed85fe017 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1450,9 +1450,6 @@ int kernel_kexec(void)
 		error = device_suspend(PMSG_FREEZE);
 		if (error)
 			goto Resume_console;
-		error = disable_nonboot_cpus();
-		if (error)
-			goto Resume_devices;
 		device_pm_lock();
 		/* At this point, device_suspend() has been called,
 		 * but *not* device_power_down(). We *must*
@@ -1463,13 +1460,15 @@ int kernel_kexec(void)
 		 */
 		error = device_power_down(PMSG_FREEZE);
 		if (error)
-			goto Unlock_pm;
-
+			goto Resume_devices;
+		error = disable_nonboot_cpus();
+		if (error)
+			goto Enable_cpus;
 		local_irq_disable();
 		/* Suspend system devices */
 		error = sysdev_suspend(PMSG_FREEZE);
 		if (error)
-			goto Power_up_devices;
+			goto Enable_irqs;
 	} else
 #endif
 	{
@@ -1483,13 +1482,13 @@ int kernel_kexec(void)
 #ifdef CONFIG_KEXEC_JUMP
 	if (kexec_image->preserve_context) {
 		sysdev_resume();
- Power_up_devices:
+ Enable_irqs:
 		local_irq_enable();
-		device_power_up(PMSG_RESTORE);
- Unlock_pm:
-		device_pm_unlock();
+ Enable_cpus:
 		enable_nonboot_cpus();
+		device_power_up(PMSG_RESTORE);
  Resume_devices:
+		device_pm_unlock();
 		device_resume(PMSG_RESTORE);
  Resume_console:
 		resume_console();
-- 
cgit v1.2.3


From f69b17d7e745d8edd7c0d90390cbaa77e63c5ea3 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Fri, 20 Mar 2009 17:40:06 +0800
Subject: rcu: rcu_barrier VS cpu_hotplug: Ensure callbacks in dead cpu are
 migrated to online cpu

cpu hotplug may happen asynchronously, some rcu callbacks are maybe
still on dead cpu, rcu_barrier() also needs to wait for these rcu
callbacks to complete, so we must ensure callbacks in dead cpu are
migrated to online cpu.

Paul E. McKenney's review:

  Good stuff, Lai!!!  Simpler than any of the approaches that I was
  considering, and, better yet, independent of the underlying RCU
  implementation!!!

  I was initially worried that wake_up() might wake only one of two
  possible wait_event()s, namely rcu_barrier() and the CPU_POST_DEAD code,
  but the fact that wait_event() clears WQ_FLAG_EXCLUSIVE avoids that issue.
  I was also worried about the fact that different RCU implementations have
  different mappings of call_rcu(), call_rcu_bh(), and call_rcu_sched(), but
  this is OK as well because we just get an extra (harmless) callback in the
  case that they map together (for example, Classic RCU has call_rcu_sched()
  mapping to call_rcu()).

  Overlap of CPU-hotplug operations is prevented by cpu_add_remove_lock,
  and any stray callbacks that arrive (for example, from irq handlers
  running on the dying CPU) either are ahead of the CPU_DYING callbacks on
  the one hand (and thus accounted for), or happened after the rcu_barrier()
  started on the other (and thus don't need to be accounted for).

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <49C36476.1010400@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcupdate.c | 44 ++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

(limited to 'kernel')

diff --git a/kernel/rcupdate.c b/kernel/rcupdate.c
index cae8a059cf47..2c7b8457d0d2 100644
--- a/kernel/rcupdate.c
+++ b/kernel/rcupdate.c
@@ -122,6 +122,8 @@ static void rcu_barrier_func(void *type)
 	}
 }
 
+static inline void wait_migrated_callbacks(void);
+
 /*
  * Orchestrate the specified type of RCU barrier, waiting for all
  * RCU callbacks of the specified type to complete.
@@ -147,6 +149,7 @@ static void _rcu_barrier(enum rcu_barrier type)
 		complete(&rcu_barrier_completion);
 	wait_for_completion(&rcu_barrier_completion);
 	mutex_unlock(&rcu_barrier_mutex);
+	wait_migrated_callbacks();
 }
 
 /**
@@ -176,9 +179,50 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
+static atomic_t rcu_migrate_type_count = ATOMIC_INIT(0);
+static struct rcu_head rcu_migrate_head[3];
+static DECLARE_WAIT_QUEUE_HEAD(rcu_migrate_wq);
+
+static void rcu_migrate_callback(struct rcu_head *notused)
+{
+	if (atomic_dec_and_test(&rcu_migrate_type_count))
+		wake_up(&rcu_migrate_wq);
+}
+
+static inline void wait_migrated_callbacks(void)
+{
+	wait_event(rcu_migrate_wq, !atomic_read(&rcu_migrate_type_count));
+}
+
+static int __cpuinit rcu_barrier_cpu_hotplug(struct notifier_block *self,
+		unsigned long action, void *hcpu)
+{
+	if (action == CPU_DYING) {
+		/*
+		 * preempt_disable() in on_each_cpu() prevents stop_machine(),
+		 * so when "on_each_cpu(rcu_barrier_func, (void *)type, 1);"
+		 * returns, all online cpus have queued rcu_barrier_func(),
+		 * and the dead cpu(if it exist) queues rcu_migrate_callback()s.
+		 *
+		 * These callbacks ensure _rcu_barrier() waits for all
+		 * RCU callbacks of the specified type to complete.
+		 */
+		atomic_set(&rcu_migrate_type_count, 3);
+		call_rcu_bh(rcu_migrate_head, rcu_migrate_callback);
+		call_rcu_sched(rcu_migrate_head + 1, rcu_migrate_callback);
+		call_rcu(rcu_migrate_head + 2, rcu_migrate_callback);
+	} else if (action == CPU_POST_DEAD) {
+		/* rcu_migrate_head is protected by cpu_add_remove_lock */
+		wait_migrated_callbacks();
+	}
+
+	return NOTIFY_OK;
+}
+
 void __init rcu_init(void)
 {
 	__rcu_init();
+	hotcpu_notifier(rcu_barrier_cpu_hotplug, 0);
 }
 
 void rcu_scheduler_starting(void)
-- 
cgit v1.2.3


From e180a6b7759a99a28cbcce3547c4c80822cb6c2a Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:29 -0600
Subject: param: fix charp parameters set via sysfs

Impact: fix crash on reading from /sys/module/.../ieee80211_default_rc_algo

The module_param type "charp" simply sets a char * pointer in the
module to the parameter in the commandline string: this is why we keep
the (mangled) module command line around.  But when set via sysfs (as
about 11 charp parameters can be) this memory is freed on the way
out of the write().  Future reads hit random mem.

So we kstrdup instead: we have to check we're not in early commandline
parsing, and we have to note when we've used it so we can reliably
kfree the parameter when it's next overwritten, and also on module
unload.

(Thanks to Randy Dunlap for CONFIG_SYSFS=n fixes)

Reported-by: Sitsofe Wheeler <sitsofe@yahoo.com>
Diagnosed-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Frederic Weisbecker <fweisbec@gmail.com>
Tested-by: Christof Schmitt <christof.schmitt@de.ibm.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h      |  4 ++++
 include/linux/moduleparam.h | 10 ++++++++++
 kernel/module.c             | 14 ++++++++------
 kernel/params.c             | 26 +++++++++++++++++++++++++-
 4 files changed, 47 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 145a75528cc1..08e5e75d6122 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -248,6 +248,10 @@ struct module
 	const unsigned long *crcs;
 	unsigned int num_syms;
 
+	/* Kernel parameters. */
+	struct kernel_param *kp;
+	unsigned int num_kp;
+
 	/* GPL-only exported symbols. */
 	unsigned int num_gpl_syms;
 	const struct kernel_symbol *gpl_syms;
diff --git a/include/linux/moduleparam.h b/include/linux/moduleparam.h
index e4af3399ef48..a4f0b931846c 100644
--- a/include/linux/moduleparam.h
+++ b/include/linux/moduleparam.h
@@ -138,6 +138,16 @@ extern int parse_args(const char *name,
 		      unsigned num,
 		      int (*unknown)(char *param, char *val));
 
+/* Called by module remove. */
+#ifdef CONFIG_SYSFS
+extern void destroy_params(const struct kernel_param *params, unsigned num);
+#else
+static inline void destroy_params(const struct kernel_param *params,
+				  unsigned num)
+{
+}
+#endif /* !CONFIG_SYSFS */
+
 /* All the helper functions */
 /* The macros to do compile-time type checking stolen from Jakub
    Jelinek, who IIRC came up with this idea for the 2.4 module init code. */
diff --git a/kernel/module.c b/kernel/module.c
index f77ac320d0b5..b862fdb6a372 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1491,6 +1491,9 @@ static void free_module(struct module *mod)
 	/* Module unload stuff */
 	module_unload_free(mod);
 
+	/* Free any allocated parameters. */
+	destroy_params(mod->kp, mod->num_kp);
+
 	/* release any pointers to mcount in this module */
 	ftrace_release(mod->module_core, mod->core_size);
 
@@ -1898,8 +1901,7 @@ static noinline struct module *load_module(void __user *umod,
 	unsigned int symindex = 0;
 	unsigned int strindex = 0;
 	unsigned int modindex, versindex, infoindex, pcpuindex;
-	unsigned int num_kp, num_mcount;
-	struct kernel_param *kp;
+	unsigned int num_mcount;
 	struct module *mod;
 	long err = 0;
 	void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */
@@ -2144,8 +2146,8 @@ static noinline struct module *load_module(void __user *umod,
 
 	/* Now we've got everything in the final locations, we can
 	 * find optional sections. */
-	kp = section_objs(hdr, sechdrs, secstrings, "__param", sizeof(*kp),
-			  &num_kp);
+	mod->kp = section_objs(hdr, sechdrs, secstrings, "__param",
+			       sizeof(*mod->kp), &mod->num_kp);
 	mod->syms = section_objs(hdr, sechdrs, secstrings, "__ksymtab",
 				 sizeof(*mod->syms), &mod->num_syms);
 	mod->crcs = section_addr(hdr, sechdrs, secstrings, "__kcrctab");
@@ -2291,11 +2293,11 @@ static noinline struct module *load_module(void __user *umod,
 	 */
 	list_add_rcu(&mod->list, &modules);
 
-	err = parse_args(mod->name, mod->args, kp, num_kp, NULL);
+	err = parse_args(mod->name, mod->args, mod->kp, mod->num_kp, NULL);
 	if (err < 0)
 		goto unlink;
 
-	err = mod_sysfs_setup(mod, kp, num_kp);
+	err = mod_sysfs_setup(mod, mod->kp, mod->num_kp);
 	if (err < 0)
 		goto unlink;
 	add_sect_attrs(mod, hdr->e_shnum, secstrings, sechdrs);
diff --git a/kernel/params.c b/kernel/params.c
index a1e3025b19a9..de273ec85bd2 100644
--- a/kernel/params.c
+++ b/kernel/params.c
@@ -24,6 +24,9 @@
 #include <linux/err.h>
 #include <linux/slab.h>
 
+/* We abuse the high bits of "perm" to record whether we kmalloc'ed. */
+#define KPARAM_KMALLOCED	0x80000000
+
 #if 0
 #define DEBUGP printk
 #else
@@ -217,7 +220,19 @@ int param_set_charp(const char *val, struct kernel_param *kp)
 		return -ENOSPC;
 	}
 
-	*(char **)kp->arg = (char *)val;
+	if (kp->perm & KPARAM_KMALLOCED)
+		kfree(*(char **)kp->arg);
+
+	/* This is a hack.  We can't need to strdup in early boot, and we
+	 * don't need to; this mangled commandline is preserved. */
+	if (slab_is_available()) {
+		kp->perm |= KPARAM_KMALLOCED;
+		*(char **)kp->arg = kstrdup(val, GFP_KERNEL);
+		if (!kp->arg)
+			return -ENOMEM;
+	} else
+		*(const char **)kp->arg = val;
+
 	return 0;
 }
 
@@ -571,6 +586,15 @@ void module_param_sysfs_remove(struct module *mod)
 }
 #endif
 
+void destroy_params(const struct kernel_param *params, unsigned num)
+{
+	unsigned int i;
+
+	for (i = 0; i < num; i++)
+		if (params[i].perm & KPARAM_KMALLOCED)
+			kfree(*(char **)params[i].arg);
+}
+
 static void __init kernel_add_sysfs_param(const char *name,
 					  struct kernel_param *kparam,
 					  unsigned int name_skip)
-- 
cgit v1.2.3


From b10153fe31dde3805f8320b61ef147cebe379aee Mon Sep 17 00:00:00 2001
From: Américo Wang <xiyou.wangcong@gmail.com>
Date: Wed, 25 Mar 2009 00:07:19 +0800
Subject: kernel/module.c: fix an unused goto label

Impact: cleanup

Label 'free_init' is only used when defined(CONFIG_MODULE_UNLOAD) &&
defined(CONFIG_SMP), so move it inside to shut up gcc.

Signed-off-by: WANG Cong <xiyou.wangcong@gmail.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index b862fdb6a372..7af72bbe4cc0 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2319,8 +2319,8 @@ static noinline struct module *load_module(void __user *umod,
 	ftrace_release(mod->module_core, mod->core_size);
  free_unload:
 	module_unload_free(mod);
- free_init:
 #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP)
+ free_init:
 	percpu_modfree(mod->refptr);
 #endif
 	module_free(mod, mod->module_init);
-- 
cgit v1.2.3


From 414fd31b2553aaf160ca9b9afe45aa0372b01b92 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:56 -0500
Subject: module: Make find_symbol return a struct kernel_symbol

Impact: Cleanup, internal API change

Ksplice needs access to the kernel_symbol structure in order to support
modifications to the exported symbol table.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bugfix and style)
---
 kernel/module.c | 75 ++++++++++++++++++++++++++++-----------------------------
 1 file changed, 37 insertions(+), 38 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 7af72bbe4cc0..2f0fddf3c114 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -283,7 +283,7 @@ struct find_symbol_arg {
 	/* Output */
 	struct module *owner;
 	const unsigned long *crc;
-	unsigned long value;
+	const struct kernel_symbol *sym;
 };
 
 static bool find_symbol_in_section(const struct symsearch *syms,
@@ -324,17 +324,17 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 	fsa->owner = owner;
 	fsa->crc = symversion(syms->crcs, symnum);
-	fsa->value = syms->start[symnum].value;
+	fsa->sym = &syms->start[symnum];
 	return true;
 }
 
-/* Find a symbol, return value, (optional) crc and (optional) module
- * which owns it */
-static unsigned long find_symbol(const char *name,
-				 struct module **owner,
-				 const unsigned long **crc,
-				 bool gplok,
-				 bool warn)
+/* Find a symbol and return it, along with, (optional) crc and
+ * (optional) module which owns it */
+static const struct kernel_symbol *find_symbol(const char *name,
+					       struct module **owner,
+					       const unsigned long **crc,
+					       bool gplok,
+					       bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -347,11 +347,11 @@ static unsigned long find_symbol(const char *name,
 			*owner = fsa.owner;
 		if (crc)
 			*crc = fsa.crc;
-		return fsa.value;
+		return fsa.sym;
 	}
 
 	DEBUGP("Failed to find symbol %s\n", name);
-	return -ENOENT;
+	return NULL;
 }
 
 /* Search for module by name: must hold module_mutex. */
@@ -894,7 +894,7 @@ void __symbol_put(const char *symbol)
 	struct module *owner;
 
 	preempt_disable();
-	if (IS_ERR_VALUE(find_symbol(symbol, &owner, NULL, true, false)))
+	if (!find_symbol(symbol, &owner, NULL, true, false))
 		BUG();
 	module_put(owner);
 	preempt_enable();
@@ -1057,7 +1057,7 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	if (IS_ERR_VALUE(find_symbol("struct_module", NULL, &crc, true, false)))
+	if (!find_symbol("struct_module", NULL, &crc, true, false))
 		BUG();
 	return check_version(sechdrs, versindex, "struct_module", mod, crc);
 }
@@ -1098,25 +1098,25 @@ static inline int same_magic(const char *amagic, const char *bmagic,
 
 /* Resolve a symbol for this module.  I.e. if we find one, record usage.
    Must be holding module_mutex. */
-static unsigned long resolve_symbol(Elf_Shdr *sechdrs,
-				    unsigned int versindex,
-				    const char *name,
-				    struct module *mod)
+static const struct kernel_symbol *resolve_symbol(Elf_Shdr *sechdrs,
+						  unsigned int versindex,
+						  const char *name,
+						  struct module *mod)
 {
 	struct module *owner;
-	unsigned long ret;
+	const struct kernel_symbol *sym;
 	const unsigned long *crc;
 
-	ret = find_symbol(name, &owner, &crc,
+	sym = find_symbol(name, &owner, &crc,
 			  !(mod->taints & (1 << TAINT_PROPRIETARY_MODULE)), true);
-	if (!IS_ERR_VALUE(ret)) {
-		/* use_module can fail due to OOM,
-		   or module initialization or unloading */
+	/* use_module can fail due to OOM,
+	   or module initialization or unloading */
+	if (sym) {
 		if (!check_version(sechdrs, versindex, name, mod, crc) ||
 		    !use_module(mod, owner))
-			ret = -EINVAL;
+			sym = NULL;
 	}
-	return ret;
+	return sym;
 }
 
 /*
@@ -1516,17 +1516,15 @@ static void free_module(struct module *mod)
 void *__symbol_get(const char *symbol)
 {
 	struct module *owner;
-	unsigned long value;
+	const struct kernel_symbol *sym;
 
 	preempt_disable();
-	value = find_symbol(symbol, &owner, NULL, true, true);
-	if (IS_ERR_VALUE(value))
-		value = 0;
-	else if (strong_try_module_get(owner))
-		value = 0;
+	sym = find_symbol(symbol, &owner, NULL, true, true);
+	if (sym && strong_try_module_get(owner))
+		sym = NULL;
 	preempt_enable();
 
-	return (void *)value;
+	return sym ? (void *)sym->value : NULL;
 }
 EXPORT_SYMBOL_GPL(__symbol_get);
 
@@ -1554,8 +1552,7 @@ static int verify_export_symbols(struct module *mod)
 
 	for (i = 0; i < ARRAY_SIZE(arr); i++) {
 		for (s = arr[i].sym; s < arr[i].sym + arr[i].num; s++) {
-			if (!IS_ERR_VALUE(find_symbol(s->name, &owner,
-						      NULL, true, false))) {
+			if (find_symbol(s->name, &owner, NULL, true, false)) {
 				printk(KERN_ERR
 				       "%s: exports duplicate symbol %s"
 				       " (owned by %s)\n",
@@ -1579,6 +1576,7 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 	unsigned long secbase;
 	unsigned int i, n = sechdrs[symindex].sh_size / sizeof(Elf_Sym);
 	int ret = 0;
+	const struct kernel_symbol *ksym;
 
 	for (i = 1; i < n; i++) {
 		switch (sym[i].st_shndx) {
@@ -1598,13 +1596,14 @@ static int simplify_symbols(Elf_Shdr *sechdrs,
 			break;
 
 		case SHN_UNDEF:
-			sym[i].st_value
-			  = resolve_symbol(sechdrs, versindex,
-					   strtab + sym[i].st_name, mod);
-
+			ksym = resolve_symbol(sechdrs, versindex,
+					      strtab + sym[i].st_name, mod);
 			/* Ok if resolved.  */
-			if (!IS_ERR_VALUE(sym[i].st_value))
+			if (ksym) {
+				sym[i].st_value = ksym->value;
 				break;
+			}
+
 			/* Ok if weak.  */
 			if (ELF_ST_BIND(sym[i].st_info) == STB_WEAK)
 				break;
-- 
cgit v1.2.3


From e610499e2656e61975affd0af56b26eb73964c84 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: __module_address

Impact: New API, cleanup

ksplice wants to know the bounds of a module, not just the module text.

It makes sense to have __module_address.  We then implement
is_module_address and __module_text_address in terms of this (and
change is_module_text_address() to bool while we're at it).

Also, add proper kerneldoc for them all.

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 20 +++++++++----
 kernel/module.c        | 76 ++++++++++++++++++++++++++++++++++++++------------
 2 files changed, 73 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index 08e5e75d6122..fd1241e1416f 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -365,7 +365,9 @@ static inline int module_is_live(struct module *mod)
 /* Is this address in a module? (second is with no locks, for oops) */
 struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
-int is_module_address(unsigned long addr);
+struct module *__module_address(unsigned long addr);
+bool is_module_address(unsigned long addr);
+bool is_module_text_address(unsigned long addr);
 
 static inline int within_module_core(unsigned long addr, struct module *mod)
 {
@@ -494,21 +496,29 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-/* Is this address in a module? */
 static inline struct module *module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-/* Is this address in a module? (don't take a lock, we're oopsing) */
+static inline struct module *__module_address(unsigned long addr)
+{
+	return NULL;
+}
+
 static inline struct module *__module_text_address(unsigned long addr)
 {
 	return NULL;
 }
 
-static inline int is_module_address(unsigned long addr)
+static inline bool is_module_address(unsigned long addr)
 {
-	return 0;
+	return false;
+}
+
+static inline bool is_module_text_address(unsigned long addr)
+{
+	return false;
 }
 
 /* Get/put a kernel symbol (calls should be symmetric) */
diff --git a/kernel/module.c b/kernel/module.c
index 2f0fddf3c114..bd15a94f91c1 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -76,7 +76,7 @@ static DECLARE_WAIT_QUEUE_HEAD(module_wq);
 
 static BLOCKING_NOTIFIER_HEAD(module_notify_list);
 
-/* Bounds of module allocation, for speeding __module_text_address */
+/* Bounds of module allocation, for speeding __module_address */
 static unsigned long module_addr_min = -1UL, module_addr_max = 0;
 
 int register_module_notifier(struct notifier_block * nb)
@@ -2745,29 +2745,31 @@ const struct exception_table_entry *search_module_extables(unsigned long addr)
 }
 
 /*
- * Is this a valid module address?
+ * is_module_address - is this address inside a module?
+ * @addr: the address to check.
+ *
+ * See is_module_text_address() if you simply want to see if the address
+ * is code (not data).
  */
-int is_module_address(unsigned long addr)
+bool is_module_address(unsigned long addr)
 {
-	struct module *mod;
+	bool ret;
 
 	preempt_disable();
-
-	list_for_each_entry_rcu(mod, &modules, list) {
-		if (within_module_core(addr, mod)) {
-			preempt_enable();
-			return 1;
-		}
-	}
-
+	ret = __module_address(addr) != NULL;
 	preempt_enable();
 
-	return 0;
+	return ret;
 }
 
-
-/* Is this a valid kernel address? */
-__notrace_funcgraph struct module *__module_text_address(unsigned long addr)
+/*
+ * __module_address - get the module which contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+__notrace_funcgraph struct module *__module_address(unsigned long addr)
 {
 	struct module *mod;
 
@@ -2775,12 +2777,50 @@ __notrace_funcgraph struct module *__module_text_address(unsigned long addr)
 		return NULL;
 
 	list_for_each_entry_rcu(mod, &modules, list)
-		if (within(addr, mod->module_init, mod->init_text_size)
-		    || within(addr, mod->module_core, mod->core_text_size))
+		if (within_module_core(addr, mod)
+		    || within_module_init(addr, mod))
 			return mod;
 	return NULL;
 }
 
+/*
+ * is_module_text_address - is this address inside module code?
+ * @addr: the address to check.
+ *
+ * See is_module_address() if you simply want to see if the address is
+ * anywhere in a module.  See kernel_text_address() for testing if an
+ * address corresponds to kernel or module code.
+ */
+bool is_module_text_address(unsigned long addr)
+{
+	bool ret;
+
+	preempt_disable();
+	ret = __module_text_address(addr) != NULL;
+	preempt_enable();
+
+	return ret;
+}
+
+/*
+ * __module_text_address - get the module whose code contains an address.
+ * @addr: the address.
+ *
+ * Must be called with preempt disabled or module mutex held so that
+ * module doesn't get freed during this.
+ */
+struct module *__module_text_address(unsigned long addr)
+{
+	struct module *mod = __module_address(addr);
+	if (mod) {
+		/* Make sure it's within the text section. */
+		if (!within(addr, mod->module_init, mod->init_text_size)
+		    && !within(addr, mod->module_core, mod->core_text_size))
+			mod = NULL;
+	}
+	return mod;
+}
+
 struct module *module_text_address(unsigned long addr)
 {
 	struct module *mod;
-- 
cgit v1.2.3


From a6e6abd575fcbe6572ebc7a70ad616406d206fa8 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:31 -0600
Subject: module: remove module_text_address()

Impact: Replace and remove risky (non-EXPORTed) API

module_text_address() returns a pointer to the module, which given locking
improvements in module.c, is useless except to test for NULL:

1) If the module can't go away, use __module_text_address.
2) Otherwise, just use is_module_text_address().

Cc: linux-mtd@lists.infradead.org
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 drivers/mtd/nand/nand_base.c |  4 ++--
 include/linux/module.h       |  7 -------
 kernel/extable.c             |  6 +++---
 kernel/module.c              | 17 ++++-------------
 4 files changed, 9 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c
index 0c3afccde8a2..5f71371eb1b0 100644
--- a/drivers/mtd/nand/nand_base.c
+++ b/drivers/mtd/nand/nand_base.c
@@ -2720,14 +2720,14 @@ int nand_scan_tail(struct mtd_info *mtd)
 	return chip->scan_bbt(mtd);
 }
 
-/* module_text_address() isn't exported, and it's mostly a pointless
+/* is_module_text_address() isn't exported, and it's mostly a pointless
    test if this is a module _anyway_ -- they'd have to try _really_ hard
    to call us from in-kernel code if the core NAND support is modular. */
 #ifdef MODULE
 #define caller_is_module() (1)
 #else
 #define caller_is_module() \
-	module_text_address((unsigned long)__builtin_return_address(0))
+	is_module_text_address((unsigned long)__builtin_return_address(0))
 #endif
 
 /**
diff --git a/include/linux/module.h b/include/linux/module.h
index fd1241e1416f..69761ce0dbf0 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -362,8 +362,6 @@ static inline int module_is_live(struct module *mod)
 	return mod->state != MODULE_STATE_GOING;
 }
 
-/* Is this address in a module? (second is with no locks, for oops) */
-struct module *module_text_address(unsigned long addr);
 struct module *__module_text_address(unsigned long addr);
 struct module *__module_address(unsigned long addr);
 bool is_module_address(unsigned long addr);
@@ -496,11 +494,6 @@ search_module_extables(unsigned long addr)
 	return NULL;
 }
 
-static inline struct module *module_text_address(unsigned long addr)
-{
-	return NULL;
-}
-
 static inline struct module *__module_address(unsigned long addr)
 {
 	return NULL;
diff --git a/kernel/extable.c b/kernel/extable.c
index e136ed8d82ba..384f0da8a03e 100644
--- a/kernel/extable.c
+++ b/kernel/extable.c
@@ -58,14 +58,14 @@ __notrace_funcgraph int __kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return __module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 int kernel_text_address(unsigned long addr)
 {
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
 
 /*
@@ -81,5 +81,5 @@ int func_ptr_is_kernel_text(void *ptr)
 	addr = (unsigned long) dereference_function_descriptor(ptr);
 	if (core_kernel_text(addr))
 		return 1;
-	return module_text_address(addr) != NULL;
+	return is_module_text_address(addr);
 }
diff --git a/kernel/module.c b/kernel/module.c
index bd15a94f91c1..8ddca629e079 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -908,8 +908,10 @@ void symbol_put_addr(void *addr)
 	if (core_kernel_text((unsigned long)addr))
 		return;
 
-	if (!(modaddr = module_text_address((unsigned long)addr)))
-		BUG();
+	/* module_text_address is safe here: we're supposed to have reference
+	 * to module from symbol_get, so it can't go away. */
+	modaddr = __module_text_address((unsigned long)addr);
+	BUG_ON(!modaddr);
 	module_put(modaddr);
 }
 EXPORT_SYMBOL_GPL(symbol_put_addr);
@@ -2821,17 +2823,6 @@ struct module *__module_text_address(unsigned long addr)
 	return mod;
 }
 
-struct module *module_text_address(unsigned long addr)
-{
-	struct module *mod;
-
-	preempt_disable();
-	mod = __module_text_address(addr);
-	preempt_enable();
-
-	return mod;
-}
-
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
 {
-- 
cgit v1.2.3


From 75a66614db21007bcc8c37f9c5d5b922981387b9 Mon Sep 17 00:00:00 2001
From: Anders Kaseorg <andersk@mit.edu>
Date: Fri, 5 Dec 2008 19:03:58 -0500
Subject: Ksplice: Add functions for walking kallsyms symbols

Impact: New API

kallsyms_lookup_name only returns the first match that it finds.  Ksplice
needs information about all symbols with a given name in order to correctly
resolve local symbols.

kallsyms_on_each_symbol provides a generic mechanism for iterating over the
kallsyms table.

Cc: Jeff Arnold <jbarnold@mit.edu>
Cc: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Anders Kaseorg <andersk@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/kallsyms.h | 15 +++++++++++++++
 include/linux/module.h   | 12 ++++++++++++
 kernel/kallsyms.c        | 19 +++++++++++++++++++
 kernel/module.c          | 19 +++++++++++++++++++
 4 files changed, 65 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kallsyms.h b/include/linux/kallsyms.h
index f3fe34391d8e..792274269f2b 100644
--- a/include/linux/kallsyms.h
+++ b/include/linux/kallsyms.h
@@ -13,10 +13,17 @@
 #define KSYM_SYMBOL_LEN (sizeof("%s+%#lx/%#lx [%s]") + (KSYM_NAME_LEN - 1) + \
 			 2*(BITS_PER_LONG*3/10) + (MODULE_NAME_LEN - 1) + 1)
 
+struct module;
+
 #ifdef CONFIG_KALLSYMS
 /* Lookup the address for a symbol. Returns 0 if not found. */
 unsigned long kallsyms_lookup_name(const char *name);
 
+/* Call a function on each kallsyms symbol in the core kernel */
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data);
+
 extern int kallsyms_lookup_size_offset(unsigned long addr,
 				  unsigned long *symbolsize,
 				  unsigned long *offset);
@@ -43,6 +50,14 @@ static inline unsigned long kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+						    struct module *,
+						    unsigned long),
+					  void *data)
+{
+	return 0;
+}
+
 static inline int kallsyms_lookup_size_offset(unsigned long addr,
 					      unsigned long *symbolsize,
 					      unsigned long *offset)
diff --git a/include/linux/module.h b/include/linux/module.h
index 69761ce0dbf0..c3d3fc4ffb18 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -387,6 +387,10 @@ int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
 /* Look for this name: can be of form module:name. */
 unsigned long module_kallsyms_lookup_name(const char *name);
 
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data);
+
 extern void __module_put_and_exit(struct module *mod, long code)
 	__attribute__((noreturn));
 #define module_put_and_exit(code) __module_put_and_exit(THIS_MODULE, code);
@@ -566,6 +570,14 @@ static inline unsigned long module_kallsyms_lookup_name(const char *name)
 	return 0;
 }
 
+static inline int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+							   struct module *,
+							   unsigned long),
+						 void *data)
+{
+	return 0;
+}
+
 static inline int register_module_notifier(struct notifier_block * nb)
 {
 	/* no events will happen anyway, so this can always succeed */
diff --git a/kernel/kallsyms.c b/kernel/kallsyms.c
index 7b8b0f21a5b1..374faf9bfdc7 100644
--- a/kernel/kallsyms.c
+++ b/kernel/kallsyms.c
@@ -161,6 +161,25 @@ unsigned long kallsyms_lookup_name(const char *name)
 	return module_kallsyms_lookup_name(name);
 }
 
+int kallsyms_on_each_symbol(int (*fn)(void *, const char *, struct module *,
+				      unsigned long),
+			    void *data)
+{
+	char namebuf[KSYM_NAME_LEN];
+	unsigned long i;
+	unsigned int off;
+	int ret;
+
+	for (i = 0, off = 0; i < kallsyms_num_syms; i++) {
+		off = kallsyms_expand_symbol(off, namebuf);
+		ret = fn(data, namebuf, NULL, kallsyms_addresses[i]);
+		if (ret != 0)
+			return ret;
+	}
+	return module_kallsyms_on_each_symbol(fn, data);
+}
+EXPORT_SYMBOL_GPL(kallsyms_on_each_symbol);
+
 static unsigned long get_symbol_pos(unsigned long addr,
 				    unsigned long *symbolsize,
 				    unsigned long *offset)
diff --git a/kernel/module.c b/kernel/module.c
index 8ddca629e079..dd4389be9152 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -2612,6 +2612,25 @@ unsigned long module_kallsyms_lookup_name(const char *name)
 	preempt_enable();
 	return ret;
 }
+
+int module_kallsyms_on_each_symbol(int (*fn)(void *, const char *,
+					     struct module *, unsigned long),
+				   void *data)
+{
+	struct module *mod;
+	unsigned int i;
+	int ret;
+
+	list_for_each_entry(mod, &modules, list) {
+		for (i = 0; i < mod->num_symtab; i++) {
+			ret = fn(data, mod->strtab + mod->symtab[i].st_name,
+				 mod, mod->symtab[i].st_value);
+			if (ret != 0)
+				return ret;
+		}
+	}
+	return 0;
+}
 #endif /* CONFIG_KALLSYMS */
 
 static char *module_flags(struct module *mod, char *buf)
-- 
cgit v1.2.3


From c6b37801911d7f4663c99cad8aa230bc934cea82 Mon Sep 17 00:00:00 2001
From: Tim Abbott <tabbott@mit.edu>
Date: Fri, 5 Dec 2008 19:03:59 -0500
Subject: module: Export symbols needed for Ksplice

Impact: Expose some module.c symbols

Ksplice uses several functions from module.c in order to resolve
symbols and implement dependency handling.  Calling these functions
requires holding module_mutex, so it is exported.

(This is just the module part of a bigger add-exports patch from Tim).

Cc: Anders Kaseorg <andersk@mit.edu>
Cc: Jeff Arnold <jbarnold@mit.edu>
Signed-off-by: Tim Abbott <tabbott@mit.edu>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 include/linux/module.h | 28 ++++++++++++++++++++++++++++
 kernel/module.c        | 43 +++++++++++++++++++------------------------
 2 files changed, 47 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/module.h b/include/linux/module.h
index c3d3fc4ffb18..d246da0b0f8c 100644
--- a/include/linux/module.h
+++ b/include/linux/module.h
@@ -354,6 +354,8 @@ struct module
 #define MODULE_ARCH_INIT {}
 #endif
 
+extern struct mutex module_mutex;
+
 /* FIXME: It'd be nice to isolate modules during init, too, so they
    aren't used before they (may) fail.  But presently too much code
    (IDE & SCSI) require entry into the module during init.*/
@@ -379,6 +381,31 @@ static inline int within_module_init(unsigned long addr, struct module *mod)
 	       addr < (unsigned long)mod->module_init + mod->init_size;
 }
 
+/* Search for module by name: must hold module_mutex. */
+struct module *find_module(const char *name);
+
+struct symsearch {
+	const struct kernel_symbol *start, *stop;
+	const unsigned long *crcs;
+	enum {
+		NOT_GPL_ONLY,
+		GPL_ONLY,
+		WILL_BE_GPL_ONLY,
+	} licence;
+	bool unused;
+};
+
+/* Search for an exported symbol by name. */
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn);
+
+/* Walk the exported symbol table */
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data);
+
 /* Returns 0 and fills in value, defined and namebuf, or -ERANGE if
    symnum out of range. */
 int module_get_kallsym(unsigned int symnum, unsigned long *value, char *type,
@@ -452,6 +479,7 @@ static inline void __module_get(struct module *module)
 #define symbol_put_addr(p) do { } while(0)
 
 #endif /* CONFIG_MODULE_UNLOAD */
+int use_module(struct module *a, struct module *b);
 
 /* This is a #define so the string doesn't get put in every .o file */
 #define module_name(mod)			\
diff --git a/kernel/module.c b/kernel/module.c
index dd4389be9152..5fd00766a4dc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -68,7 +68,8 @@
 
 /* List of modules, protected by module_mutex or preempt_disable
  * (delete uses stop_machine/add uses RCU list operations). */
-static DEFINE_MUTEX(module_mutex);
+DEFINE_MUTEX(module_mutex);
+EXPORT_SYMBOL_GPL(module_mutex);
 static LIST_HEAD(modules);
 
 /* Waiting for a module to finish initializing? */
@@ -186,17 +187,6 @@ extern const unsigned long __start___kcrctab_unused_gpl[];
 #define symversion(base, idx) ((base != NULL) ? ((base) + (idx)) : NULL)
 #endif
 
-struct symsearch {
-	const struct kernel_symbol *start, *stop;
-	const unsigned long *crcs;
-	enum {
-		NOT_GPL_ONLY,
-		GPL_ONLY,
-		WILL_BE_GPL_ONLY,
-	} licence;
-	bool unused;
-};
-
 static bool each_symbol_in_section(const struct symsearch *arr,
 				   unsigned int arrsize,
 				   struct module *owner,
@@ -217,10 +207,8 @@ static bool each_symbol_in_section(const struct symsearch *arr,
 }
 
 /* Returns true as soon as fn returns true, otherwise false. */
-static bool each_symbol(bool (*fn)(const struct symsearch *arr,
-				   struct module *owner,
-				   unsigned int symnum, void *data),
-			void *data)
+bool each_symbol(bool (*fn)(const struct symsearch *arr, struct module *owner,
+			    unsigned int symnum, void *data), void *data)
 {
 	struct module *mod;
 	const struct symsearch arr[] = {
@@ -273,6 +261,7 @@ static bool each_symbol(bool (*fn)(const struct symsearch *arr,
 	}
 	return false;
 }
+EXPORT_SYMBOL_GPL(each_symbol);
 
 struct find_symbol_arg {
 	/* Input */
@@ -330,11 +319,11 @@ static bool find_symbol_in_section(const struct symsearch *syms,
 
 /* Find a symbol and return it, along with, (optional) crc and
  * (optional) module which owns it */
-static const struct kernel_symbol *find_symbol(const char *name,
-					       struct module **owner,
-					       const unsigned long **crc,
-					       bool gplok,
-					       bool warn)
+const struct kernel_symbol *find_symbol(const char *name,
+					struct module **owner,
+					const unsigned long **crc,
+					bool gplok,
+					bool warn)
 {
 	struct find_symbol_arg fsa;
 
@@ -353,9 +342,10 @@ static const struct kernel_symbol *find_symbol(const char *name,
 	DEBUGP("Failed to find symbol %s\n", name);
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_symbol);
 
 /* Search for module by name: must hold module_mutex. */
-static struct module *find_module(const char *name)
+struct module *find_module(const char *name)
 {
 	struct module *mod;
 
@@ -365,6 +355,7 @@ static struct module *find_module(const char *name)
 	}
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(find_module);
 
 #ifdef CONFIG_SMP
 
@@ -641,7 +632,7 @@ static int already_uses(struct module *a, struct module *b)
 }
 
 /* Module a uses b */
-static int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	struct module_use *use;
 	int no_warn, err;
@@ -674,6 +665,7 @@ static int use_module(struct module *a, struct module *b)
 	no_warn = sysfs_create_link(b->holders_dir, &a->mkobj.kobj, a->name);
 	return 1;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 /* Clear the unload stuff of the module. */
 static void module_unload_free(struct module *mod)
@@ -951,10 +943,11 @@ static inline void module_unload_free(struct module *mod)
 {
 }
 
-static inline int use_module(struct module *a, struct module *b)
+int use_module(struct module *a, struct module *b)
 {
 	return strong_try_module_get(b) == 0;
 }
+EXPORT_SYMBOL_GPL(use_module);
 
 static inline void module_unload_init(struct module *mod)
 {
@@ -2803,6 +2796,7 @@ __notrace_funcgraph struct module *__module_address(unsigned long addr)
 			return mod;
 	return NULL;
 }
+EXPORT_SYMBOL_GPL(__module_address);
 
 /*
  * is_module_text_address - is this address inside module code?
@@ -2841,6 +2835,7 @@ struct module *__module_text_address(unsigned long addr)
 	}
 	return mod;
 }
+EXPORT_SYMBOL_GPL(__module_text_address);
 
 /* Don't grab lock, we're oopsing. */
 void print_modules(void)
-- 
cgit v1.2.3


From c6e665c8f0c18ab3686117905765b5139efd6ebd Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:33 -0600
Subject: module: clarify the force-loading taint message.

Impact: Message cleanup

Two of three callers of try_to_force_load() are not because of a
missing version, so change the messages:

Old:
	<modname>: no version for "magic" found: kernel tainted.
New:
	<modname>: bad vermagic: kernel tainted.

Old:
	<modname>: no version for "nocrc" found: kernel tainted.
New:
	<modname>: no versions for exported symbols: kernel tainted.

Old:
	<modname>: no version for "<symname>" found: kernel tainted.
New:
	<modname>: <symname>: kernel tainted.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 5fd00766a4dc..599fc85bd2cc 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -990,12 +990,12 @@ static struct module_attribute *modinfo_attrs[] = {
 
 static const char vermagic[] = VERMAGIC_STRING;
 
-static int try_to_force_load(struct module *mod, const char *symname)
+static int try_to_force_load(struct module *mod, const char *reason)
 {
 #ifdef CONFIG_MODULE_FORCE_LOAD
 	if (!test_taint(TAINT_FORCED_MODULE))
-		printk("%s: no version for \"%s\" found: kernel tainted.\n",
-		       mod->name, symname);
+		printk(KERN_WARNING "%s: %s: kernel tainted.\n",
+		       mod->name, reason);
 	add_taint_module(mod, TAINT_FORCED_MODULE);
 	return 0;
 #else
@@ -2002,7 +2002,7 @@ static noinline struct module *load_module(void __user *umod,
 	modmagic = get_modinfo(sechdrs, infoindex, "vermagic");
 	/* This is allowed: modprobe --force will invalidate it. */
 	if (!modmagic) {
-		err = try_to_force_load(mod, "magic");
+		err = try_to_force_load(mod, "bad vermagic");
 		if (err)
 			goto free_hdr;
 	} else if (!same_magic(modmagic, vermagic, versindex)) {
@@ -2191,8 +2191,8 @@ static noinline struct module *load_module(void __user *umod,
 	    || (mod->num_unused_gpl_syms && !mod->unused_gpl_crcs)
 #endif
 		) {
-		printk(KERN_WARNING "%s: No versions for exported symbols.\n", mod->name);
-		err = try_to_force_load(mod, "nocrc");
+		err = try_to_force_load(mod,
+					"no versions for exported symbols");
 		if (err)
 			goto cleanup;
 	}
-- 
cgit v1.2.3


From 9cb610d8e35fe3ec95a2fe2030b02f85aeea83c1 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:33 -0600
Subject: module: remove the SHF_ALLOC flag on the __versions section.

Impact: reduce kernel memory usage

This patch just takes off the SHF_ALLOC flag on __versions so we don't
keep them around after module load.

This saves about 7% of module memory if CONFIG_MODVERSIONS=y.

Cc: Shawn Bohrer <shawn.bohrer@gmail.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 599fc85bd2cc..784bf6d8f8c4 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1961,6 +1961,9 @@ static noinline struct module *load_module(void __user *umod,
 		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
+		/* Don't keep __versions around; it's just for loading. */
+		if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
+			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
 
 	modindex = find_sec(hdr, sechdrs, secstrings,
-- 
cgit v1.2.3


From 8c8ef42aee8fcfb4128bb94c50d55c9f80ade525 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:34 -0600
Subject: module: include other structures in module version check

With CONFIG_MODVERSIONS, we version 'struct module' using a dummy
export, but other things matter too:

1) 'struct modversion_info' determines the layout of the __versions section,
2) 'struct kernel_param' determines the layout of the __params section,
3) 'struct kernel_symbol' determines __ksymtab*.
4) 'struct marker' determines __markers.
5) 'struct tracepoint' determines __tracepoints.

So we rename 'struct_module' to 'module_layout' and include these in
the signature.  Now it's general we can add others later on without
confusion.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c       | 18 +++++++++++++-----
 scripts/mod/modpost.c |  4 ++--
 2 files changed, 15 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 784bf6d8f8c4..e8cf636e614b 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1052,9 +1052,9 @@ static inline int check_modstruct_version(Elf_Shdr *sechdrs,
 {
 	const unsigned long *crc;
 
-	if (!find_symbol("struct_module", NULL, &crc, true, false))
+	if (!find_symbol("module_layout", NULL, &crc, true, false))
 		BUG();
-	return check_version(sechdrs, versindex, "struct_module", mod, crc);
+	return check_version(sechdrs, versindex, "module_layout", mod, crc);
 }
 
 /* First part is kernel version, which we ignore if module has crcs. */
@@ -2858,9 +2858,17 @@ void print_modules(void)
 }
 
 #ifdef CONFIG_MODVERSIONS
-/* Generate the signature for struct module here, too, for modversions. */
-void struct_module(struct module *mod) { return; }
-EXPORT_SYMBOL(struct_module);
+/* Generate the signature for all relevant module structures here.
+ * If these change, we don't want to try to parse the module. */
+void module_layout(struct module *mod,
+		   struct modversion_info *ver,
+		   struct kernel_param *kp,
+		   struct kernel_symbol *ks,
+		   struct marker *marker,
+		   struct tracepoint *tp)
+{
+}
+EXPORT_SYMBOL(module_layout);
 #endif
 
 #ifdef CONFIG_MARKERS
diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c
index 7e62303133dc..8cc70612984c 100644
--- a/scripts/mod/modpost.c
+++ b/scripts/mod/modpost.c
@@ -1607,12 +1607,12 @@ static void read_symbols(char *modname)
 
 	parse_elf_finish(&info);
 
-	/* Our trick to get versioning for struct_module - it's
+	/* Our trick to get versioning for module struct etc. - it's
 	 * never passed as an argument to an exported function, so
 	 * the automatic versioning doesn't pick it up, but it's really
 	 * important anyhow */
 	if (modversions)
-		mod->unres = alloc_symbol("struct_module", 0, mod->unres);
+		mod->unres = alloc_symbol("module_layout", 0, mod->unres);
 }
 
 #define SZ 500
-- 
cgit v1.2.3


From acae05156551fd7528fbb616271e672789388e3c Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sun, 8 Feb 2009 10:42:01 -0800
Subject: module: create a request_module_nowait()

There seems to be a common pattern in the kernel where drivers want to
call request_module() from inside a module_init() function. Currently
this would deadlock.

As a result, several drivers go through hoops like scheduling things via
kevent, or creating custom work queues (because kevent can deadlock on them).

This patch changes this to use a request_module_nowait() function macro instead,
which just fires the modprobe off but doesn't wait for it, and thus avoids the
original deadlock entirely.

On my laptop this already results in one less kernel thread running..

(Includes Jiri's patch to use enum umh_wait)

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au> (bool-ified)
Cc: Jiri Slaby <jirislaby@gmail.com>
---
 include/linux/kmod.h | 11 ++++++++---
 kernel/kmod.c        | 10 ++++++----
 2 files changed, 14 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/kmod.h b/include/linux/kmod.h
index 92213a9194e1..d5fa565086d1 100644
--- a/include/linux/kmod.h
+++ b/include/linux/kmod.h
@@ -29,10 +29,15 @@
 #ifdef CONFIG_MODULES
 /* modprobe exit status on success, -ve on error.  Return value
  * usually useless though. */
-extern int request_module(const char * name, ...) __attribute__ ((format (printf, 1, 2)));
-#define try_then_request_module(x, mod...) ((x) ?: (request_module(mod), (x)))
+extern int __request_module(bool wait, const char *name, ...) \
+	__attribute__((format(printf, 2, 3)));
+#define request_module(mod...) __request_module(true, mod)
+#define request_module_nowait(mod...) __request_module(false, mod)
+#define try_then_request_module(x, mod...) \
+	((x) ?: (__request_module(false, mod), (x)))
 #else
-static inline int request_module(const char * name, ...) { return -ENOSYS; }
+static inline int request_module(const char *name, ...) { return -ENOSYS; }
+static inline int request_module_nowait(const char *name, ...) { return -ENOSYS; }
 #define try_then_request_module(x, mod...) (x)
 #endif
 
diff --git a/kernel/kmod.c b/kernel/kmod.c
index f0c8f545180d..b750675251e5 100644
--- a/kernel/kmod.c
+++ b/kernel/kmod.c
@@ -50,7 +50,8 @@ static struct workqueue_struct *khelper_wq;
 char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
 
 /**
- * request_module - try to load a kernel module
+ * __request_module - try to load a kernel module
+ * @wait: wait (or not) for the operation to complete
  * @fmt: printf style format string for the name of the module
  * @...: arguments as specified in the format string
  *
@@ -63,7 +64,7 @@ char modprobe_path[KMOD_PATH_LEN] = "/sbin/modprobe";
  * If module auto-loading support is disabled then this function
  * becomes a no-operation.
  */
-int request_module(const char *fmt, ...)
+int __request_module(bool wait, const char *fmt, ...)
 {
 	va_list args;
 	char module_name[MODULE_NAME_LEN];
@@ -108,11 +109,12 @@ int request_module(const char *fmt, ...)
 		return -ENOMEM;
 	}
 
-	ret = call_usermodehelper(modprobe_path, argv, envp, 1);
+	ret = call_usermodehelper(modprobe_path, argv, envp,
+			wait ? UMH_WAIT_PROC : UMH_WAIT_EXEC);
 	atomic_dec(&kmod_concurrent);
 	return ret;
 }
-EXPORT_SYMBOL(request_module);
+EXPORT_SYMBOL(__request_module);
 #endif /* CONFIG_MODULES */
 
 struct subprocess_info {
-- 
cgit v1.2.3


From e91defa26c527ceeaff6266c55cdc7e17c9081a2 Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:35 -0600
Subject: module: don't use stop_machine on module load

Kay Sievers <kay.sievers@vrfy.org> discovered that boot times are slowed
by about half a second because all the stop_machine_create() calls,
and he only probes about 40 modules (I have 125 loaded on this laptop).

We only do stop_machine_create() so we can unlink the module if
something goes wrong, but it's overkill (and buggy anyway: if
stop_machine_create() fails we still call stop_machine_destroy()).

Since we are only protecting against kallsyms (esp. oops) walking the
list, synchronize_sched() is sufficient (synchronize_rcu() is probably
sufficient, but we're not in a hurry).

Kay says of this patch:
	... no module takes more than 40 millisecs to link now, most of
	them are between 3 and 8 millisecs.

	That looks very different to the numbers without this patch
	and the otherwise same setup, where we get heavy noise in the
	traces and many delays of up to 200 millisecs until linking,
	most of them taking 30+ millisecs.

Tested-by: Kay Sievers <kay.sievers@vrfy.org>
Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index e8cf636e614b..1a9a3986b136 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1912,12 +1912,6 @@ static noinline struct module *load_module(void __user *umod,
 	if (len > 64 * 1024 * 1024 || (hdr = vmalloc(len)) == NULL)
 		return ERR_PTR(-ENOMEM);
 
-	/* Create stop_machine threads since the error path relies on
-	 * a non-failing stop_machine call. */
-	err = stop_machine_create();
-	if (err)
-		goto free_hdr;
-
 	if (copy_from_user(hdr, umod, len) != 0) {
 		err = -EFAULT;
 		goto free_hdr;
@@ -2303,12 +2297,13 @@ static noinline struct module *load_module(void __user *umod,
 	/* Get rid of temporary copy */
 	vfree(hdr);
 
-	stop_machine_destroy();
 	/* Done! */
 	return mod;
 
  unlink:
-	stop_machine(__unlink_module, mod, NULL);
+	/* Unlink carefully: kallsyms could be walking list. */
+	list_del_rcu(&mod->list);
+	synchronize_sched();
 	module_arch_cleanup(mod);
  cleanup:
 	kobject_del(&mod->mkobj.kobj);
@@ -2331,7 +2326,6 @@ static noinline struct module *load_module(void __user *umod,
 	kfree(args);
  free_hdr:
 	vfree(hdr);
-	stop_machine_destroy();
 	return ERR_PTR(err);
 
  truncated:
-- 
cgit v1.2.3


From 49502677e11079c2e3e01867c922a894ce06a8be Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 31 Mar 2009 13:05:36 -0600
Subject: module: use strstarts()

Impact: minor cleanup.

I'm not going to neaten anyone else's code, but I'm happy to clean up
my own.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index 1a9a3986b136..f6e08b7cff7c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1673,8 +1673,7 @@ static void layout_sections(struct module *mod,
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strncmp(secstrings + s->sh_name,
-				       ".init", 5) == 0)
+			    || strstarts(secstrings + s->sh_name, ".init"))
 				continue;
 			s->sh_entsize = get_offset(mod, &mod->core_size, s, i);
 			DEBUGP("\t%s\n", secstrings + s->sh_name);
@@ -1691,8 +1690,7 @@ static void layout_sections(struct module *mod,
 			if ((s->sh_flags & masks[m][0]) != masks[m][0]
 			    || (s->sh_flags & masks[m][1])
 			    || s->sh_entsize != ~0UL
-			    || strncmp(secstrings + s->sh_name,
-				       ".init", 5) != 0)
+			    || !strstarts(secstrings + s->sh_name, ".init"))
 				continue;
 			s->sh_entsize = (get_offset(mod, &mod->init_size, s, i)
 					 | INIT_OFFSET_MASK);
@@ -1825,8 +1823,7 @@ static char elf_type(const Elf_Sym *sym,
 		else
 			return 'b';
 	}
-	if (strncmp(secstrings + sechdrs[sym->st_shndx].sh_name,
-		    ".debug", strlen(".debug")) == 0)
+	if (strstarts(secstrings + sechdrs[sym->st_shndx].sh_name, ".debug"))
 		return 'n';
 	return '?';
 }
@@ -1952,7 +1949,7 @@ static noinline struct module *load_module(void __user *umod,
 		}
 #ifndef CONFIG_MODULE_UNLOAD
 		/* Don't load .exit sections */
-		if (strncmp(secstrings+sechdrs[i].sh_name, ".exit", 5) == 0)
+		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
 		/* Don't keep __versions around; it's just for loading. */
-- 
cgit v1.2.3


From 7f1e2ca9f04b02794597f60e7b1d43f0a1317939 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 13 Mar 2009 12:21:27 +0100
Subject: hrtimer: fix rq->lock inversion (again)

It appears I inadvertly introduced rq->lock recursion to the
hrtimer_start() path when I delegated running already expired
timers to softirq context.

This patch fixes it by introducing a __hrtimer_start_range_ns()
method that will not use raise_softirq_irqoff() but
__raise_softirq_irqoff() which avoids the wakeup.

It then also changes schedule() to check for pending softirqs and
do the wakeup then, I'm not quite sure I like this last bit, nor
am I convinced its really needed.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: paulus@samba.org
LKML-Reference: <20090313112301.096138802@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/hrtimer.h   |  5 +++++
 include/linux/interrupt.h |  1 +
 kernel/hrtimer.c          | 55 +++++++++++++++++++++++++++++------------------
 kernel/sched.c            | 14 +++++++++---
 kernel/softirq.c          |  2 +-
 5 files changed, 52 insertions(+), 25 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index bd37078c2d7d..0d2f7c8a33d6 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -336,6 +336,11 @@ extern int hrtimer_start(struct hrtimer *timer, ktime_t tim,
 			 const enum hrtimer_mode mode);
 extern int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
 			unsigned long range_ns, const enum hrtimer_mode mode);
+extern int
+__hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+			 unsigned long delta_ns,
+			 const enum hrtimer_mode mode, int wakeup);
+
 extern int hrtimer_cancel(struct hrtimer *timer);
 extern int hrtimer_try_to_cancel(struct hrtimer *timer);
 
diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index c68bffd182bb..4528bf70866a 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -294,6 +294,7 @@ extern void softirq_init(void);
 #define __raise_softirq_irqoff(nr) do { or_softirq_pending(1UL << (nr)); } while (0)
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
+extern void wakeup_softirqd(void);
 
 /* This is the worklist that queues up per-cpu softirq work.
  *
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index f394d2a42ca3..cb8a15c19583 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -651,14 +651,20 @@ static inline void hrtimer_init_timer_hres(struct hrtimer *timer)
  * and expiry check is done in the hrtimer_interrupt or in the softirq.
  */
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	if (base->cpu_base->hres_active && hrtimer_reprogram(timer, base)) {
-		spin_unlock(&base->cpu_base->lock);
-		raise_softirq_irqoff(HRTIMER_SOFTIRQ);
-		spin_lock(&base->cpu_base->lock);
+		if (wakeup) {
+			spin_unlock(&base->cpu_base->lock);
+			raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+			spin_lock(&base->cpu_base->lock);
+		} else
+			__raise_softirq_irqoff(HRTIMER_SOFTIRQ);
+
 		return 1;
 	}
+
 	return 0;
 }
 
@@ -703,7 +709,8 @@ static inline int hrtimer_is_hres_enabled(void) { return 0; }
 static inline int hrtimer_switch_to_hres(void) { return 0; }
 static inline void hrtimer_force_reprogram(struct hrtimer_cpu_base *base) { }
 static inline int hrtimer_enqueue_reprogram(struct hrtimer *timer,
-					    struct hrtimer_clock_base *base)
+					    struct hrtimer_clock_base *base,
+					    int wakeup)
 {
 	return 0;
 }
@@ -886,20 +893,9 @@ remove_hrtimer(struct hrtimer *timer, struct hrtimer_clock_base *base)
 	return 0;
 }
 
-/**
- * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
- * @timer:	the timer to be added
- * @tim:	expiry time
- * @delta_ns:	"slack" range for the timer
- * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
- *
- * Returns:
- *  0 on success
- *  1 when the timer was active
- */
-int
-hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_ns,
-			const enum hrtimer_mode mode)
+int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode,
+		int wakeup)
 {
 	struct hrtimer_clock_base *base, *new_base;
 	unsigned long flags;
@@ -940,12 +936,29 @@ hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, unsigned long delta_n
 	 * XXX send_remote_softirq() ?
 	 */
 	if (leftmost && new_base->cpu_base == &__get_cpu_var(hrtimer_bases))
-		hrtimer_enqueue_reprogram(timer, new_base);
+		hrtimer_enqueue_reprogram(timer, new_base, wakeup);
 
 	unlock_hrtimer_base(timer, &flags);
 
 	return ret;
 }
+
+/**
+ * hrtimer_start_range_ns - (re)start an hrtimer on the current CPU
+ * @timer:	the timer to be added
+ * @tim:	expiry time
+ * @delta_ns:	"slack" range for the timer
+ * @mode:	expiry mode: absolute (HRTIMER_ABS) or relative (HRTIMER_REL)
+ *
+ * Returns:
+ *  0 on success
+ *  1 when the timer was active
+ */
+int hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
+		unsigned long delta_ns, const enum hrtimer_mode mode)
+{
+	return __hrtimer_start_range_ns(timer, tim, delta_ns, mode, 1);
+}
 EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 
 /**
@@ -961,7 +974,7 @@ EXPORT_SYMBOL_GPL(hrtimer_start_range_ns);
 int
 hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
 {
-	return hrtimer_start_range_ns(timer, tim, 0, mode);
+	return __hrtimer_start_range_ns(timer, tim, 0, mode, 1);
 }
 EXPORT_SYMBOL_GPL(hrtimer_start);
 
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..63256e3ede2a 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -231,13 +231,20 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b)
 
 	spin_lock(&rt_b->rt_runtime_lock);
 	for (;;) {
+		unsigned long delta;
+		ktime_t soft, hard;
+
 		if (hrtimer_active(&rt_b->rt_period_timer))
 			break;
 
 		now = hrtimer_cb_get_time(&rt_b->rt_period_timer);
 		hrtimer_forward(&rt_b->rt_period_timer, now, rt_b->rt_period);
-		hrtimer_start_expires(&rt_b->rt_period_timer,
-				HRTIMER_MODE_ABS);
+
+		soft = hrtimer_get_softexpires(&rt_b->rt_period_timer);
+		hard = hrtimer_get_expires(&rt_b->rt_period_timer);
+		delta = ktime_to_ns(ktime_sub(hard, soft));
+		__hrtimer_start_range_ns(&rt_b->rt_period_timer, soft, delta,
+				HRTIMER_MODE_ABS, 0);
 	}
 	spin_unlock(&rt_b->rt_runtime_lock);
 }
@@ -1146,7 +1153,8 @@ static __init void init_hrtick(void)
  */
 static void hrtick_start(struct rq *rq, u64 delay)
 {
-	hrtimer_start(&rq->hrtick_timer, ns_to_ktime(delay), HRTIMER_MODE_REL);
+	__hrtimer_start_range_ns(&rq->hrtick_timer, ns_to_ktime(delay), 0,
+			HRTIMER_MODE_REL, 0);
 }
 
 static inline void init_hrtick(void)
diff --git a/kernel/softirq.c b/kernel/softirq.c
index 487751604300..accc85197c49 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -58,7 +58,7 @@ static DEFINE_PER_CPU(struct task_struct *, ksoftirqd);
  * to the pending events, so lets the scheduler to balance
  * the softirq load for us.
  */
-static inline void wakeup_softirqd(void)
+void wakeup_softirqd(void)
 {
 	/* Interrupts are disabled: no need to stop preemption */
 	struct task_struct *tsk = __get_cpu_var(ksoftirqd);
-- 
cgit v1.2.3


From eedeeabdeeadb016b8c783e3620d06b98d0cb4e1 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 18 Mar 2009 12:38:47 +0100
Subject: lockdep: add stack dumps to asserts

Have a better idea about exactly which loc causes a lockdep
limit overflow. Often it's a bug or inefficiency in that
subsystem.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1237376327.5069.253.camel@laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/lockdep.c | 5 +++++
 1 file changed, 5 insertions(+)

(limited to 'kernel')

diff --git a/kernel/lockdep.c b/kernel/lockdep.c
index 981cd4854281..a288ae107b50 100644
--- a/kernel/lockdep.c
+++ b/kernel/lockdep.c
@@ -792,6 +792,7 @@ register_lock_class(struct lockdep_map *lock, unsigned int subclass, int force)
 
 		printk("BUG: MAX_LOCKDEP_KEYS too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return NULL;
 	}
 	class = lock_classes + nr_lock_classes++;
@@ -855,6 +856,7 @@ static struct lock_list *alloc_list_entry(void)
 
 		printk("BUG: MAX_LOCKDEP_ENTRIES too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return NULL;
 	}
 	return list_entries + nr_list_entries++;
@@ -1681,6 +1683,7 @@ cache_hit:
 
 		printk("BUG: MAX_LOCKDEP_CHAINS too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 	chain = lock_chains + nr_lock_chains++;
@@ -2540,6 +2543,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		debug_locks_off();
 		printk("BUG: MAX_LOCKDEP_SUBCLASSES too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 
@@ -2636,6 +2640,7 @@ static int __lock_acquire(struct lockdep_map *lock, unsigned int subclass,
 		debug_locks_off();
 		printk("BUG: MAX_LOCK_DEPTH too low!\n");
 		printk("turning off the locking correctness validator.\n");
+		dump_stack();
 		return 0;
 	}
 
-- 
cgit v1.2.3


From 6c051ce0307526adec32a847f0daa1af2124f0a9 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:18:56 +0800
Subject: blktrace: fix timestamp in binary output

I found the timestamp is wrong:

 # echo bin > trace_option
 # echo blk > current_tracer
 # cat trace_pipe | blkparse -i -
 8,0    0        0     0.000000000   504  A   W ...
 ...
 8,7    1        0     0.008534097     0  C   R ...
            (should be 8.534097xxx)

user-space blkparse expects the timestamp to be nanosecond.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 6fb274f5f34e..ee7a8bb8b1e8 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1168,7 +1168,7 @@ static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 	const int offset = offsetof(struct blk_io_trace, sector);
 	struct blk_io_trace old = {
 		.magic	  = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION,
-		.time     = ns2usecs(iter->ts),
+		.time     = iter->ts,
 	};
 
 	if (!trace_seq_putmem(s, &old, offset))
-- 
cgit v1.2.3


From b5230b56ee6caeb27cedb7753c0c319646383bb4 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:19:33 +0800
Subject: blktrace: fix a race when creating blk_tree_root in debugfs

t1                                t2
------                            ------
do_blk_trace_setup()              do_blk_trace_setup()
  if (!blk_tree_root) {
                                    if (!blk_tree_root)
    blk_tree_root = create_dir()
                                      blk_tree_root = create_dir();
                                      (now blk_tree_root == NULL)
  ...
  dir = create_dir(name, blk_tree_root);

Due to this race, t1 will create 'dir' in /debugfs but not /debugfs/block.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index ee7a8bb8b1e8..95f89faca73e 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -426,11 +426,15 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	ret = -ENOENT;
 
+	mutex_lock(&blk_tree_mutex);
 	if (!blk_tree_root) {
 		blk_tree_root = debugfs_create_dir("block", NULL);
-		if (!blk_tree_root)
+		if (!blk_tree_root) {
+			mutex_unlock(&blk_tree_mutex);
 			goto err;
+		}
 	}
+	mutex_unlock(&blk_tree_mutex);
 
 	dir = debugfs_create_dir(buts->name, blk_tree_root);
 
-- 
cgit v1.2.3


From 5554720482a631702146a959db22fe417740e0a6 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Wed, 25 Mar 2009 17:21:26 +0800
Subject: blktrace: fix the original blktrace

Currently the original blktrace, which is using relay and is used via
ioctl, is broken. You can use ftrace to see the output of blktrace,
but user-space blktrace is unusable.

It's broken by "blktrace: add ftrace plugin"
(c71a896154119f4ca9e89d6078f5f63ad60ef199)

 -	if (unlikely(bt->trace_state != Blktrace_running))
 +	if (unlikely(bt->trace_state != Blktrace_running || !blk_tracer_enabled))
		return;

With this patch, both ioctl and ftrace can be used, but of course you
can't use both of them at the same time.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 95f89faca73e..a7f7ff5db38b 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -110,7 +110,7 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
-	if (blk_tr) {
+	if (blk_tracer_enabled) {
 		va_start(args, fmt);
 		ftrace_vprintk(fmt, args);
 		va_end(args);
@@ -169,7 +169,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	pid_t pid;
 	int cpu, pc = 0;
 
-	if (unlikely(bt->trace_state != Blktrace_running ||
+	if (unlikely(bt->trace_state != Blktrace_running &&
 		     !blk_tracer_enabled))
 		return;
 
@@ -185,7 +185,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 	cpu = raw_smp_processor_id();
 
-	if (blk_tr) {
+	if (blk_tracer_enabled) {
 		tracing_record_cmdline(current);
 
 		pc = preempt_count();
@@ -235,7 +235,7 @@ record_it:
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
-		if (blk_tr) {
+		if (blk_tracer_enabled) {
 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
@@ -267,8 +267,7 @@ int blk_trace_remove(struct request_queue *q)
 	if (!bt)
 		return -EINVAL;
 
-	if (bt->trace_state == Blktrace_setup ||
-	    bt->trace_state == Blktrace_stopped)
+	if (bt->trace_state != Blktrace_running)
 		blk_trace_cleanup(bt);
 
 	return 0;
@@ -1273,7 +1272,6 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
 	bt->end_lba = -1ULL;
-	bt->trace_state = Blktrace_running;
 
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
-- 
cgit v1.2.3


From eb08f8eb0673d9c1e62b69ad1b41593e73c40467 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Tue, 24 Mar 2009 16:05:27 +0800
Subject: blktrace: fix off-by-one bug

'what' is used as the index of array what2act, so it can't >= the array size.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Acked-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a7f7ff5db38b..d43cdac7c754 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -1152,7 +1152,7 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 	if (!trace_print_context(iter))
 		return TRACE_TYPE_PARTIAL_LINE;
 
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
@@ -1199,7 +1199,7 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 	t = (const struct blk_io_trace *)iter->ent;
 	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
 
-	if (unlikely(what == 0 || what > ARRAY_SIZE(what2act)))
+	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
 	else {
 		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-- 
cgit v1.2.3


From 35ac51bfe4c293b67ce9f85082ba0b9bc6123c40 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:19:46 +0800
Subject: blktrace: make classic output more classic

Impact: fix ftrace plugin timestamp output

In the classic user-space blktrace, the output timestamp is sec.nsec
not sec.usec.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index d43cdac7c754..5b28f0f119c5 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -994,8 +994,8 @@ static void get_pdu_remap(const struct trace_entry *ent,
 static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
-	unsigned long long ts  = ns2usecs(iter->ts);
-	unsigned long usec_rem = do_div(ts, USEC_PER_SEC);
+	unsigned long long ts  = iter->ts;
+	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
 	const struct trace_entry *ent = iter->ent;
 	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
@@ -1003,9 +1003,9 @@ static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
 	fill_rwbs(rwbs, t);
 
 	return trace_seq_printf(&iter->seq,
-				"%3d,%-3d %2d %5d.%06lu %5u %2s %3s ",
+				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, usec_rem, ent->pid, act, rwbs);
+				secs, nsec_rem, ent->pid, act, rwbs);
 }
 
 static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-- 
cgit v1.2.3


From 17ba97e347bec9bbc47a0877c7a098708982129d Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:20:09 +0800
Subject: blktrace: fix blk_probes_ref chaos

Impact: fix mixed ioctl and ftrace-plugin blktrace use refcount bugs

ioctl-based blktrace allocates bt and registers tracepoints when
ioctl(BLKTRACESETUP), and do all cleanups when ioctl(BLKTRACETEARDOWN).

while ftrace-based blktrace allocates/frees bt when:
  # echo 1/0 > /sys/block/sda/sda1/trace/enable

and registers/unregisters tracepoints when:
  # echo blk/nop > /debugfs/tracing/current_tracer
or
  # echo 1/0 > /debugfs/tracing/tracing_enable

The separatation of allocation and registeration causes 2 problems:

  1. current user-space blktrace still calls ioctl(TEARDOWN) when
     ioctl(SETUP) failed:
       # echo 1 > /sys/block/sda/sda1/trace/enable
       # blktrace /dev/sda
         BLKTRACESETUP: Device or resource busy
         ^C
     and now blk_probes_ref == -1

  2. Another way to make blk_probes_ref == -1:
     # plugin sdb && mount sdb1
     # echo 1 > /sys/block/sdb/sdb1/trace/enable
     # remove sdb

This patch does the allocation and registeration when writing
sdaX/trace/enable.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 15 +++++++--------
 1 file changed, 7 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 5b28f0f119c5..8d6bd12aab10 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -478,7 +478,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		goto err;
 	}
 
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
+	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
 
 	return 0;
@@ -1091,8 +1091,6 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
-	if (atomic_add_return(1, &blk_probes_ref) == 1)
-		blk_register_tracepoints();
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1107,15 +1105,10 @@ static int blk_tracer_init(struct trace_array *tr)
 static void blk_tracer_stop(struct trace_array *tr)
 {
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
-	if (atomic_dec_and_test(&blk_probes_ref))
-		blk_unregister_tracepoints();
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
 {
-	if (!atomic_read(&blk_probes_ref))
-		return;
-
 	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
@@ -1254,6 +1247,9 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (bt == NULL)
 		return -EINVAL;
 
+	if (atomic_dec_and_test(&blk_probes_ref))
+		blk_unregister_tracepoints();
+
 	kfree(bt);
 	return 0;
 }
@@ -1280,6 +1276,9 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 		return -EBUSY;
 	}
 
+	if (atomic_inc_return(&blk_probes_ref) == 1)
+		blk_register_tracepoints();
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From ad5dd5493a55e462796e42e50a49e76df76fdb05 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:20:24 +0800
Subject: blktrace: fix memory leak when freeing struct blk_io_trace

Impact: fix mixed ioctl and ftrace-plugin blktrace use memory leak

When mixing the use of ioctl-based blktrace and ftrace-based blktrace,
we can leak memory in this way:

  # btrace /dev/sda > /dev/null &
  # echo 0 > /sys/block/sda/sda1/trace/enable

now we leak bt->dropped_file, bt->msg_file, bt->rchan...

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 8d6bd12aab10..2f21d7761600 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -247,7 +247,7 @@ record_it:
 static struct dentry *blk_tree_root;
 static DEFINE_MUTEX(blk_tree_mutex);
 
-static void blk_trace_cleanup(struct blk_trace *bt)
+static void blk_trace_free(struct blk_trace *bt)
 {
 	debugfs_remove(bt->msg_file);
 	debugfs_remove(bt->dropped_file);
@@ -255,6 +255,11 @@ static void blk_trace_cleanup(struct blk_trace *bt)
 	free_percpu(bt->sequence);
 	free_percpu(bt->msg_data);
 	kfree(bt);
+}
+
+static void blk_trace_cleanup(struct blk_trace *bt)
+{
+	blk_trace_free(bt);
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 }
@@ -410,11 +415,11 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 		if (buts->name[i] == '/')
 			buts->name[i] = '_';
 
-	ret = -ENOMEM;
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
-		goto err;
+		return -ENOMEM;
 
+	ret = -ENOMEM;
 	bt->sequence = alloc_percpu(unsigned long);
 	if (!bt->sequence)
 		goto err;
@@ -483,17 +488,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev,
 
 	return 0;
 err:
-	if (bt) {
-		if (bt->msg_file)
-			debugfs_remove(bt->msg_file);
-		if (bt->dropped_file)
-			debugfs_remove(bt->dropped_file);
-		free_percpu(bt->sequence);
-		free_percpu(bt->msg_data);
-		if (bt->rchan)
-			relay_close(bt->rchan);
-		kfree(bt);
-	}
+	blk_trace_free(bt);
 	return ret;
 }
 
@@ -1091,6 +1086,7 @@ static void blk_tracer_print_header(struct seq_file *m)
 
 static void blk_tracer_start(struct trace_array *tr)
 {
+	blk_tracer_enabled = true;
 	trace_flags &= ~TRACE_ITER_CONTEXT_INFO;
 }
 
@@ -1098,18 +1094,17 @@ static int blk_tracer_init(struct trace_array *tr)
 {
 	blk_tr = tr;
 	blk_tracer_start(tr);
-	blk_tracer_enabled = true;
 	return 0;
 }
 
 static void blk_tracer_stop(struct trace_array *tr)
 {
+	blk_tracer_enabled = false;
 	trace_flags |= TRACE_ITER_CONTEXT_INFO;
 }
 
 static void blk_tracer_reset(struct trace_array *tr)
 {
-	blk_tracer_enabled = false;
 	blk_tracer_stop(tr);
 }
 
@@ -1250,7 +1245,7 @@ static int blk_trace_remove_queue(struct request_queue *q)
 	if (atomic_dec_and_test(&blk_probes_ref))
 		blk_unregister_tracepoints();
 
-	kfree(bt);
+	blk_trace_free(bt);
 	return 0;
 }
 
-- 
cgit v1.2.3


From b6a4b0c3ad4c09c1d37b1040ac8e3ebd1016e10b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:23 +0800
Subject: blktrace: extract duplidate code

Impact: cleanup

blk_trace_event_print() and blk_tracer_print_line() share most of the code.

   text    data     bss     dec     hex filename
   8605     393      12    9010    2332 kernel/trace/blktrace.o.orig
   text    data     bss     dec     hex filename
   8555     393      12    8960    2300 kernel/trace/blktrace.o

This patch also prepares for the next patch, that prints out BLK_TN_MESSAGE.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 62 ++++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 2f21d7761600..c103b0c20022 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -986,29 +986,31 @@ static void get_pdu_remap(const struct trace_entry *ent,
 	r->sector = be64_to_cpu(sector);
 }
 
-static int blk_log_action_iter(struct trace_iterator *iter, const char *act)
+typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act);
+
+static int blk_log_action_classic(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
 	unsigned long long ts  = iter->ts;
 	unsigned long nsec_rem = do_div(ts, NSEC_PER_SEC);
 	unsigned secs	       = (unsigned long)ts;
-	const struct trace_entry *ent = iter->ent;
-	const struct blk_io_trace *t = (const struct blk_io_trace *)ent;
+	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
 
 	fill_rwbs(rwbs, t);
 
 	return trace_seq_printf(&iter->seq,
 				"%3d,%-3d %2d %5d.%09lu %5u %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), iter->cpu,
-				secs, nsec_rem, ent->pid, act, rwbs);
+				secs, nsec_rem, iter->ent->pid, act, rwbs);
 }
 
-static int blk_log_action_seq(struct trace_seq *s, const struct blk_io_trace *t,
-			      const char *act)
+static int blk_log_action(struct trace_iterator *iter, const char *act)
 {
 	char rwbs[6];
+	const struct blk_io_trace *t = te_blk_io_trace(iter->ent);
+
 	fill_rwbs(rwbs, t);
-	return trace_seq_printf(s, "%3d,%-3d %2s %3s ",
+	return trace_seq_printf(&iter->seq, "%3d,%-3d %2s %3s ",
 				MAJOR(t->device), MINOR(t->device), act, rwbs);
 }
 
@@ -1129,22 +1131,25 @@ static const struct {
 	[__BLK_TA_REMAP]	= {{  "A", "remap" },	   blk_log_remap },
 };
 
-static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
-					       int flags)
+static enum print_line_t print_one_line(struct trace_iterator *iter,
+					bool classic)
 {
 	struct trace_seq *s = &iter->seq;
-	const struct blk_io_trace *t = (struct blk_io_trace *)iter->ent;
-	const u16 what = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	const struct blk_io_trace *t;
+	u16 what;
 	int ret;
+	bool long_act;
+	blk_log_action_t *log_action;
 
-	if (!trace_print_context(iter))
-		return TRACE_TYPE_PARTIAL_LINE;
+	t	   = te_blk_io_trace(iter->ent);
+	what	   = t->action & ((1 << BLK_TC_SHIFT) - 1);
+	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
+	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_seq(s, t, what2act[what].act[long_act]);
+		ret = log_action(iter, what2act[what].act[long_act]);
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
@@ -1152,6 +1157,15 @@ static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 
+static enum print_line_t blk_trace_event_print(struct trace_iterator *iter,
+					       int flags)
+{
+	if (!trace_print_context(iter))
+		return TRACE_TYPE_PARTIAL_LINE;
+
+	return print_one_line(iter, false);
+}
+
 static int blk_trace_synthesize_old_trace(struct trace_iterator *iter)
 {
 	struct trace_seq *s = &iter->seq;
@@ -1177,26 +1191,10 @@ blk_trace_event_print_binary(struct trace_iterator *iter, int flags)
 
 static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter)
 {
-	const struct blk_io_trace *t;
-	u16 what;
-	int ret;
-
 	if (!(blk_tracer_flags.val & TRACE_BLK_OPT_CLASSIC))
 		return TRACE_TYPE_UNHANDLED;
 
-	t = (const struct blk_io_trace *)iter->ent;
-	what = t->action & ((1 << BLK_TC_SHIFT) - 1);
-
-	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
-		ret = trace_seq_printf(&iter->seq, "Bad pc action %x\n", what);
-	else {
-		const bool long_act = !!(trace_flags & TRACE_ITER_VERBOSE);
-		ret = blk_log_action_iter(iter, what2act[what].act[long_act]);
-		if (ret)
-			ret = what2act[what].print(&iter->seq, iter->ent);
-	}
-
-	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
+	return print_one_line(iter, true);
 }
 
 static struct tracer blk_tracer __read_mostly = {
-- 
cgit v1.2.3


From 18cea4591a98817697017bcb056a848bae1205df Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:54 +0800
Subject: blktrace: print out BLK_TN_MESSAGE properly

Impact: improve ftrace plugin output

Before this patch:

 # cat trace
         make-5383  [001]   741.240059:   8,7    P   N [make]
 __trace_note_message: cfq1074

 # echo 1 > options/blk_classic
 # cat trace
   8,7    1     0.692221252     0  C   W 130411392 + 1024 [0]
 Bad pc action 6361
 Bad pc action 283d

 # echo 0 > options/blk_classic
 # echo bin > trace_options
 # cat trace_pipe | blkparse -i -
 (can't parse messages generated by blk_add_trace_msg())

After this patch:
 # cat trace
      <idle>-0     [001]   187.600933:   8,7    C   W 145220224 + 8 [0]
      <idle>-0     [001]   187.600946:   8,7    m   N cfq1076 complete

 # echo 1 > options/blk_classic
 # cat trace
   8,7    1     0.256378996   238  I   W 113190728 + 8 [pdflush]
   8,7    1     0.256378998   238  m   N cfq1076 insert_request

 # echo 0 > options/blk_classic
 # echo bin > trace_options
 # cat trace_pipe | blkparse -i -
  8,7    1        0    22.973250293     0  C   W 102770576 + 8 [0]
  8,7    1        0    22.973259213     0  m   N cfq1076 complete

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 80 +++++++++++++++++++++++++++++++++++++------------
 1 file changed, 61 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index c103b0c20022..947c5b3f90c4 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -59,22 +59,39 @@ static void trace_note(struct blk_trace *bt, pid_t pid, int action,
 		       const void *data, size_t len)
 {
 	struct blk_io_trace *t;
+	struct ring_buffer_event *event = NULL;
+	int pc = 0;
+	int cpu = smp_processor_id();
+	bool blk_tracer = blk_tracer_enabled;
+
+	if (blk_tracer) {
+		pc = preempt_count();
+		event = trace_buffer_lock_reserve(blk_tr, TRACE_BLK,
+						  sizeof(*t) + len,
+						  0, pc);
+		if (!event)
+			return;
+		t = ring_buffer_event_data(event);
+		goto record_it;
+	}
 
 	if (!bt->rchan)
 		return;
 
 	t = relay_reserve(bt->rchan, sizeof(*t) + len);
 	if (t) {
-		const int cpu = smp_processor_id();
-
 		t->magic = BLK_IO_TRACE_MAGIC | BLK_IO_TRACE_VERSION;
 		t->time = ktime_to_ns(ktime_get());
+record_it:
 		t->device = bt->dev;
 		t->action = action;
 		t->pid = pid;
 		t->cpu = cpu;
 		t->pdu_len = len;
 		memcpy((void *) t + sizeof(*t), data, len);
+
+		if (blk_tracer)
+			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 	}
 }
 
@@ -110,14 +127,8 @@ void __trace_note_message(struct blk_trace *bt, const char *fmt, ...)
 	unsigned long flags;
 	char *buf;
 
-	if (blk_tracer_enabled) {
-		va_start(args, fmt);
-		ftrace_vprintk(fmt, args);
-		va_end(args);
-		return;
-	}
-
-	if (!bt->msg_data)
+	if (unlikely(bt->trace_state != Blktrace_running &&
+		     !blk_tracer_enabled))
 		return;
 
 	local_irq_save(flags);
@@ -168,9 +179,9 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 	unsigned long *sequence;
 	pid_t pid;
 	int cpu, pc = 0;
+	bool blk_tracer = blk_tracer_enabled;
 
-	if (unlikely(bt->trace_state != Blktrace_running &&
-		     !blk_tracer_enabled))
+	if (unlikely(bt->trace_state != Blktrace_running && !blk_tracer))
 		return;
 
 	what |= ddir_act[rw & WRITE];
@@ -185,7 +196,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes,
 		return;
 	cpu = raw_smp_processor_id();
 
-	if (blk_tracer_enabled) {
+	if (blk_tracer) {
 		tracing_record_cmdline(current);
 
 		pc = preempt_count();
@@ -235,7 +246,7 @@ record_it:
 		if (pdu_len)
 			memcpy((void *) t + sizeof(*t), pdu_data, pdu_len);
 
-		if (blk_tracer_enabled) {
+		if (blk_tracer) {
 			trace_buffer_unlock_commit(blk_tr, event, 0, pc);
 			return;
 		}
@@ -922,6 +933,11 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 	int i = 0;
 	int tc = t->action >> BLK_TC_SHIFT;
 
+	if (t->action == BLK_TN_MESSAGE) {
+		rwbs[i++] = 'N';
+		goto out;
+	}
+
 	if (tc & BLK_TC_DISCARD)
 		rwbs[i++] = 'D';
 	else if (tc & BLK_TC_WRITE)
@@ -939,7 +955,7 @@ static void fill_rwbs(char *rwbs, const struct blk_io_trace *t)
 		rwbs[i++] = 'S';
 	if (tc & BLK_TC_META)
 		rwbs[i++] = 'M';
-
+out:
 	rwbs[i] = '\0';
 }
 
@@ -1074,6 +1090,17 @@ static int blk_log_split(struct trace_seq *s, const struct trace_entry *ent)
 				get_pdu_int(ent), cmd);
 }
 
+static int blk_log_msg(struct trace_seq *s, const struct trace_entry *ent)
+{
+	int ret;
+	const struct blk_io_trace *t = te_blk_io_trace(ent);
+
+	ret = trace_seq_putmem(s, t + 1, t->pdu_len);
+	if (ret)
+		return trace_seq_putc(s, '\n');
+	return ret;
+}
+
 /*
  * struct tracer operations
  */
@@ -1146,6 +1173,13 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 	long_act   = !!(trace_flags & TRACE_ITER_VERBOSE);
 	log_action = classic ? &blk_log_action_classic : &blk_log_action;
 
+	if (t->action == BLK_TN_MESSAGE) {
+		ret = log_action(iter, long_act ? "message" : "m");
+		if (ret)
+			ret = blk_log_msg(s, iter->ent);
+		goto out;
+	}
+
 	if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act)))
 		ret = trace_seq_printf(s, "Bad pc action %x\n", what);
 	else {
@@ -1153,7 +1187,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter,
 		if (ret)
 			ret = what2act[what].print(s, iter->ent);
 	}
-
+out:
 	return ret ? TRACE_TYPE_HANDLED : TRACE_TYPE_PARTIAL_LINE;
 }
 
@@ -1253,11 +1287,16 @@ static int blk_trace_remove_queue(struct request_queue *q)
 static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 {
 	struct blk_trace *old_bt, *bt = NULL;
+	int ret = -ENOMEM;
 
 	bt = kzalloc(sizeof(*bt), GFP_KERNEL);
 	if (!bt)
 		return -ENOMEM;
 
+	bt->msg_data = __alloc_percpu(BLK_TN_MAX_MSG, __alignof__(char));
+	if (!bt->msg_data)
+		goto free_bt;
+
 	bt->dev = dev;
 	bt->act_mask = (u16)-1;
 	bt->end_lba = -1ULL;
@@ -1265,14 +1304,17 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev)
 	old_bt = xchg(&q->blk_trace, bt);
 	if (old_bt != NULL) {
 		(void)xchg(&q->blk_trace, old_bt);
-		kfree(bt);
-		return -EBUSY;
+		ret = -EBUSY;
+		goto free_bt;
 	}
 
 	if (atomic_inc_return(&blk_probes_ref) == 1)
 		blk_register_tracepoints();
-
 	return 0;
+
+free_bt:
+	blk_trace_free(bt);
+	return ret;
 }
 
 /*
-- 
cgit v1.2.3


From bdd6df6af98ce7e70702edfb5fd5dbbd8d1b0453 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:22 +0200
Subject: tracing: provide trace_seq_reserve()

trace_seq_reserve() allows a caller to reserve space in a trace_seq and
write directly into it. This makes it easier to export binary data to
userspace via the tracing interface, by simply filling in a struct.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 13 +++++++++++++
 kernel/trace/trace_output.h |  1 +
 2 files changed, 14 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 19261fdd2455..6595074cbac5 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -167,6 +167,19 @@ int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
 	return trace_seq_putmem(s, hex, j);
 }
 
+void *trace_seq_reserve(struct trace_seq *s, size_t len)
+{
+	void *ret;
+
+	if (len > ((PAGE_SIZE - 1) - s->len))
+		return NULL;
+
+	ret = s->buffer + s->len;
+	s->len += len;
+
+	return ret;
+}
+
 int trace_seq_path(struct trace_seq *s, struct path *path)
 {
 	unsigned char *p;
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 35c422fb51a9..0ae20b83eec8 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -33,6 +33,7 @@ int trace_seq_puts(struct trace_seq *s, const char *str);
 int trace_seq_putc(struct trace_seq *s, unsigned char c);
 int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
 int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+void *trace_seq_reserve(struct trace_seq *s, size_t len);
 int trace_seq_path(struct trace_seq *s, struct path *path);
 int seq_print_userip_objs(const struct userstack_entry *entry,
 			  struct trace_seq *s, unsigned long sym_flags);
-- 
cgit v1.2.3


From f285901bb21355bb47106658ef14eeb6b8ed538f Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:23 +0200
Subject: tracing: add missing 'extern' keywords to trace_output.h

Impact: cleanup

Many declarations within trace_output.h are missing the 'extern' keyword
in an inconsistent manner. This adds 'extern' where it should be.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.h | 33 +++++++++++++++++----------------
 1 file changed, 17 insertions(+), 16 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 0ae20b83eec8..46fb9612d884 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -29,25 +29,26 @@ seq_print_ip_sym(struct trace_seq *s, unsigned long ip,
 		unsigned long sym_flags);
 extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
-int trace_seq_puts(struct trace_seq *s, const char *str);
-int trace_seq_putc(struct trace_seq *s, unsigned char c);
-int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
-int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
-void *trace_seq_reserve(struct trace_seq *s, size_t len);
-int trace_seq_path(struct trace_seq *s, struct path *path);
-int seq_print_userip_objs(const struct userstack_entry *entry,
-			  struct trace_seq *s, unsigned long sym_flags);
-int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
-		      unsigned long ip, unsigned long sym_flags);
+extern int trace_seq_puts(struct trace_seq *s, const char *str);
+extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
+extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
+extern int trace_seq_path(struct trace_seq *s, struct path *path);
+extern int seq_print_userip_objs(const struct userstack_entry *entry,
+				 struct trace_seq *s, unsigned long sym_flags);
+extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm,
+			     unsigned long ip, unsigned long sym_flags);
 
-int trace_print_context(struct trace_iterator *iter);
-int trace_print_lat_context(struct trace_iterator *iter);
+extern int trace_print_context(struct trace_iterator *iter);
+extern int trace_print_lat_context(struct trace_iterator *iter);
 
-struct trace_event *ftrace_find_event(int type);
-int register_ftrace_event(struct trace_event *event);
-int unregister_ftrace_event(struct trace_event *event);
+extern struct trace_event *ftrace_find_event(int type);
+extern int register_ftrace_event(struct trace_event *event);
+extern int unregister_ftrace_event(struct trace_event *event);
 
-enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags);
+extern enum print_line_t trace_nop_print(struct trace_iterator *iter,
+					 int flags);
 
 #define MAX_MEMHEX_BYTES	8
 #define HEX_CHARS		(MAX_MEMHEX_BYTES*2 + 1)
-- 
cgit v1.2.3


From b14b70a6a4e394c9630bcde17e07d3bcdcbca27e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 27 Mar 2009 10:21:00 +0800
Subject: trace: make argument 'mem' of trace_seq_putmem() const

Impact: fix build warning

I passed a const value to trace_seq_putmem(), and I got compile warning.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Jens Axboe <jens.axboe@oracle.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_output.c | 6 +++---
 kernel/trace/trace_output.h | 5 +++--
 2 files changed, 6 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index 6595074cbac5..d72b9a63b247 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -137,7 +137,7 @@ int trace_seq_putc(struct trace_seq *s, unsigned char c)
 	return 1;
 }
 
-int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
+int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len)
 {
 	if (len > ((PAGE_SIZE - 1) - s->len))
 		return 0;
@@ -148,10 +148,10 @@ int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len)
 	return len;
 }
 
-int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len)
+int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, size_t len)
 {
 	unsigned char hex[HEX_CHARS];
-	unsigned char *data = mem;
+	const unsigned char *data = mem;
 	int i, j;
 
 #ifdef __BIG_ENDIAN
diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h
index 46fb9612d884..e0bde39c2dd9 100644
--- a/kernel/trace/trace_output.h
+++ b/kernel/trace/trace_output.h
@@ -31,8 +31,9 @@ extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf,
 				 size_t cnt);
 extern int trace_seq_puts(struct trace_seq *s, const char *str);
 extern int trace_seq_putc(struct trace_seq *s, unsigned char c);
-extern int trace_seq_putmem(struct trace_seq *s, void *mem, size_t len);
-extern int trace_seq_putmem_hex(struct trace_seq *s, void *mem, size_t len);
+extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len);
+extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem,
+				size_t len);
 extern void *trace_seq_reserve(struct trace_seq *s, size_t len);
 extern int trace_seq_path(struct trace_seq *s, struct path *path);
 extern int seq_print_userip_objs(const struct userstack_entry *entry,
-- 
cgit v1.2.3


From 11d06b2a1e5658f448a308aa3beb97bacd64a940 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 05:45:36 -0400
Subject: Kill unsharing fs_struct in __set_personality()

That's a rudiment of altroot support.  I.e. it should've been buried
a long time ago.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/exec_domain.c | 22 ----------------------
 1 file changed, 22 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index 667c841c2952..cb8e9626c215 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -145,28 +145,6 @@ __set_personality(u_long personality)
 		return 0;
 	}
 
-	if (atomic_read(&current->fs->count) != 1) {
-		struct fs_struct *fsp, *ofsp;
-
-		fsp = copy_fs_struct(current->fs);
-		if (fsp == NULL) {
-			module_put(ep->module);
-			return -ENOMEM;
-		}
-
-		task_lock(current);
-		ofsp = current->fs;
-		current->fs = fsp;
-		task_unlock(current);
-
-		put_fs_struct(ofsp);
-	}
-
-	/*
-	 * At that point we are guaranteed to be the sole owner of
-	 * current->fs.
-	 */
-
 	current->personality = personality;
 	oep = current_thread_info()->exec_domain;
 	current_thread_info()->exec_domain = ep;
-- 
cgit v1.2.3


From 3e93cd671813e204c258f1e6c797959920cf7772 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:00:13 -0400
Subject: Take fs_struct handling to new file (fs/fs_struct.c)

Pure code move; two new helper functions for nfsd and daemonize
(unshare_fs_struct() and daemonize_fs_struct() resp.; for now -
the same code as used to be in callers).  unshare_fs_struct()
exported (for nfsd, as copy_fs_struct()/exit_fs() used to be),
copy_fs_struct() and exit_fs() don't need exports anymore.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/Makefile               |   2 +-
 fs/fs_struct.c            | 141 ++++++++++++++++++++++++++++++++++++++++++++++
 fs/internal.h             |   6 ++
 fs/namei.c                |   7 ---
 fs/namespace.c            |  68 ----------------------
 fs/nfsd/nfssvc.c          |   7 +--
 include/linux/fs_struct.h |   2 +
 kernel/exit.c             |  31 +---------
 kernel/fork.c             |  29 +---------
 9 files changed, 155 insertions(+), 138 deletions(-)
 create mode 100644 fs/fs_struct.c

(limited to 'kernel')

diff --git a/fs/Makefile b/fs/Makefile
index 6e82a307bcd4..b5cd8e18dd9f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -11,7 +11,7 @@ obj-y :=	open.o read_write.o file_table.o super.o \
 		attr.o bad_inode.o file.o filesystems.o namespace.o \
 		seq_file.o xattr.o libfs.o fs-writeback.o \
 		pnode.o drop_caches.o splice.o sync.o utimes.o \
-		stack.o
+		stack.o fs_struct.o
 
 ifeq ($(CONFIG_BLOCK),y)
 obj-y +=	buffer.o bio.o block_dev.o direct-io.o mpage.o ioprio.o
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
new file mode 100644
index 000000000000..36e0a123bbf3
--- /dev/null
+++ b/fs/fs_struct.c
@@ -0,0 +1,141 @@
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/path.h>
+#include <linux/slab.h>
+
+/*
+ * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_root(struct fs_struct *fs, struct path *path)
+{
+	struct path old_root;
+
+	write_lock(&fs->lock);
+	old_root = fs->root;
+	fs->root = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+	if (old_root.dentry)
+		path_put(&old_root);
+}
+
+/*
+ * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
+ * It can block.
+ */
+void set_fs_pwd(struct fs_struct *fs, struct path *path)
+{
+	struct path old_pwd;
+
+	write_lock(&fs->lock);
+	old_pwd = fs->pwd;
+	fs->pwd = *path;
+	path_get(path);
+	write_unlock(&fs->lock);
+
+	if (old_pwd.dentry)
+		path_put(&old_pwd);
+}
+
+void chroot_fs_refs(struct path *old_root, struct path *new_root)
+{
+	struct task_struct *g, *p;
+	struct fs_struct *fs;
+	int count = 0;
+
+	read_lock(&tasklist_lock);
+	do_each_thread(g, p) {
+		task_lock(p);
+		fs = p->fs;
+		if (fs) {
+			write_lock(&fs->lock);
+			if (fs->root.dentry == old_root->dentry
+			    && fs->root.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->root = *new_root;
+				count++;
+			}
+			if (fs->pwd.dentry == old_root->dentry
+			    && fs->pwd.mnt == old_root->mnt) {
+				path_get(new_root);
+				fs->pwd = *new_root;
+				count++;
+			}
+			write_unlock(&fs->lock);
+		}
+		task_unlock(p);
+	} while_each_thread(g, p);
+	read_unlock(&tasklist_lock);
+	while (count--)
+		path_put(old_root);
+}
+
+void put_fs_struct(struct fs_struct *fs)
+{
+	/* No need to hold fs->lock if we are killing it */
+	if (atomic_dec_and_test(&fs->count)) {
+		path_put(&fs->root);
+		path_put(&fs->pwd);
+		kmem_cache_free(fs_cachep, fs);
+	}
+}
+
+void exit_fs(struct task_struct *tsk)
+{
+	struct fs_struct * fs = tsk->fs;
+
+	if (fs) {
+		task_lock(tsk);
+		tsk->fs = NULL;
+		task_unlock(tsk);
+		put_fs_struct(fs);
+	}
+}
+
+struct fs_struct *copy_fs_struct(struct fs_struct *old)
+{
+	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
+	/* We don't need to lock fs - think why ;-) */
+	if (fs) {
+		atomic_set(&fs->count, 1);
+		rwlock_init(&fs->lock);
+		fs->umask = old->umask;
+		read_lock(&old->lock);
+		fs->root = old->root;
+		path_get(&old->root);
+		fs->pwd = old->pwd;
+		path_get(&old->pwd);
+		read_unlock(&old->lock);
+	}
+	return fs;
+}
+
+int unshare_fs_struct(void)
+{
+	struct fs_struct *fsp = copy_fs_struct(current->fs);
+	if (!fsp)
+		return -ENOMEM;
+	exit_fs(current);
+	current->fs = fsp;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(unshare_fs_struct);
+
+/* to be mentioned only in INIT_TASK */
+struct fs_struct init_fs = {
+	.count		= ATOMIC_INIT(1),
+	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
+	.umask		= 0022,
+};
+
+void daemonize_fs_struct(void)
+{
+	struct fs_struct *fs;
+
+	exit_fs(current);	/* current->fs->count--; */
+	fs = &init_fs;
+	current->fs = fs;
+	atomic_inc(&fs->count);
+}
diff --git a/fs/internal.h b/fs/internal.h
index 53af885f1732..477a105f8df3 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -11,6 +11,7 @@
 
 struct super_block;
 struct linux_binprm;
+struct path;
 
 /*
  * block_dev.c
@@ -60,3 +61,8 @@ extern void umount_tree(struct vfsmount *, int, struct list_head *);
 extern struct vfsmount *copy_tree(struct vfsmount *, struct dentry *, int);
 
 extern void __init mnt_init(void);
+
+/*
+ * fs_struct.c
+ */
+extern void chroot_fs_refs(struct path *, struct path *);
diff --git a/fs/namei.c b/fs/namei.c
index d040ce11785d..4c65a6460138 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -2897,10 +2897,3 @@ EXPORT_SYMBOL(vfs_symlink);
 EXPORT_SYMBOL(vfs_unlink);
 EXPORT_SYMBOL(dentry_unhash);
 EXPORT_SYMBOL(generic_readlink);
-
-/* to be mentioned only in INIT_TASK */
-struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
-	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
-	.umask		= 0022,
-};
diff --git a/fs/namespace.c b/fs/namespace.c
index f7ec283ccfbb..1e56303c718e 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2092,74 +2092,6 @@ out1:
 	return retval;
 }
 
-/*
- * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_root(struct fs_struct *fs, struct path *path)
-{
-	struct path old_root;
-
-	write_lock(&fs->lock);
-	old_root = fs->root;
-	fs->root = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-	if (old_root.dentry)
-		path_put(&old_root);
-}
-
-/*
- * Replace the fs->{pwdmnt,pwd} with {mnt,dentry}. Put the old values.
- * It can block. Requires the big lock held.
- */
-void set_fs_pwd(struct fs_struct *fs, struct path *path)
-{
-	struct path old_pwd;
-
-	write_lock(&fs->lock);
-	old_pwd = fs->pwd;
-	fs->pwd = *path;
-	path_get(path);
-	write_unlock(&fs->lock);
-
-	if (old_pwd.dentry)
-		path_put(&old_pwd);
-}
-
-static void chroot_fs_refs(struct path *old_root, struct path *new_root)
-{
-	struct task_struct *g, *p;
-	struct fs_struct *fs;
-	int count = 0;
-
-	read_lock(&tasklist_lock);
-	do_each_thread(g, p) {
-		task_lock(p);
-		fs = p->fs;
-		if (fs) {
-			write_lock(&fs->lock);
-			if (fs->root.dentry == old_root->dentry
-			    && fs->root.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->root = *new_root;
-				count++;
-			}
-			if (fs->pwd.dentry == old_root->dentry
-			    && fs->pwd.mnt == old_root->mnt) {
-				path_get(new_root);
-				fs->pwd = *new_root;
-				count++;
-			}
-			write_unlock(&fs->lock);
-		}
-		task_unlock(p);
-	} while_each_thread(g, p);
-	read_unlock(&tasklist_lock);
-	while (count--)
-		path_put(old_root);
-}
-
 /*
  * pivot_root Semantics:
  * Moves the root file system of the current process to the directory put_old,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa8..144d69918614 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -404,7 +404,6 @@ static int
 nfsd(void *vrqstp)
 {
 	struct svc_rqst *rqstp = (struct svc_rqst *) vrqstp;
-	struct fs_struct *fsp;
 	int err, preverr = 0;
 
 	/* Lock module and set up kernel thread */
@@ -413,13 +412,11 @@ nfsd(void *vrqstp)
 	/* At this point, the thread shares current->fs
 	 * with the init process. We need to create files with a
 	 * umask of 0 instead of init's umask. */
-	fsp = copy_fs_struct(current->fs);
-	if (!fsp) {
+	if (unshare_fs_struct() < 0) {
 		printk("Unable to start nfsd thread: out of memory\n");
 		goto out;
 	}
-	exit_fs(current);
-	current->fs = fsp;
+
 	current->fs->umask = 0;
 
 	/*
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 18b467dbe278..298cef1c0793 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -20,5 +20,7 @@ extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
 extern void put_fs_struct(struct fs_struct *);
+extern void daemonize_fs_struct(void);
+extern int unshare_fs_struct(void);
 
 #endif /* _LINUX_FS_STRUCT_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..ad8375758a79 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -429,7 +429,6 @@ EXPORT_SYMBOL(disallow_signal);
 void daemonize(const char *name, ...)
 {
 	va_list args;
-	struct fs_struct *fs;
 	sigset_t blocked;
 
 	va_start(args, name);
@@ -462,11 +461,7 @@ void daemonize(const char *name, ...)
 
 	/* Become as one with the init task */
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = init_task.fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
-
+	daemonize_fs_struct();
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
@@ -565,30 +560,6 @@ void exit_files(struct task_struct *tsk)
 	}
 }
 
-void put_fs_struct(struct fs_struct *fs)
-{
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
-}
-
-void exit_fs(struct task_struct *tsk)
-{
-	struct fs_struct * fs = tsk->fs;
-
-	if (fs) {
-		task_lock(tsk);
-		tsk->fs = NULL;
-		task_unlock(tsk);
-		put_fs_struct(fs);
-	}
-}
-
-EXPORT_SYMBOL_GPL(exit_fs);
-
 #ifdef CONFIG_MM_OWNER
 /*
  * Task p is exiting and it owned mm, lets find a new owner for it
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..05c02dc586b1 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -681,38 +681,13 @@ fail_nomem:
 	return retval;
 }
 
-static struct fs_struct *__copy_fs_struct(struct fs_struct *old)
-{
-	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
-	/* We don't need to lock fs - think why ;-) */
-	if (fs) {
-		atomic_set(&fs->count, 1);
-		rwlock_init(&fs->lock);
-		fs->umask = old->umask;
-		read_lock(&old->lock);
-		fs->root = old->root;
-		path_get(&old->root);
-		fs->pwd = old->pwd;
-		path_get(&old->pwd);
-		read_unlock(&old->lock);
-	}
-	return fs;
-}
-
-struct fs_struct *copy_fs_struct(struct fs_struct *old)
-{
-	return __copy_fs_struct(old);
-}
-
-EXPORT_SYMBOL_GPL(copy_fs_struct);
-
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
 	if (clone_flags & CLONE_FS) {
 		atomic_inc(&current->fs->count);
 		return 0;
 	}
-	tsk->fs = __copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(current->fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1545,7 +1520,7 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 
 	if ((unshare_flags & CLONE_FS) &&
 	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = __copy_fs_struct(current->fs);
+		*new_fsp = copy_fs_struct(current->fs);
 		if (!*new_fsp)
 			return -ENOMEM;
 	}
-- 
cgit v1.2.3


From 498052bba55ecaff58db6a1436b0e25bfd75a7ff Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Mon, 30 Mar 2009 07:20:30 -0400
Subject: New locking/refcounting for fs_struct

* all changes of current->fs are done under task_lock and write_lock of
  old fs->lock
* refcount is not atomic anymore (same protection)
* its decrements are done when removing reference from current; at the
  same time we decide whether to free it.
* put_fs_struct() is gone
* new field - ->in_exec.  Set by check_unsafe_exec() if we are trying to do
  execve() and only subthreads share fs_struct.  Cleared when finishing exec
  (success and failure alike).  Makes CLONE_FS fail with -EAGAIN if set.
* check_unsafe_exec() may fail with -EAGAIN if another execve() from subthread
  is in progress.

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/compat.c               | 16 +++++++++--
 fs/exec.c                 | 31 +++++++++++++++++----
 fs/fs_struct.c            | 69 +++++++++++++++++++++++++++++++++--------------
 fs/internal.h             |  2 +-
 fs/proc/task_nommu.c      |  2 +-
 include/linux/fs_struct.h |  8 +++---
 kernel/fork.c             | 37 ++++++++++++++++++-------
 7 files changed, 121 insertions(+), 44 deletions(-)

(limited to 'kernel')

diff --git a/fs/compat.c b/fs/compat.c
index 55efdfebdf5a..baabf203b847 100644
--- a/fs/compat.c
+++ b/fs/compat.c
@@ -51,6 +51,7 @@
 #include <linux/poll.h>
 #include <linux/mm.h>
 #include <linux/eventpoll.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
@@ -1441,12 +1442,15 @@ int compat_do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1488,6 +1492,9 @@ int compat_do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1506,6 +1513,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/exec.c b/fs/exec.c
index c5128fbc9165..07a059664b73 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -1056,16 +1056,18 @@ EXPORT_SYMBOL(install_exec_creds);
  * - the caller must hold current->cred_exec_mutex to protect against
  *   PTRACE_ATTACH
  */
-void check_unsafe_exec(struct linux_binprm *bprm)
+int check_unsafe_exec(struct linux_binprm *bprm)
 {
 	struct task_struct *p = current, *t;
 	unsigned long flags;
 	unsigned n_fs, n_sighand;
+	int res = 0;
 
 	bprm->unsafe = tracehook_unsafe_exec(p);
 
 	n_fs = 1;
 	n_sighand = 1;
+	write_lock(&p->fs->lock);
 	lock_task_sighand(p, &flags);
 	for (t = next_thread(p); t != p; t = next_thread(t)) {
 		if (t->fs == p->fs)
@@ -1073,11 +1075,19 @@ void check_unsafe_exec(struct linux_binprm *bprm)
 		n_sighand++;
 	}
 
-	if (atomic_read(&p->fs->count) > n_fs ||
-	    atomic_read(&p->sighand->count) > n_sighand)
+	if (p->fs->users > n_fs ||
+	    atomic_read(&p->sighand->count) > n_sighand) {
 		bprm->unsafe |= LSM_UNSAFE_SHARE;
+	} else {
+		if (p->fs->in_exec)
+			res = -EAGAIN;
+		p->fs->in_exec = 1;
+	}
 
 	unlock_task_sighand(p, &flags);
+	write_unlock(&p->fs->lock);
+
+	return res;
 }
 
 /* 
@@ -1296,12 +1306,15 @@ int do_execve(char * filename,
 	bprm->cred = prepare_exec_creds();
 	if (!bprm->cred)
 		goto out_unlock;
-	check_unsafe_exec(bprm);
+
+	retval = check_unsafe_exec(bprm);
+	if (retval)
+		goto out_unlock;
 
 	file = open_exec(filename);
 	retval = PTR_ERR(file);
 	if (IS_ERR(file))
-		goto out_unlock;
+		goto out_unmark;
 
 	sched_exec();
 
@@ -1344,6 +1357,9 @@ int do_execve(char * filename,
 		goto out;
 
 	/* execve succeeded */
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
 	acct_update_integrals(current);
@@ -1362,6 +1378,11 @@ out_file:
 		fput(bprm->file);
 	}
 
+out_unmark:
+	write_lock(&current->fs->lock);
+	current->fs->in_exec = 0;
+	write_unlock(&current->fs->lock);
+
 out_unlock:
 	current->in_execve = 0;
 	mutex_unlock(&current->cred_exec_mutex);
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 36e0a123bbf3..41cff72b377b 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -72,25 +72,27 @@ void chroot_fs_refs(struct path *old_root, struct path *new_root)
 		path_put(old_root);
 }
 
-void put_fs_struct(struct fs_struct *fs)
+void free_fs_struct(struct fs_struct *fs)
 {
-	/* No need to hold fs->lock if we are killing it */
-	if (atomic_dec_and_test(&fs->count)) {
-		path_put(&fs->root);
-		path_put(&fs->pwd);
-		kmem_cache_free(fs_cachep, fs);
-	}
+	path_put(&fs->root);
+	path_put(&fs->pwd);
+	kmem_cache_free(fs_cachep, fs);
 }
 
 void exit_fs(struct task_struct *tsk)
 {
-	struct fs_struct * fs = tsk->fs;
+	struct fs_struct *fs = tsk->fs;
 
 	if (fs) {
+		int kill;
 		task_lock(tsk);
+		write_lock(&fs->lock);
 		tsk->fs = NULL;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
 		task_unlock(tsk);
-		put_fs_struct(fs);
+		if (kill)
+			free_fs_struct(fs);
 	}
 }
 
@@ -99,7 +101,8 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 	/* We don't need to lock fs - think why ;-) */
 	if (fs) {
-		atomic_set(&fs->count, 1);
+		fs->users = 1;
+		fs->in_exec = 0;
 		rwlock_init(&fs->lock);
 		fs->umask = old->umask;
 		read_lock(&old->lock);
@@ -114,28 +117,54 @@ struct fs_struct *copy_fs_struct(struct fs_struct *old)
 
 int unshare_fs_struct(void)
 {
-	struct fs_struct *fsp = copy_fs_struct(current->fs);
-	if (!fsp)
+	struct fs_struct *fs = current->fs;
+	struct fs_struct *new_fs = copy_fs_struct(fs);
+	int kill;
+
+	if (!new_fs)
 		return -ENOMEM;
-	exit_fs(current);
-	current->fs = fsp;
+
+	task_lock(current);
+	write_lock(&fs->lock);
+	kill = !--fs->users;
+	current->fs = new_fs;
+	write_unlock(&fs->lock);
+	task_unlock(current);
+
+	if (kill)
+		free_fs_struct(fs);
+
 	return 0;
 }
 EXPORT_SYMBOL_GPL(unshare_fs_struct);
 
 /* to be mentioned only in INIT_TASK */
 struct fs_struct init_fs = {
-	.count		= ATOMIC_INIT(1),
+	.users		= 1,
 	.lock		= __RW_LOCK_UNLOCKED(init_fs.lock),
 	.umask		= 0022,
 };
 
 void daemonize_fs_struct(void)
 {
-	struct fs_struct *fs;
+	struct fs_struct *fs = current->fs;
+
+	if (fs) {
+		int kill;
+
+		task_lock(current);
 
-	exit_fs(current);	/* current->fs->count--; */
-	fs = &init_fs;
-	current->fs = fs;
-	atomic_inc(&fs->count);
+		write_lock(&init_fs.lock);
+		init_fs.users++;
+		write_unlock(&init_fs.lock);
+
+		write_lock(&fs->lock);
+		current->fs = &init_fs;
+		kill = !--fs->users;
+		write_unlock(&fs->lock);
+
+		task_unlock(current);
+		if (kill)
+			free_fs_struct(fs);
+	}
 }
diff --git a/fs/internal.h b/fs/internal.h
index 477a105f8df3..b4dac4fb6b61 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -44,7 +44,7 @@ extern void __init chrdev_init(void);
 /*
  * exec.c
  */
-extern void check_unsafe_exec(struct linux_binprm *);
+extern int check_unsafe_exec(struct linux_binprm *);
 
 /*
  * namespace.c
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..6ca01052c5bc 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -49,7 +49,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm)
 	else
 		bytes += kobjsize(mm);
 	
-	if (current->fs && atomic_read(&current->fs->count) > 1)
+	if (current->fs && current->fs->users > 1)
 		sbytes += kobjsize(current->fs);
 	else
 		bytes += kobjsize(current->fs);
diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h
index 298cef1c0793..78a05bfcd8eb 100644
--- a/include/linux/fs_struct.h
+++ b/include/linux/fs_struct.h
@@ -4,12 +4,10 @@
 #include <linux/path.h>
 
 struct fs_struct {
-	atomic_t count;	/* This usage count is used by check_unsafe_exec() for
-			 * security checking purposes - therefore it may not be
-			 * incremented, except by clone(CLONE_FS).
-			 */
+	int users;
 	rwlock_t lock;
 	int umask;
+	int in_exec;
 	struct path root, pwd;
 };
 
@@ -19,7 +17,7 @@ extern void exit_fs(struct task_struct *);
 extern void set_fs_root(struct fs_struct *, struct path *);
 extern void set_fs_pwd(struct fs_struct *, struct path *);
 extern struct fs_struct *copy_fs_struct(struct fs_struct *);
-extern void put_fs_struct(struct fs_struct *);
+extern void free_fs_struct(struct fs_struct *);
 extern void daemonize_fs_struct(void);
 extern int unshare_fs_struct(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 05c02dc586b1..51f138a131de 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -683,11 +683,19 @@ fail_nomem:
 
 static int copy_fs(unsigned long clone_flags, struct task_struct *tsk)
 {
+	struct fs_struct *fs = current->fs;
 	if (clone_flags & CLONE_FS) {
-		atomic_inc(&current->fs->count);
+		/* tsk->fs is already what we want */
+		write_lock(&fs->lock);
+		if (fs->in_exec) {
+			write_unlock(&fs->lock);
+			return -EAGAIN;
+		}
+		fs->users++;
+		write_unlock(&fs->lock);
 		return 0;
 	}
-	tsk->fs = copy_fs_struct(current->fs);
+	tsk->fs = copy_fs_struct(fs);
 	if (!tsk->fs)
 		return -ENOMEM;
 	return 0;
@@ -1518,12 +1526,16 @@ static int unshare_fs(unsigned long unshare_flags, struct fs_struct **new_fsp)
 {
 	struct fs_struct *fs = current->fs;
 
-	if ((unshare_flags & CLONE_FS) &&
-	    (fs && atomic_read(&fs->count) > 1)) {
-		*new_fsp = copy_fs_struct(current->fs);
-		if (!*new_fsp)
-			return -ENOMEM;
-	}
+	if (!(unshare_flags & CLONE_FS) || !fs)
+		return 0;
+
+	/* don't need lock here; in the worst case we'll do useless copy */
+	if (fs->users == 1)
+		return 0;
+
+	*new_fsp = copy_fs_struct(fs);
+	if (!*new_fsp)
+		return -ENOMEM;
 
 	return 0;
 }
@@ -1639,8 +1651,13 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags)
 
 		if (new_fs) {
 			fs = current->fs;
+			write_lock(&fs->lock);
 			current->fs = new_fs;
-			new_fs = fs;
+			if (--fs->users)
+				new_fs = NULL;
+			else
+				new_fs = fs;
+			write_unlock(&fs->lock);
 		}
 
 		if (new_mm) {
@@ -1679,7 +1696,7 @@ bad_unshare_cleanup_sigh:
 
 bad_unshare_cleanup_fs:
 	if (new_fs)
-		put_fs_struct(new_fs);
+		free_fs_struct(new_fs);
 
 bad_unshare_cleanup_thread:
 bad_unshare_out:
-- 
cgit v1.2.3


From 5ad4e53bd5406ee214ddc5a41f03f779b8b2d526 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 29 Mar 2009 19:50:06 -0400
Subject: Get rid of indirect include of fs_struct.h

Don't pull it in sched.h; very few files actually need it and those
can include directly.  sched.h itself only needs forward declaration
of struct fs_struct;

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 arch/cris/kernel/process.c    | 1 -
 fs/dcache.c                   | 1 +
 fs/exec.c                     | 1 +
 fs/fs_struct.c                | 1 +
 fs/namei.c                    | 1 +
 fs/namespace.c                | 1 +
 fs/open.c                     | 1 +
 fs/proc/base.c                | 1 +
 fs/proc/task_nommu.c          | 1 +
 include/linux/mnt_namespace.h | 2 ++
 include/linux/nsproxy.h       | 1 +
 include/linux/sched.h         | 3 ++-
 init/do_mounts.c              | 1 +
 kernel/auditsc.c              | 1 +
 kernel/exec_domain.c          | 1 +
 kernel/exit.c                 | 1 +
 kernel/fork.c                 | 1 +
 kernel/sys.c                  | 1 +
 security/tomoyo/realpath.c    | 1 +
 19 files changed, 20 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/cris/kernel/process.c b/arch/cris/kernel/process.c
index 60816e876455..4df0b320d524 100644
--- a/arch/cris/kernel/process.c
+++ b/arch/cris/kernel/process.c
@@ -19,7 +19,6 @@
 #include <asm/system.h>
 #include <linux/module.h>
 #include <linux/spinlock.h>
-#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <linux/sched.h>
 #include <linux/fs.h>
diff --git a/fs/dcache.c b/fs/dcache.c
index 90bbd7e1b116..0dc4de21f088 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -32,6 +32,7 @@
 #include <linux/seqlock.h>
 #include <linux/swap.h>
 #include <linux/bootmem.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 int sysctl_vfs_cache_pressure __read_mostly = 100;
diff --git a/fs/exec.c b/fs/exec.c
index 614991bf0c87..052a961e41aa 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -53,6 +53,7 @@
 #include <linux/tracehook.h>
 #include <linux/kmod.h>
 #include <linux/fsnotify.h>
+#include <linux/fs_struct.h>
 
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>
diff --git a/fs/fs_struct.c b/fs/fs_struct.c
index 6ac219338670..eee059052db5 100644
--- a/fs/fs_struct.c
+++ b/fs/fs_struct.c
@@ -3,6 +3,7 @@
 #include <linux/fs.h>
 #include <linux/path.h>
 #include <linux/slab.h>
+#include <linux/fs_struct.h>
 
 /*
  * Replace the fs->{rootmnt,root} with {mnt,dentry}. Put the old values.
diff --git a/fs/namei.c b/fs/namei.c
index 964c0249444b..b8433ebfae05 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -32,6 +32,7 @@
 #include <linux/file.h>
 #include <linux/fcntl.h>
 #include <linux/device_cgroup.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 
 #define ACC_MODE(x) ("\000\004\002\006"[(x)&O_ACCMODE])
diff --git a/fs/namespace.c b/fs/namespace.c
index 1e56303c718e..c6f54e4c4290 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -27,6 +27,7 @@
 #include <linux/ramfs.h>
 #include <linux/log2.h>
 #include <linux/idr.h>
+#include <linux/fs_struct.h>
 #include <asm/uaccess.h>
 #include <asm/unistd.h>
 #include "pnode.h"
diff --git a/fs/open.c b/fs/open.c
index 75b61677daaf..377eb25b6abf 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -29,6 +29,7 @@
 #include <linux/rcupdate.h>
 #include <linux/audit.h>
 #include <linux/falloc.h>
+#include <linux/fs_struct.h>
 
 int vfs_statfs(struct dentry *dentry, struct kstatfs *buf)
 {
diff --git a/fs/proc/base.c b/fs/proc/base.c
index e0afd326b688..f71559784bfb 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -80,6 +80,7 @@
 #include <linux/oom.h>
 #include <linux/elf.h>
 #include <linux/pid_namespace.h>
+#include <linux/fs_struct.h>
 #include "internal.h"
 
 /* NOTE:
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 6ca01052c5bc..253afc04484c 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -2,6 +2,7 @@
 #include <linux/mm.h>
 #include <linux/file.h>
 #include <linux/fdtable.h>
+#include <linux/fs_struct.h>
 #include <linux/mount.h>
 #include <linux/ptrace.h>
 #include <linux/seq_file.h>
diff --git a/include/linux/mnt_namespace.h b/include/linux/mnt_namespace.h
index 830bbcd449d6..3a059298cc19 100644
--- a/include/linux/mnt_namespace.h
+++ b/include/linux/mnt_namespace.h
@@ -22,6 +22,8 @@ struct proc_mounts {
 	int event;
 };
 
+struct fs_struct;
+
 extern struct mnt_namespace *copy_mnt_ns(unsigned long, struct mnt_namespace *,
 		struct fs_struct *);
 extern void __put_mnt_ns(struct mnt_namespace *ns);
diff --git a/include/linux/nsproxy.h b/include/linux/nsproxy.h
index afad7dec1b36..7b370c7cfeff 100644
--- a/include/linux/nsproxy.h
+++ b/include/linux/nsproxy.h
@@ -8,6 +8,7 @@ struct mnt_namespace;
 struct uts_namespace;
 struct ipc_namespace;
 struct pid_namespace;
+struct fs_struct;
 
 /*
  * A structure to contain pointers to all per-process
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 29df6374d2de..b4e065ea0de1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -68,7 +68,7 @@ struct sched_param {
 #include <linux/smp.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
-#include <linux/fs_struct.h>
+#include <linux/path.h>
 #include <linux/compiler.h>
 #include <linux/completion.h>
 #include <linux/pid.h>
@@ -97,6 +97,7 @@ struct futex_pi_state;
 struct robust_list_head;
 struct bio;
 struct bts_tracer;
+struct fs_struct;
 
 /*
  * List of flags we want to share for kernel threads,
diff --git a/init/do_mounts.c b/init/do_mounts.c
index 8d4ff5afc1d8..dd7ee5f203f3 100644
--- a/init/do_mounts.c
+++ b/init/do_mounts.c
@@ -14,6 +14,7 @@
 #include <linux/fs.h>
 #include <linux/initrd.h>
 #include <linux/async.h>
+#include <linux/fs_struct.h>
 
 #include <linux/nfs_fs.h>
 #include <linux/nfs_fs_sb.h>
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 8cbddff6c283..2bfc64786765 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -66,6 +66,7 @@
 #include <linux/syscalls.h>
 #include <linux/inotify.h>
 #include <linux/capability.h>
+#include <linux/fs_struct.h>
 
 #include "audit.h"
 
diff --git a/kernel/exec_domain.c b/kernel/exec_domain.c
index cb8e9626c215..c35452cadded 100644
--- a/kernel/exec_domain.c
+++ b/kernel/exec_domain.c
@@ -18,6 +18,7 @@
 #include <linux/syscalls.h>
 #include <linux/sysctl.h>
 #include <linux/types.h>
+#include <linux/fs_struct.h>
 
 
 static void default_handler(int, struct pt_regs *);
diff --git a/kernel/exit.c b/kernel/exit.c
index ad8375758a79..b5d656845c90 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -46,6 +46,7 @@
 #include <linux/blkdev.h>
 #include <linux/task_io_accounting_ops.h>
 #include <linux/tracehook.h>
+#include <linux/fs_struct.h>
 #include <linux/init_task.h>
 #include <trace/sched.h>
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51f138a131de..e82a14577a98 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -60,6 +60,7 @@
 #include <linux/tty.h>
 #include <linux/proc_fs.h>
 #include <linux/blkdev.h>
+#include <linux/fs_struct.h>
 #include <trace/sched.h>
 #include <linux/magic.h>
 
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..ce182aaed204 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -34,6 +34,7 @@
 #include <linux/seccomp.h>
 #include <linux/cpu.h>
 #include <linux/ptrace.h>
+#include <linux/fs_struct.h>
 
 #include <linux/compat.h>
 #include <linux/syscalls.h>
diff --git a/security/tomoyo/realpath.c b/security/tomoyo/realpath.c
index d47f16b844b2..3bbe01a7a4b5 100644
--- a/security/tomoyo/realpath.c
+++ b/security/tomoyo/realpath.c
@@ -12,6 +12,7 @@
 #include <linux/types.h>
 #include <linux/mount.h>
 #include <linux/mnt_namespace.h>
+#include <linux/fs_struct.h>
 #include "common.h"
 #include "realpath.h"
 
-- 
cgit v1.2.3


From 2aad1b76e6b0cc5a2e5d9b95a9f356ddddbfa8a9 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 30 Mar 2009 11:11:28 -0400
Subject: function-graph: allow unregistering twice

Impact: fix to permanent disabling of function graph tracer

There should be nothing to prevent a tracer from unregistering a
function graph callback more than once. This can simplify error paths.

But currently, the counter does not account for mulitple unregistering
of the function graph callback. If it happens, the function graph
tracer will be permanently disabled.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ftrace.c | 4 ++++
 1 file changed, 4 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c
index 1752a63f37c0..f1ed080406c3 100644
--- a/kernel/trace/ftrace.c
+++ b/kernel/trace/ftrace.c
@@ -2719,6 +2719,9 @@ void unregister_ftrace_graph(void)
 {
 	mutex_lock(&ftrace_lock);
 
+	if (!unlikely(atomic_read(&ftrace_graph_active)))
+		goto out;
+
 	atomic_dec(&ftrace_graph_active);
 	unregister_trace_sched_switch(ftrace_graph_probe_sched_switch);
 	ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub;
@@ -2726,6 +2729,7 @@ void unregister_ftrace_graph(void)
 	ftrace_shutdown(FTRACE_STOP_FUNC_RET);
 	unregister_pm_notifier(&ftrace_suspend_notifier);
 
+ out:
 	mutex_unlock(&ftrace_lock);
 }
 
-- 
cgit v1.2.3


From 2e572895bf3203e881356a4039ab0fa428ed2639 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Mon, 30 Mar 2009 14:03:19 -0400
Subject: ring-buffer: do not remove reader page from list on ring buffer free

Impact: prevent possible memory leak

The reader page of the ring buffer is special. Although it points
into the ring buffer, it is not part of the actual buffer. It is
a page used by the reader to swap with a page in the ring buffer.
Once the swap is made, the new reader page is again outside the
buffer.

Even though the reader page points into the buffer, it is really
pointing to residual data. Note, this data is used by the reader.

              reader page
                  |
                  v
       (prev)   +---+    (next)
     +----------|   |----------+
     |          +---+          |
     v                         v
   +---+        +---+        +---+
-->|   |------->|   |------->|   |--->
<--|   |<-------|   |<-------|   |<---
   +---+        +---+        +---+

     ^            ^            ^
      \           |            /
       ------- Buffer---------

If we perform a list_del_init() on the reader page we will actually remove
the last page the reader swapped with and not the reader page itself.
This will cause that page to not be freed, and thus is a memory leak.

Luckily, the only user of the ring buffer so far is ftrace. And ftrace
will not free its ring buffer after it allocates it. There is no current
possible memory leak. But once there are other users, or if ftrace
dynamically creates and frees its ring buffer, then this would be a
memory leak.

This patch fixes the leak for future cases.

Signed-off-by: Steven Rostedt <srostedt@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/ring_buffer.c | 1 -
 1 file changed, 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index edce2ff38944..960cbf44c844 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -563,7 +563,6 @@ static void rb_free_cpu_buffer(struct ring_buffer_per_cpu *cpu_buffer)
 	struct list_head *head = &cpu_buffer->pages;
 	struct buffer_page *bpage, *tmp;
 
-	list_del_init(&cpu_buffer->reader_page->list);
 	free_buffer_page(cpu_buffer->reader_page);
 
 	list_for_each_entry_safe(bpage, tmp, head, list) {
-- 
cgit v1.2.3


From ee99c71c59f897436ec65debb99372b3146f9985 Mon Sep 17 00:00:00 2001
From: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Date: Tue, 31 Mar 2009 15:19:31 -0700
Subject: mm: introduce for_each_populated_zone() macro

Impact: cleanup

In almost cases, for_each_zone() is used with populated_zone().  It's
because almost function doesn't need memoryless node information.
Therefore, for_each_populated_zone() can help to make code simplify.

This patch has no functional change.

[akpm@linux-foundation.org: small cleanup]
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Reviewed-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/mmzone.h  |  8 ++++++++
 kernel/power/snapshot.c |  9 +++------
 kernel/power/swsusp.c   | 17 ++++++++---------
 mm/page_alloc.c         | 26 +++++---------------------
 mm/vmscan.c             |  4 +---
 mm/vmstat.c             | 11 ++---------
 6 files changed, 27 insertions(+), 48 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 1aca6cebbb78..26ef24076b76 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -806,6 +806,14 @@ extern struct zone *next_zone(struct zone *zone);
 	     zone;					\
 	     zone = next_zone(zone))
 
+#define for_each_populated_zone(zone)		        \
+	for (zone = (first_online_pgdat())->node_zones; \
+	     zone;					\
+	     zone = next_zone(zone))			\
+		if (!populated_zone(zone))		\
+			; /* do nothing */		\
+		else
+
 static inline struct zone *zonelist_zone(struct zoneref *zoneref)
 {
 	return zoneref->zone;
diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c
index f5fc2d7680f2..33e2e4a819f9 100644
--- a/kernel/power/snapshot.c
+++ b/kernel/power/snapshot.c
@@ -321,13 +321,10 @@ static int create_mem_extents(struct list_head *list, gfp_t gfp_mask)
 
 	INIT_LIST_HEAD(list);
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		unsigned long zone_start, zone_end;
 		struct mem_extent *ext, *cur, *aux;
 
-		if (!populated_zone(zone))
-			continue;
-
 		zone_start = zone->zone_start_pfn;
 		zone_end = zone->zone_start_pfn + zone->spanned_pages;
 
@@ -804,8 +801,8 @@ static unsigned int count_free_highmem_pages(void)
 	struct zone *zone;
 	unsigned int cnt = 0;
 
-	for_each_zone(zone)
-		if (populated_zone(zone) && is_highmem(zone))
+	for_each_populated_zone(zone)
+		if (is_highmem(zone))
 			cnt += zone_page_state(zone, NR_FREE_PAGES);
 
 	return cnt;
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index a92c91451559..1ee6636414b2 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -229,17 +229,16 @@ int swsusp_shrink_memory(void)
 		size = count_data_pages() + PAGES_FOR_IO + SPARE_PAGES;
 		tmp = size;
 		size += highmem_size;
-		for_each_zone (zone)
-			if (populated_zone(zone)) {
-				tmp += snapshot_additional_pages(zone);
-				if (is_highmem(zone)) {
-					highmem_size -=
+		for_each_populated_zone(zone) {
+			tmp += snapshot_additional_pages(zone);
+			if (is_highmem(zone)) {
+				highmem_size -=
 					zone_page_state(zone, NR_FREE_PAGES);
-				} else {
-					tmp -= zone_page_state(zone, NR_FREE_PAGES);
-					tmp += zone->lowmem_reserve[ZONE_NORMAL];
-				}
+			} else {
+				tmp -= zone_page_state(zone, NR_FREE_PAGES);
+				tmp += zone->lowmem_reserve[ZONE_NORMAL];
 			}
+		}
 
 		if (highmem_size < 0)
 			highmem_size = 0;
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a3803ea8c27d..cbd532161f68 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -922,13 +922,10 @@ static void drain_pages(unsigned int cpu)
 	unsigned long flags;
 	struct zone *zone;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *pset;
 		struct per_cpu_pages *pcp;
 
-		if (!populated_zone(zone))
-			continue;
-
 		pset = zone_pcp(zone, cpu);
 
 		pcp = &pset->pcp;
@@ -1879,10 +1876,7 @@ void show_free_areas(void)
 	int cpu;
 	struct zone *zone;
 
-	for_each_zone(zone) {
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		show_node(zone);
 		printk("%s per-cpu:\n", zone->name);
 
@@ -1922,12 +1916,9 @@ void show_free_areas(void)
 		global_page_state(NR_PAGETABLE),
 		global_page_state(NR_BOUNCE));
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		int i;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s"
 			" free:%lukB"
@@ -1967,12 +1958,9 @@ void show_free_areas(void)
 		printk("\n");
 	}
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
  		unsigned long nr[MAX_ORDER], flags, order, total = 0;
 
-		if (!populated_zone(zone))
-			continue;
-
 		show_node(zone);
 		printk("%s: ", zone->name);
 
@@ -2784,11 +2772,7 @@ static int __cpuinit process_zones(int cpu)
 
 	node_set_state(node, N_CPU);	/* this node has a cpu */
 
-	for_each_zone(zone) {
-
-		if (!populated_zone(zone))
-			continue;
-
+	for_each_populated_zone(zone) {
 		zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
 					 GFP_KERNEL, node);
 		if (!zone_pcp(zone, cpu))
diff --git a/mm/vmscan.c b/mm/vmscan.c
index 1bca60f0c527..301f057fd115 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -2061,11 +2061,9 @@ static unsigned long shrink_all_zones(unsigned long nr_pages, int prio,
 	struct zone *zone;
 	unsigned long ret = 0;
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		enum lru_list l;
 
-		if (!populated_zone(zone))
-			continue;
 		if (zone_is_all_unreclaimable(zone) && prio != DEF_PRIORITY)
 			continue;
 
diff --git a/mm/vmstat.c b/mm/vmstat.c
index 8cd81ea1ddc1..9826766f1274 100644
--- a/mm/vmstat.c
+++ b/mm/vmstat.c
@@ -135,11 +135,7 @@ static void refresh_zone_stat_thresholds(void)
 	int cpu;
 	int threshold;
 
-	for_each_zone(zone) {
-
-		if (!zone->present_pages)
-			continue;
-
+	for_each_populated_zone(zone) {
 		threshold = calculate_threshold(zone);
 
 		for_each_online_cpu(cpu)
@@ -301,12 +297,9 @@ void refresh_cpu_vm_stats(int cpu)
 	int i;
 	int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
 
-	for_each_zone(zone) {
+	for_each_populated_zone(zone) {
 		struct per_cpu_pageset *p;
 
-		if (!populated_zone(zone))
-			continue;
-
 		p = zone_pcp(zone, cpu);
 
 		for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
-- 
cgit v1.2.3


From 704503d836042d4a4c7685b7036e7de0418fbc0f Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Tue, 31 Mar 2009 15:23:18 -0700
Subject: mm: fix proc_dointvec_userhz_jiffies "breakage"

Addresses http://bugzilla.kernel.org/show_bug.cgi?id=9838

On i386, HZ=1000, jiffies_to_clock_t() converts time in a somewhat strange
way from the user's point of view:

	# echo 500 >/proc/sys/vm/dirty_writeback_centisecs
	# cat /proc/sys/vm/dirty_writeback_centisecs
	499

So, we have 5000 jiffies converted to only 499 clock ticks and reported
back.

TICK_NSEC = 999848
ACTHZ = 256039

Keeping in-kernel variable in units passed from userspace will fix issue
of course, but this probably won't be right for every sysctl.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/writeback.h |  4 ++--
 kernel/sysctl.c           |  2 +-
 mm/page-writeback.c       | 20 +++++++++++---------
 3 files changed, 14 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 7300ecdc480c..93445477f86a 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -109,8 +109,8 @@ extern int dirty_background_ratio;
 extern unsigned long dirty_background_bytes;
 extern int vm_dirty_ratio;
 extern unsigned long vm_dirty_bytes;
-extern int dirty_writeback_interval;
-extern int dirty_expire_interval;
+extern unsigned int dirty_writeback_interval;
+extern unsigned int dirty_expire_interval;
 extern int vm_highmem_is_dirtyable;
 extern int block_dump;
 extern int laptop_mode;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index c5ef44ff850f..2e490a389dd2 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1010,7 +1010,7 @@ static struct ctl_table vm_table[] = {
 		.data		= &dirty_expire_interval,
 		.maxlen		= sizeof(dirty_expire_interval),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_userhz_jiffies,
+		.proc_handler	= &proc_dointvec,
 	},
 	{
 		.ctl_name	= VM_NR_PDFLUSH_THREADS,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 6aa92b03c747..30351f0063ac 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -92,14 +92,14 @@ int vm_dirty_ratio = 20;
 unsigned long vm_dirty_bytes;
 
 /*
- * The interval between `kupdate'-style writebacks, in jiffies
+ * The interval between `kupdate'-style writebacks
  */
-int dirty_writeback_interval = 5 * HZ;
+unsigned int dirty_writeback_interval = 5 * 100; /* sentiseconds */
 
 /*
- * The longest number of jiffies for which data is allowed to remain dirty
+ * The longest time for which data is allowed to remain dirty
  */
-int dirty_expire_interval = 30 * HZ;
+unsigned int dirty_expire_interval = 30 * 100; /* sentiseconds */
 
 /*
  * Flag that makes the machine dump writes/reads and block dirtyings.
@@ -770,9 +770,9 @@ static void wb_kupdate(unsigned long arg)
 
 	sync_supers();
 
-	oldest_jif = jiffies - dirty_expire_interval;
+	oldest_jif = jiffies - msecs_to_jiffies(dirty_expire_interval);
 	start_jif = jiffies;
-	next_jif = start_jif + dirty_writeback_interval;
+	next_jif = start_jif + msecs_to_jiffies(dirty_writeback_interval * 10);
 	nr_to_write = global_page_state(NR_FILE_DIRTY) +
 			global_page_state(NR_UNSTABLE_NFS) +
 			(inodes_stat.nr_inodes - inodes_stat.nr_unused);
@@ -801,9 +801,10 @@ static void wb_kupdate(unsigned long arg)
 int dirty_writeback_centisecs_handler(ctl_table *table, int write,
 	struct file *file, void __user *buffer, size_t *length, loff_t *ppos)
 {
-	proc_dointvec_userhz_jiffies(table, write, file, buffer, length, ppos);
+	proc_dointvec(table, write, file, buffer, length, ppos);
 	if (dirty_writeback_interval)
-		mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+		mod_timer(&wb_timer, jiffies +
+			msecs_to_jiffies(dirty_writeback_interval * 10));
 	else
 		del_timer(&wb_timer);
 	return 0;
@@ -905,7 +906,8 @@ void __init page_writeback_init(void)
 {
 	int shift;
 
-	mod_timer(&wb_timer, jiffies + dirty_writeback_interval);
+	mod_timer(&wb_timer,
+		  jiffies + msecs_to_jiffies(dirty_writeback_interval * 10));
 	writeback_set_ratelimit();
 	register_cpu_notifier(&ratelimit_nb);
 
-- 
cgit v1.2.3


From a8af78982ff4c0b3731527b0217d286a343a3089 Mon Sep 17 00:00:00 2001
From: Magnus Damm <damm@igel.co.jp>
Date: Tue, 31 Mar 2009 15:23:37 -0700
Subject: pm: rework includes, remove arch ifdefs

Make the following header file changes:

 - remove arch ifdefs and asm/suspend.h from linux/suspend.h
 - add asm/suspend.h to disk.c (for arch_prepare_suspend())
 - add linux/io.h to swsusp.c (for ioremap())
 - x86 32/64 bit compile fixes

Signed-off-by: Magnus Damm <damm@igel.co.jp>
Cc: Paul Mundt <lethal@linux-sh.org>
Acked-by: "Rafael J. Wysocki" <rjw@sisk.pl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/asm-offsets_32.c | 1 +
 arch/x86/kernel/asm-offsets_64.c | 1 +
 arch/x86/power/cpu_32.c          | 1 +
 arch/x86/power/cpu_64.c          | 1 +
 arch/x86/power/hibernate_64.c    | 1 +
 include/linux/suspend.h          | 3 ---
 kernel/power/disk.c              | 1 +
 kernel/power/swsusp.c            | 1 +
 8 files changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index fbf2f33e3080..5a6aa1c1162f 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -18,6 +18,7 @@
 #include <asm/thread_info.h>
 #include <asm/bootparam.h>
 #include <asm/elf.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index 8793ab33e2c1..e72f062fb4b5 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -16,6 +16,7 @@
 #include <asm/thread_info.h>
 #include <asm/ia32.h>
 #include <asm/bootparam.h>
+#include <asm/suspend.h>
 
 #include <xen/interface/xen.h>
 
diff --git a/arch/x86/power/cpu_32.c b/arch/x86/power/cpu_32.c
index 274d06082f48..ce702c5b3a2c 100644
--- a/arch/x86/power/cpu_32.c
+++ b/arch/x86/power/cpu_32.c
@@ -12,6 +12,7 @@
 #include <asm/mtrr.h>
 #include <asm/mce.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static struct saved_context saved_context;
 
diff --git a/arch/x86/power/cpu_64.c b/arch/x86/power/cpu_64.c
index e3b6cf70d62c..5343540f2607 100644
--- a/arch/x86/power/cpu_64.c
+++ b/arch/x86/power/cpu_64.c
@@ -15,6 +15,7 @@
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
 #include <asm/xcr.h>
+#include <asm/suspend.h>
 
 static void fix_processor_context(void);
 
diff --git a/arch/x86/power/hibernate_64.c b/arch/x86/power/hibernate_64.c
index 6dd000dd7933..65fdc86e923f 100644
--- a/arch/x86/power/hibernate_64.c
+++ b/arch/x86/power/hibernate_64.c
@@ -14,6 +14,7 @@
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/mtrr.h>
+#include <asm/suspend.h>
 
 /* References to section boundaries */
 extern const void __nosave_begin, __nosave_end;
diff --git a/include/linux/suspend.h b/include/linux/suspend.h
index c7d9bb1832ba..3e3a4364cbff 100644
--- a/include/linux/suspend.h
+++ b/include/linux/suspend.h
@@ -1,9 +1,6 @@
 #ifndef _LINUX_SUSPEND_H
 #define _LINUX_SUSPEND_H
 
-#if defined(CONFIG_X86) || defined(CONFIG_FRV) || defined(CONFIG_PPC32) || defined(CONFIG_PPC64)
-#include <asm/suspend.h>
-#endif
 #include <linux/swap.h>
 #include <linux/notifier.h>
 #include <linux/init.h>
diff --git a/kernel/power/disk.c b/kernel/power/disk.c
index e886d1332a10..f3db382c2b2d 100644
--- a/kernel/power/disk.c
+++ b/kernel/power/disk.c
@@ -22,6 +22,7 @@
 #include <linux/console.h>
 #include <linux/cpu.h>
 #include <linux/freezer.h>
+#include <asm/suspend.h>
 
 #include "power.h"
 
diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c
index 1ee6636414b2..78c35047586d 100644
--- a/kernel/power/swsusp.c
+++ b/kernel/power/swsusp.c
@@ -51,6 +51,7 @@
 #include <linux/highmem.h>
 #include <linux/time.h>
 #include <linux/rbtree.h>
+#include <linux/io.h>
 
 #include "power.h"
 
-- 
cgit v1.2.3


From 4ede816ac36e027db5fe0051ad9c73f76db63772 Mon Sep 17 00:00:00 2001
From: Davide Libenzi <davidel@xmailserver.org>
Date: Tue, 31 Mar 2009 15:24:20 -0700
Subject: epoll keyed wakeups: add __wake_up_locked_key() and
 __wake_up_sync_key()

This patchset introduces wakeup hints for some of the most popular (from
epoll POV) devices, so that epoll code can avoid spurious wakeups on its
waiters.

The problem with epoll is that the callback-based wakeups do not, ATM,
carry any information about the events the wakeup is related to.  So the
only choice epoll has (not being able to call f_op->poll() from inside the
callback), is to add the file* to a ready-list and resolve the real events
later on, at epoll_wait() (or its own f_op->poll()) time.  This can cause
spurious wakeups, since the wake_up() itself might be for an event the
caller is not interested into.

The rate of these spurious wakeup can be pretty high in case of many
network sockets being monitored.

By allowing devices to report the events the wakeups refer to (at least
the two major classes - POLLIN/POLLOUT), we are able to spare useless
wakeups by proper handling inside the epoll's poll callback.

Epoll will have in any case to call f_op->poll() on the file* later on,
since the change to be done in order to have the full event set sent via
wakeup, is too invasive for the way our f_op->poll() system works (the
full event set is calculated inside the poll function - there are too many
of them to even start thinking the change - also poll/select would need
change too).

Epoll is changed in a way that both devices which send event hints, and
the ones that don't, are correctly handled.  The former will gain some
efficiency though.

As a general rule for devices, would be to add an event mask by using
key-aware wakeup macros, when making up poll wait queues.  I tested it
(together with the epoll's poll fix patch Andrew has in -mm) and wakeups
for the supported devices are correctly filtered.

Test program available here:

http://www.xmailserver.org/epoll_test.c

This patch:

Nothing revolutionary here.  Just using the available "key" that our
wakeup core already support.  The __wake_up_locked_key() was no brainer,
since both __wake_up_locked() and __wake_up_locked_key() are thin wrappers
around __wake_up_common().

The __wake_up_sync() function had a body, so the choice was between
borrowing the body for __wake_up_sync_key() and calling it from
__wake_up_sync(), or make an inline and calling it from both.  I chose the
former since in most archs it all resolves to "mov $0, REG; jmp ADDR".

Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: David Miller <davem@davemloft.net>
Cc: William Lee Irwin III <wli@movementarian.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/wait.h |  7 +++++--
 kernel/sched.c       | 23 +++++++++++++++++++----
 2 files changed, 24 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/wait.h b/include/linux/wait.h
index a210ede73b56..0d2eeb03a718 100644
--- a/include/linux/wait.h
+++ b/include/linux/wait.h
@@ -135,8 +135,11 @@ static inline void __remove_wait_queue(wait_queue_head_t *head,
 void __wake_up_common(wait_queue_head_t *q, unsigned int mode,
 			int nr_exclusive, int sync, void *key);
 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key);
-extern void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
-extern void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key);
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr,
+			void *key);
+void __wake_up_locked(wait_queue_head_t *q, unsigned int mode);
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr);
 void __wake_up_bit(wait_queue_head_t *, void *, int);
 int __wait_on_bit(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
 int __wait_on_bit_lock(wait_queue_head_t *, struct wait_bit_queue *, int (*)(void *), unsigned);
diff --git a/kernel/sched.c b/kernel/sched.c
index 196d48babbef..73513f4e19df 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -5196,11 +5196,17 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
 	__wake_up_common(q, mode, 1, 0, NULL);
 }
 
+void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key)
+{
+	__wake_up_common(q, mode, 1, 0, key);
+}
+
 /**
- * __wake_up_sync - wake up threads blocked on a waitqueue.
+ * __wake_up_sync_key - wake up threads blocked on a waitqueue.
  * @q: the waitqueue
  * @mode: which threads
  * @nr_exclusive: how many wake-one or wake-many threads to wake up
+ * @key: opaque value to be passed to wakeup targets
  *
  * The sync wakeup differs that the waker knows that it will schedule
  * away soon, so while the target thread will be woken up, it will not
@@ -5209,8 +5215,8 @@ void __wake_up_locked(wait_queue_head_t *q, unsigned int mode)
  *
  * On UP it can prevent extra preemption.
  */
-void
-__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode,
+			int nr_exclusive, void *key)
 {
 	unsigned long flags;
 	int sync = 1;
@@ -5222,9 +5228,18 @@ __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
 		sync = 0;
 
 	spin_lock_irqsave(&q->lock, flags);
-	__wake_up_common(q, mode, nr_exclusive, sync, NULL);
+	__wake_up_common(q, mode, nr_exclusive, sync, key);
 	spin_unlock_irqrestore(&q->lock, flags);
 }
+EXPORT_SYMBOL_GPL(__wake_up_sync_key);
+
+/*
+ * __wake_up_sync - see __wake_up_sync_key()
+ */
+void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr_exclusive)
+{
+	__wake_up_sync_key(q, mode, nr_exclusive, NULL);
+}
 EXPORT_SYMBOL_GPL(__wake_up_sync);	/* For internal use only */
 
 /**
-- 
cgit v1.2.3


From 33e5d76979cf01e3834814fe0aea569d1d602c1a Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Thu, 2 Apr 2009 16:56:32 -0700
Subject: nommu: fix a number of issues with the per-MM VMA patch

Fix a number of issues with the per-MM VMA patch:

 (1) Make mmap_pages_allocated an atomic_long_t, just in case this is used on
     a NOMMU system with more than 2G pages.  Makes no difference on a 32-bit
     system.

 (2) Report vma->vm_pgoff * PAGE_SIZE as a 64-bit value, not a 32-bit value,
     lest it overflow.

 (3) Move the allocation of the vm_area_struct slab back for fork.c.

 (4) Use KMEM_CACHE() for both vm_area_struct and vm_region slabs.

 (5) Use BUG_ON() rather than if () BUG().

 (6) Make the default validate_nommu_regions() a static inline rather than a
     #define.

 (7) Make free_page_series()'s objection to pages with a refcount != 1 more
     informative.

 (8) Adjust the __put_nommu_region() banner comment to indicate that the
     semaphore must be held for writing.

 (9) Limit the number of warnings about munmaps of non-mmapped regions.

Reported-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: David Howells <dhowells@redhat.com>
Cc: Greg Ungerer <gerg@snapgear.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 fs/proc/meminfo.c    |  2 +-
 fs/proc/task_nommu.c |  4 ++--
 include/linux/mm.h   |  2 +-
 kernel/fork.c        |  1 +
 mm/mmap.c            |  3 ---
 mm/nommu.c           | 52 +++++++++++++++++++++++++---------------------------
 6 files changed, 30 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c
index 43d23948384a..74ea974f5ca6 100644
--- a/fs/proc/meminfo.c
+++ b/fs/proc/meminfo.c
@@ -120,7 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v)
 		K(i.freeram-i.freehigh),
 #endif
 #ifndef CONFIG_MMU
-		K((unsigned long) atomic_read(&mmap_pages_allocated)),
+		K((unsigned long) atomic_long_read(&mmap_pages_allocated)),
 #endif
 		K(i.totalswap),
 		K(i.freeswap),
diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c
index 343ea1216bc8..370be0a2c909 100644
--- a/fs/proc/task_nommu.c
+++ b/fs/proc/task_nommu.c
@@ -136,14 +136,14 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma)
 	}
 
 	seq_printf(m,
-		   "%08lx-%08lx %c%c%c%c %08lx %02x:%02x %lu %n",
+		   "%08lx-%08lx %c%c%c%c %08llx %02x:%02x %lu %n",
 		   vma->vm_start,
 		   vma->vm_end,
 		   flags & VM_READ ? 'r' : '-',
 		   flags & VM_WRITE ? 'w' : '-',
 		   flags & VM_EXEC ? 'x' : '-',
 		   flags & VM_MAYSHARE ? flags & VM_SHARED ? 'S' : 's' : 'p',
-		   vma->vm_pgoff << PAGE_SHIFT,
+		   (unsigned long long) vma->vm_pgoff << PAGE_SHIFT,
 		   MAJOR(dev), MINOR(dev), ino, &len);
 
 	if (file) {
diff --git a/include/linux/mm.h b/include/linux/mm.h
index aeabe953ba4f..bff1f0d475c7 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1079,7 +1079,7 @@ static inline void setup_per_cpu_pageset(void) {}
 #endif
 
 /* nommu.c */
-extern atomic_t mmap_pages_allocated;
+extern atomic_long_t mmap_pages_allocated;
 
 /* prio_tree.c */
 void vma_prio_tree_add(struct vm_area_struct *, struct vm_area_struct *old);
diff --git a/kernel/fork.c b/kernel/fork.c
index 47c15840a381..51d1aa21483b 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1488,6 +1488,7 @@ void __init proc_caches_init(void)
 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
 			SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+	vm_area_cachep = KMEM_CACHE(vm_area_struct, SLAB_PANIC);
 	mmap_init();
 }
 
diff --git a/mm/mmap.c b/mm/mmap.c
index 1abb9185a686..4a3841186c11 100644
--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2481,7 +2481,4 @@ void mm_drop_all_locks(struct mm_struct *mm)
  */
 void __init mmap_init(void)
 {
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-			sizeof(struct vm_area_struct), 0,
-			SLAB_PANIC, NULL);
 }
diff --git a/mm/nommu.c b/mm/nommu.c
index 2fcf47d449b4..72eda4aee2cb 100644
--- a/mm/nommu.c
+++ b/mm/nommu.c
@@ -69,7 +69,7 @@ int sysctl_max_map_count = DEFAULT_MAX_MAP_COUNT;
 int sysctl_nr_trim_pages = 1; /* page trimming behaviour */
 int heap_stack_gap = 0;
 
-atomic_t mmap_pages_allocated;
+atomic_long_t mmap_pages_allocated;
 
 EXPORT_SYMBOL(mem_map);
 EXPORT_SYMBOL(num_physpages);
@@ -463,12 +463,7 @@ SYSCALL_DEFINE1(brk, unsigned long, brk)
  */
 void __init mmap_init(void)
 {
-	vm_region_jar = kmem_cache_create("vm_region_jar",
-					  sizeof(struct vm_region), 0,
-					  SLAB_PANIC, NULL);
-	vm_area_cachep = kmem_cache_create("vm_area_struct",
-					   sizeof(struct vm_area_struct), 0,
-					   SLAB_PANIC, NULL);
+	vm_region_jar = KMEM_CACHE(vm_region, SLAB_PANIC);
 }
 
 /*
@@ -486,27 +481,24 @@ static noinline void validate_nommu_regions(void)
 		return;
 
 	last = rb_entry(lastp, struct vm_region, vm_rb);
-	if (unlikely(last->vm_end <= last->vm_start))
-		BUG();
-	if (unlikely(last->vm_top < last->vm_end))
-		BUG();
+	BUG_ON(unlikely(last->vm_end <= last->vm_start));
+	BUG_ON(unlikely(last->vm_top < last->vm_end));
 
 	while ((p = rb_next(lastp))) {
 		region = rb_entry(p, struct vm_region, vm_rb);
 		last = rb_entry(lastp, struct vm_region, vm_rb);
 
-		if (unlikely(region->vm_end <= region->vm_start))
-			BUG();
-		if (unlikely(region->vm_top < region->vm_end))
-			BUG();
-		if (unlikely(region->vm_start < last->vm_top))
-			BUG();
+		BUG_ON(unlikely(region->vm_end <= region->vm_start));
+		BUG_ON(unlikely(region->vm_top < region->vm_end));
+		BUG_ON(unlikely(region->vm_start < last->vm_top));
 
 		lastp = p;
 	}
 }
 #else
-#define validate_nommu_regions() do {} while(0)
+static void validate_nommu_regions(void)
+{
+}
 #endif
 
 /*
@@ -563,16 +555,17 @@ static void free_page_series(unsigned long from, unsigned long to)
 		struct page *page = virt_to_page(from);
 
 		kdebug("- free %lx", from);
-		atomic_dec(&mmap_pages_allocated);
+		atomic_long_dec(&mmap_pages_allocated);
 		if (page_count(page) != 1)
-			kdebug("free page %p [%d]", page, page_count(page));
+			kdebug("free page %p: refcount not one: %d",
+			       page, page_count(page));
 		put_page(page);
 	}
 }
 
 /*
  * release a reference to a region
- * - the caller must hold the region semaphore, which this releases
+ * - the caller must hold the region semaphore for writing, which this releases
  * - the region may not have been added to the tree yet, in which case vm_top
  *   will equal vm_start
  */
@@ -1096,7 +1089,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 		goto enomem;
 
 	total = 1 << order;
-	atomic_add(total, &mmap_pages_allocated);
+	atomic_long_add(total, &mmap_pages_allocated);
 
 	point = rlen >> PAGE_SHIFT;
 
@@ -1107,7 +1100,7 @@ static int do_mmap_private(struct vm_area_struct *vma,
 			order = ilog2(total - point);
 			n = 1 << order;
 			kdebug("shave %lu/%lu @%lu", n, total - point, total);
-			atomic_sub(n, &mmap_pages_allocated);
+			atomic_long_sub(n, &mmap_pages_allocated);
 			total -= n;
 			set_page_refcounted(pages + total);
 			__free_pages(pages + total, order);
@@ -1536,10 +1529,15 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len)
 	/* find the first potentially overlapping VMA */
 	vma = find_vma(mm, start);
 	if (!vma) {
-		printk(KERN_WARNING
-		       "munmap of memory not mmapped by process %d (%s):"
-		       " 0x%lx-0x%lx\n",
-		       current->pid, current->comm, start, start + len - 1);
+		static int limit = 0;
+		if (limit < 5) {
+			printk(KERN_WARNING
+			       "munmap of memory not mmapped by process %d"
+			       " (%s): 0x%lx-0x%lx\n",
+			       current->pid, current->comm,
+			       start, start + len - 1);
+			limit++;
+		}
 		return -EINVAL;
 	}
 
-- 
cgit v1.2.3


From 6f2c55b843836d26528c56a0968689accaedbc67 Mon Sep 17 00:00:00 2001
From: Alexey Dobriyan <adobriyan@gmail.com>
Date: Thu, 2 Apr 2009 16:56:59 -0700
Subject: Simplify copy_thread()

First argument unused since 2.3.11.

[akpm@linux-foundation.org: coding-style fixes]
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/kernel/process.c         | 2 +-
 arch/arm/kernel/process.c           | 2 +-
 arch/avr32/kernel/process.c         | 2 +-
 arch/blackfin/kernel/process.c      | 2 +-
 arch/cris/arch-v10/kernel/process.c | 2 +-
 arch/cris/arch-v32/kernel/process.c | 2 +-
 arch/frv/kernel/process.c           | 2 +-
 arch/h8300/kernel/process.c         | 2 +-
 arch/ia64/kernel/process.c          | 2 +-
 arch/m32r/kernel/process.c          | 2 +-
 arch/m68k/kernel/process.c          | 2 +-
 arch/m68knommu/kernel/process.c     | 2 +-
 arch/mips/kernel/process.c          | 2 +-
 arch/mn10300/kernel/process.c       | 2 +-
 arch/parisc/kernel/process.c        | 2 +-
 arch/powerpc/kernel/process.c       | 2 +-
 arch/s390/kernel/process.c          | 2 +-
 arch/sh/kernel/process_32.c         | 2 +-
 arch/sh/kernel/process_64.c         | 2 +-
 arch/sparc/kernel/process_32.c      | 2 +-
 arch/sparc/kernel/process_64.c      | 2 +-
 arch/um/kernel/process.c            | 2 +-
 arch/x86/kernel/process_32.c        | 2 +-
 arch/x86/kernel/process_64.c        | 2 +-
 arch/xtensa/kernel/process.c        | 2 +-
 include/linux/sched.h               | 3 ++-
 kernel/fork.c                       | 2 +-
 27 files changed, 28 insertions(+), 27 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c
index 8d0097f10208..3a2fb7a02db4 100644
--- a/arch/alpha/kernel/process.c
+++ b/arch/alpha/kernel/process.c
@@ -272,7 +272,7 @@ alpha_vfork(struct pt_regs *regs)
  */
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,
 	    struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c
index 2de14e2afdc5..c3265a2e7cd4 100644
--- a/arch/arm/kernel/process.c
+++ b/arch/arm/kernel/process.c
@@ -301,7 +301,7 @@ void release_thread(struct task_struct *dead_task)
 asmlinkage void ret_from_fork(void) __asm__("ret_from_fork");
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long stack_start,
+copy_thread(unsigned long clone_flags, unsigned long stack_start,
 	    unsigned long stk_sz, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *thread = task_thread_info(p);
diff --git a/arch/avr32/kernel/process.c b/arch/avr32/kernel/process.c
index 43ae555ecb33..1bbe1da54869 100644
--- a/arch/avr32/kernel/process.c
+++ b/arch/avr32/kernel/process.c
@@ -332,7 +332,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/blackfin/kernel/process.c b/arch/blackfin/kernel/process.c
index 33e2e8993f7f..f49427293ca1 100644
--- a/arch/blackfin/kernel/process.c
+++ b/arch/blackfin/kernel/process.c
@@ -193,7 +193,7 @@ asmlinkage int bfin_clone(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	    unsigned long usp, unsigned long topstk,
 	    struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v10/kernel/process.c b/arch/cris/arch-v10/kernel/process.c
index bd9b3ff63f6c..c4c69cf721e5 100644
--- a/arch/cris/arch-v10/kernel/process.c
+++ b/arch/cris/arch-v10/kernel/process.c
@@ -115,7 +115,7 @@ int kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
  */
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/cris/arch-v32/kernel/process.c b/arch/cris/arch-v32/kernel/process.c
index ced5b725d9bd..120e7f796fea 100644
--- a/arch/cris/arch-v32/kernel/process.c
+++ b/arch/cris/arch-v32/kernel/process.c
@@ -131,7 +131,7 @@ kernel_thread(int (*fn)(void *), void * arg, unsigned long flags)
 extern asmlinkage void ret_from_fork(void);
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/frv/kernel/process.c b/arch/frv/kernel/process.c
index 9583a338e9d6..0de50df74970 100644
--- a/arch/frv/kernel/process.c
+++ b/arch/frv/kernel/process.c
@@ -204,7 +204,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * set up the kernel stack and exception frames for a new process
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c
index a8ef654a5a0b..e2f33d0f9969 100644
--- a/arch/h8300/kernel/process.c
+++ b/arch/h8300/kernel/process.c
@@ -191,7 +191,7 @@ asmlinkage int h8300_clone(struct pt_regs *regs)
 
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
                 unsigned long usp, unsigned long topstk,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c
index c57162705147..5d7c0e5b9e76 100644
--- a/arch/ia64/kernel/process.c
+++ b/arch/ia64/kernel/process.c
@@ -413,7 +413,7 @@ ia64_load_extra (struct task_struct *task)
  * so there is nothing to worry about.
  */
 int
-copy_thread (int nr, unsigned long clone_flags,
+copy_thread(unsigned long clone_flags,
 	     unsigned long user_stack_base, unsigned long user_stack_size,
 	     struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/m32r/kernel/process.c b/arch/m32r/kernel/process.c
index 7103d91e1a2f..3e876f0baebc 100644
--- a/arch/m32r/kernel/process.c
+++ b/arch/m32r/kernel/process.c
@@ -225,7 +225,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 	return 0; /* Task didn't use the fpu at all. */
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long spu,
+int copy_thread(unsigned long clone_flags, unsigned long spu,
 	unsigned long unused, struct task_struct *tsk, struct pt_regs *regs)
 {
 	struct pt_regs *childregs = task_pt_regs(tsk);
diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c
index 632ce016014d..ec37fb56c127 100644
--- a/arch/m68k/kernel/process.c
+++ b/arch/m68k/kernel/process.c
@@ -233,7 +233,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
 		       parent_tidptr, child_tidptr);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		 unsigned long unused,
 		 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/m68knommu/kernel/process.c b/arch/m68knommu/kernel/process.c
index 3f2d7745f31e..1e96c6eb6312 100644
--- a/arch/m68knommu/kernel/process.c
+++ b/arch/m68knommu/kernel/process.c
@@ -199,7 +199,7 @@ asmlinkage int m68k_clone(struct pt_regs *regs)
         return do_fork(clone_flags, newsp, regs, 0, NULL, NULL);
 }
 
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long usp, unsigned long topstk,
 		struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c
index ca2e4026ad20..1eaaa450e20c 100644
--- a/arch/mips/kernel/process.c
+++ b/arch/mips/kernel/process.c
@@ -99,7 +99,7 @@ void flush_thread(void)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 	unsigned long unused, struct task_struct *p, struct pt_regs *regs)
 {
 	struct thread_info *ti = task_thread_info(p);
diff --git a/arch/mn10300/kernel/process.c b/arch/mn10300/kernel/process.c
index b28c9a60445b..234cf344cdce 100644
--- a/arch/mn10300/kernel/process.c
+++ b/arch/mn10300/kernel/process.c
@@ -193,7 +193,7 @@ void prepare_to_copy(struct task_struct *tsk)
  * set up the kernel stack for a new thread and copy arch-specific thread
  * control information
  */
-int copy_thread(int nr, unsigned long clone_flags,
+int copy_thread(unsigned long clone_flags,
 		unsigned long c_usp, unsigned long ustk_size,
 		struct task_struct *p, struct pt_regs *kregs)
 {
diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c
index b80e02a4d81d..8aa591ed9127 100644
--- a/arch/parisc/kernel/process.c
+++ b/arch/parisc/kernel/process.c
@@ -263,7 +263,7 @@ sys_vfork(struct pt_regs *regs)
 }
 
 int
-copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+copy_thread(unsigned long clone_flags, unsigned long usp,
 	    unsigned long unused,	/* in ia64 this is "user_stack_size" */
 	    struct task_struct * p, struct pt_regs * pregs)
 {
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index eac064948780..7b44a33f03c2 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -598,7 +598,7 @@ void prepare_to_copy(struct task_struct *tsk)
 /*
  * Copy a thread..
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused, struct task_struct *p,
 		struct pt_regs *regs)
 {
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index b48e961a38f6..a3acd8e60aff 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -160,7 +160,7 @@ void release_thread(struct task_struct *dead_task)
 {
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long new_stackp,
+int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c
index ddafbbbab2ab..694bc15f84fd 100644
--- a/arch/sh/kernel/process_32.c
+++ b/arch/sh/kernel/process_32.c
@@ -170,7 +170,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sh/kernel/process_64.c b/arch/sh/kernel/process_64.c
index c90c7e5e5fee..96be839040f8 100644
--- a/arch/sh/kernel/process_64.c
+++ b/arch/sh/kernel/process_64.c
@@ -425,7 +425,7 @@ int dump_fpu(struct pt_regs *regs, elf_fpregset_t *fpu)
 
 asmlinkage void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c
index f4bee35a1b46..2830b415e214 100644
--- a/arch/sparc/kernel/process_32.c
+++ b/arch/sparc/kernel/process_32.c
@@ -455,7 +455,7 @@ asmlinkage int sparc_do_fork(unsigned long clone_flags,
  */
 extern void ret_from_fork(void);
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index a73954b87f0a..4041f94e7724 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -561,7 +561,7 @@ asmlinkage long sparc_do_fork(unsigned long clone_flags,
  * Parent -->  %o0 == childs  pid, %o1 == 0
  * Child  -->  %o0 == parents pid, %o1 == 1
  */
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 		struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index a1c6d07cac3e..4a28a1568d85 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -179,7 +179,7 @@ void fork_handler(void)
 	userspace(&current->thread.regs.regs);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long stack_top, struct task_struct * p,
 		struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 14014d766cad..76f8f84043a2 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -245,7 +245,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 	unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index abb7e6a7f0c6..b751a41392b1 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -278,7 +278,7 @@ void prepare_to_copy(struct task_struct *tsk)
 	unlazy_fpu(tsk);
 }
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long sp,
+int copy_thread(unsigned long clone_flags, unsigned long sp,
 		unsigned long unused,
 	struct task_struct *p, struct pt_regs *regs)
 {
diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c
index 9185597eb6a0..031f36685710 100644
--- a/arch/xtensa/kernel/process.c
+++ b/arch/xtensa/kernel/process.c
@@ -172,7 +172,7 @@ void prepare_to_copy(struct task_struct *tsk)
  *       childregs.
  */
 
-int copy_thread(int nr, unsigned long clone_flags, unsigned long usp,
+int copy_thread(unsigned long clone_flags, unsigned long usp,
 		unsigned long unused,
                 struct task_struct * p, struct pt_regs * regs)
 {
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 481fad3a9b42..9186f8c5d5f2 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1975,7 +1975,8 @@ extern void mm_release(struct task_struct *, struct mm_struct *);
 /* Allocate a new mm structure and copy contents from tsk->mm */
 extern struct mm_struct *dup_mm(struct task_struct *tsk);
 
-extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
+extern int copy_thread(unsigned long, unsigned long, unsigned long,
+			struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);
 
diff --git a/kernel/fork.c b/kernel/fork.c
index 51d1aa21483b..d7eb727eb535 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1125,7 +1125,7 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 		goto bad_fork_cleanup_mm;
 	if ((retval = copy_io(clone_flags, p)))
 		goto bad_fork_cleanup_namespaces;
-	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
+	retval = copy_thread(clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_io;
 
-- 
cgit v1.2.3


From 313e924c0852943e67335fad9d2608701f0dfe8e Mon Sep 17 00:00:00 2001
From: Grzegorz Nosek <root@localdomain.pl>
Date: Thu, 2 Apr 2009 16:57:23 -0700
Subject: cgroups: relax ns_can_attach checks to allow attaching to grandchild
 cgroups

The ns_proxy cgroup allows moving processes to child cgroups only one
level deep at a time.  This commit relaxes this restriction and makes it
possible to attach tasks directly to grandchild cgroups, e.g.:

($pid is in the root cgroup)
echo $pid > /cgroup/CG1/CG2/tasks

Previously this operation would fail with -EPERM and would have to be
performed as two steps:
echo $pid > /cgroup/CG1/tasks
echo $pid > /cgroup/CG1/CG2/tasks

Also, the target cgroup no longer needs to be empty to move a task there.

Signed-off-by: Grzegorz Nosek <root@localdomain.pl>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  4 ++--
 kernel/cgroup.c        | 11 ++++++-----
 kernel/ns_cgroup.c     | 14 ++++----------
 3 files changed, 12 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index bb8feb9feccd..788c4964c142 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -348,8 +348,8 @@ int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen);
 
 int cgroup_task_count(const struct cgroup *cgrp);
 
-/* Return true if the cgroup is a descendant of the current cgroup */
-int cgroup_is_descendant(const struct cgroup *cgrp);
+/* Return true if cgrp is a descendant of the task's cgroup */
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 
 /* Control Group subsystem type. See Documentation/cgroups.txt for details */
 
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index c500ca7239b2..27792bcb0758 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3084,18 +3084,19 @@ int cgroup_clone(struct task_struct *tsk, struct cgroup_subsys *subsys,
 }
 
 /**
- * cgroup_is_descendant - see if @cgrp is a descendant of current task's cgrp
+ * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp
  * @cgrp: the cgroup in question
+ * @task: the task in question
  *
- * See if @cgrp is a descendant of the current task's cgroup in
- * the appropriate hierarchy.
+ * See if @cgrp is a descendant of @task's cgroup in the appropriate
+ * hierarchy.
  *
  * If we are sending in dummytop, then presumably we are creating
  * the top cgroup in the subsystem.
  *
  * Called only by the ns (nsproxy) cgroup.
  */
-int cgroup_is_descendant(const struct cgroup *cgrp)
+int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task)
 {
 	int ret;
 	struct cgroup *target;
@@ -3105,7 +3106,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp)
 		return 1;
 
 	get_first_subsys(cgrp, NULL, &subsys_id);
-	target = task_cgroup(current, subsys_id);
+	target = task_cgroup(task, subsys_id);
 	while (cgrp != target && cgrp!= cgrp->top_cgroup)
 		cgrp = cgrp->parent;
 	ret = (cgrp == target);
diff --git a/kernel/ns_cgroup.c b/kernel/ns_cgroup.c
index 78bc3fdac0d2..5aa854f9e5ae 100644
--- a/kernel/ns_cgroup.c
+++ b/kernel/ns_cgroup.c
@@ -34,7 +34,7 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 
 /*
  * Rules:
- *   1. you can only enter a cgroup which is a child of your current
+ *   1. you can only enter a cgroup which is a descendant of your current
  *     cgroup
  *   2. you can only place another process into a cgroup if
  *     a. you have CAP_SYS_ADMIN
@@ -45,21 +45,15 @@ int ns_cgroup_clone(struct task_struct *task, struct pid *pid)
 static int ns_can_attach(struct cgroup_subsys *ss,
 		struct cgroup *new_cgroup, struct task_struct *task)
 {
-	struct cgroup *orig;
-
 	if (current != task) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
-		if (!cgroup_is_descendant(new_cgroup))
+		if (!cgroup_is_descendant(new_cgroup, current))
 			return -EPERM;
 	}
 
-	if (atomic_read(&new_cgroup->count) != 0)
-		return -EPERM;
-
-	orig = task_cgroup(task, ns_subsys_id);
-	if (orig && orig != new_cgroup->parent)
+	if (!cgroup_is_descendant(new_cgroup, task))
 		return -EPERM;
 
 	return 0;
@@ -77,7 +71,7 @@ static struct cgroup_subsys_state *ns_create(struct cgroup_subsys *ss,
 
 	if (!capable(CAP_SYS_ADMIN))
 		return ERR_PTR(-EPERM);
-	if (!cgroup_is_descendant(cgroup))
+	if (!cgroup_is_descendant(cgroup, current))
 		return ERR_PTR(-EPERM);
 
 	ns_cgroup = kzalloc(sizeof(*ns_cgroup), GFP_KERNEL);
-- 
cgit v1.2.3


From 38460b48d06440de46b34cb778bd6c4855030754 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:25 -0700
Subject: cgroup: CSS ID support

Patch for Per-CSS(Cgroup Subsys State) ID and private hierarchy code.

This patch attaches unique ID to each css and provides following.

 - css_lookup(subsys, id)
   returns pointer to struct cgroup_subysys_state of id.
 - css_get_next(subsys, id, rootid, depth, foundid)
   returns the next css under "root" by scanning

When cgroup_subsys->use_id is set, an id for css is maintained.

The cgroup framework only parepares
	- css_id of root css for subsys
	- id is automatically attached at creation of css.
	- id is *not* freed automatically. Because the cgroup framework
	  don't know lifetime of cgroup_subsys_state.
	  free_css_id() function is provided. This must be called by subsys.

There are several reasons to develop this.
	- Saving space .... For example, memcg's swap_cgroup is array of
	  pointers to cgroup. But it is not necessary to be very fast.
	  By replacing pointers(8bytes per ent) to ID (2byes per ent), we can
	  reduce much amount of memory usage.

	- Scanning without lock.
	  CSS_ID provides "scan id under this ROOT" function. By this, scanning
	  css under root can be written without locks.
	  ex)
	  do {
		rcu_read_lock();
		next = cgroup_get_next(subsys, id, root, &found);
		/* check sanity of next here */
		css_tryget();
		rcu_read_unlock();
		id = found + 1
	 } while(...)

Characteristics:
	- Each css has unique ID under subsys.
	- Lifetime of ID is controlled by subsys.
	- css ID contains "ID" and "Depth in hierarchy" and stack of hierarchy
	- Allowed ID is 1-65535, ID 0 is UNUSED ID.

Design Choices:
	- scan-by-ID v.s. scan-by-tree-walk.
	  As /proc's pid scan does, scan-by-ID is robust when scanning is done
	  by following kind of routine.
	  scan -> rest a while(release a lock) -> conitunue from interrupted
	  memcg's hierarchical reclaim does this.

	- When subsys->use_id is set, # of css in the system is limited to
	  65535.

[bharata@linux.vnet.ibm.com: remove rcu_read_lock() from css_get_next()]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Bharata B Rao <bharata@linux.vnet.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  50 +++++++++
 include/linux/idr.h    |   1 +
 kernel/cgroup.c        | 286 ++++++++++++++++++++++++++++++++++++++++++++++++-
 lib/idr.c              |  46 ++++++++
 4 files changed, 382 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 788c4964c142..9a23bb098205 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -15,6 +15,7 @@
 #include <linux/cgroupstats.h>
 #include <linux/prio_heap.h>
 #include <linux/rwsem.h>
+#include <linux/idr.h>
 
 #ifdef CONFIG_CGROUPS
 
@@ -22,6 +23,7 @@ struct cgroupfs_root;
 struct cgroup_subsys;
 struct inode;
 struct cgroup;
+struct css_id;
 
 extern int cgroup_init_early(void);
 extern int cgroup_init(void);
@@ -63,6 +65,8 @@ struct cgroup_subsys_state {
 	atomic_t refcnt;
 
 	unsigned long flags;
+	/* ID for this css, if possible */
+	struct css_id *id;
 };
 
 /* bits in struct cgroup_subsys_state flags field */
@@ -373,6 +377,11 @@ struct cgroup_subsys {
 	int active;
 	int disabled;
 	int early_init;
+	/*
+	 * True if this subsys uses ID. ID is not available before cgroup_init()
+	 * (not available in early_init time.)
+	 */
+	bool use_id;
 #define MAX_CGROUP_TYPE_NAMELEN 32
 	const char *name;
 
@@ -395,6 +404,9 @@ struct cgroup_subsys {
 	 */
 	struct cgroupfs_root *root;
 	struct list_head sibling;
+	/* used when use_id == true */
+	struct idr idr;
+	spinlock_t id_lock;
 };
 
 #define SUBSYS(_x) extern struct cgroup_subsys _x ## _subsys;
@@ -450,6 +462,44 @@ void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it);
 int cgroup_scan_tasks(struct cgroup_scanner *scan);
 int cgroup_attach_task(struct cgroup *, struct task_struct *);
 
+/*
+ * CSS ID is ID for cgroup_subsys_state structs under subsys. This only works
+ * if cgroup_subsys.use_id == true. It can be used for looking up and scanning.
+ * CSS ID is assigned at cgroup allocation (create) automatically
+ * and removed when subsys calls free_css_id() function. This is because
+ * the lifetime of cgroup_subsys_state is subsys's matter.
+ *
+ * Looking up and scanning function should be called under rcu_read_lock().
+ * Taking cgroup_mutex()/hierarchy_mutex() is not necessary for following calls.
+ * But the css returned by this routine can be "not populated yet" or "being
+ * destroyed". The caller should check css and cgroup's status.
+ */
+
+/*
+ * Typically Called at ->destroy(), or somewhere the subsys frees
+ * cgroup_subsys_state.
+ */
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css);
+
+/* Find a cgroup_subsys_state which has given ID */
+
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id);
+
+/*
+ * Get a cgroup whose id is greater than or equal to id under tree of root.
+ * Returning a cgroup_subsys_state or NULL.
+ */
+struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
+		struct cgroup_subsys_state *root, int *foundid);
+
+/* Returns true if root is ancestor of cg */
+bool css_is_ancestor(struct cgroup_subsys_state *cg,
+		     struct cgroup_subsys_state *root);
+
+/* Get id and depth of css */
+unsigned short css_id(struct cgroup_subsys_state *css);
+unsigned short css_depth(struct cgroup_subsys_state *css);
+
 #else /* !CONFIG_CGROUPS */
 
 static inline int cgroup_init_early(void) { return 0; }
diff --git a/include/linux/idr.h b/include/linux/idr.h
index dd846df8cd32..e968db71e33a 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -106,6 +106,7 @@ int idr_get_new(struct idr *idp, void *ptr, int *id);
 int idr_get_new_above(struct idr *idp, void *ptr, int starting_id, int *id);
 int idr_for_each(struct idr *idp,
 		 int (*fn)(int id, void *p, void *data), void *data);
+void *idr_get_next(struct idr *idp, int *nextid);
 void *idr_replace(struct idr *idp, void *ptr, int id);
 void idr_remove(struct idr *idp, int id);
 void idr_remove_all(struct idr *idp);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 27792bcb0758..d3c521137425 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -94,7 +94,6 @@ struct cgroupfs_root {
 	char release_agent_path[PATH_MAX];
 };
 
-
 /*
  * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the
  * subsystems that are otherwise unattached - it never has more than a
@@ -102,6 +101,39 @@ struct cgroupfs_root {
  */
 static struct cgroupfs_root rootnode;
 
+/*
+ * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when
+ * cgroup_subsys->use_id != 0.
+ */
+#define CSS_ID_MAX	(65535)
+struct css_id {
+	/*
+	 * The css to which this ID points. This pointer is set to valid value
+	 * after cgroup is populated. If cgroup is removed, this will be NULL.
+	 * This pointer is expected to be RCU-safe because destroy()
+	 * is called after synchronize_rcu(). But for safe use, css_is_removed()
+	 * css_tryget() should be used for avoiding race.
+	 */
+	struct cgroup_subsys_state *css;
+	/*
+	 * ID of this css.
+	 */
+	unsigned short id;
+	/*
+	 * Depth in hierarchy which this ID belongs to.
+	 */
+	unsigned short depth;
+	/*
+	 * ID is freed by RCU. (and lookup routine is RCU safe.)
+	 */
+	struct rcu_head rcu_head;
+	/*
+	 * Hierarchy of CSS ID belongs to.
+	 */
+	unsigned short stack[0]; /* Array of Length (depth+1) */
+};
+
+
 /* The list of hierarchy roots */
 
 static LIST_HEAD(roots);
@@ -185,6 +217,8 @@ struct cg_cgroup_link {
 static struct css_set init_css_set;
 static struct cg_cgroup_link init_css_set_link;
 
+static int cgroup_subsys_init_idr(struct cgroup_subsys *ss);
+
 /* css_set_lock protects the list of css_set objects, and the
  * chain of tasks off each css_set.  Nests outside task->alloc_lock
  * due to cgroup_iter_start() */
@@ -567,6 +601,9 @@ static struct backing_dev_info cgroup_backing_dev_info = {
 	.capabilities	= BDI_CAP_NO_ACCT_AND_WRITEBACK,
 };
 
+static int alloc_css_id(struct cgroup_subsys *ss,
+			struct cgroup *parent, struct cgroup *child);
+
 static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
 {
 	struct inode *inode = new_inode(sb);
@@ -2327,6 +2364,17 @@ static int cgroup_populate_dir(struct cgroup *cgrp)
 		if (ss->populate && (err = ss->populate(ss, cgrp)) < 0)
 			return err;
 	}
+	/* This cgroup is ready now */
+	for_each_subsys(cgrp->root, ss) {
+		struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id];
+		/*
+		 * Update id->css pointer and make this css visible from
+		 * CSS ID functions. This pointer will be dereferened
+		 * from RCU-read-side without locks.
+		 */
+		if (css->id)
+			rcu_assign_pointer(css->id->css, css);
+	}
 
 	return 0;
 }
@@ -2338,6 +2386,7 @@ static void init_cgroup_css(struct cgroup_subsys_state *css,
 	css->cgroup = cgrp;
 	atomic_set(&css->refcnt, 1);
 	css->flags = 0;
+	css->id = NULL;
 	if (cgrp == dummytop)
 		set_bit(CSS_ROOT, &css->flags);
 	BUG_ON(cgrp->subsys[ss->subsys_id]);
@@ -2413,6 +2462,10 @@ static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
 			goto err_destroy;
 		}
 		init_cgroup_css(css, ss, cgrp);
+		if (ss->use_id)
+			if (alloc_css_id(ss, parent, cgrp))
+				goto err_destroy;
+		/* At error, ->destroy() callback has to free assigned ID. */
 	}
 
 	cgroup_lock_hierarchy(root);
@@ -2708,6 +2761,8 @@ int __init cgroup_init(void)
 		struct cgroup_subsys *ss = subsys[i];
 		if (!ss->early_init)
 			cgroup_init_subsys(ss);
+		if (ss->use_id)
+			cgroup_subsys_init_idr(ss);
 	}
 
 	/* Add init_css_set to the hash table */
@@ -3242,3 +3297,232 @@ static int __init cgroup_disable(char *str)
 	return 1;
 }
 __setup("cgroup_disable=", cgroup_disable);
+
+/*
+ * Functons for CSS ID.
+ */
+
+/*
+ *To get ID other than 0, this should be called when !cgroup_is_removed().
+ */
+unsigned short css_id(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->id;
+	return 0;
+}
+
+unsigned short css_depth(struct cgroup_subsys_state *css)
+{
+	struct css_id *cssid = rcu_dereference(css->id);
+
+	if (cssid)
+		return cssid->depth;
+	return 0;
+}
+
+bool css_is_ancestor(struct cgroup_subsys_state *child,
+		    struct cgroup_subsys_state *root)
+{
+	struct css_id *child_id = rcu_dereference(child->id);
+	struct css_id *root_id = rcu_dereference(root->id);
+
+	if (!child_id || !root_id || (child_id->depth < root_id->depth))
+		return false;
+	return child_id->stack[root_id->depth] == root_id->id;
+}
+
+static void __free_css_id_cb(struct rcu_head *head)
+{
+	struct css_id *id;
+
+	id = container_of(head, struct css_id, rcu_head);
+	kfree(id);
+}
+
+void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css)
+{
+	struct css_id *id = css->id;
+	/* When this is called before css_id initialization, id can be NULL */
+	if (!id)
+		return;
+
+	BUG_ON(!ss->use_id);
+
+	rcu_assign_pointer(id->css, NULL);
+	rcu_assign_pointer(css->id, NULL);
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, id->id);
+	spin_unlock(&ss->id_lock);
+	call_rcu(&id->rcu_head, __free_css_id_cb);
+}
+
+/*
+ * This is called by init or create(). Then, calls to this function are
+ * always serialized (By cgroup_mutex() at create()).
+ */
+
+static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth)
+{
+	struct css_id *newid;
+	int myid, error, size;
+
+	BUG_ON(!ss->use_id);
+
+	size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1);
+	newid = kzalloc(size, GFP_KERNEL);
+	if (!newid)
+		return ERR_PTR(-ENOMEM);
+	/* get id */
+	if (unlikely(!idr_pre_get(&ss->idr, GFP_KERNEL))) {
+		error = -ENOMEM;
+		goto err_out;
+	}
+	spin_lock(&ss->id_lock);
+	/* Don't use 0. allocates an ID of 1-65535 */
+	error = idr_get_new_above(&ss->idr, newid, 1, &myid);
+	spin_unlock(&ss->id_lock);
+
+	/* Returns error when there are no free spaces for new ID.*/
+	if (error) {
+		error = -ENOSPC;
+		goto err_out;
+	}
+	if (myid > CSS_ID_MAX)
+		goto remove_idr;
+
+	newid->id = myid;
+	newid->depth = depth;
+	return newid;
+remove_idr:
+	error = -ENOSPC;
+	spin_lock(&ss->id_lock);
+	idr_remove(&ss->idr, myid);
+	spin_unlock(&ss->id_lock);
+err_out:
+	kfree(newid);
+	return ERR_PTR(error);
+
+}
+
+static int __init cgroup_subsys_init_idr(struct cgroup_subsys *ss)
+{
+	struct css_id *newid;
+	struct cgroup_subsys_state *rootcss;
+
+	spin_lock_init(&ss->id_lock);
+	idr_init(&ss->idr);
+
+	rootcss = init_css_set.subsys[ss->subsys_id];
+	newid = get_new_cssid(ss, 0);
+	if (IS_ERR(newid))
+		return PTR_ERR(newid);
+
+	newid->stack[0] = newid->id;
+	newid->css = rootcss;
+	rootcss->id = newid;
+	return 0;
+}
+
+static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent,
+			struct cgroup *child)
+{
+	int subsys_id, i, depth = 0;
+	struct cgroup_subsys_state *parent_css, *child_css;
+	struct css_id *child_id, *parent_id = NULL;
+
+	subsys_id = ss->subsys_id;
+	parent_css = parent->subsys[subsys_id];
+	child_css = child->subsys[subsys_id];
+	depth = css_depth(parent_css) + 1;
+	parent_id = parent_css->id;
+
+	child_id = get_new_cssid(ss, depth);
+	if (IS_ERR(child_id))
+		return PTR_ERR(child_id);
+
+	for (i = 0; i < depth; i++)
+		child_id->stack[i] = parent_id->stack[i];
+	child_id->stack[depth] = child_id->id;
+	/*
+	 * child_id->css pointer will be set after this cgroup is available
+	 * see cgroup_populate_dir()
+	 */
+	rcu_assign_pointer(child_css->id, child_id);
+
+	return 0;
+}
+
+/**
+ * css_lookup - lookup css by id
+ * @ss: cgroup subsys to be looked into.
+ * @id: the id
+ *
+ * Returns pointer to cgroup_subsys_state if there is valid one with id.
+ * NULL if not. Should be called under rcu_read_lock()
+ */
+struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id)
+{
+	struct css_id *cssid = NULL;
+
+	BUG_ON(!ss->use_id);
+	cssid = idr_find(&ss->idr, id);
+
+	if (unlikely(!cssid))
+		return NULL;
+
+	return rcu_dereference(cssid->css);
+}
+
+/**
+ * css_get_next - lookup next cgroup under specified hierarchy.
+ * @ss: pointer to subsystem
+ * @id: current position of iteration.
+ * @root: pointer to css. search tree under this.
+ * @foundid: position of found object.
+ *
+ * Search next css under the specified hierarchy of rootid. Calling under
+ * rcu_read_lock() is necessary. Returns NULL if it reaches the end.
+ */
+struct cgroup_subsys_state *
+css_get_next(struct cgroup_subsys *ss, int id,
+	     struct cgroup_subsys_state *root, int *foundid)
+{
+	struct cgroup_subsys_state *ret = NULL;
+	struct css_id *tmp;
+	int tmpid;
+	int rootid = css_id(root);
+	int depth = css_depth(root);
+
+	if (!rootid)
+		return NULL;
+
+	BUG_ON(!ss->use_id);
+	/* fill start point for scan */
+	tmpid = id;
+	while (1) {
+		/*
+		 * scan next entry from bitmap(tree), tmpid is updated after
+		 * idr_get_next().
+		 */
+		spin_lock(&ss->id_lock);
+		tmp = idr_get_next(&ss->idr, &tmpid);
+		spin_unlock(&ss->id_lock);
+
+		if (!tmp)
+			break;
+		if (tmp->depth >= depth && tmp->stack[depth] == rootid) {
+			ret = rcu_dereference(tmp->css);
+			if (ret) {
+				*foundid = tmpid;
+				break;
+			}
+		}
+		/* continue to scan from next id */
+		tmpid = tmpid + 1;
+	}
+	return ret;
+}
+
diff --git a/lib/idr.c b/lib/idr.c
index dab4bca86f5d..80ca9aca038b 100644
--- a/lib/idr.c
+++ b/lib/idr.c
@@ -578,6 +578,52 @@ int idr_for_each(struct idr *idp,
 }
 EXPORT_SYMBOL(idr_for_each);
 
+/**
+ * idr_get_next - lookup next object of id to given id.
+ * @idp: idr handle
+ * @id:  pointer to lookup key
+ *
+ * Returns pointer to registered object with id, which is next number to
+ * given id.
+ */
+
+void *idr_get_next(struct idr *idp, int *nextidp)
+{
+	struct idr_layer *p, *pa[MAX_LEVEL];
+	struct idr_layer **paa = &pa[0];
+	int id = *nextidp;
+	int n, max;
+
+	/* find first ent */
+	n = idp->layers * IDR_BITS;
+	max = 1 << n;
+	p = rcu_dereference(idp->top);
+	if (!p)
+		return NULL;
+
+	while (id < max) {
+		while (n > 0 && p) {
+			n -= IDR_BITS;
+			*paa++ = p;
+			p = rcu_dereference(p->ary[(id >> n) & IDR_MASK]);
+		}
+
+		if (p) {
+			*nextidp = id;
+			return p;
+		}
+
+		id += 1 << n;
+		while (n < fls(id)) {
+			n += IDR_BITS;
+			p = *--paa;
+		}
+	}
+	return NULL;
+}
+
+
+
 /**
  * idr_replace - replace pointer for given id
  * @idp: idr handle
-- 
cgit v1.2.3


From ec64f51545fffbc4cb968f0cea56341a4b07e85a Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:26 -0700
Subject: cgroup: fix frequent -EBUSY at rmdir

In following situation, with memory subsystem,

	/groupA use_hierarchy==1
		/01 some tasks
		/02 some tasks
		/03 some tasks
		/04 empty

When tasks under 01/02/03 hit limit on /groupA, hierarchical reclaim
is triggered and the kernel walks tree under groupA. In this case,
rmdir /groupA/04 fails with -EBUSY frequently because of temporal
refcnt from the kernel.

In general. cgroup can be rmdir'd if there are no children groups and
no tasks. Frequent fails of rmdir() is not useful to users.
(And the reason for -EBUSY is unknown to users.....in most cases)

This patch tries to modify above behavior, by
	- retries if css_refcnt is got by someone.
	- add "return value" to pre_destroy() and allows subsystem to
	  say "we're really busy!"

Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/cgroups.txt |  6 ++-
 include/linux/cgroup.h            |  6 ++-
 kernel/cgroup.c                   | 81 ++++++++++++++++++++++++++++++++-------
 mm/memcontrol.c                   |  5 ++-
 4 files changed, 79 insertions(+), 19 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/cgroups.txt b/Documentation/cgroups/cgroups.txt
index 93feb8444489..cdc46a501b85 100644
--- a/Documentation/cgroups/cgroups.txt
+++ b/Documentation/cgroups/cgroups.txt
@@ -476,11 +476,13 @@ cgroup->parent is still valid. (Note - can also be called for a
 newly-created cgroup if an error occurs after this subsystem's
 create() method has been called for the new cgroup).
 
-void pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+int pre_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
 
 Called before checking the reference count on each subsystem. This may
 be useful for subsystems which have some extra references even if
-there are not tasks in the cgroup.
+there are not tasks in the cgroup. If pre_destroy() returns error code,
+rmdir() will fail with it. From this behavior, pre_destroy() can be
+called multiple times against a cgroup.
 
 int can_attach(struct cgroup_subsys *ss, struct cgroup *cgrp,
 	       struct task_struct *task)
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 9a23bb098205..7d824b80b3d7 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -135,6 +135,10 @@ enum {
 	CGRP_RELEASABLE,
 	/* Control Group requires release notifications to userspace */
 	CGRP_NOTIFY_ON_RELEASE,
+	/*
+	 * A thread in rmdir() is wating for this cgroup.
+	 */
+	CGRP_WAIT_ON_RMDIR,
 };
 
 struct cgroup {
@@ -360,7 +364,7 @@ int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task);
 struct cgroup_subsys {
 	struct cgroup_subsys_state *(*create)(struct cgroup_subsys *ss,
 						  struct cgroup *cgrp);
-	void (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
+	int (*pre_destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	void (*destroy)(struct cgroup_subsys *ss, struct cgroup *cgrp);
 	int (*can_attach)(struct cgroup_subsys *ss,
 			  struct cgroup *cgrp, struct task_struct *tsk);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index d3c521137425..fc5e4a48582f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -622,13 +622,18 @@ static struct inode *cgroup_new_inode(mode_t mode, struct super_block *sb)
  * Call subsys's pre_destroy handler.
  * This is called before css refcnt check.
  */
-static void cgroup_call_pre_destroy(struct cgroup *cgrp)
+static int cgroup_call_pre_destroy(struct cgroup *cgrp)
 {
 	struct cgroup_subsys *ss;
+	int ret = 0;
+
 	for_each_subsys(cgrp->root, ss)
-		if (ss->pre_destroy)
-			ss->pre_destroy(ss, cgrp);
-	return;
+		if (ss->pre_destroy) {
+			ret = ss->pre_destroy(ss, cgrp);
+			if (ret)
+				break;
+		}
+	return ret;
 }
 
 static void free_cgroup_rcu(struct rcu_head *obj)
@@ -722,6 +727,22 @@ static void cgroup_d_remove_dir(struct dentry *dentry)
 	remove_dir(dentry);
 }
 
+/*
+ * A queue for waiters to do rmdir() cgroup. A tasks will sleep when
+ * cgroup->count == 0 && list_empty(&cgroup->children) && subsys has some
+ * reference to css->refcnt. In general, this refcnt is expected to goes down
+ * to zero, soon.
+ *
+ * CGRP_WAIT_ON_RMDIR flag is modified under cgroup's inode->i_mutex;
+ */
+DECLARE_WAIT_QUEUE_HEAD(cgroup_rmdir_waitq);
+
+static void cgroup_wakeup_rmdir_waiters(const struct cgroup *cgrp)
+{
+	if (unlikely(test_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags)))
+		wake_up_all(&cgroup_rmdir_waitq);
+}
+
 static int rebind_subsystems(struct cgroupfs_root *root,
 			      unsigned long final_bits)
 {
@@ -1317,6 +1338,12 @@ int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk)
 	set_bit(CGRP_RELEASABLE, &oldcgrp->flags);
 	synchronize_rcu();
 	put_css_set(cg);
+
+	/*
+	 * wake up rmdir() waiter. the rmdir should fail since the cgroup
+	 * is no longer empty.
+	 */
+	cgroup_wakeup_rmdir_waiters(cgrp);
 	return 0;
 }
 
@@ -2608,9 +2635,11 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	struct cgroup *cgrp = dentry->d_fsdata;
 	struct dentry *d;
 	struct cgroup *parent;
+	DEFINE_WAIT(wait);
+	int ret;
 
 	/* the vfs holds both inode->i_mutex already */
-
+again:
 	mutex_lock(&cgroup_mutex);
 	if (atomic_read(&cgrp->count) != 0) {
 		mutex_unlock(&cgroup_mutex);
@@ -2626,17 +2655,39 @@ static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry)
 	 * Call pre_destroy handlers of subsys. Notify subsystems
 	 * that rmdir() request comes.
 	 */
-	cgroup_call_pre_destroy(cgrp);
+	ret = cgroup_call_pre_destroy(cgrp);
+	if (ret)
+		return ret;
 
 	mutex_lock(&cgroup_mutex);
 	parent = cgrp->parent;
-
-	if (atomic_read(&cgrp->count)
-	    || !list_empty(&cgrp->children)
-	    || !cgroup_clear_css_refs(cgrp)) {
+	if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) {
 		mutex_unlock(&cgroup_mutex);
 		return -EBUSY;
 	}
+	/*
+	 * css_put/get is provided for subsys to grab refcnt to css. In typical
+	 * case, subsystem has no reference after pre_destroy(). But, under
+	 * hierarchy management, some *temporal* refcnt can be hold.
+	 * To avoid returning -EBUSY to a user, waitqueue is used. If subsys
+	 * is really busy, it should return -EBUSY at pre_destroy(). wake_up
+	 * is called when css_put() is called and refcnt goes down to 0.
+	 */
+	set_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+	prepare_to_wait(&cgroup_rmdir_waitq, &wait, TASK_INTERRUPTIBLE);
+
+	if (!cgroup_clear_css_refs(cgrp)) {
+		mutex_unlock(&cgroup_mutex);
+		schedule();
+		finish_wait(&cgroup_rmdir_waitq, &wait);
+		clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
+		if (signal_pending(current))
+			return -EINTR;
+		goto again;
+	}
+	/* NO css_tryget() can success after here. */
+	finish_wait(&cgroup_rmdir_waitq, &wait);
+	clear_bit(CGRP_WAIT_ON_RMDIR, &cgrp->flags);
 
 	spin_lock(&release_list_lock);
 	set_bit(CGRP_REMOVED, &cgrp->flags);
@@ -3194,10 +3245,12 @@ void __css_put(struct cgroup_subsys_state *css)
 {
 	struct cgroup *cgrp = css->cgroup;
 	rcu_read_lock();
-	if ((atomic_dec_return(&css->refcnt) == 1) &&
-	    notify_on_release(cgrp)) {
-		set_bit(CGRP_RELEASABLE, &cgrp->flags);
-		check_for_release(cgrp);
+	if (atomic_dec_return(&css->refcnt) == 1) {
+		if (notify_on_release(cgrp)) {
+			set_bit(CGRP_RELEASABLE, &cgrp->flags);
+			check_for_release(cgrp);
+		}
+		cgroup_wakeup_rmdir_waiters(cgrp);
 	}
 	rcu_read_unlock();
 }
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 8e4be9cb2a6a..8ffec674c5ac 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2272,11 +2272,12 @@ free_out:
 	return ERR_PTR(-ENOMEM);
 }
 
-static void mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
+static int mem_cgroup_pre_destroy(struct cgroup_subsys *ss,
 					struct cgroup *cont)
 {
 	struct mem_cgroup *mem = mem_cgroup_from_cont(cont);
-	mem_cgroup_force_empty(mem, false);
+
+	return mem_cgroup_force_empty(mem, false);
 }
 
 static void mem_cgroup_destroy(struct cgroup_subsys *ss,
-- 
cgit v1.2.3


From 66bdc9cfc77ba89a9ee6c82d28375b646ab4bb1d Mon Sep 17 00:00:00 2001
From: Jesper Juhl <jj@chaosbits.net>
Date: Thu, 2 Apr 2009 16:57:27 -0700
Subject: kernel/cgroup.c: kfree(NULL) is legal

Reduces object file size a bit:

Before:
$ size kernel/cgroup.o
   text    data     bss     dec     hex filename
  21593    7804    4924   34321    8611 kernel/cgroup.o
After:
$ size kernel/cgroup.o
   text    data     bss     dec     hex filename
  21537    7744    4924   34205    859d kernel/cgroup.o

Signed-off-by: Jesper Juhl <jj@chaosbits.net>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fc5e4a48582f..9a6c2bfa1d9f 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -923,8 +923,7 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
  out_unlock:
-	if (opts.release_agent)
-		kfree(opts.release_agent);
+	kfree(opts.release_agent);
 	mutex_unlock(&cgroup_mutex);
 	mutex_unlock(&cgrp->dentry->d_inode->i_mutex);
 	return ret;
@@ -1027,15 +1026,13 @@ static int cgroup_get_sb(struct file_system_type *fs_type,
 	/* First find the desired set of subsystems */
 	ret = parse_cgroupfs_options(data, &opts);
 	if (ret) {
-		if (opts.release_agent)
-			kfree(opts.release_agent);
+		kfree(opts.release_agent);
 		return ret;
 	}
 
 	root = kzalloc(sizeof(*root), GFP_KERNEL);
 	if (!root) {
-		if (opts.release_agent)
-			kfree(opts.release_agent);
+		kfree(opts.release_agent);
 		return -ENOMEM;
 	}
 
-- 
cgit v1.2.3


From 099fca3225b39f7a3ed853036038054172b55581 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:29 -0700
Subject: cgroups: show correct file mode

We have some read-only files and write-only files, but currently they are
all set to 0644, which is counter-intuitive and cause trouble for some
cgroup tools like libcgroup.

This patch adds 'mode' to struct cftype to allow cgroup subsys to set it's
own files' file mode, and for the most cases cft->mode can be default to 0
and cgroup will figure out proper mode.

Acked-by: Paul Menage <menage@google.com>
Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cgroup.h |  5 +++++
 kernel/cgroup.c        | 38 ++++++++++++++++++++++++++++++++++----
 kernel/cpuset.c        |  1 +
 3 files changed, 40 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 7d824b80b3d7..b2816fba5306 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -258,6 +258,11 @@ struct cftype {
 	 */
 	char name[MAX_CFTYPE_NAME];
 	int private;
+	/*
+	 * If not 0, file mode is set to this value, otherwise it will
+	 * be figured out automatically
+	 */
+	mode_t mode;
 
 	/*
 	 * If non-zero, defines the maximum length of string that can
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 9a6c2bfa1d9f..fea11c5c990c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -1686,7 +1686,7 @@ static struct inode_operations cgroup_dir_inode_operations = {
 	.rename = cgroup_rename,
 };
 
-static int cgroup_create_file(struct dentry *dentry, int mode,
+static int cgroup_create_file(struct dentry *dentry, mode_t mode,
 				struct super_block *sb)
 {
 	static const struct dentry_operations cgroup_dops = {
@@ -1732,7 +1732,7 @@ static int cgroup_create_file(struct dentry *dentry, int mode,
  * @mode: mode to set on new directory.
  */
 static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
-				int mode)
+				mode_t mode)
 {
 	struct dentry *parent;
 	int error = 0;
@@ -1750,6 +1750,33 @@ static int cgroup_create_dir(struct cgroup *cgrp, struct dentry *dentry,
 	return error;
 }
 
+/**
+ * cgroup_file_mode - deduce file mode of a control file
+ * @cft: the control file in question
+ *
+ * returns cft->mode if ->mode is not 0
+ * returns S_IRUGO|S_IWUSR if it has both a read and a write handler
+ * returns S_IRUGO if it has only a read handler
+ * returns S_IWUSR if it has only a write hander
+ */
+static mode_t cgroup_file_mode(const struct cftype *cft)
+{
+	mode_t mode = 0;
+
+	if (cft->mode)
+		return cft->mode;
+
+	if (cft->read || cft->read_u64 || cft->read_s64 ||
+	    cft->read_map || cft->read_seq_string)
+		mode |= S_IRUGO;
+
+	if (cft->write || cft->write_u64 || cft->write_s64 ||
+	    cft->write_string || cft->trigger)
+		mode |= S_IWUSR;
+
+	return mode;
+}
+
 int cgroup_add_file(struct cgroup *cgrp,
 		       struct cgroup_subsys *subsys,
 		       const struct cftype *cft)
@@ -1757,6 +1784,7 @@ int cgroup_add_file(struct cgroup *cgrp,
 	struct dentry *dir = cgrp->dentry;
 	struct dentry *dentry;
 	int error;
+	mode_t mode;
 
 	char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 };
 	if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) {
@@ -1767,7 +1795,8 @@ int cgroup_add_file(struct cgroup *cgrp,
 	BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex));
 	dentry = lookup_one_len(name, dir, strlen(name));
 	if (!IS_ERR(dentry)) {
-		error = cgroup_create_file(dentry, 0644 | S_IFREG,
+		mode = cgroup_file_mode(cft);
+		error = cgroup_create_file(dentry, mode | S_IFREG,
 						cgrp->root->sb);
 		if (!error)
 			dentry->d_fsdata = (void *)cft;
@@ -2349,6 +2378,7 @@ static struct cftype files[] = {
 		.write_u64 = cgroup_tasks_write,
 		.release = cgroup_tasks_release,
 		.private = FILE_TASKLIST,
+		.mode = S_IRUGO | S_IWUSR,
 	},
 
 	{
@@ -2449,7 +2479,7 @@ static void cgroup_unlock_hierarchy(struct cgroupfs_root *root)
  * Must be called with the mutex on the parent inode held
  */
 static long cgroup_create(struct cgroup *parent, struct dentry *dentry,
-			     int mode)
+			     mode_t mode)
 {
 	struct cgroup *cgrp;
 	struct cgroupfs_root *root = parent->root;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index f76db9dcaa05..ee5ec386aa8b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1706,6 +1706,7 @@ static struct cftype files[] = {
 		.read_u64 = cpuset_read_u64,
 		.write_u64 = cpuset_write_u64,
 		.private = FILE_MEMORY_PRESSURE,
+		.mode = S_IRUGO,
 	},
 
 	{
-- 
cgit v1.2.3


From 0670e08bdfc67272f8c3087030417465629b8073 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:30 -0700
Subject: cgroups: don't change release_agent when remount failed

Remount can fail in either case:
  - wrong mount options is specified, or option 'noprefix' is changed.
  - a to-be-added subsys is already mounted/active.

When using remount to change 'release_agent', for the above former failure
case, remount will return errno with release_agent unchanged, but for the
latter case, remount will return EBUSY with relase_agent changed, which is
unexpected I think:

 # mount -t cgroup -o cpu xxx /cgrp1
 # mount -t cgroup -o cpuset,release_agent=agent1 yyy /cgrp2
 # cat /cgrp2/release_agent
 agent1
 # mount -t cgroup -o remount,cpuset,noprefix,release_agent=agent2 yyy /cgrp2
 mount: /cgrp2 not mounted already, or bad option
 # cat /cgrp2/release_agent
 agent1     <-- ok
 # mount -t cgroup -o remount,cpu,cpuset,release_agent=agent2 yyy /cgrp2
 mount: /cgrp2 is busy
 # cat /cgrp2/release_agent
 agent2     <-- unexpected!

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index fea11c5c990c..f2a3f5c9936c 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -915,10 +915,11 @@ static int cgroup_remount(struct super_block *sb, int *flags, char *data)
 	}
 
 	ret = rebind_subsystems(root, opts.subsys_bits);
+	if (ret)
+		goto out_unlock;
 
 	/* (re)populate subsystem files */
-	if (!ret)
-		cgroup_populate_dir(cgrp);
+	cgroup_populate_dir(cgrp);
 
 	if (opts.release_agent)
 		strcpy(root->release_agent_path, opts.release_agent);
-- 
cgit v1.2.3


From d969fbe69e07fcceb0558b35d4c75eb046041c5e Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:31 -0700
Subject: debug cgroup: remove unneeded cgroup_lock

Since we are in cgroup write handler, so the cgrp is valid, so we don't
have to hold cgroup_mutex when calling cgroup_task_count().  One similar
example is in cgroup_tasks_open().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cgroup_debug.c | 2 --
 1 file changed, 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cgroup_debug.c b/kernel/cgroup_debug.c
index daca6209202d..0c92d797baa6 100644
--- a/kernel/cgroup_debug.c
+++ b/kernel/cgroup_debug.c
@@ -40,9 +40,7 @@ static u64 taskcount_read(struct cgroup *cont, struct cftype *cft)
 {
 	u64 count;
 
-	cgroup_lock();
 	count = cgroup_task_count(cont);
-	cgroup_unlock();
 	return count;
 }
 
-- 
cgit v1.2.3


From 0b7f569e45bb6be142d87017030669a6a7d327a1 Mon Sep 17 00:00:00 2001
From: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:38 -0700
Subject: memcg: fix OOM killer under memcg

This patch tries to fix OOM Killer problems caused by hierarchy.
Now, memcg itself has OOM KILL function (in oom_kill.c) and tries to
kill a task in memcg.

But, when hierarchy is used, it's broken and correct task cannot
be killed. For example, in following cgroup

	/groupA/	hierarchy=1, limit=1G,
		01	nolimit
		02	nolimit
All tasks' memory usage under /groupA, /groupA/01, groupA/02 is limited to
groupA's 1Gbytes but OOM Killer just kills tasks in groupA.

This patch provides makes the bad process be selected from all tasks
under hierarchy. BTW, currently, oom_jiffies is updated against groupA
in above case. oom_jiffies of tree should be updated.

To see how oom_jiffies is used, please check mem_cgroup_oom_called()
callers.

[akpm@linux-foundation.org: build fix]
[akpm@linux-foundation.org: const fix]
Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/cgroups/memcg_test.txt | 20 +++++++++++++++++++-
 include/linux/cgroup.h               |  2 +-
 kernel/cgroup.c                      |  2 +-
 mm/memcontrol.c                      | 30 ++++++++++++++++++++++++++++--
 4 files changed, 49 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt
index 523a9c16c400..8a11caf417a0 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -1,5 +1,5 @@
 Memory Resource Controller(Memcg)  Implementation Memo.
-Last Updated: 2009/1/19
+Last Updated: 2009/1/20
 Base Kernel Version: based on 2.6.29-rc2.
 
 Because VM is getting complex (one of reasons is memcg...), memcg's behavior
@@ -360,3 +360,21 @@ Under below explanation, we assume CONFIG_MEM_RES_CTRL_SWAP=y.
 	# kill malloc task.
 
 	Of course, tmpfs v.s. swapoff test should be tested, too.
+
+ 9.8 OOM-Killer
+	Out-of-memory caused by memcg's limit will kill tasks under
+	the memcg. When hierarchy is used, a task under hierarchy
+	will be killed by the kernel.
+	In this case, panic_on_oom shouldn't be invoked and tasks
+	in other groups shouldn't be killed.
+
+	It's not difficult to cause OOM under memcg as following.
+	Case A) when you can swapoff
+	#swapoff -a
+	#echo 50M > /memory.limit_in_bytes
+	run 51M of malloc
+
+	Case B) when you use mem+swap limitation.
+	#echo 50M > memory.limit_in_bytes
+	#echo 50M > memory.memsw.limit_in_bytes
+	run 51M of malloc
diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index b2816fba5306..43763bd772b9 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -503,7 +503,7 @@ struct cgroup_subsys_state *css_get_next(struct cgroup_subsys *ss, int id,
 
 /* Returns true if root is ancestor of cg */
 bool css_is_ancestor(struct cgroup_subsys_state *cg,
-		     struct cgroup_subsys_state *root);
+		     const struct cgroup_subsys_state *root);
 
 /* Get id and depth of css */
 unsigned short css_id(struct cgroup_subsys_state *css);
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f2a3f5c9936c..382109b5baeb 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -3405,7 +3405,7 @@ unsigned short css_depth(struct cgroup_subsys_state *css)
 }
 
 bool css_is_ancestor(struct cgroup_subsys_state *child,
-		    struct cgroup_subsys_state *root)
+		    const struct cgroup_subsys_state *root)
 {
 	struct css_id *child_id = rcu_dereference(child->id);
 	struct css_id *root_id = rcu_dereference(root->id);
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6f6a575e77ad..025f8abfae2d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -295,6 +295,9 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
 static struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
 	struct mem_cgroup *mem = NULL;
+
+	if (!mm)
+		return NULL;
 	/*
 	 * Because we have no locks, mm->owner's may be being moved to other
 	 * cgroup. We use css_tryget() here even if this looks
@@ -486,10 +489,20 @@ void mem_cgroup_move_lists(struct page *page,
 int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *mem)
 {
 	int ret;
+	struct mem_cgroup *curr = NULL;
 
 	task_lock(task);
-	ret = task->mm && mm_match_cgroup(task->mm, mem);
+	rcu_read_lock();
+	curr = try_get_mem_cgroup_from_mm(task->mm);
+	rcu_read_unlock();
 	task_unlock(task);
+	if (!curr)
+		return 0;
+	if (curr->use_hierarchy)
+		ret = css_is_ancestor(&curr->css, &mem->css);
+	else
+		ret = (curr == mem);
+	css_put(&curr->css);
 	return ret;
 }
 
@@ -820,6 +833,19 @@ bool mem_cgroup_oom_called(struct task_struct *task)
 	rcu_read_unlock();
 	return ret;
 }
+
+static int record_last_oom_cb(struct mem_cgroup *mem, void *data)
+{
+	mem->last_oom_jiffies = jiffies;
+	return 0;
+}
+
+static void record_last_oom(struct mem_cgroup *mem)
+{
+	mem_cgroup_walk_tree(mem, NULL, record_last_oom_cb);
+}
+
+
 /*
  * Unlike exported interface, "oom" parameter is added. if oom==true,
  * oom-killer can be invoked.
@@ -902,7 +928,7 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 				mutex_lock(&memcg_tasklist);
 				mem_cgroup_out_of_memory(mem_over_limit, gfp_mask);
 				mutex_unlock(&memcg_tasklist);
-				mem_over_limit->last_oom_jiffies = jiffies;
+				record_last_oom(mem_over_limit);
 			}
 			goto nomem;
 		}
-- 
cgit v1.2.3


From 0b4217b3fdddc4a58939720d3ed809537577d48b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:49 -0700
Subject: cpuset: fix possible races in cpu/memory hotplug

Change to cpuset->cpus_allowed and cpuset->mems_allowed should be protected
by callback_mutex, otherwise the reader may read wrong cpus/mems. This is
cpuset's lock rule.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index ee5ec386aa8b..31737957cb62 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2070,7 +2070,9 @@ static int cpuset_track_online_cpus(struct notifier_block *unused_nb,
 	}
 
 	cgroup_lock();
+	mutex_lock(&callback_mutex);
 	cpumask_copy(top_cpuset.cpus_allowed, cpu_online_mask);
+	mutex_unlock(&callback_mutex);
 	scan_for_empty_cpusets(&top_cpuset);
 	ndoms = generate_sched_domains(&doms, &attr);
 	cgroup_unlock();
@@ -2093,11 +2095,12 @@ static int cpuset_track_online_nodes(struct notifier_block *self,
 	cgroup_lock();
 	switch (action) {
 	case MEM_ONLINE:
-		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		break;
 	case MEM_OFFLINE:
+		mutex_lock(&callback_mutex);
 		top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY];
-		scan_for_empty_cpusets(&top_cpuset);
+		mutex_unlock(&callback_mutex);
+		if (action == MEM_OFFLINE)
+			scan_for_empty_cpusets(&top_cpuset);
 		break;
 	default:
 		break;
-- 
cgit v1.2.3


From 3b6766fe668b83c8a03c6ed01bcc2ac77cbae848 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:51 -0700
Subject: cpuset: rewrite update_tasks_nodemask()

This patch uses cgroup_scan_tasks() to rebind tasks' vmas to new cpuset's
mems_allowed.

Not only simplify the code largely, but also avoid allocating an array to
hold mm pointers of all the tasks in the cpuset.  This array can be big
(size > PAGESIZE) if we have lots of tasks in that cpuset, thus has a
chance to fail the allocation when under memory stress.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 109 ++++++++++++++++++++------------------------------------
 1 file changed, 39 insertions(+), 70 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 31737957cb62..dca455e0482e 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1026,6 +1026,31 @@ static void cpuset_migrate_mm(struct mm_struct *mm, const nodemask_t *from,
 	mutex_unlock(&callback_mutex);
 }
 
+/*
+ * Rebind task's vmas to cpuset's new mems_allowed, and migrate pages to new
+ * nodes if memory_migrate flag is set. Called with cgroup_mutex held.
+ */
+static void cpuset_change_nodemask(struct task_struct *p,
+				   struct cgroup_scanner *scan)
+{
+	struct mm_struct *mm;
+	struct cpuset *cs;
+	int migrate;
+	const nodemask_t *oldmem = scan->data;
+
+	mm = get_task_mm(p);
+	if (!mm)
+		return;
+
+	cs = cgroup_cs(scan->cg);
+	migrate = is_memory_migrate(cs);
+
+	mpol_rebind_mm(mm, &cs->mems_allowed);
+	if (migrate)
+		cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
+	mmput(mm);
+}
+
 static void *cpuset_being_rebound;
 
 /**
@@ -1038,88 +1063,32 @@ static void *cpuset_being_rebound;
  */
 static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 {
-	struct task_struct *p;
-	struct mm_struct **mmarray;
-	int i, n, ntasks;
-	int migrate;
-	int fudge;
-	struct cgroup_iter it;
 	int retval;
+	struct cgroup_scanner scan;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
 
-	fudge = 10;				/* spare mmarray[] slots */
-	fudge += cpumask_weight(cs->cpus_allowed);/* imagine 1 fork-bomb/cpu */
-	retval = -ENOMEM;
-
-	/*
-	 * Allocate mmarray[] to hold mm reference for each task
-	 * in cpuset cs.  Can't kmalloc GFP_KERNEL while holding
-	 * tasklist_lock.  We could use GFP_ATOMIC, but with a
-	 * few more lines of code, we can retry until we get a big
-	 * enough mmarray[] w/o using GFP_ATOMIC.
-	 */
-	while (1) {
-		ntasks = cgroup_task_count(cs->css.cgroup);  /* guess */
-		ntasks += fudge;
-		mmarray = kmalloc(ntasks * sizeof(*mmarray), GFP_KERNEL);
-		if (!mmarray)
-			goto done;
-		read_lock(&tasklist_lock);		/* block fork */
-		if (cgroup_task_count(cs->css.cgroup) <= ntasks)
-			break;				/* got enough */
-		read_unlock(&tasklist_lock);		/* try again */
-		kfree(mmarray);
-	}
-
-	n = 0;
-
-	/* Load up mmarray[] with mm reference for each task in cpuset. */
-	cgroup_iter_start(cs->css.cgroup, &it);
-	while ((p = cgroup_iter_next(cs->css.cgroup, &it))) {
-		struct mm_struct *mm;
-
-		if (n >= ntasks) {
-			printk(KERN_WARNING
-				"Cpuset mempolicy rebind incomplete.\n");
-			break;
-		}
-		mm = get_task_mm(p);
-		if (!mm)
-			continue;
-		mmarray[n++] = mm;
-	}
-	cgroup_iter_end(cs->css.cgroup, &it);
-	read_unlock(&tasklist_lock);
+	scan.cg = cs->css.cgroup;
+	scan.test_task = NULL;
+	scan.process_task = cpuset_change_nodemask;
+	scan.heap = NULL;
+	scan.data = (nodemask_t *)oldmem;
 
 	/*
-	 * Now that we've dropped the tasklist spinlock, we can
-	 * rebind the vma mempolicies of each mm in mmarray[] to their
-	 * new cpuset, and release that mm.  The mpol_rebind_mm()
-	 * call takes mmap_sem, which we couldn't take while holding
-	 * tasklist_lock.  Forks can happen again now - the mpol_dup()
-	 * cpuset_being_rebound check will catch such forks, and rebind
-	 * their vma mempolicies too.  Because we still hold the global
-	 * cgroup_mutex, we know that no other rebind effort will
-	 * be contending for the global variable cpuset_being_rebound.
+	 * The mpol_rebind_mm() call takes mmap_sem, which we couldn't
+	 * take while holding tasklist_lock.  Forks can happen - the
+	 * mpol_dup() cpuset_being_rebound check will catch such forks,
+	 * and rebind their vma mempolicies too.  Because we still hold
+	 * the global cgroup_mutex, we know that no other rebind effort
+	 * will be contending for the global variable cpuset_being_rebound.
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	migrate = is_memory_migrate(cs);
-	for (i = 0; i < n; i++) {
-		struct mm_struct *mm = mmarray[i];
-
-		mpol_rebind_mm(mm, &cs->mems_allowed);
-		if (migrate)
-			cpuset_migrate_mm(mm, oldmem, &cs->mems_allowed);
-		mmput(mm);
-	}
+	retval = cgroup_scan_tasks(&scan);
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
-	kfree(mmarray);
 	cpuset_being_rebound = NULL;
-	retval = 0;
-done:
+
 	return retval;
 }
 
-- 
cgit v1.2.3


From 010cfac4ca0f9e85f54ba2117a372e72f4fb9a60 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:52 -0700
Subject: cpuset: avoid changing cpuset's mems when errno returned

When writing to cpuset.mems, cpuset has to update its mems_allowed before
calling update_tasks_nodemask(), but this function might return -ENOMEM.

To avoid this rare case, we allocate the memory before changing
mems_allowed, and then pass to update_tasks_nodemask().  Similar to what
update_cpumask() does.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 25 ++++++++++++++++---------
 1 file changed, 16 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index dca455e0482e..3778a21a4662 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1057,13 +1057,15 @@ static void *cpuset_being_rebound;
  * update_tasks_nodemask - Update the nodemasks of tasks in the cpuset.
  * @cs: the cpuset in which each task's mems_allowed mask needs to be changed
  * @oldmem: old mems_allowed of cpuset cs
+ * @heap: if NULL, defer allocating heap memory to cgroup_scan_tasks()
  *
  * Called with cgroup_mutex held
- * Return 0 if successful, -errno if not.
+ * No return value. It's guaranteed that cgroup_scan_tasks() always returns 0
+ * if @heap != NULL.
  */
-static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
+static void update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem,
+				 struct ptr_heap *heap)
 {
-	int retval;
 	struct cgroup_scanner scan;
 
 	cpuset_being_rebound = cs;		/* causes mpol_dup() rebind */
@@ -1071,7 +1073,7 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	scan.cg = cs->css.cgroup;
 	scan.test_task = NULL;
 	scan.process_task = cpuset_change_nodemask;
-	scan.heap = NULL;
+	scan.heap = heap;
 	scan.data = (nodemask_t *)oldmem;
 
 	/*
@@ -1084,12 +1086,10 @@ static int update_tasks_nodemask(struct cpuset *cs, const nodemask_t *oldmem)
 	 * It's ok if we rebind the same mm twice; mpol_rebind_mm()
 	 * is idempotent.  Also migrate pages in each mm to new nodes.
 	 */
-	retval = cgroup_scan_tasks(&scan);
+	cgroup_scan_tasks(&scan);
 
 	/* We're done rebinding vmas to this cpuset's new mems_allowed. */
 	cpuset_being_rebound = NULL;
-
-	return retval;
 }
 
 /*
@@ -1110,6 +1110,7 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 {
 	nodemask_t oldmem;
 	int retval;
+	struct ptr_heap heap;
 
 	/*
 	 * top_cpuset.mems_allowed tracks node_stats[N_HIGH_MEMORY];
@@ -1144,12 +1145,18 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs,
 	if (retval < 0)
 		goto done;
 
+	retval = heap_init(&heap, PAGE_SIZE, GFP_KERNEL, NULL);
+	if (retval < 0)
+		goto done;
+
 	mutex_lock(&callback_mutex);
 	cs->mems_allowed = trialcs->mems_allowed;
 	cs->mems_generation = cpuset_mems_generation++;
 	mutex_unlock(&callback_mutex);
 
-	retval = update_tasks_nodemask(cs, &oldmem);
+	update_tasks_nodemask(cs, &oldmem, &heap);
+
+	heap_free(&heap);
 done:
 	return retval;
 }
@@ -2003,7 +2010,7 @@ static void scan_for_empty_cpusets(struct cpuset *root)
 			remove_tasks_in_empty_cpuset(cp);
 		else {
 			update_tasks_cpumask(cp, NULL);
-			update_tasks_nodemask(cp, &oldmems);
+			update_tasks_nodemask(cp, &oldmems, NULL);
 		}
 	}
 }
-- 
cgit v1.2.3


From 7f81b1ae18416b457e4d5ff23f0bd598e8a42224 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:57:53 -0700
Subject: cpuset: remove struct cpuset_hotplug_scanner

Use cgroup_scanner.data, instead of introducing cpuset_hotplug_scanner.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3778a21a4662..0619f109d38d 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -128,10 +128,6 @@ static inline struct cpuset *task_cs(struct task_struct *task)
 	return container_of(task_subsys_state(task, cpuset_subsys_id),
 			    struct cpuset, css);
 }
-struct cpuset_hotplug_scanner {
-	struct cgroup_scanner scan;
-	struct cgroup *to;
-};
 
 /* bits in struct cpuset flags field */
 typedef enum {
@@ -1890,10 +1886,9 @@ int __init cpuset_init(void)
 static void cpuset_do_move_task(struct task_struct *tsk,
 				struct cgroup_scanner *scan)
 {
-	struct cpuset_hotplug_scanner *chsp;
+	struct cgroup *new_cgroup = scan->data;
 
-	chsp = container_of(scan, struct cpuset_hotplug_scanner, scan);
-	cgroup_attach_task(chsp->to, tsk);
+	cgroup_attach_task(new_cgroup, tsk);
 }
 
 /**
@@ -1909,15 +1904,15 @@ static void cpuset_do_move_task(struct task_struct *tsk,
  */
 static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to)
 {
-	struct cpuset_hotplug_scanner scan;
+	struct cgroup_scanner scan;
 
-	scan.scan.cg = from->css.cgroup;
-	scan.scan.test_task = NULL; /* select all tasks in cgroup */
-	scan.scan.process_task = cpuset_do_move_task;
-	scan.scan.heap = NULL;
-	scan.to = to->css.cgroup;
+	scan.cg = from->css.cgroup;
+	scan.test_task = NULL; /* select all tasks in cgroup */
+	scan.process_task = cpuset_do_move_task;
+	scan.heap = NULL;
+	scan.data = to->css.cgroup;
 
-	if (cgroup_scan_tasks(&scan.scan))
+	if (cgroup_scan_tasks(&scan))
 		printk(KERN_ERR "move_member_tasks_to_cpuset: "
 				"cgroup_scan_tasks failed\n");
 }
-- 
cgit v1.2.3


From a1bc5a4eee990a1f290735c8694d0aebdad095fa Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:54 -0700
Subject: cpusets: replace zone allowed functions with node allowed

The cpuset_zone_allowed() variants are actually only a function of the
zone's node.

Cc: Paul Menage <menage@google.com>
Acked-by: Christoph Lameter <cl@linux-foundation.org>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/cpuset.h | 33 +++++++++++++++++++++++-----
 kernel/cpuset.c        | 59 +++++++++++++++++++++-----------------------------
 2 files changed, 52 insertions(+), 40 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h
index 2e0d79678deb..05ea1dd7d681 100644
--- a/include/linux/cpuset.h
+++ b/include/linux/cpuset.h
@@ -12,6 +12,7 @@
 #include <linux/cpumask.h>
 #include <linux/nodemask.h>
 #include <linux/cgroup.h>
+#include <linux/mm.h>
 
 #ifdef CONFIG_CPUSETS
 
@@ -29,19 +30,29 @@ void cpuset_init_current_mems_allowed(void);
 void cpuset_update_task_memory_state(void);
 int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask);
 
-extern int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask);
-extern int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask);
+extern int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask);
 
-static int inline cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_softwall(z, gfp_mask);
+		__cpuset_node_allowed_softwall(node, gfp_mask);
 }
 
-static int inline cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
 	return number_of_cpusets <= 1 ||
-		__cpuset_zone_allowed_hardwall(z, gfp_mask);
+		__cpuset_node_allowed_hardwall(node, gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_softwall(zone_to_nid(z), gfp_mask);
+}
+
+static inline int cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+{
+	return cpuset_node_allowed_hardwall(zone_to_nid(z), gfp_mask);
 }
 
 extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1,
@@ -112,6 +123,16 @@ static inline int cpuset_nodemask_valid_mems_allowed(nodemask_t *nodemask)
 	return 1;
 }
 
+static inline int cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
+static inline int cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
+{
+	return 1;
+}
+
 static inline int cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 {
 	return 1;
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 0619f109d38d..3ff910eb30d3 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -2181,26 +2181,24 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
 }
 
 /**
- * cpuset_zone_allowed_softwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_softwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.  If
- * __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.  If it's not a
- * __GFP_HARDWALL request and this zone's nodes is in the nearest
- * hardwalled cpuset ancestor to this tasks cpuset, yes.
- * If the task has been OOM killed and has access to memory reserves
- * as specified by the TIF_MEMDIE flag, yes.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If it's not a __GFP_HARDWALL request and this node is in the nearest
+ * hardwalled cpuset ancestor to this task's cpuset, yes.  If the task has been
+ * OOM killed and has access to memory reserves as specified by the TIF_MEMDIE
+ * flag, yes.
  * Otherwise, no.
  *
- * If __GFP_HARDWALL is set, cpuset_zone_allowed_softwall()
- * reduces to cpuset_zone_allowed_hardwall().  Otherwise,
- * cpuset_zone_allowed_softwall() might sleep, and might allow a zone
- * from an enclosing cpuset.
+ * If __GFP_HARDWALL is set, cpuset_node_allowed_softwall() reduces to
+ * cpuset_node_allowed_hardwall().  Otherwise, cpuset_node_allowed_softwall()
+ * might sleep, and might allow a node from an enclosing cpuset.
  *
- * cpuset_zone_allowed_hardwall() only handles the simpler case of
- * hardwall cpusets, and never sleeps.
+ * cpuset_node_allowed_hardwall() only handles the simpler case of hardwall
+ * cpusets, and never sleeps.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2239,20 +2237,17 @@ static const struct cpuset *nearest_hardwall_ancestor(const struct cpuset *cs)
  *	GFP_USER     - only nodes in current tasks mems allowed ok.
  *
  * Rule:
- *    Don't call cpuset_zone_allowed_softwall if you can't sleep, unless you
+ *    Don't call cpuset_node_allowed_softwall if you can't sleep, unless you
  *    pass in the __GFP_HARDWALL flag set in gfp_flag, which disables
  *    the code that might scan up ancestor cpusets and sleep.
  */
-
-int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_softwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
 	const struct cpuset *cs;	/* current cpuset ancestors */
 	int allowed;			/* is allocation in zone z allowed? */
 
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	might_sleep_if(!(gfp_mask & __GFP_HARDWALL));
 	if (node_isset(node, current->mems_allowed))
 		return 1;
@@ -2281,15 +2276,15 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
 }
 
 /*
- * cpuset_zone_allowed_hardwall - Can we allocate on zone z's memory node?
- * @z: is this zone on an allowed node?
+ * cpuset_node_allowed_hardwall - Can we allocate on a memory node?
+ * @node: is this an allowed node?
  * @gfp_mask: memory allocation flags
  *
- * If we're in interrupt, yes, we can always allocate.
- * If __GFP_THISNODE is set, yes, we can always allocate.  If zone
- * z's node is in our tasks mems_allowed, yes.   If the task has been
- * OOM killed and has access to memory reserves as specified by the
- * TIF_MEMDIE flag, yes.  Otherwise, no.
+ * If we're in interrupt, yes, we can always allocate.  If __GFP_THISNODE is
+ * set, yes, we can always allocate.  If node is in our task's mems_allowed,
+ * yes.  If the task has been OOM killed and has access to memory reserves as
+ * specified by the TIF_MEMDIE flag, yes.
+ * Otherwise, no.
  *
  * The __GFP_THISNODE placement logic is really handled elsewhere,
  * by forcibly using a zonelist starting at a specified node, and by
@@ -2297,20 +2292,16 @@ int __cpuset_zone_allowed_softwall(struct zone *z, gfp_t gfp_mask)
  * any node on the zonelist except the first.  By the time any such
  * calls get to this routine, we should just shut up and say 'yes'.
  *
- * Unlike the cpuset_zone_allowed_softwall() variant, above,
- * this variant requires that the zone be in the current tasks
+ * Unlike the cpuset_node_allowed_softwall() variant, above,
+ * this variant requires that the node be in the current task's
  * mems_allowed or that we're in interrupt.  It does not scan up the
  * cpuset hierarchy for the nearest enclosing mem_exclusive cpuset.
  * It never sleeps.
  */
-
-int __cpuset_zone_allowed_hardwall(struct zone *z, gfp_t gfp_mask)
+int __cpuset_node_allowed_hardwall(int node, gfp_t gfp_mask)
 {
-	int node;			/* node that zone z is on */
-
 	if (in_interrupt() || (gfp_mask & __GFP_THISNODE))
 		return 1;
-	node = zone_to_nid(z);
 	if (node_isset(node, current->mems_allowed))
 		return 1;
 	/*
-- 
cgit v1.2.3


From db7f47cf4805e30decb0841764b21b7c4000f7dc Mon Sep 17 00:00:00 2001
From: Paul Menage <menage@google.com>
Date: Thu, 2 Apr 2009 16:57:55 -0700
Subject: cpusets: allow cpusets to be configured/built on non-SMP systems

Allow cpusets to be configured/built on non-SMP systems

Currently it's impossible to build cpusets under UML on x86-64, since
cpusets depends on SMP and x86-64 UML doesn't support SMP.

There's code in cpusets that doesn't depend on SMP.  This patch surrounds
the minimum amount of cpusets code with #ifdef CONFIG_SMP in order to
allow cpusets to build/run on UP systems (for testing purposes under UML).

Reviewed-by: Li Zefan <lizf@cn.fujitsu.com>
Signed-off-by: Paul Menage <menage@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 init/Kconfig    |  2 +-
 kernel/cpuset.c | 15 +++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/init/Kconfig b/init/Kconfig
index 92d410603932..1398a14b0191 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -531,7 +531,7 @@ config CGROUP_DEVICE
 
 config CPUSETS
 	bool "Cpuset support"
-	depends on SMP && CGROUPS
+	depends on CGROUPS
 	help
 	  This option will let you create and manage CPUSETs which
 	  allow dynamically partitioning a system into sets of CPUs and
diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 3ff910eb30d3..2b93b50cbe4b 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -517,6 +517,7 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial)
 	return 0;
 }
 
+#ifdef CONFIG_SMP
 /*
  * Helper routine for generate_sched_domains().
  * Do cpusets a, b have overlapping cpus_allowed masks?
@@ -811,6 +812,18 @@ static void do_rebuild_sched_domains(struct work_struct *unused)
 
 	put_online_cpus();
 }
+#else /* !CONFIG_SMP */
+static void do_rebuild_sched_domains(struct work_struct *unused)
+{
+}
+
+static int generate_sched_domains(struct cpumask **domains,
+			struct sched_domain_attr **attributes)
+{
+	*domains = NULL;
+	return 1;
+}
+#endif /* CONFIG_SMP */
 
 static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains);
 
@@ -1164,8 +1177,10 @@ int current_cpuset_is_being_rebound(void)
 
 static int update_relax_domain_level(struct cpuset *cs, s64 val)
 {
+#ifdef CONFIG_SMP
 	if (val < -1 || val >= SD_LV_MAX)
 		return -EINVAL;
+#endif
 
 	if (val != cs->relax_domain_level) {
 		cs->relax_domain_level = val;
-- 
cgit v1.2.3


From 6d7b2f5f9e88902b19f91d0c8a7ef58a5455f1a2 Mon Sep 17 00:00:00 2001
From: David Rientjes <rientjes@google.com>
Date: Thu, 2 Apr 2009 16:57:57 -0700
Subject: cpusets: prevent PF_THREAD_BOUND tasks from attaching to non-root
 cpusets

Kthreads that have the PF_THREAD_BOUND bit set in their flags are bound to a
specific cpu.  Thus, their set of allowed cpus shall not change.

This patch prevents such threads from attaching to non-root cpusets.  They do
not have mempolicies that restrict them to a subset of system nodes and, since
their cpumask may never change, they cannot use any of the features of
cpusets.

The tasks will forever be a member of the root cpuset and will be returned
when listing the tasks attached to that cpuset.

Cc: Paul Menage <menage@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Dhaval Giani <dhaval@linux.vnet.ibm.com>
Signed-off-by: David Rientjes <rientjes@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/cpuset.c | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/cpuset.c b/kernel/cpuset.c
index 2b93b50cbe4b..026faccca869 100644
--- a/kernel/cpuset.c
+++ b/kernel/cpuset.c
@@ -1342,19 +1342,22 @@ static int cpuset_can_attach(struct cgroup_subsys *ss,
 			     struct cgroup *cont, struct task_struct *tsk)
 {
 	struct cpuset *cs = cgroup_cs(cont);
-	int ret = 0;
 
 	if (cpumask_empty(cs->cpus_allowed) || nodes_empty(cs->mems_allowed))
 		return -ENOSPC;
 
-	if (tsk->flags & PF_THREAD_BOUND) {
-		mutex_lock(&callback_mutex);
-		if (!cpumask_equal(&tsk->cpus_allowed, cs->cpus_allowed))
-			ret = -EINVAL;
-		mutex_unlock(&callback_mutex);
-	}
+	/*
+	 * Kthreads bound to specific cpus cannot be moved to a new cpuset; we
+	 * cannot change their cpu affinity and isolating such threads by their
+	 * set of allowed nodes is unnecessary.  Thus, cpusets are not
+	 * applicable for such threads.  This prevents checking for success of
+	 * set_cpus_allowed_ptr() on all attached tasks before cpus_allowed may
+	 * be changed.
+	 */
+	if (tsk->flags & PF_THREAD_BOUND)
+		return -EINVAL;
 
-	return ret < 0 ? ret : security_task_setscheduler(tsk, 0, NULL);
+	return security_task_setscheduler(tsk, 0, NULL);
 }
 
 static void cpuset_attach(struct cgroup_subsys *ss,
-- 
cgit v1.2.3


From 90bc8d8b1a38f1ab131a2399a202e1889db95de8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:57:58 -0700
Subject: do_wait: fix waiting for the group stop with the dead leader

do_wait(WSTOPPED) assumes that p->state must be == TASK_STOPPED, this is
not true if the leader is already dead.  Check SIGNAL_STOP_STOPPED instead
and use signal->group_exit_code.

Trivial test-case:

	void *tfunc(void *arg)
	{
		pause();
		return NULL;
	}

	int main(void)
	{
		pthread_t thr;
		pthread_create(&thr, NULL, tfunc, NULL);
		pthread_exit(NULL);
		return 0;
	}

It doesn't react to ^Z (and then to ^C or ^\). The task is stopped, but
bash can't see this.

The bug is very old, and it was reported multiple times. This patch was sent
more than a year ago (http://marc.info/?t=119713920000003) but it was ignored.

This change also fixes other oddities (but not all) in this area.  For
example, before this patch:

	$ sleep 100
	^Z
	[1]+  Stopped                 sleep 100
	$ strace -p `pidof sleep`
	Process 11442 attached - interrupt to quit

strace hangs in do_wait(), because ->exit_code was already consumed by
bash.  After this patch, strace happily proceeds:

	--- SIGTSTP (Stopped) @ 0 (0) ---
	restart_syscall(<... resuming interrupted call ...>

To me, this looks much more "natural" and correct.

Another example.  Let's suppose we have the main thread M and sub-thread
T, the process is stopped, and its parent did wait(WSTOPPED).  Now we can
ptrace T but not M.  This looks at least strange to me.

Imho, do_wait() should not confuse the per-thread ptrace stops with the
per-process job control stops.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Jan Kratochvil <jan.kratochvil@redhat.com>
Cc: Kaz Kylheku <kkylheku@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 167e1e3ad7c6..0c06b9efae3b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1417,6 +1417,18 @@ static int wait_task_zombie(struct task_struct *p, int options,
 	return retval;
 }
 
+static int *task_stopped_code(struct task_struct *p, bool ptrace)
+{
+	if (ptrace) {
+		if (task_is_stopped_or_traced(p))
+			return &p->exit_code;
+	} else {
+		if (p->signal->flags & SIGNAL_STOP_STOPPED)
+			return &p->signal->group_exit_code;
+	}
+	return NULL;
+}
+
 /*
  * Handle sys_wait4 work for one task in state TASK_STOPPED.  We hold
  * read_lock(&tasklist_lock) on entry.  If we return zero, we still hold
@@ -1427,7 +1439,7 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 			     int options, struct siginfo __user *infop,
 			     int __user *stat_addr, struct rusage __user *ru)
 {
-	int retval, exit_code, why;
+	int retval, exit_code, *p_code, why;
 	uid_t uid = 0; /* unneeded, required by compiler */
 	pid_t pid;
 
@@ -1437,22 +1449,16 @@ static int wait_task_stopped(int ptrace, struct task_struct *p,
 	exit_code = 0;
 	spin_lock_irq(&p->sighand->siglock);
 
-	if (unlikely(!task_is_stopped_or_traced(p)))
-		goto unlock_sig;
-
-	if (!ptrace && p->signal->group_stop_count > 0)
-		/*
-		 * A group stop is in progress and this is the group leader.
-		 * We won't report until all threads have stopped.
-		 */
+	p_code = task_stopped_code(p, ptrace);
+	if (unlikely(!p_code))
 		goto unlock_sig;
 
-	exit_code = p->exit_code;
+	exit_code = *p_code;
 	if (!exit_code)
 		goto unlock_sig;
 
 	if (!unlikely(options & WNOWAIT))
-		p->exit_code = 0;
+		*p_code = 0;
 
 	/* don't need the RCU readlock here as we're holding a spinlock */
 	uid = __task_cred(p)->uid;
@@ -1608,7 +1614,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace,
 	 */
 	*notask_error = 0;
 
-	if (task_is_stopped_or_traced(p))
+	if (task_stopped_code(p, ptrace))
 		return wait_task_stopped(ptrace, p, options,
 					 infop, stat_addr, ru);
 
-- 
cgit v1.2.3


From 43918f2bf4806675943416d539d9d5e4d585ebff Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:00 -0700
Subject: signals: remove 'handler' parameter to tracehook functions

Container-init must behave like global-init to processes within the
container and hence it must be immune to unhandled fatal signals from
within the container (i.e SIG_DFL signals that terminate the process).

But the same container-init must behave like a normal process to processes
in ancestor namespaces and so if it receives the same fatal signal from a
process in ancestor namespace, the signal must be processed.

Implementing these semantics requires that send_signal() determine pid
namespace of the sender but since signals can originate from workqueues/
interrupt-handlers, determining pid namespace of sender may not always be
possible or safe.

This patchset implements the design/simplified semantics suggested by
Oleg Nesterov.  The simplified semantics for container-init are:

	- container-init must never be terminated by a signal from a
	  descendant process.

	- container-init must never be immune to SIGKILL from an ancestor
	  namespace (so a process in parent namespace must always be able
	  to terminate a descendant container).

	- container-init may be immune to unhandled fatal signals (like
	  SIGUSR1) even if they are from ancestor namespace. SIGKILL/SIGSTOP
	  are the only reliable signals to a container-init from ancestor
	  namespace.

This patch:

Based on an earlier patch submitted by Oleg Nesterov and comments from
Roland McGrath (http://lkml.org/lkml/2008/11/19/258).

The handler parameter is currently unused in the tracehook functions.
Besides, the tracehook functions are called with siglock held, so the
functions can check the handler if they later need to.

Removing the parameter simiplifies changes to sig_ignored() in a follow-on
patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/x86/kernel/ptrace.c  |  2 +-
 include/linux/tracehook.h | 13 ++++---------
 kernel/signal.c           |  6 +++---
 3 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index 19378715f415..b7cc21bc6ae0 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -1455,6 +1455,6 @@ asmregparm void syscall_trace_leave(struct pt_regs *regs)
 	 * system call instruction.
 	 */
 	if (test_thread_flag(TIF_SINGLESTEP) &&
-	    tracehook_consider_fatal_signal(current, SIGTRAP, SIG_DFL))
+	    tracehook_consider_fatal_signal(current, SIGTRAP))
 		send_sigtrap(current, regs, 0, TRAP_BRKPT);
 }
diff --git a/include/linux/tracehook.h b/include/linux/tracehook.h
index 6186a789d6c7..eb4c6545b384 100644
--- a/include/linux/tracehook.h
+++ b/include/linux/tracehook.h
@@ -388,17 +388,14 @@ static inline void tracehook_signal_handler(int sig, siginfo_t *info,
  * tracehook_consider_ignored_signal - suppress short-circuit of ignored signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_IGN or %SIG_DFL
  *
  * Return zero iff tracing doesn't care to examine this ignored signal,
  * so it can short-circuit normal delivery and never even get queued.
- * Either @handler is %SIG_DFL and @sig's default is ignore, or it's %SIG_IGN.
  *
  * Called with @task->sighand->siglock held.
  */
 static inline int tracehook_consider_ignored_signal(struct task_struct *task,
-						    int sig,
-						    void __user *handler)
+						    int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
@@ -407,19 +404,17 @@ static inline int tracehook_consider_ignored_signal(struct task_struct *task,
  * tracehook_consider_fatal_signal - suppress special handling of fatal signal
  * @task:		task receiving the signal
  * @sig:		signal number being sent
- * @handler:		%SIG_DFL or %SIG_IGN
  *
  * Return nonzero to prevent special handling of this termination signal.
- * Normally @handler is %SIG_DFL.  It can be %SIG_IGN if @sig is ignored,
- * in which case force_sig() is about to reset it to %SIG_DFL.
+ * Normally handler for signal is %SIG_DFL.  It can be %SIG_IGN if @sig is
+ * ignored, in which case force_sig() is about to reset it to %SIG_DFL.
  * When this returns zero, this signal might cause a quick termination
  * that does not give the debugger a chance to intercept the signal.
  *
  * Called with or without @task->sighand->siglock held.
  */
 static inline int tracehook_consider_fatal_signal(struct task_struct *task,
-						  int sig,
-						  void __user *handler)
+						  int sig)
 {
 	return (task_ptrace(task) & PT_PTRACED) != 0;
 }
diff --git a/kernel/signal.c b/kernel/signal.c
index 1c8814481a11..92a1ab004498 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -74,7 +74,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	/*
 	 * Tracers may want to know about even ignored signals.
 	 */
-	return !tracehook_consider_ignored_signal(t, sig, handler);
+	return !tracehook_consider_ignored_signal(t, sig);
 }
 
 /*
@@ -318,7 +318,7 @@ int unhandled_signal(struct task_struct *tsk, int sig)
 		return 1;
 	if (handler != SIG_IGN && handler != SIG_DFL)
 		return 0;
-	return !tracehook_consider_fatal_signal(tsk, sig, handler);
+	return !tracehook_consider_fatal_signal(tsk, sig);
 }
 
 
@@ -777,7 +777,7 @@ static void complete_signal(int sig, struct task_struct *p, int group)
 	    !(signal->flags & (SIGNAL_UNKILLABLE | SIGNAL_GROUP_EXIT)) &&
 	    !sigismember(&t->real_blocked, sig) &&
 	    (sig == SIGKILL ||
-	     !tracehook_consider_fatal_signal(t, sig, SIG_DFL))) {
+	     !tracehook_consider_fatal_signal(t, sig))) {
 		/*
 		 * This signal will be fatal to the whole group.
 		 */
-- 
cgit v1.2.3


From f008faff0e2777c8b3fe853891b774ca465938d8 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:02 -0700
Subject: signals: protect init from unwanted signals more

(This is a modified version of the patch submitted by Oleg Nesterov
http://lkml.org/lkml/2008/11/18/249 and tries to address comments that
came up in that discussion)

init ignores the SIG_DFL signals but we queue them anyway, including
SIGKILL.  This is mostly OK, the signal will be dropped silently when
dequeued, but the pending SIGKILL has 2 bad implications:

        - it implies fatal_signal_pending(), so we confuse things
          like wait_for_completion_killable/lock_page_killable.

        - for the sub-namespace inits, the pending SIGKILL can
          mask (legacy_queue) the subsequent SIGKILL from the
          parent namespace which must kill cinit reliably.
          (preparation, cinits don't have SIGNAL_UNKILLABLE yet)

The patch can't help when init is ptraced, but ptracing of init is not
"safe" anyway.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 92a1ab004498..8bf7a40e5c71 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,10 +55,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig)
 {
 	void __user *handler;
 
+	handler = sig_handler(t, sig);
+
+	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
+			handler == SIG_DFL)
+		return 1;
+
+	return sig_handler_ignored(handler, sig);
+}
+
+static int sig_ignored(struct task_struct *t, int sig)
+{
 	/*
 	 * Blocked signals are never ignored, since the
 	 * signal handler may change by the time it is
@@ -67,8 +78,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	handler = sig_handler(t, sig);
-	if (!sig_handler_ignored(handler, sig))
+	if (!sig_task_ignored(t, sig))
 		return 0;
 
 	/*
-- 
cgit v1.2.3


From 7978b567d31555fc828b8f945c605ad29e117b22 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:04 -0700
Subject: signals: add from_ancestor_ns parameter to send_signal()

send_signal() (or its helper) needs to determine the pid namespace of the
sender.  But a signal sent via kill_pid_info_as_uid() comes from within
the kernel and send_signal() does not need to determine the pid namespace
of the sender.  So define a helper for send_signal() which takes an
additional parameter, 'from_ancestor_ns' and have kill_pid_info_as_uid()
use that helper directly.

The 'from_ancestor_ns' parameter will be used in a follow-on patch.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 8bf7a40e5c71..7b6de962a1af 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -823,8 +823,8 @@ static inline int legacy_queue(struct sigpending *signals, int sig)
 	return (sig < SIGRTMIN) && sigismember(&signals->signal, sig);
 }
 
-static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
-			int group)
+static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			int group, int from_ancestor_ns)
 {
 	struct sigpending *pending;
 	struct sigqueue *q;
@@ -899,6 +899,12 @@ out_set:
 	return 0;
 }
 
+static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
+			int group)
+{
+	return __send_signal(sig, info, t, group, 0);
+}
+
 int print_fatal_signals;
 
 static void print_fatal_signal(struct pt_regs *regs, int signr)
@@ -1143,7 +1149,7 @@ int kill_pid_info_as_uid(int sig, struct siginfo *info, struct pid *pid,
 	if (sig && p->sighand) {
 		unsigned long flags;
 		spin_lock_irqsave(&p->sighand->siglock, flags);
-		ret = __group_send_sig_info(sig, info, p);
+		ret = __send_signal(sig, info, p, 1, 0);
 		spin_unlock_irqrestore(&p->sighand->siglock, flags);
 	}
 out_unlock:
-- 
cgit v1.2.3


From 921cf9f63089c7442d44083477620132f4cea066 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:05 -0700
Subject: signals: protect cinit from unblocked SIG_DFL signals

Drop early any SIG_DFL or SIG_IGN signals to container-init from within
the same container.  But queue SIGSTOP and SIGKILL to the container-init
if they are from an ancestor container.

Blocked, fatal signals (i.e when SIG_DFL is to terminate) from within the
container can still terminate the container-init.  That will be addressed
in the next patch.

Note:	To be bisect-safe, SIGNAL_UNKILLABLE will be set for container-inits
   	in a follow-on patch. Until then, this patch is just a preparatory
	step.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 28 +++++++++++++++++++---------
 1 file changed, 19 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index 7b6de962a1af..fb19aae2363b 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -55,20 +55,21 @@ static int sig_handler_ignored(void __user *handler, int sig)
 		(handler == SIG_DFL && sig_kernel_ignore(sig));
 }
 
-static int sig_task_ignored(struct task_struct *t, int sig)
+static int sig_task_ignored(struct task_struct *t, int sig,
+		int from_ancestor_ns)
 {
 	void __user *handler;
 
 	handler = sig_handler(t, sig);
 
 	if (unlikely(t->signal->flags & SIGNAL_UNKILLABLE) &&
-			handler == SIG_DFL)
+			handler == SIG_DFL && !from_ancestor_ns)
 		return 1;
 
 	return sig_handler_ignored(handler, sig);
 }
 
-static int sig_ignored(struct task_struct *t, int sig)
+static int sig_ignored(struct task_struct *t, int sig, int from_ancestor_ns)
 {
 	/*
 	 * Blocked signals are never ignored, since the
@@ -78,7 +79,7 @@ static int sig_ignored(struct task_struct *t, int sig)
 	if (sigismember(&t->blocked, sig) || sigismember(&t->real_blocked, sig))
 		return 0;
 
-	if (!sig_task_ignored(t, sig))
+	if (!sig_task_ignored(t, sig, from_ancestor_ns))
 		return 0;
 
 	/*
@@ -634,7 +635,7 @@ static int check_kill_permission(int sig, struct siginfo *info,
  * Returns true if the signal should be actually delivered, otherwise
  * it should be dropped.
  */
-static int prepare_signal(int sig, struct task_struct *p)
+static int prepare_signal(int sig, struct task_struct *p, int from_ancestor_ns)
 {
 	struct signal_struct *signal = p->signal;
 	struct task_struct *t;
@@ -718,7 +719,7 @@ static int prepare_signal(int sig, struct task_struct *p)
 		}
 	}
 
-	return !sig_ignored(p, sig);
+	return !sig_ignored(p, sig, from_ancestor_ns);
 }
 
 /*
@@ -832,7 +833,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 	trace_sched_signal_send(sig, t);
 
 	assert_spin_locked(&t->sighand->siglock);
-	if (!prepare_signal(sig, t))
+
+	if (!prepare_signal(sig, t, from_ancestor_ns))
 		return 0;
 
 	pending = group ? &t->signal->shared_pending : &t->pending;
@@ -902,7 +904,15 @@ out_set:
 static int send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			int group)
 {
-	return __send_signal(sig, info, t, group, 0);
+	int from_ancestor_ns = 0;
+
+#ifdef CONFIG_PID_NS
+	if (!is_si_special(info) && SI_FROMUSER(info) &&
+			task_pid_nr_ns(current, task_active_pid_ns(t)) <= 0)
+		from_ancestor_ns = 1;
+#endif
+
+	return __send_signal(sig, info, t, group, from_ancestor_ns);
 }
 
 int print_fatal_signals;
@@ -1336,7 +1346,7 @@ int send_sigqueue(struct sigqueue *q, struct task_struct *t, int group)
 		goto ret;
 
 	ret = 1; /* the signal is ignored */
-	if (!prepare_signal(sig, t))
+	if (!prepare_signal(sig, t, 0))
 		goto out;
 
 	ret = 0;
-- 
cgit v1.2.3


From e4da026f980df125a4918c3bb9fe93185c7ef12a Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:06 -0700
Subject: signals: zap_pid_ns_process() should use force_sig()

send_signal() assumes that signals with SEND_SIG_PRIV are generated from
within the same namespace.  So any nested container-init processes become
immune to the SIGKILL generated by kill_proc_info() in
zap_pid_ns_processes().

Use force_sig() in zap_pid_ns_processes() instead - force_sig() clears the
SIGNAL_UNKILLABLE flag ensuring the signal is processed by
container-inits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/pid_namespace.c | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index fab8ea86fac3..2d1001b4858d 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -152,6 +152,7 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 {
 	int nr;
 	int rc;
+	struct task_struct *task;
 
 	/*
 	 * The last thread in the cgroup-init thread group is terminating.
@@ -169,7 +170,19 @@ void zap_pid_ns_processes(struct pid_namespace *pid_ns)
 	read_lock(&tasklist_lock);
 	nr = next_pidmap(pid_ns, 1);
 	while (nr > 0) {
-		kill_proc_info(SIGKILL, SEND_SIG_PRIV, nr);
+		rcu_read_lock();
+
+		/*
+		 * Use force_sig() since it clears SIGNAL_UNKILLABLE ensuring
+		 * any nested-container's init processes don't ignore the
+		 * signal
+		 */
+		task = pid_task(find_vpid(nr), PIDTYPE_PID);
+		if (task)
+			force_sig(SIGKILL, task);
+
+		rcu_read_unlock();
+
 		nr = next_pidmap(pid_ns, nr);
 	}
 	read_unlock(&tasklist_lock);
-- 
cgit v1.2.3


From b3bfa0cba867f23365b81658b47efd906830879b Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:08 -0700
Subject: signals: protect cinit from blocked fatal signals

Normally SIG_DFL signals to global and container-init are dropped early.
But if a signal is blocked when it is posted, we cannot drop the signal
since the receiver may install a handler before unblocking the signal.
Once this signal is queued however, the receiver container-init has no way
of knowing if the signal was sent from an ancestor or descendant
namespace.  This patch ensures that contianer-init drops all SIG_DFL
signals in get_signal_to_deliver() except SIGKILL/SIGSTOP.

If SIGSTOP/SIGKILL originate from a descendant of container-init they are
never queued (i.e dropped in sig_ignored() in an earler patch).

If SIGSTOP/SIGKILL originate from parent namespace, the signal is queued
and container-init processes the signal.

IOW, if get_signal_to_deliver() sees a sig_kernel_only() signal for global
or container-init, the signal must have been generated internally or must
have come from an ancestor ns and we process the signal.

Further, the signal_group_exit() check was needed to cover the case of a
multi-threaded init sending SIGKILL to other threads when doing an exit()
or exec().  But since the new sig_kernel_only() check covers the SIGKILL,
the signal_group_exit() check is no longer needed and can be removed.

Finally, now that we have all pieces in place, set SIGNAL_UNKILLABLE for
container-inits.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/fork.c   | 2 ++
 kernel/signal.c | 9 ++++++++-
 2 files changed, 10 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/fork.c b/kernel/fork.c
index d7eb727eb535..adbea16ec649 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -841,6 +841,8 @@ static int copy_signal(unsigned long clone_flags, struct task_struct *tsk)
 	atomic_set(&sig->live, 1);
 	init_waitqueue_head(&sig->wait_chldexit);
 	sig->flags = 0;
+	if (clone_flags & CLONE_NEWPID)
+		sig->flags |= SIGNAL_UNKILLABLE;
 	sig->group_exit_code = 0;
 	sig->group_exit_task = NULL;
 	sig->group_stop_count = 0;
diff --git a/kernel/signal.c b/kernel/signal.c
index fb19aae2363b..ba3da25f0eea 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -1870,9 +1870,16 @@ relock:
 
 		/*
 		 * Global init gets no signals it doesn't want.
+		 * Container-init gets no signals it doesn't want from same
+		 * container.
+		 *
+		 * Note that if global/container-init sees a sig_kernel_only()
+		 * signal here, the signal must have been generated internally
+		 * or must have come from an ancestor namespace. In either
+		 * case, the signal cannot be dropped.
 		 */
 		if (unlikely(signal->flags & SIGNAL_UNKILLABLE) &&
-		    !signal_group_exit(signal))
+				!sig_kernel_only(signr))
 			continue;
 
 		if (sig_kernel_stop(signr)) {
-- 
cgit v1.2.3


From 6588c1e3ff01418acafd938db0740e3477dc8cb7 Mon Sep 17 00:00:00 2001
From: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Date: Thu, 2 Apr 2009 16:58:09 -0700
Subject: signals: SI_USER: Masquerade si_pid when crossing pid ns boundary

When sending a signal to a descendant namespace, set ->si_pid to 0 since
the sender does not have a pid in the receiver's namespace.

Note:
	- If rt_sigqueueinfo() sets si_code to SI_USER when sending a
	  signal across a pid namespace boundary, the value in ->si_pid
	  will be cleared to 0.

Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Daniel Lezcano <daniel.lezcano@free.fr>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/signal.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/signal.c b/kernel/signal.c
index ba3da25f0eea..d8034737db4c 100644
--- a/kernel/signal.c
+++ b/kernel/signal.c
@@ -883,6 +883,8 @@ static int __send_signal(int sig, struct siginfo *info, struct task_struct *t,
 			break;
 		default:
 			copy_siginfo(&q->info, info);
+			if (from_ancestor_ns)
+				q->info.si_pid = 0;
 			break;
 		}
 	} else if (!is_si_special(info)) {
-- 
cgit v1.2.3


From 95c3eb76dc07fd81289888ffc42948196b34b444 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:11 -0700
Subject: ptrace: kill __ptrace_detach(), fix ->exit_state check

Move the code from __ptrace_detach() to its single caller and kill this
helper.

Also, fix the ->exit_state check, we shouldn't wake up EXIT_DEAD tasks.
Actually, I think task_is_stopped_or_traced() makes more sense, but this
needs another patch.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 22 +++++++++-------------
 1 file changed, 9 insertions(+), 13 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index c9cf48b21f05..f62a568e84ec 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,16 +235,6 @@ out:
 	return retval;
 }
 
-static inline void __ptrace_detach(struct task_struct *child, unsigned int data)
-{
-	child->exit_code = data;
-	/* .. re-parent .. */
-	__ptrace_unlink(child);
-	/* .. and wake it up. */
-	if (child->exit_state != EXIT_ZOMBIE)
-		wake_up_process(child);
-}
-
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
 	if (!valid_signal(data))
@@ -254,10 +244,16 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	write_lock_irq(&tasklist_lock);
 	/* protect against de_thread()->release_task() */
-	if (child->ptrace)
-		__ptrace_detach(child, data);
+	write_lock_irq(&tasklist_lock);
+	if (child->ptrace) {
+		child->exit_code = data;
+
+		__ptrace_unlink(child);
+
+		if (!child->exit_state)
+			wake_up_process(child);
+	}
 	write_unlock_irq(&tasklist_lock);
 
 	return 0;
-- 
cgit v1.2.3


From 6d69cb87f05eef3b02370b2f7bae608ad2301a00 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:12 -0700
Subject: ptrace: simplify ptrace_exit()->ignoring_children() path

ignoring_children() takes parent->sighand->siglock and checks
k_sigaction[SIGCHLD] atomically.  But this buys nothing, we can't get the
"really" wrong result even if we race with sigaction(SIGCHLD).  If we read
the "stale" sa_handler/sa_flags we can pretend it was changed right after
the check.

Remove spin_lock(->siglock), and kill "int ign" which caches the result of
ignoring_children() which becomes rather trivial.

Perhaps it makes sense to export this helper, do_notify_parent() can use
it too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 25 ++++++++-----------------
 1 file changed, 8 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 0c06b9efae3b..7a8311422930 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -732,19 +732,15 @@ static void exit_mm(struct task_struct * tsk)
 }
 
 /*
- * Return nonzero if @parent's children should reap themselves.
- *
- * Called with write_lock_irq(&tasklist_lock) held.
+ * Called with irqs disabled, returns true if childs should reap themselves.
  */
-static int ignoring_children(struct task_struct *parent)
+static int ignoring_children(struct sighand_struct *sigh)
 {
 	int ret;
-	struct sighand_struct *psig = parent->sighand;
-	unsigned long flags;
-	spin_lock_irqsave(&psig->siglock, flags);
-	ret = (psig->action[SIGCHLD-1].sa.sa_handler == SIG_IGN ||
-	       (psig->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT));
-	spin_unlock_irqrestore(&psig->siglock, flags);
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
 	return ret;
 }
 
@@ -757,7 +753,6 @@ static int ignoring_children(struct task_struct *parent)
 static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 {
 	struct task_struct *p, *n;
-	int ign = -1;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
 		__ptrace_unlink(p);
@@ -779,12 +774,8 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 		if (!task_detached(p) && thread_group_empty(p)) {
 			if (!same_thread_group(p->real_parent, parent))
 				do_notify_parent(p, p->exit_signal);
-			else {
-				if (ign < 0)
-					ign = ignoring_children(parent);
-				if (ign)
-					p->exit_signal = -1;
-			}
+			else if (ignoring_children(parent->sighand))
+				p->exit_signal = -1;
 		}
 
 		if (task_detached(p)) {
-- 
cgit v1.2.3


From b1b4c6799fb59e710454bfe0ab477cb8523a8667 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:13 -0700
Subject: ptrace: reintroduce __ptrace_detach() as a callee of ptrace_exit()

No functional changes, preparation for the next patch.

Move the "should we release this child" logic into the separate handler,
__ptrace_detach().

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 62 +++++++++++++++++++++++++++++++----------------------------
 1 file changed, 33 insertions(+), 29 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 7a8311422930..576eae233b53 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -744,6 +744,38 @@ static int ignoring_children(struct sighand_struct *sigh)
 	return ret;
 }
 
+/* Returns nonzero if the tracee should be released. */
+int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state != EXIT_ZOMBIE)
+		return 0;
+	/*
+	 * If it's a zombie, our attachedness prevented normal
+	 * parent notification or self-reaping.  Do notification
+	 * now if it would have happened earlier.  If it should
+	 * reap itself we return true.
+	 *
+	 * If it's our own child, there is no notification to do.
+	 * But if our normal children self-reap, then this child
+	 * was prevented by ptrace and we must reap it now.
+	 */
+	if (!task_detached(p) && thread_group_empty(p)) {
+		if (!same_thread_group(p->real_parent, tracer))
+			do_notify_parent(p, p->exit_signal);
+		else if (ignoring_children(tracer->sighand))
+			p->exit_signal = -1;
+	}
+
+	if (!task_detached(p))
+		return 0;
+
+	/* Mark it as in the process of being reaped. */
+	p->exit_state = EXIT_DEAD;
+	return 1;
+}
+
 /*
  * Detach all tasks we were using ptrace on.
  * Any that need to be release_task'd are put on the @dead list.
@@ -755,36 +787,8 @@ static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
 	struct task_struct *p, *n;
 
 	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		__ptrace_unlink(p);
-
-		if (p->exit_state != EXIT_ZOMBIE)
-			continue;
-
-		/*
-		 * If it's a zombie, our attachedness prevented normal
-		 * parent notification or self-reaping.  Do notification
-		 * now if it would have happened earlier.  If it should
-		 * reap itself, add it to the @dead list.  We can't call
-		 * release_task() here because we already hold tasklist_lock.
-		 *
-		 * If it's our own child, there is no notification to do.
-		 * But if our normal children self-reap, then this child
-		 * was prevented by ptrace and we must reap it now.
-		 */
-		if (!task_detached(p) && thread_group_empty(p)) {
-			if (!same_thread_group(p->real_parent, parent))
-				do_notify_parent(p, p->exit_signal);
-			else if (ignoring_children(parent->sighand))
-				p->exit_signal = -1;
-		}
-
-		if (task_detached(p)) {
-			/*
-			 * Mark it as in the process of being reaped.
-			 */
-			p->exit_state = EXIT_DEAD;
+		if (__ptrace_detach(parent, p))
 			list_add(&p->ptrace_entry, dead);
-		}
 	}
 }
 
-- 
cgit v1.2.3


From 4576145c1ecdaaea9ef8976a48335206aa1ebf91 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:14 -0700
Subject: ptrace: fix possible zombie leak on PTRACE_DETACH

When ptrace_detach() takes tasklist, the tracee can be SIGKILL'ed.  If it
has already passed exit_notify() we can leak a zombie, because a) ptracing
disables the auto-reaping logic, and b) ->real_parent was not notified
about the child's death.

ptrace_detach() should follow the ptrace_exit's logic, change the code
accordingly.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Tested-by: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h | 1 +
 kernel/ptrace.c        | 9 +++++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 98b93ca4db06..1a2b0cb55535 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,6 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
+extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f62a568e84ec..ee553b6ad125 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -237,6 +237,8 @@ out:
 
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
+	int dead = 0;
+
 	if (!valid_signal(data))
 		return -EIO;
 
@@ -244,18 +246,21 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	ptrace_disable(child);
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
-	/* protect against de_thread()->release_task() */
 	write_lock_irq(&tasklist_lock);
+	/* protect against de_thread()->release_task() */
 	if (child->ptrace) {
 		child->exit_code = data;
 
-		__ptrace_unlink(child);
+		dead = __ptrace_detach(current, child);
 
 		if (!child->exit_state)
 			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
+	if (unlikely(dead))
+		release_task(child);
+
 	return 0;
 }
 
-- 
cgit v1.2.3


From 0a967a044a777e8b9c739120927114ddc0094298 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:15 -0700
Subject: reparent_thread: don't call kill_orphaned_pgrp() if task_detached()

If task_detached(p) == T, then either

  a) p is not the main thread, we will find the group leader on the
     ->children list.

or

  b) p is the group leader but its ->exit_state = EXIT_DEAD.  This
     can only happen when the last sub-thread has died, but in that case
     that thread has already called kill_orphaned_pgrp() from
     exit_notify().

In both cases kill_orphaned_pgrp() looks bogus.

Move the task_detached() check up and simplify the code, this is also
right from the "common sense" pov: we should do nothing with the detached
childs, except move them to the new parent's ->children list.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 576eae233b53..405e6877168b 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -818,6 +818,8 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
+	if (task_detached(p))
+		return;
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
@@ -825,15 +827,13 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 		return;
 
 	/* We don't want people slaying init.  */
-	if (!task_detached(p))
-		p->exit_signal = SIGCHLD;
+	p->exit_signal = SIGCHLD;
 
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
 	if (!ptrace_reparented(p) &&
-	    p->exit_state == EXIT_ZOMBIE &&
-	    !task_detached(p) && thread_group_empty(p))
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
 	kill_orphaned_pgrp(p, father);
-- 
cgit v1.2.3


From b1442b055c154699a6a2c436f3352f71b6beede3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:16 -0700
Subject: reparent_thread: fix the "is it traced" check

reparent_thread() uses ptrace_reparented() to check whether this thread is
ptraced, in that case we should not notify the new parent.

But ptrace_reparented() is not exactly correct when the reparented thread
is traced by /sbin/init, because forget_original_parent() has already
changed ->real_parent.

Currently, the only problem is the false notification.  But with the next
patch the kernel crash in this (yes, pathological) case.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 405e6877168b..5be0a406faeb 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -832,7 +832,7 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
-	if (!ptrace_reparented(p) &&
+	if (!p->ptrace &&
 	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
 		do_notify_parent(p, p->exit_signal);
 
-- 
cgit v1.2.3


From 7f5d3652d469cdf9eb2365dfea7ce3fb9e1409cc Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:17 -0700
Subject: reparent_thread: fix a zombie leak if /sbin/init ignores SIGCHLD

If /sbin/init ignores SIGCHLD and we re-parent a zombie, it is leaked.
reparent_thread() does do_notify_parent() which sets ->exit_signal = -1 in
this case.  This means that nobody except us can reap it, the detached
task is not visible to do_wait().

Change reparent_thread() to return a boolean (like __pthread_detach) to
indicate that the thread is dead and must be released.  Also change
forget_original_parent() to add the child to ptrace_dead list in this
case.

The naming becomes insane, the next patch does the cleanup.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 5be0a406faeb..3e09b7cb3b20 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -810,8 +810,11 @@ static void ptrace_exit_finish(struct task_struct *parent,
 	}
 }
 
-static void reparent_thread(struct task_struct *p, struct task_struct *father)
+/* Returns nonzero if the child should be released. */
+static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
+	int dead;
+
 	if (p->pdeath_signal)
 		/* We already hold the tasklist_lock here.  */
 		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
@@ -819,12 +822,12 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	list_move_tail(&p->sibling, &p->real_parent->children);
 
 	if (task_detached(p))
-		return;
+		return 0;
 	/* If this is a threaded reparent there is no need to
 	 * notify anyone anything has happened.
 	 */
 	if (same_thread_group(p->real_parent, father))
-		return;
+		return 0;
 
 	/* We don't want people slaying init.  */
 	p->exit_signal = SIGCHLD;
@@ -832,11 +835,19 @@ static void reparent_thread(struct task_struct *p, struct task_struct *father)
 	/* If we'd notified the old parent about this child's death,
 	 * also notify the new parent.
 	 */
+	dead = 0;
 	if (!p->ptrace &&
-	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p))
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
 		do_notify_parent(p, p->exit_signal);
+		if (task_detached(p)) {
+			p->exit_state = EXIT_DEAD;
+			dead = 1;
+		}
+	}
 
 	kill_orphaned_pgrp(p, father);
+
+	return dead;
 }
 
 /*
@@ -896,7 +907,8 @@ static void forget_original_parent(struct task_struct *father)
 			BUG_ON(p->ptrace);
 			p->parent = p->real_parent;
 		}
-		reparent_thread(p, father);
+		if (reparent_thread(p, father))
+			list_add(&p->ptrace_entry, &ptrace_dead);;
 	}
 
 	write_unlock_irq(&tasklist_lock);
-- 
cgit v1.2.3


From 39c626ae47c469abdfd30c6e42eff884931380d6 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:18 -0700
Subject: forget_original_parent: split out the un-ptrace part

By discussion with Roland.

- Rename ptrace_exit() to exit_ptrace(), and change it to do all the
  necessary work with ->ptraced list by its own.

- Move this code from exit.c to ptrace.c

- Update the comment in ptrace_detach() to explain the rechecking of
  the child->ptrace.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/ptrace.h |  2 +-
 include/linux/sched.h  |  5 +++
 kernel/exit.c          | 95 ++++----------------------------------------------
 kernel/ptrace.c        | 78 +++++++++++++++++++++++++++++++++++++++--
 4 files changed, 88 insertions(+), 92 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h
index 1a2b0cb55535..67c15653fc23 100644
--- a/include/linux/ptrace.h
+++ b/include/linux/ptrace.h
@@ -94,7 +94,7 @@ extern void ptrace_notify(int exit_code);
 extern void __ptrace_link(struct task_struct *child,
 			  struct task_struct *new_parent);
 extern void __ptrace_unlink(struct task_struct *child);
-extern int __ptrace_detach(struct task_struct *tracer, struct task_struct *p);
+extern void exit_ptrace(struct task_struct *tracer);
 extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags);
 #define PTRACE_MODE_READ   1
 #define PTRACE_MODE_ATTACH 2
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 9186f8c5d5f2..b47c94e7560b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -2061,6 +2061,11 @@ static inline int thread_group_empty(struct task_struct *p)
 #define delay_group_leader(p) \
 		(thread_group_leader(p) && !thread_group_empty(p))
 
+static inline int task_detached(struct task_struct *p)
+{
+	return p->exit_signal == -1;
+}
+
 /*
  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
diff --git a/kernel/exit.c b/kernel/exit.c
index 3e09b7cb3b20..506693dfdd4e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -61,11 +61,6 @@ DEFINE_TRACE(sched_process_wait);
 
 static void exit_mm(struct task_struct * tsk);
 
-static inline int task_detached(struct task_struct *p)
-{
-	return p->exit_signal == -1;
-}
-
 static void __unhash_process(struct task_struct *p)
 {
 	nr_threads--;
@@ -731,85 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/*
- * Called with irqs disabled, returns true if childs should reap themselves.
- */
-static int ignoring_children(struct sighand_struct *sigh)
-{
-	int ret;
-	spin_lock(&sigh->siglock);
-	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
-	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
-	spin_unlock(&sigh->siglock);
-	return ret;
-}
-
-/* Returns nonzero if the tracee should be released. */
-int __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
-{
-	__ptrace_unlink(p);
-
-	if (p->exit_state != EXIT_ZOMBIE)
-		return 0;
-	/*
-	 * If it's a zombie, our attachedness prevented normal
-	 * parent notification or self-reaping.  Do notification
-	 * now if it would have happened earlier.  If it should
-	 * reap itself we return true.
-	 *
-	 * If it's our own child, there is no notification to do.
-	 * But if our normal children self-reap, then this child
-	 * was prevented by ptrace and we must reap it now.
-	 */
-	if (!task_detached(p) && thread_group_empty(p)) {
-		if (!same_thread_group(p->real_parent, tracer))
-			do_notify_parent(p, p->exit_signal);
-		else if (ignoring_children(tracer->sighand))
-			p->exit_signal = -1;
-	}
-
-	if (!task_detached(p))
-		return 0;
-
-	/* Mark it as in the process of being reaped. */
-	p->exit_state = EXIT_DEAD;
-	return 1;
-}
-
-/*
- * Detach all tasks we were using ptrace on.
- * Any that need to be release_task'd are put on the @dead list.
- *
- * Called with write_lock(&tasklist_lock) held.
- */
-static void ptrace_exit(struct task_struct *parent, struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	list_for_each_entry_safe(p, n, &parent->ptraced, ptrace_entry) {
-		if (__ptrace_detach(parent, p))
-			list_add(&p->ptrace_entry, dead);
-	}
-}
-
-/*
- * Finish up exit-time ptrace cleanup.
- *
- * Called without locks.
- */
-static void ptrace_exit_finish(struct task_struct *parent,
-			       struct list_head *dead)
-{
-	struct task_struct *p, *n;
-
-	BUG_ON(!list_empty(&parent->ptraced));
-
-	list_for_each_entry_safe(p, n, dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
-		release_task(p);
-	}
-}
-
 /* Returns nonzero if the child should be released. */
 static int reparent_thread(struct task_struct *p, struct task_struct *father)
 {
@@ -894,12 +810,10 @@ static void forget_original_parent(struct task_struct *father)
 	struct task_struct *p, *n, *reaper;
 	LIST_HEAD(ptrace_dead);
 
+	exit_ptrace(father);
+
 	write_lock_irq(&tasklist_lock);
 	reaper = find_new_reaper(father);
-	/*
-	 * First clean up ptrace if we were using it.
-	 */
-	ptrace_exit(father, &ptrace_dead);
 
 	list_for_each_entry_safe(p, n, &father->children, sibling) {
 		p->real_parent = reaper;
@@ -914,7 +828,10 @@ static void forget_original_parent(struct task_struct *father)
 	write_unlock_irq(&tasklist_lock);
 	BUG_ON(!list_empty(&father->children));
 
-	ptrace_exit_finish(father, &ptrace_dead);
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
 }
 
 /*
diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index ee553b6ad125..f5a9fa5aafa1 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -235,9 +235,57 @@ out:
 	return retval;
 }
 
+/*
+ * Called with irqs disabled, returns true if childs should reap themselves.
+ */
+static int ignoring_children(struct sighand_struct *sigh)
+{
+	int ret;
+	spin_lock(&sigh->siglock);
+	ret = (sigh->action[SIGCHLD-1].sa.sa_handler == SIG_IGN) ||
+	      (sigh->action[SIGCHLD-1].sa.sa_flags & SA_NOCLDWAIT);
+	spin_unlock(&sigh->siglock);
+	return ret;
+}
+
+/*
+ * Called with tasklist_lock held for writing.
+ * Unlink a traced task, and clean it up if it was a traced zombie.
+ * Return true if it needs to be reaped with release_task().
+ * (We can't call release_task() here because we already hold tasklist_lock.)
+ *
+ * If it's a zombie, our attachedness prevented normal parent notification
+ * or self-reaping.  Do notification now if it would have happened earlier.
+ * If it should reap itself, return true.
+ *
+ * If it's our own child, there is no notification to do.
+ * But if our normal children self-reap, then this child
+ * was prevented by ptrace and we must reap it now.
+ */
+static bool __ptrace_detach(struct task_struct *tracer, struct task_struct *p)
+{
+	__ptrace_unlink(p);
+
+	if (p->exit_state == EXIT_ZOMBIE) {
+		if (!task_detached(p) && thread_group_empty(p)) {
+			if (!same_thread_group(p->real_parent, tracer))
+				do_notify_parent(p, p->exit_signal);
+			else if (ignoring_children(tracer->sighand))
+				p->exit_signal = -1;
+		}
+		if (task_detached(p)) {
+			/* Mark it as in the process of being reaped. */
+			p->exit_state = EXIT_DEAD;
+			return true;
+		}
+	}
+
+	return false;
+}
+
 int ptrace_detach(struct task_struct *child, unsigned int data)
 {
-	int dead = 0;
+	bool dead = false;
 
 	if (!valid_signal(data))
 		return -EIO;
@@ -247,7 +295,10 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	clear_tsk_thread_flag(child, TIF_SYSCALL_TRACE);
 
 	write_lock_irq(&tasklist_lock);
-	/* protect against de_thread()->release_task() */
+	/*
+	 * This child can be already killed. Make sure de_thread() or
+	 * our sub-thread doing do_wait() didn't do release_task() yet.
+	 */
 	if (child->ptrace) {
 		child->exit_code = data;
 
@@ -264,6 +315,29 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	return 0;
 }
 
+/*
+ * Detach all tasks we were using ptrace on.
+ */
+void exit_ptrace(struct task_struct *tracer)
+{
+	struct task_struct *p, *n;
+	LIST_HEAD(ptrace_dead);
+
+	write_lock_irq(&tasklist_lock);
+	list_for_each_entry_safe(p, n, &tracer->ptraced, ptrace_entry) {
+		if (__ptrace_detach(tracer, p))
+			list_add(&p->ptrace_entry, &ptrace_dead);
+	}
+	write_unlock_irq(&tasklist_lock);
+
+	BUG_ON(!list_empty(&tracer->ptraced));
+
+	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
+		list_del_init(&p->ptrace_entry);
+		release_task(p);
+	}
+}
+
 int ptrace_readdata(struct task_struct *tsk, unsigned long src, char __user *dst, int len)
 {
 	int copied = 0;
-- 
cgit v1.2.3


From 5dfc80be73dd0c212d2e6dd8dbf5afa07e680bbe Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:19 -0700
Subject: forget_original_parent: do not abuse child->ptrace_entry

By discussion with Roland.

- Use ->sibling instead of ->ptrace_entry to chain the need to be
  release_task'd childs. Nobody else can use ->sibling, this task
  is EXIT_DEAD and nobody can find it on its own list.

- rename ptrace_dead to dead_childs.

- Now that we don't have the "parallel" untrace code, change back
  reparent_thread() to return void, pass dead_childs as an argument.

Actually, I don't understand why do we notify /sbin/init when we
reparent a zombie, probably it is better to reap it unconditionally.

[akpm@linux-foundation.org: s/childs/children/]
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: "Metzger, Markus T" <markus.t.metzger@intel.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 87 ++++++++++++++++++++++++++++-------------------------------
 1 file changed, 41 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 506693dfdd4e..029415d9f82e 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -726,46 +726,6 @@ static void exit_mm(struct task_struct * tsk)
 	mmput(mm);
 }
 
-/* Returns nonzero if the child should be released. */
-static int reparent_thread(struct task_struct *p, struct task_struct *father)
-{
-	int dead;
-
-	if (p->pdeath_signal)
-		/* We already hold the tasklist_lock here.  */
-		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
-
-	list_move_tail(&p->sibling, &p->real_parent->children);
-
-	if (task_detached(p))
-		return 0;
-	/* If this is a threaded reparent there is no need to
-	 * notify anyone anything has happened.
-	 */
-	if (same_thread_group(p->real_parent, father))
-		return 0;
-
-	/* We don't want people slaying init.  */
-	p->exit_signal = SIGCHLD;
-
-	/* If we'd notified the old parent about this child's death,
-	 * also notify the new parent.
-	 */
-	dead = 0;
-	if (!p->ptrace &&
-	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
-		do_notify_parent(p, p->exit_signal);
-		if (task_detached(p)) {
-			p->exit_state = EXIT_DEAD;
-			dead = 1;
-		}
-	}
-
-	kill_orphaned_pgrp(p, father);
-
-	return dead;
-}
-
 /*
  * When we die, we re-parent all our children.
  * Try to give them to another thread in our thread
@@ -805,10 +765,46 @@ static struct task_struct *find_new_reaper(struct task_struct *father)
 	return pid_ns->child_reaper;
 }
 
+/*
+* Any that need to be release_task'd are put on the @dead list.
+ */
+static void reparent_thread(struct task_struct *father, struct task_struct *p,
+				struct list_head *dead)
+{
+	if (p->pdeath_signal)
+		group_send_sig_info(p->pdeath_signal, SEND_SIG_NOINFO, p);
+
+	list_move_tail(&p->sibling, &p->real_parent->children);
+
+	if (task_detached(p))
+		return;
+	/*
+	 * If this is a threaded reparent there is no need to
+	 * notify anyone anything has happened.
+	 */
+	if (same_thread_group(p->real_parent, father))
+		return;
+
+	/* We don't want people slaying init.  */
+	p->exit_signal = SIGCHLD;
+
+	/* If it has exited notify the new parent about this child's death. */
+	if (!p->ptrace &&
+	    p->exit_state == EXIT_ZOMBIE && thread_group_empty(p)) {
+		do_notify_parent(p, p->exit_signal);
+		if (task_detached(p)) {
+			p->exit_state = EXIT_DEAD;
+			list_move_tail(&p->sibling, dead);
+		}
+	}
+
+	kill_orphaned_pgrp(p, father);
+}
+
 static void forget_original_parent(struct task_struct *father)
 {
 	struct task_struct *p, *n, *reaper;
-	LIST_HEAD(ptrace_dead);
+	LIST_HEAD(dead_children);
 
 	exit_ptrace(father);
 
@@ -821,15 +817,14 @@ static void forget_original_parent(struct task_struct *father)
 			BUG_ON(p->ptrace);
 			p->parent = p->real_parent;
 		}
-		if (reparent_thread(p, father))
-			list_add(&p->ptrace_entry, &ptrace_dead);;
+		reparent_thread(father, p, &dead_children);
 	}
-
 	write_unlock_irq(&tasklist_lock);
+
 	BUG_ON(!list_empty(&father->children));
 
-	list_for_each_entry_safe(p, n, &ptrace_dead, ptrace_entry) {
-		list_del_init(&p->ptrace_entry);
+	list_for_each_entry_safe(p, n, &dead_children, sibling) {
+		list_del_init(&p->sibling);
 		release_task(p);
 	}
 }
-- 
cgit v1.2.3


From 95a3540da9c81a5987be810e1d9a83640a366bd5 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:21 -0700
Subject: ptrace_detach: the wrong wakeup breaks the ERESTARTxxx logic

Another ancient bug. Consider this trivial test-case,

	int main(void)
	{
		int pid = fork();

		if (pid) {
			ptrace(PTRACE_ATTACH, pid, NULL, NULL);
			wait(NULL);
			ptrace(PTRACE_DETACH, pid, NULL, NULL);
		} else {
			pause();
			printf("WE HAVE A KERNEL BUG!!!\n");
		}

		return 0;
	}

the child must not "escape" for sys_pause(), but it can and this was seen
in practice.

This is because ptrace_detach does:

	if (!child->exit_state)
		wake_up_process(child);

this wakeup can happen after this child has already restarted sys_pause(),
because it gets another wakeup from ptrace_untrace().

With or without this patch, perhaps sys_pause() needs a fix.  But this
wakeup also breaks the SIGNAL_STOP_STOPPED logic in ptrace_untrace().

Remove this wakeup.  The caller saw this task in TASK_TRACED state, and
unless it was SIGKILL'ed in between __ptrace_unlink()->ptrace_untrace()
should handle this case correctly.  If it was SIGKILL'ed, we don't need to
wakup the dying tracee too.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 4 ----
 1 file changed, 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index f5a9fa5aafa1..296e8105863a 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -301,11 +301,7 @@ int ptrace_detach(struct task_struct *child, unsigned int data)
 	 */
 	if (child->ptrace) {
 		child->exit_code = data;
-
 		dead = __ptrace_detach(current, child);
-
-		if (!child->exit_state)
-			wake_up_process(child);
 	}
 	write_unlock_irq(&tasklist_lock);
 
-- 
cgit v1.2.3


From 1ee1184485df9c9a3503d3a684b911fb7c73d259 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:23 -0700
Subject: ptrace_untrace: fix the SIGNAL_STOP_STOPPED check

This bug is ancient too. ptrace_untrace() must not resume the task
if the group stop in progress, we should set TASK_STOPPED instead.

Unfortunately, we still have problems here:

	- if the process/thread was traced, SIGNAL_STOP_STOPPED
	  does not necessary means this thread group is stopped.

	- ptrace breaks the bookkeeping of ->group_stop_count.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Jerome Marchand <jmarchan@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Denys Vlasenko <dvlasenk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/ptrace.c | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/ptrace.c b/kernel/ptrace.c
index 296e8105863a..5105f5a6a2ce 100644
--- a/kernel/ptrace.c
+++ b/kernel/ptrace.c
@@ -60,11 +60,15 @@ static void ptrace_untrace(struct task_struct *child)
 {
 	spin_lock(&child->sighand->siglock);
 	if (task_is_traced(child)) {
-		if (child->signal->flags & SIGNAL_STOP_STOPPED) {
+		/*
+		 * If the group stop is completed or in progress,
+		 * this thread was already counted as stopped.
+		 */
+		if (child->signal->flags & SIGNAL_STOP_STOPPED ||
+		    child->signal->group_stop_count)
 			__set_task_state(child, TASK_STOPPED);
-		} else {
+		else
 			signal_wake_up(child, 1);
-		}
 	}
 	spin_unlock(&child->sighand->siglock);
 }
-- 
cgit v1.2.3


From 2355b70fd59cb5be7de2052a9edeee7afb7ff099 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 16:58:24 -0700
Subject: workqueue: avoid recursion in run_workqueue()

1) lockdep will complain when run_workqueue() performs recursion.

2) The recursive implementation of run_workqueue() means that
   flush_workqueue() and its documentation are inconsistent.  This may
   hide deadlocks and other bugs.

3) The recursion in run_workqueue() will poison cwq->current_work, but
   flush_work() and __cancel_work_timer(), etcetera need a reliable
   cwq->current_work.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Eric Dumazet <dada1@cosmosbay.com>
Cc: Rusty Russell <rusty@rustcorp.com.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/workqueue.c | 41 +++++++++++------------------------------
 1 file changed, 11 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9aedd9fd825b..32f8e0d2bf5a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -48,8 +48,6 @@ struct cpu_workqueue_struct {
 
 	struct workqueue_struct *wq;
 	struct task_struct *thread;
-
-	int run_depth;		/* Detect run_workqueue() recursion depth */
 } ____cacheline_aligned;
 
 /*
@@ -262,13 +260,6 @@ EXPORT_SYMBOL_GPL(queue_delayed_work_on);
 static void run_workqueue(struct cpu_workqueue_struct *cwq)
 {
 	spin_lock_irq(&cwq->lock);
-	cwq->run_depth++;
-	if (cwq->run_depth > 3) {
-		/* morton gets to eat his hat */
-		printk("%s: recursion depth exceeded: %d\n",
-			__func__, cwq->run_depth);
-		dump_stack();
-	}
 	while (!list_empty(&cwq->worklist)) {
 		struct work_struct *work = list_entry(cwq->worklist.next,
 						struct work_struct, entry);
@@ -311,7 +302,6 @@ static void run_workqueue(struct cpu_workqueue_struct *cwq)
 		spin_lock_irq(&cwq->lock);
 		cwq->current_work = NULL;
 	}
-	cwq->run_depth--;
 	spin_unlock_irq(&cwq->lock);
 }
 
@@ -368,29 +358,20 @@ static void insert_wq_barrier(struct cpu_workqueue_struct *cwq,
 
 static int flush_cpu_workqueue(struct cpu_workqueue_struct *cwq)
 {
-	int active;
+	int active = 0;
+	struct wq_barrier barr;
 
-	if (cwq->thread == current) {
-		/*
-		 * Probably keventd trying to flush its own queue. So simply run
-		 * it by hand rather than deadlocking.
-		 */
-		run_workqueue(cwq);
-		active = 1;
-	} else {
-		struct wq_barrier barr;
+	WARN_ON(cwq->thread == current);
 
-		active = 0;
-		spin_lock_irq(&cwq->lock);
-		if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
-			insert_wq_barrier(cwq, &barr, &cwq->worklist);
-			active = 1;
-		}
-		spin_unlock_irq(&cwq->lock);
-
-		if (active)
-			wait_for_completion(&barr.done);
+	spin_lock_irq(&cwq->lock);
+	if (!list_empty(&cwq->worklist) || cwq->current_work != NULL) {
+		insert_wq_barrier(cwq, &barr, &cwq->worklist);
+		active = 1;
 	}
+	spin_unlock_irq(&cwq->lock);
+
+	if (active)
+		wait_for_completion(&barr.done);
 
 	return active;
 }
-- 
cgit v1.2.3


From 11dea1900931ac73184b2f5163a13d24a4e572ea Mon Sep 17 00:00:00 2001
From: "Serge E. Hallyn" <serue@us.ibm.com>
Date: Thu, 2 Apr 2009 16:58:27 -0700
Subject: proc_sysctl: use CONFIG_PROC_SYSCTL around ipc and utsname
 proc_handlers

As pointed out by Cedric Le Goater (in response to Alexey's original
comment wrt mqns), ipc_sysctl.c and utsname_sysctl.c are using
CONFIG_PROC_FS, not CONFIG_PROC_SYSCTL, to determine whether to define
the proc_handlers.  Change that.

Signed-off-by: Serge E. Hallyn <serue@us.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Acked-by: Alexey Dobriyan <adobriyan@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 ipc/ipc_sysctl.c        | 2 +-
 kernel/utsname_sysctl.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/ipc/ipc_sysctl.c b/ipc/ipc_sysctl.c
index 4a7a12c95abe..40eab7314aeb 100644
--- a/ipc/ipc_sysctl.c
+++ b/ipc/ipc_sysctl.c
@@ -26,7 +26,7 @@ static void *get_ipc(ctl_table *table)
 	return which;
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 static int proc_ipc_dointvec(ctl_table *table, int write, struct file *filp,
 	void __user *buffer, size_t *lenp, loff_t *ppos)
 {
diff --git a/kernel/utsname_sysctl.c b/kernel/utsname_sysctl.c
index 3b34b3545936..92359cc747a7 100644
--- a/kernel/utsname_sysctl.c
+++ b/kernel/utsname_sysctl.c
@@ -37,7 +37,7 @@ static void put_uts(ctl_table *table, int write, void *which)
 		up_write(&uts_sem);
 }
 
-#ifdef CONFIG_PROC_FS
+#ifdef CONFIG_PROC_SYSCTL
 /*
  *	Special case of dostring for the UTS structure. This has locks
  *	to observe. Should this be in kernel/sys.c ????
-- 
cgit v1.2.3


From 8e654fba4a376f436bdfe361fc5cdbc87ac09b35 Mon Sep 17 00:00:00 2001
From: Matthew Wilcox <matthew@wil.cx>
Date: Thu, 2 Apr 2009 16:58:33 -0700
Subject: sysctl: fix suid_dumpable and lease-break-time sysctls

Arne de Bruijn points out that commit
76fdbb25f963de5dc1e308325f0578a2f92b1c2d ("coredump masking: bound
suid_dumpable sysctl") mistakenly limits lease-break-time instead of
suid_dumpable.

Signed-off-by: Matthew Wilcox <matthew@wil.cx>
Reported-by: Arne de Bruijn <kernelbt@arbruijn.dds.nl>
Cc: Kawai, Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: <stable@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 2e490a389dd2..5ec4543dfc06 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -95,12 +95,9 @@ static int sixty = 60;
 static int neg_one = -1;
 #endif
 
-#if defined(CONFIG_MMU) && defined(CONFIG_FILE_LOCKING)
-static int two = 2;
-#endif
-
 static int zero;
 static int one = 1;
+static int two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
@@ -1373,10 +1370,7 @@ static struct ctl_table fs_table[] = {
 		.data		= &lease_break_time,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec_minmax,
-		.strategy	= &sysctl_intvec,
-		.extra1		= &zero,
-		.extra2		= &two,
+		.proc_handler	= &proc_dointvec,
 	},
 #endif
 #ifdef CONFIG_AIO
@@ -1417,7 +1411,10 @@ static struct ctl_table fs_table[] = {
 		.data		= &suid_dumpable,
 		.maxlen		= sizeof(int),
 		.mode		= 0644,
-		.proc_handler	= &proc_dointvec,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &zero,
+		.extra2		= &two,
 	},
 #if defined(CONFIG_BINFMT_MISC) || defined(CONFIG_BINFMT_MISC_MODULE)
 	{
-- 
cgit v1.2.3


From 2ae448efc87df6d328f5835969076c7f9fce59c3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:36 -0700
Subject: pids: improve get_task_pid() to fix the unsafe
 sys_wait4()->task_pgrp()

sys_wait4() does get_pid(task_pgrp(current)), this is not safe.  We can
add rcu lock/unlock around, but we already have get_task_pid() which can
be improved to handle the special pids in more reliable manner.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 2 +-
 kernel/pid.c  | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 029415d9f82e..384f09caf2ef 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -1737,7 +1737,7 @@ SYSCALL_DEFINE4(wait4, pid_t, upid, int __user *, stat_addr,
 		pid = find_get_pid(-upid);
 	} else if (upid == 0) {
 		type = PIDTYPE_PGID;
-		pid = get_pid(task_pgrp(current));
+		pid = get_task_pid(current, PIDTYPE_PGID);
 	} else /* upid > 0 */ {
 		type = PIDTYPE_PID;
 		pid = find_get_pid(upid);
diff --git a/kernel/pid.c b/kernel/pid.c
index 1b3586fe753a..6628abcc520e 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -403,6 +403,8 @@ struct pid *get_task_pid(struct task_struct *task, enum pid_type type)
 {
 	struct pid *pid;
 	rcu_read_lock();
+	if (type != PIDTYPE_PID)
+		task = task->group_leader;
 	pid = get_pid(task->pids[type].pid);
 	rcu_read_unlock();
 	return pid;
-- 
cgit v1.2.3


From 52ee2dfdd4f51cf422ea6a96a0846dc94244aa37 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:38 -0700
Subject: pids: refactor vnr/nr_ns helpers to make them safe

Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.

task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.

As for "special" pids, vnr/nr_ns helpers always need rcu.  However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.

And almost every helper has a callsite which needs a fix.

Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".

This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.

After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/sched.h | 27 ++++++++++++++++++++-------
 kernel/pid.c          | 31 ++++++++++++++++---------------
 2 files changed, 36 insertions(+), 22 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 722dd313bf8a..49df878a0cad 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1519,17 +1519,23 @@ struct pid_namespace;
  *
  * see also pid_nr() etc in include/linux/pid.h
  */
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns);
 
 static inline pid_t task_pid_nr(struct task_struct *tsk)
 {
 	return tsk->pid;
 }
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
+}
 
 static inline pid_t task_pid_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pid(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
 }
 
 
@@ -1551,11 +1557,15 @@ static inline pid_t task_pgrp_nr(struct task_struct *tsk)
 	return tsk->signal->__pgrp;
 }
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
+}
 
 static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_pgrp(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
 }
 
 
@@ -1564,14 +1574,17 @@ static inline pid_t task_session_nr(struct task_struct *tsk)
 	return tsk->signal->__session;
 }
 
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
+static inline pid_t task_session_nr_ns(struct task_struct *tsk,
+					struct pid_namespace *ns)
+{
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
+}
 
 static inline pid_t task_session_vnr(struct task_struct *tsk)
 {
-	return pid_vnr(task_session(tsk));
+	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
-
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/pid.c b/kernel/pid.c
index 6628abcc520e..b2e5f78fd281 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -452,11 +452,24 @@ pid_t pid_vnr(struct pid *pid)
 }
 EXPORT_SYMBOL_GPL(pid_vnr);
 
-pid_t task_pid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
+pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
+			struct pid_namespace *ns)
 {
-	return pid_nr_ns(task_pid(tsk), ns);
+	pid_t nr = 0;
+
+	rcu_read_lock();
+	if (!ns)
+		ns = current->nsproxy->pid_ns;
+	if (likely(pid_alive(task))) {
+		if (type != PIDTYPE_PID)
+			task = task->group_leader;
+		nr = pid_nr_ns(task->pids[type].pid, ns);
+	}
+	rcu_read_unlock();
+
+	return nr;
 }
-EXPORT_SYMBOL(task_pid_nr_ns);
+EXPORT_SYMBOL(__task_pid_nr_ns);
 
 pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 {
@@ -464,18 +477,6 @@ pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
 }
 EXPORT_SYMBOL(task_tgid_nr_ns);
 
-pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_pgrp(tsk), ns);
-}
-EXPORT_SYMBOL(task_pgrp_nr_ns);
-
-pid_t task_session_nr_ns(struct task_struct *tsk, struct pid_namespace *ns)
-{
-	return pid_nr_ns(task_session(tsk), ns);
-}
-EXPORT_SYMBOL(task_session_nr_ns);
-
 struct pid_namespace *task_active_pid_ns(struct task_struct *tsk)
 {
 	return ns_of_pid(task_pid(tsk));
-- 
cgit v1.2.3


From 1b0f7ffd0ea27cd3a0b9ca04e3df9522048c32a3 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Thu, 2 Apr 2009 16:58:39 -0700
Subject: pids: kill signal_struct-> __pgrp/__session and friends

We are wasting 2 words in signal_struct without any reason to implement
task_pgrp_nr() and task_session_nr().

task_session_nr() has no callers since
2e2ba22ea4fd4bb85f0fa37c521066db6775cbef, we can remove it.

task_pgrp_nr() is still (I believe wrongly) used in fs/autofsX and
fs/coda.

This patch reimplements task_pgrp_nr() via task_pgrp_nr_ns(), and kills
__pgrp/__session and the related helpers.

The change in drivers/char/tty_io.c is cosmetic, but hopefully makes sense
anyway.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Alan Cox <number6@the-village.bc.nu>		[tty parts]
Cc: Cedric Le Goater <clg@fr.ibm.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Eric Biederman <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Serge Hallyn <serue@us.ibm.com>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 drivers/char/tty_io.c |  4 ++--
 include/linux/sched.h | 43 ++++++-------------------------------------
 kernel/exit.c         | 10 +++-------
 kernel/fork.c         |  2 --
 kernel/sys.c          |  4 +---
 5 files changed, 12 insertions(+), 51 deletions(-)

(limited to 'kernel')

diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c
index a44b701c5bba..66b99a2049e3 100644
--- a/drivers/char/tty_io.c
+++ b/drivers/char/tty_io.c
@@ -2681,7 +2681,7 @@ void __do_SAK(struct tty_struct *tty)
 	/* Kill the entire session */
 	do_each_pid_task(session, PIDTYPE_SID, p) {
 		printk(KERN_NOTICE "SAK: killed process %d"
-			" (%s): task_session_nr(p)==tty->session\n",
+			" (%s): task_session(p)==tty->session\n",
 			task_pid_nr(p), p->comm);
 		send_sig(SIGKILL, p, 1);
 	} while_each_pid_task(session, PIDTYPE_SID, p);
@@ -2691,7 +2691,7 @@ void __do_SAK(struct tty_struct *tty)
 	do_each_thread(g, p) {
 		if (p->signal->tty == tty) {
 			printk(KERN_NOTICE "SAK: killed process %d"
-			    " (%s): task_session_nr(p)==tty->session\n",
+			    " (%s): task_session(p)==tty->session\n",
 			    task_pid_nr(p), p->comm);
 			send_sig(SIGKILL, p, 1);
 			continue;
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 49df878a0cad..206ac003e8c0 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -547,25 +547,8 @@ struct signal_struct {
 
 	struct list_head cpu_timers[3];
 
-	/* job control IDs */
-
-	/*
-	 * pgrp and session fields are deprecated.
-	 * use the task_session_Xnr and task_pgrp_Xnr routines below
-	 */
-
-	union {
-		pid_t pgrp __deprecated;
-		pid_t __pgrp;
-	};
-
 	struct pid *tty_old_pgrp;
 
-	union {
-		pid_t session __deprecated;
-		pid_t __session;
-	};
-
 	/* boolean value for session group leader */
 	int leader;
 
@@ -1469,16 +1452,6 @@ static inline int rt_task(struct task_struct *p)
 	return rt_prio(p->prio);
 }
 
-static inline void set_task_session(struct task_struct *tsk, pid_t session)
-{
-	tsk->signal->__session = session;
-}
-
-static inline void set_task_pgrp(struct task_struct *tsk, pid_t pgrp)
-{
-	tsk->signal->__pgrp = pgrp;
-}
-
 static inline struct pid *task_pid(struct task_struct *task)
 {
 	return task->pids[PIDTYPE_PID].pid;
@@ -1552,11 +1525,6 @@ static inline pid_t task_tgid_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_pgrp_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__pgrp;
-}
-
 static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1569,11 +1537,6 @@ static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
 }
 
 
-static inline pid_t task_session_nr(struct task_struct *tsk)
-{
-	return tsk->signal->__session;
-}
-
 static inline pid_t task_session_nr_ns(struct task_struct *tsk,
 					struct pid_namespace *ns)
 {
@@ -1585,6 +1548,12 @@ static inline pid_t task_session_vnr(struct task_struct *tsk)
 	return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
 }
 
+/* obsolete, do not use */
+static inline pid_t task_pgrp_nr(struct task_struct *tsk)
+{
+	return task_pgrp_nr_ns(tsk, &init_pid_ns);
+}
+
 /**
  * pid_alive - check that a task structure is not stale
  * @p: Task structure to be checked.
diff --git a/kernel/exit.c b/kernel/exit.c
index 384f09caf2ef..3bec141c82f6 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -357,16 +357,12 @@ static void reparent_to_kthreadd(void)
 void __set_special_pids(struct pid *pid)
 {
 	struct task_struct *curr = current->group_leader;
-	pid_t nr = pid_nr(pid);
 
-	if (task_session(curr) != pid) {
+	if (task_session(curr) != pid)
 		change_pid(curr, PIDTYPE_SID, pid);
-		set_task_session(curr, nr);
-	}
-	if (task_pgrp(curr) != pid) {
+
+	if (task_pgrp(curr) != pid)
 		change_pid(curr, PIDTYPE_PGID, pid);
-		set_task_pgrp(curr, nr);
-	}
 }
 
 static void set_special_pids(struct pid *pid)
diff --git a/kernel/fork.c b/kernel/fork.c
index adbea16ec649..f74458231449 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1265,8 +1265,6 @@ static struct task_struct *copy_process(unsigned long clone_flags,
 			p->signal->leader_pid = pid;
 			tty_kref_put(p->signal->tty);
 			p->signal->tty = tty_kref_get(current->signal->tty);
-			set_task_pgrp(p, task_pgrp_nr(current));
-			set_task_session(p, task_session_nr(current));
 			attach_pid(p, PIDTYPE_PGID, task_pgrp(current));
 			attach_pid(p, PIDTYPE_SID, task_session(current));
 			list_add_tail_rcu(&p->tasks, &init_task.tasks);
diff --git a/kernel/sys.c b/kernel/sys.c
index 37f458e6882a..742cefa527e6 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -1013,10 +1013,8 @@ SYSCALL_DEFINE2(setpgid, pid_t, pid, pid_t, pgid)
 	if (err)
 		goto out;
 
-	if (task_pgrp(p) != pgrp) {
+	if (task_pgrp(p) != pgrp)
 		change_pid(p, PIDTYPE_PGID, pgrp);
-		set_task_pgrp(p, pid_nr(pgrp));
-	}
 
 	err = 0;
 out:
-- 
cgit v1.2.3


From 04d491ab2a53008a1aa98ac09561768c7f3adda3 Mon Sep 17 00:00:00 2001
From: Neil Horman <nhorman@tuxdriver.com>
Date: Thu, 2 Apr 2009 16:58:57 -0700
Subject: kexec: add dmesg log symbols to /proc/vmcoreinfo lists

It would be nice to be able to extract the dmesg log from a vmcore file
without needing to keep the debug symbols for the running kernel handy all
the time.  We have a facility to do this in /proc/vmcore.  This patch adds
the log_buf and log_end symbols to the vmcoreinfo area so that tools (like
makedumpfile) can easily extract the dmesg logs from a vmcore image.

[akpm@linux-foundation.org: several fixes and cleanups]
[akpm@linux-foundation.org: fix unused log_buf_kexec_setup()]
[akpm@linux-foundation.org: build fix]
Signed-off-by: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Acked-by: Vivek Goyal <vgoyal@redhat.com>
Cc: Neil Horman <nhorman@tuxdriver.com>
Cc: Simon Horman <horms@verge.net.au>
Cc: Vivek Goyal <vgoyal@redhat.com>
Cc: Randy Dunlap <randy.dunlap@oracle.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/kernel.h |  4 ++++
 kernel/kexec.c         |  1 +
 kernel/printk.c        | 19 +++++++++++++++++++
 3 files changed, 24 insertions(+)

(limited to 'kernel')

diff --git a/include/linux/kernel.h b/include/linux/kernel.h
index e720b0da7751..556d781e69fe 100644
--- a/include/linux/kernel.h
+++ b/include/linux/kernel.h
@@ -242,6 +242,7 @@ extern struct ratelimit_state printk_ratelimit_state;
 extern int printk_ratelimit(void);
 extern bool printk_timed_ratelimit(unsigned long *caller_jiffies,
 				   unsigned int interval_msec);
+void log_buf_kexec_setup(void);
 #else
 static inline int vprintk(const char *s, va_list args)
 	__attribute__ ((format (printf, 1, 0)));
@@ -253,6 +254,9 @@ static inline int printk_ratelimit(void) { return 0; }
 static inline bool printk_timed_ratelimit(unsigned long *caller_jiffies, \
 					  unsigned int interval_msec)	\
 		{ return false; }
+static inline void log_buf_kexec_setup(void)
+{
+}
 #endif
 
 extern int printk_needs_cpu(int cpu);
diff --git a/kernel/kexec.c b/kernel/kexec.c
index 93eed85fe017..589832aac41f 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -1409,6 +1409,7 @@ static int __init crash_save_vmcoreinfo_init(void)
 	VMCOREINFO_OFFSET(list_head, prev);
 	VMCOREINFO_OFFSET(vm_struct, addr);
 	VMCOREINFO_LENGTH(zone.free_area, MAX_ORDER);
+	log_buf_kexec_setup();
 	VMCOREINFO_LENGTH(free_area.free_list, MIGRATE_TYPES);
 	VMCOREINFO_NUMBER(NR_FREE_PAGES);
 	VMCOREINFO_NUMBER(PG_lru);
diff --git a/kernel/printk.c b/kernel/printk.c
index e3602d0755b0..a5f61a9acedb 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -32,6 +32,7 @@
 #include <linux/security.h>
 #include <linux/bootmem.h>
 #include <linux/syscalls.h>
+#include <linux/kexec.h>
 
 #include <asm/uaccess.h>
 
@@ -135,6 +136,24 @@ static char *log_buf = __log_buf;
 static int log_buf_len = __LOG_BUF_LEN;
 static unsigned logged_chars; /* Number of chars produced since last read+clear operation */
 
+#ifdef CONFIG_KEXEC
+/*
+ * This appends the listed symbols to /proc/vmcoreinfo
+ *
+ * /proc/vmcoreinfo is used by various utiilties, like crash and makedumpfile to
+ * obtain access to symbols that are otherwise very difficult to locate.  These
+ * symbols are specifically used so that utilities can access and extract the
+ * dmesg log from a vmcore file after a crash.
+ */
+void log_buf_kexec_setup(void)
+{
+	VMCOREINFO_SYMBOL(log_buf);
+	VMCOREINFO_SYMBOL(log_end);
+	VMCOREINFO_SYMBOL(log_buf_len);
+	VMCOREINFO_SYMBOL(logged_chars);
+}
+#endif
+
 static int __init log_buf_len_setup(char *str)
 {
 	unsigned size = memparse(str, &str);
-- 
cgit v1.2.3


From edb79a213223488735fae1d408f4c136e9ed25d6 Mon Sep 17 00:00:00 2001
From: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Date: Thu, 2 Apr 2009 16:58:58 -0700
Subject: kexec: vmcoreinfo_data[] can become static

The vmcoreinfo_data[] array is not used outside of kernel/kexec.c, and
can therefore become static. This patch adds the relevant keyword to the
definition of the array.

Noticed by sparse.

Signed-off-by: Dmitri Vorobiev <dmitri.vorobiev@movial.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kexec.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/kexec.c b/kernel/kexec.c
index 589832aac41f..5a758c6e4950 100644
--- a/kernel/kexec.c
+++ b/kernel/kexec.c
@@ -42,7 +42,7 @@
 note_buf_t* crash_notes;
 
 /* vmcoreinfo stuff */
-unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
+static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
 u32 vmcoreinfo_note[VMCOREINFO_NOTE_SIZE/4];
 size_t vmcoreinfo_size;
 size_t vmcoreinfo_max_size = sizeof(vmcoreinfo_data);
-- 
cgit v1.2.3


From 2c53d9109f077900e140edb8b766132ad93b81cc Mon Sep 17 00:00:00 2001
From: Aravind Srinivasan <raa.aars@gmail.com>
Date: Thu, 2 Apr 2009 16:58:59 -0700
Subject: relay: fix for possible loss/corruption of produced subbufs

Fix possible loss/corruption of produced subbufs in
relay_subbufs_consumed().

When buf->subbufs_produced wraps around after UINT_MAX and
buf->subbufs_consumed is still < UINT_MAX, the condition

	if (buf->subbufs_consumed > buf->subbufs_produced)

will be true even for certain valid values of subbufs_consumed.  This may
lead to loss or corruption of produced subbufs.

Signed-off-by: Aravind Srinivasan <raa.aars@gmail.com>
Cc: Tom Zanussi <tzanussi@gmail.com>
Cc: Tom Zanussi <zanussi@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/relay.c | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/relay.c b/kernel/relay.c
index 8f2179c8056f..e92db8c06acf 100644
--- a/kernel/relay.c
+++ b/kernel/relay.c
@@ -797,13 +797,15 @@ void relay_subbufs_consumed(struct rchan *chan,
 	if (!chan)
 		return;
 
-	if (cpu >= NR_CPUS || !chan->buf[cpu])
+	if (cpu >= NR_CPUS || !chan->buf[cpu] ||
+					subbufs_consumed > chan->n_subbufs)
 		return;
 
 	buf = chan->buf[cpu];
-	buf->subbufs_consumed += subbufs_consumed;
-	if (buf->subbufs_consumed > buf->subbufs_produced)
+	if (subbufs_consumed > buf->subbufs_produced - buf->subbufs_consumed)
 		buf->subbufs_consumed = buf->subbufs_produced;
+	else
+		buf->subbufs_consumed += subbufs_consumed;
 }
 EXPORT_SYMBOL_GPL(relay_subbufs_consumed);
 
-- 
cgit v1.2.3


From e8c158bb313c1df421eab7dc4299cd39cbbf5895 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:45 -0700
Subject: Factor out #ifdefs from kernel/spinlock.c to LOCK_CONTENDED_FLAGS

SGI has observed that on large systems, interrupts are not serviced for a
long period of time when waiting for a rwlock.  The following patch series
re-enables irqs while waiting for the lock, resembling the code which is
already there for spinlocks.

I only made the ia64 version, because the patch adds some overhead to the
fast path.  I assume there is currently no demand to have this for other
architectures, because the systems are not so large.  Of course, the
possibility to implement raw_{read|write}_lock_flags for any architecture
is still there.

This patch:

The new macro LOCK_CONTENDED_FLAGS expands to the correct implementation
depending on the config options, so that IRQ's are re-enabled when
possible, but they remain disabled if CONFIG_LOCKDEP is set.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Cc: <linux-arch@vger.kernel.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 include/linux/lockdep.h | 17 +++++++++++++++++
 kernel/spinlock.c       | 12 ++----------
 2 files changed, 19 insertions(+), 10 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 5a58ea3e91e9..da5a5a1f4cd2 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -364,6 +364,23 @@ do {								\
 
 #endif /* CONFIG_LOCK_STAT */
 
+#ifdef CONFIG_LOCKDEP
+
+/*
+ * On lockdep we dont want the hand-coded irq-enable of
+ * _raw_*_lock_flags() code, because lockdep assumes
+ * that interrupts are not re-enabled during lock-acquire:
+ */
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	LOCK_CONTENDED((_lock), (try), (lock))
+
+#else /* CONFIG_LOCKDEP */
+
+#define LOCK_CONTENDED_FLAGS(_lock, try, lock, lockfl, flags) \
+	lockfl((_lock), (flags))
+
+#endif /* CONFIG_LOCKDEP */
+
 #ifdef CONFIG_GENERIC_HARDIRQS
 extern void early_init_irq_lock_class(void);
 #else
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 29ab20749dd3..7283c6dc2d59 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -299,16 +299,8 @@ unsigned long __lockfunc _spin_lock_irqsave_nested(spinlock_t *lock, int subclas
 	local_irq_save(flags);
 	preempt_disable();
 	spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_);
-	/*
-	 * On lockdep we dont want the hand-coded irq-enable of
-	 * _raw_spin_lock_flags() code, because lockdep assumes
-	 * that interrupts are not re-enabled during lock-acquire:
-	 */
-#ifdef CONFIG_LOCKDEP
-	LOCK_CONTENDED(lock, _raw_spin_trylock, _raw_spin_lock);
-#else
-	_raw_spin_lock_flags(lock, &flags);
-#endif
+	LOCK_CONTENDED_FLAGS(lock, _raw_spin_trylock, _raw_spin_lock,
+				_raw_spin_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_spin_lock_irqsave_nested);
-- 
cgit v1.2.3


From f5f7eac41db827a47b2163330eecd7bb55ae9f12 Mon Sep 17 00:00:00 2001
From: Robin Holt <holt@sgi.com>
Date: Thu, 2 Apr 2009 16:59:46 -0700
Subject: Allow rwlocks to re-enable interrupts

Pass the original flags to rwlock arch-code, so that it can re-enable
interrupts if implemented for that architecture.

Initially, make __raw_read_lock_flags and __raw_write_lock_flags stubs
which just do the same thing as non-flags variants.

Signed-off-by: Petr Tesarik <ptesarik@suse.cz>
Signed-off-by: Robin Holt <holt@sgi.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: <linux-arch@vger.kernel.org>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: "Luck, Tony" <tony.luck@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/alpha/include/asm/spinlock.h          | 3 +++
 arch/arm/include/asm/spinlock.h            | 3 +++
 arch/cris/include/arch-v32/arch/spinlock.h | 2 ++
 arch/ia64/include/asm/spinlock.h           | 3 +++
 arch/mips/include/asm/spinlock.h           | 2 ++
 arch/parisc/include/asm/spinlock.h         | 3 +++
 arch/powerpc/include/asm/spinlock.h        | 3 +++
 arch/s390/include/asm/spinlock.h           | 3 +++
 arch/sh/include/asm/spinlock.h             | 3 +++
 arch/sparc/include/asm/spinlock_32.h       | 2 ++
 arch/sparc/include/asm/spinlock_64.h       | 2 ++
 arch/x86/include/asm/spinlock.h            | 3 +++
 include/asm-m32r/spinlock.h                | 3 +++
 include/linux/spinlock.h                   | 6 ++++++
 kernel/spinlock.c                          | 6 ++++--
 15 files changed, 45 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/arch/alpha/include/asm/spinlock.h b/arch/alpha/include/asm/spinlock.h
index aeeb125f6851..e38fb95cb335 100644
--- a/arch/alpha/include/asm/spinlock.h
+++ b/arch/alpha/include/asm/spinlock.h
@@ -166,6 +166,9 @@ static inline void __raw_write_unlock(raw_rwlock_t * lock)
 	lock->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/arm/include/asm/spinlock.h b/arch/arm/include/asm/spinlock.h
index 2b41ebbfa7ff..c13681ac1ede 100644
--- a/arch/arm/include/asm/spinlock.h
+++ b/arch/arm/include/asm/spinlock.h
@@ -217,6 +217,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *rw)
 /* read_can_lock - would read_trylock() succeed? */
 #define __raw_read_can_lock(x)		((x)->lock < 0x80000000)
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/cris/include/arch-v32/arch/spinlock.h b/arch/cris/include/arch-v32/arch/spinlock.h
index 0d5709b983a1..129756b96661 100644
--- a/arch/cris/include/arch-v32/arch/spinlock.h
+++ b/arch/cris/include/arch-v32/arch/spinlock.h
@@ -121,6 +121,8 @@ static  inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return 1;
 }
 
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/ia64/include/asm/spinlock.h b/arch/ia64/include/asm/spinlock.h
index 0229fb95fb38..0a619618b2fa 100644
--- a/arch/ia64/include/asm/spinlock.h
+++ b/arch/ia64/include/asm/spinlock.h
@@ -213,6 +213,9 @@ static inline int __raw_read_trylock(raw_rwlock_t *x)
 	return (u32)ia64_cmpxchg4_acq((__u32 *)(x), new.word, old.word) == old.word;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/mips/include/asm/spinlock.h b/arch/mips/include/asm/spinlock.h
index 10e82441b496..5b60a09a0f08 100644
--- a/arch/mips/include/asm/spinlock.h
+++ b/arch/mips/include/asm/spinlock.h
@@ -480,6 +480,8 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return ret;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index f3d2090a18dc..fae03e136fa8 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -187,6 +187,9 @@ static __inline__ int __raw_write_can_lock(raw_rwlock_t *rw)
 	return !rw->counter;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/powerpc/include/asm/spinlock.h b/arch/powerpc/include/asm/spinlock.h
index 36864364e601..c3b193121f81 100644
--- a/arch/powerpc/include/asm/spinlock.h
+++ b/arch/powerpc/include/asm/spinlock.h
@@ -287,6 +287,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 	rw->lock = 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	__spin_yield(lock)
 #define _raw_read_relax(lock)	__rw_yield(lock)
 #define _raw_write_relax(lock)	__rw_yield(lock)
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index df84ae96915f..f3861b09ebb0 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -172,6 +172,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return _raw_write_trylock_retry(rw);
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
 
diff --git a/arch/sh/include/asm/spinlock.h b/arch/sh/include/asm/spinlock.h
index e793181d64da..60283565f89b 100644
--- a/arch/sh/include/asm/spinlock.h
+++ b/arch/sh/include/asm/spinlock.h
@@ -216,6 +216,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *rw)
 	return (oldval > (RW_LOCK_BIAS - 1));
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_32.h b/arch/sparc/include/asm/spinlock_32.h
index bf2d532593e3..46f91ab66a50 100644
--- a/arch/sparc/include/asm/spinlock_32.h
+++ b/arch/sparc/include/asm/spinlock_32.h
@@ -177,6 +177,8 @@ static inline int __read_trylock(raw_rwlock_t *rw)
 #define __raw_write_unlock(rw)	do { (rw)->lock = 0; } while(0)
 
 #define __raw_spin_lock_flags(lock, flags) __raw_spin_lock(lock)
+#define __raw_read_lock_flags(rw, flags)   __raw_read_lock(rw)
+#define __raw_write_lock_flags(rw, flags)  __raw_write_lock(rw)
 
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
diff --git a/arch/sparc/include/asm/spinlock_64.h b/arch/sparc/include/asm/spinlock_64.h
index c4d274d330e9..f6b2b92ad8d2 100644
--- a/arch/sparc/include/asm/spinlock_64.h
+++ b/arch/sparc/include/asm/spinlock_64.h
@@ -211,9 +211,11 @@ static int inline __write_trylock(raw_rwlock_t *lock)
 }
 
 #define __raw_read_lock(p)	__read_lock(p)
+#define __raw_read_lock_flags(p, f) __read_lock(p)
 #define __raw_read_trylock(p)	__read_trylock(p)
 #define __raw_read_unlock(p)	__read_unlock(p)
 #define __raw_write_lock(p)	__write_lock(p)
+#define __raw_write_lock_flags(p, f) __write_lock(p)
 #define __raw_write_unlock(p)	__write_unlock(p)
 #define __raw_write_trylock(p)	__write_trylock(p)
 
diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h
index 3a5696656680..e5e6caffec87 100644
--- a/arch/x86/include/asm/spinlock.h
+++ b/arch/x86/include/asm/spinlock.h
@@ -295,6 +295,9 @@ static inline void __raw_write_unlock(raw_rwlock_t *rw)
 		     : "+m" (rw->lock) : "i" (RW_LOCK_BIAS) : "memory");
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/asm-m32r/spinlock.h b/include/asm-m32r/spinlock.h
index f5cfba81ee10..dded923883b2 100644
--- a/include/asm-m32r/spinlock.h
+++ b/include/asm-m32r/spinlock.h
@@ -316,6 +316,9 @@ static inline int __raw_write_trylock(raw_rwlock_t *lock)
 	return 0;
 }
 
+#define __raw_read_lock_flags(lock, flags) __raw_read_lock(lock)
+#define __raw_write_lock_flags(lock, flags) __raw_write_lock(lock)
+
 #define _raw_spin_relax(lock)	cpu_relax()
 #define _raw_read_relax(lock)	cpu_relax()
 #define _raw_write_relax(lock)	cpu_relax()
diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h
index a0c66a2e00ad..252b245cfcf4 100644
--- a/include/linux/spinlock.h
+++ b/include/linux/spinlock.h
@@ -153,9 +153,11 @@ do {								\
  extern int _raw_spin_trylock(spinlock_t *lock);
  extern void _raw_spin_unlock(spinlock_t *lock);
  extern void _raw_read_lock(rwlock_t *lock);
+#define _raw_read_lock_flags(lock, flags) _raw_read_lock(lock)
  extern int _raw_read_trylock(rwlock_t *lock);
  extern void _raw_read_unlock(rwlock_t *lock);
  extern void _raw_write_lock(rwlock_t *lock);
+#define _raw_write_lock_flags(lock, flags) _raw_write_lock(lock)
  extern int _raw_write_trylock(rwlock_t *lock);
  extern void _raw_write_unlock(rwlock_t *lock);
 #else
@@ -165,9 +167,13 @@ do {								\
 # define _raw_spin_trylock(lock)	__raw_spin_trylock(&(lock)->raw_lock)
 # define _raw_spin_unlock(lock)		__raw_spin_unlock(&(lock)->raw_lock)
 # define _raw_read_lock(rwlock)		__raw_read_lock(&(rwlock)->raw_lock)
+# define _raw_read_lock_flags(lock, flags) \
+		__raw_read_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_read_trylock(rwlock)	__raw_read_trylock(&(rwlock)->raw_lock)
 # define _raw_read_unlock(rwlock)	__raw_read_unlock(&(rwlock)->raw_lock)
 # define _raw_write_lock(rwlock)	__raw_write_lock(&(rwlock)->raw_lock)
+# define _raw_write_lock_flags(lock, flags) \
+		__raw_write_lock_flags(&(lock)->raw_lock, *(flags))
 # define _raw_write_trylock(rwlock)	__raw_write_trylock(&(rwlock)->raw_lock)
 # define _raw_write_unlock(rwlock)	__raw_write_unlock(&(rwlock)->raw_lock)
 #endif
diff --git a/kernel/spinlock.c b/kernel/spinlock.c
index 7283c6dc2d59..7932653c4ebd 100644
--- a/kernel/spinlock.c
+++ b/kernel/spinlock.c
@@ -121,7 +121,8 @@ unsigned long __lockfunc _read_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire_read(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_read_trylock, _raw_read_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_read_trylock, _raw_read_lock,
+			     _raw_read_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_read_lock_irqsave);
@@ -151,7 +152,8 @@ unsigned long __lockfunc _write_lock_irqsave(rwlock_t *lock)
 	local_irq_save(flags);
 	preempt_disable();
 	rwlock_acquire(&lock->dep_map, 0, 0, _RET_IP_);
-	LOCK_CONTENDED(lock, _raw_write_trylock, _raw_write_lock);
+	LOCK_CONTENDED_FLAGS(lock, _raw_write_trylock, _raw_write_lock,
+			     _raw_write_lock_flags, &flags);
 	return flags;
 }
 EXPORT_SYMBOL(_write_lock_irqsave);
-- 
cgit v1.2.3


From b1f77b0581b8fd837acb4a973f7d5496cae6efee Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 13 Mar 2009 03:20:49 +0100
Subject: kmemtrace, rcu: fix linux/rcutree.h and linux/rcuclassic.h
 dependencies

Impact: build fix for all non-x86 architectures

We want to remove percpu.h from rcuclassic.h/rcutree.h (for upcoming
kmemtrace changes) but that would break the DECLARE_PER_CPU based
declarations in these files.

Move the quiescent counter management functions to their respective
RCU implementation .c files - they were slightly above the inlining
limit anyway.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcuclassic.h | 15 ++-------------
 include/linux/rcutree.h    | 26 ++------------------------
 kernel/rcuclassic.c        | 23 +++++++++++++++++++++--
 kernel/rcutree.c           | 28 ++++++++++++++++++++++++----
 4 files changed, 49 insertions(+), 43 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h
index 80044a4f3ab9..2d688b4461f7 100644
--- a/include/linux/rcuclassic.h
+++ b/include/linux/rcuclassic.h
@@ -108,25 +108,14 @@ struct rcu_data {
 	struct rcu_head barrier;
 };
 
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
 /*
  * Increment the quiescent state counter.
  * The counter is a bit degenerated: We do not need to know
  * how many quiescent states passed, just if there was at least
  * one since the start of the grace period. Thus just a flag.
  */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-}
+extern void rcu_qsctr_inc(int cpu);
+extern void rcu_bh_qsctr_inc(int cpu);
 
 extern int rcu_pending(int cpu);
 extern int rcu_needs_cpu(int cpu);
diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h
index a722fb67bb2d..5d6f425260bc 100644
--- a/include/linux/rcutree.h
+++ b/include/linux/rcutree.h
@@ -236,30 +236,8 @@ struct rcu_state {
 #endif /* #ifdef CONFIG_NO_HZ */
 };
 
-extern struct rcu_state rcu_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_data);
-
-extern struct rcu_state rcu_bh_state;
-DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
-
-/*
- * Increment the quiescent state counter.
- * The counter is a bit degenerated: We do not need to know
- * how many quiescent states passed, just if there was at least
- * one since the start of the grace period. Thus just a flag.
- */
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
-	rdp->passed_quiesc = 1;
-	rdp->passed_quiesc_completed = rdp->completed;
-}
-static inline void rcu_bh_qsctr_inc(int cpu)
-{
-	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
-	rdp->passed_quiesc = 1;
-	rdp->passed_quiesc_completed = rdp->completed;
-}
+extern void rcu_qsctr_inc(int cpu);
+extern void rcu_bh_qsctr_inc(int cpu);
 
 extern int rcu_pending(int cpu);
 extern int rcu_needs_cpu(int cpu);
diff --git a/kernel/rcuclassic.c b/kernel/rcuclassic.c
index 654c640a6b9c..0f2b0b311304 100644
--- a/kernel/rcuclassic.c
+++ b/kernel/rcuclassic.c
@@ -65,6 +65,7 @@ static struct rcu_ctrlblk rcu_ctrlblk = {
 	.lock = __SPIN_LOCK_UNLOCKED(&rcu_ctrlblk.lock),
 	.cpumask = CPU_BITS_NONE,
 };
+
 static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cur = -300,
 	.completed = -300,
@@ -73,8 +74,26 @@ static struct rcu_ctrlblk rcu_bh_ctrlblk = {
 	.cpumask = CPU_BITS_NONE,
 };
 
-DEFINE_PER_CPU(struct rcu_data, rcu_data) = { 0L };
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data) = { 0L };
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+}
 
 static int blimit = 10;
 static int qhimark = 10000;
diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index 97ce31579ec0..a2015edfe167 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,31 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_data);
+static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+
+/*
+ * Increment the quiescent state counter.
+ * The counter is a bit degenerated: We do not need to know
+ * how many quiescent states passed, just if there was at least
+ * one since the start of the grace period. Thus just a flag.
+ */
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
+
+void rcu_bh_qsctr_inc(int cpu)
+{
+	struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu);
+	rdp->passed_quiesc = 1;
+	rdp->passed_quiesc_completed = rdp->completed;
+}
 
 #ifdef CONFIG_NO_HZ
 DEFINE_PER_CPU(struct rcu_dynticks, rcu_dynticks) = {
-- 
cgit v1.2.3


From 6258c4fb59e77d748f7efc2c137ad420372edd07 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcu_tree_trace.c data structure dependencies

Impact: cleanup

We want to remove rcutree internals from the public rcutree.h file for
upcoming kmemtrace changes - but kernel/rcutree_trace.c depends on them.

Introduce kernel/rcutree.h for internal definitions. (Probably all
the other data types from include/linux/rcutree.h could be
moved here too - except rcu_data.)

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/rcutree.c       |  8 ++++----
 kernel/rcutree.h       | 10 ++++++++++
 kernel/rcutree_trace.c |  2 ++
 3 files changed, 16 insertions(+), 4 deletions(-)
 create mode 100644 kernel/rcutree.h

(limited to 'kernel')

diff --git a/kernel/rcutree.c b/kernel/rcutree.c
index a2015edfe167..7f3266922572 100644
--- a/kernel/rcutree.c
+++ b/kernel/rcutree.c
@@ -72,11 +72,11 @@ EXPORT_SYMBOL_GPL(rcu_lock_map);
 	.n_force_qs_ngp = 0, \
 }
 
-static struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
-static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+struct rcu_state rcu_state = RCU_STATE_INITIALIZER(rcu_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_data);
 
-static struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
-static DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
+struct rcu_state rcu_bh_state = RCU_STATE_INITIALIZER(rcu_bh_state);
+DEFINE_PER_CPU(struct rcu_data, rcu_bh_data);
 
 /*
  * Increment the quiescent state counter.
diff --git a/kernel/rcutree.h b/kernel/rcutree.h
new file mode 100644
index 000000000000..5e872bbf07f5
--- /dev/null
+++ b/kernel/rcutree.h
@@ -0,0 +1,10 @@
+
+/*
+ * RCU implementation internal declarations:
+ */
+extern struct rcu_state rcu_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_data);
+
+extern struct rcu_state rcu_bh_state;
+DECLARE_PER_CPU(struct rcu_data, rcu_bh_data);
+
diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c
index d6db3e837826..4ee954f6a8d5 100644
--- a/kernel/rcutree_trace.c
+++ b/kernel/rcutree_trace.c
@@ -43,6 +43,8 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 
+#include "rcutree.h"
+
 static void print_one_rcu_data(struct seq_file *m, struct rcu_data *rdp)
 {
 	if (!rdp->beenonline)
-- 
cgit v1.2.3


From a979241c532f07c201fe94e0a632107268f02578 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 25 Mar 2009 16:42:24 +0100
Subject: kmemtrace, rcu: fix rcupreempt.c data structure dependencies

Impact: cleanup

We want to remove percpu.h from rcupreempt.h, but if we do that
the percpu primitives there wont build anymore. Move them to the
.c file instead.

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Cc: paulmck@linux.vnet.ibm.com
LKML-Reference: <1237898630.25315.83.camel@penberg-laptop>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/rcupreempt.h | 51 ++++++++--------------------------------------
 kernel/rcupreempt.c        | 48 +++++++++++++++++++++++++++++++++++++++----
 2 files changed, 53 insertions(+), 46 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/rcupreempt.h b/include/linux/rcupreempt.h
index 74304b4538d8..3eb8fdd19445 100644
--- a/include/linux/rcupreempt.h
+++ b/include/linux/rcupreempt.h
@@ -40,30 +40,15 @@
 #include <linux/cpumask.h>
 #include <linux/seqlock.h>
 
-struct rcu_dyntick_sched {
-	int dynticks;
-	int dynticks_snap;
-	int sched_qs;
-	int sched_qs_snap;
-	int sched_dynticks_snap;
-};
-
-DECLARE_PER_CPU(struct rcu_dyntick_sched, rcu_dyntick_sched);
-
-static inline void rcu_qsctr_inc(int cpu)
-{
-	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
-
-	rdssp->sched_qs++;
-}
-#define rcu_bh_qsctr_inc(cpu)
+extern void rcu_qsctr_inc(int cpu);
+static inline void rcu_bh_qsctr_inc(int cpu) { }
 
 /*
  * Someone might want to pass call_rcu_bh as a function pointer.
  * So this needs to just be a rename and not a macro function.
  *  (no parentheses)
  */
-#define call_rcu_bh	 	call_rcu
+#define call_rcu_bh		call_rcu
 
 /**
  * call_rcu_sched - Queue RCU callback for invocation after sched grace period.
@@ -117,30 +102,12 @@ extern struct rcupreempt_trace *rcupreempt_trace_cpu(int cpu);
 struct softirq_action;
 
 #ifdef CONFIG_NO_HZ
-
-static inline void rcu_enter_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
-}
-
-static inline void rcu_exit_nohz(void)
-{
-	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
-
-	__get_cpu_var(rcu_dyntick_sched).dynticks++;
-	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
-	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
-				&rs);
-}
-
-#else /* CONFIG_NO_HZ */
-#define rcu_enter_nohz()	do { } while (0)
-#define rcu_exit_nohz()		do { } while (0)
-#endif /* CONFIG_NO_HZ */
+extern void rcu_enter_nohz(void);
+extern void rcu_exit_nohz(void);
+#else
+# define rcu_enter_nohz()	do { } while (0)
+# define rcu_exit_nohz()	do { } while (0)
+#endif
 
 /*
  * A context switch is a grace period for rcupreempt synchronize_rcu()
diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c
index 5d59e850fb71..ce97a4df64d3 100644
--- a/kernel/rcupreempt.c
+++ b/kernel/rcupreempt.c
@@ -147,7 +147,51 @@ struct rcu_ctrlblk {
 	wait_queue_head_t sched_wq;	/* Place for rcu_sched to sleep. */
 };
 
+struct rcu_dyntick_sched {
+	int dynticks;
+	int dynticks_snap;
+	int sched_qs;
+	int sched_qs_snap;
+	int sched_dynticks_snap;
+};
+
+static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
+	.dynticks = 1,
+};
+
+void rcu_qsctr_inc(int cpu)
+{
+	struct rcu_dyntick_sched *rdssp = &per_cpu(rcu_dyntick_sched, cpu);
+
+	rdssp->sched_qs++;
+}
+
+#ifdef CONFIG_NO_HZ
+
+void rcu_enter_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	smp_mb(); /* CPUs seeing ++ must see prior RCU read-side crit sects */
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	WARN_ON_RATELIMIT(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1, &rs);
+}
+
+void rcu_exit_nohz(void)
+{
+	static DEFINE_RATELIMIT_STATE(rs, 10 * HZ, 1);
+
+	__get_cpu_var(rcu_dyntick_sched).dynticks++;
+	smp_mb(); /* CPUs seeing ++ must see later RCU read-side crit sects */
+	WARN_ON_RATELIMIT(!(__get_cpu_var(rcu_dyntick_sched).dynticks & 0x1),
+				&rs);
+}
+
+#endif /* CONFIG_NO_HZ */
+
+
 static DEFINE_PER_CPU(struct rcu_data, rcu_data);
+
 static struct rcu_ctrlblk rcu_ctrlblk = {
 	.fliplock = __SPIN_LOCK_UNLOCKED(rcu_ctrlblk.fliplock),
 	.completed = 0,
@@ -427,10 +471,6 @@ static void __rcu_advance_callbacks(struct rcu_data *rdp)
 	}
 }
 
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_dyntick_sched, rcu_dyntick_sched) = {
-	.dynticks = 1,
-};
-
 #ifdef CONFIG_NO_HZ
 static DEFINE_PER_CPU(int, rcu_update_flag);
 
-- 
cgit v1.2.3


From ca2b84cb3c4a0d4d2143b46ec072cdff5d1b3b87 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:24 +0200
Subject: kmemtrace: use tracepoints

kmemtrace now uses tracepoints instead of markers. We no longer need to
use format specifiers to pass arguments.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
[ folded: Use the new TP_PROTO and TP_ARGS to fix the build.     ]
[ folded: fix build when CONFIG_KMEMTRACE is disabled.           ]
[ folded: define tracepoints when CONFIG_TRACEPOINTS is enabled. ]
Signed-off-by: Pekka Enberg <penberg@cs.helsinki.fi>
LKML-Reference: <ae61c0f37156db8ec8dc0d5778018edde60a92e3.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/slab_def.h  |  10 +--
 include/linux/slub_def.h  |  12 +--
 include/trace/kmemtrace.h |  92 +++++++++------------
 kernel/trace/kmemtrace.c  | 206 ++++++++++++++++++++++++++++++++--------------
 kernel/trace/trace.h      |   6 ++
 mm/slab.c                 |  24 +++---
 mm/slob.c                 |  28 +++----
 mm/slub.c                 |  30 +++----
 mm/util.c                 |  16 ++++
 9 files changed, 251 insertions(+), 173 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index f4523651fa42..5ac9b0bcaf9a 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -73,8 +73,8 @@ found:
 
 		ret = kmem_cache_alloc_notrace(cachep, flags);
 
-		kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
-				     size, slab_buffer_size(cachep), flags);
+		trace_kmalloc(_THIS_IP_, ret,
+			      size, slab_buffer_size(cachep), flags);
 
 		return ret;
 	}
@@ -128,9 +128,9 @@ found:
 
 		ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_,
-					  ret, size, slab_buffer_size(cachep),
-					  flags, node);
+		trace_kmalloc_node(_THIS_IP_, ret,
+				   size, slab_buffer_size(cachep),
+				   flags, node);
 
 		return ret;
 	}
diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h
index a1f90528e70b..5046f90c1171 100644
--- a/include/linux/slub_def.h
+++ b/include/linux/slub_def.h
@@ -233,8 +233,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags)
 	unsigned int order = get_order(size);
 	void *ret = (void *) __get_free_pages(flags | __GFP_COMP, order);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _THIS_IP_, ret,
-			     size, PAGE_SIZE << order, flags);
+	trace_kmalloc(_THIS_IP_, ret, size, PAGE_SIZE << order, flags);
 
 	return ret;
 }
@@ -255,9 +254,7 @@ static __always_inline void *kmalloc(size_t size, gfp_t flags)
 
 			ret = kmem_cache_alloc_notrace(s, flags);
 
-			kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
-					     _THIS_IP_, ret,
-					     size, s->size, flags);
+			trace_kmalloc(_THIS_IP_, ret, size, s->size, flags);
 
 			return ret;
 		}
@@ -296,9 +293,8 @@ static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node)
 
 		ret = kmem_cache_alloc_node_notrace(s, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _THIS_IP_, ret,
-					  size, s->size, flags, node);
+		trace_kmalloc_node(_THIS_IP_, ret,
+				   size, s->size, flags, node);
 
 		return ret;
 	}
diff --git a/include/trace/kmemtrace.h b/include/trace/kmemtrace.h
index ad8b7857855a..28ee69f9cd46 100644
--- a/include/trace/kmemtrace.h
+++ b/include/trace/kmemtrace.h
@@ -9,65 +9,53 @@
 
 #ifdef __KERNEL__
 
+#include <linux/tracepoint.h>
 #include <linux/types.h>
-#include <linux/marker.h>
-
-enum kmemtrace_type_id {
-	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
-	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
-	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
-};
 
 #ifdef CONFIG_KMEMTRACE
-
 extern void kmemtrace_init(void);
-
-extern void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node);
-
-extern void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr);
-
-#else /* CONFIG_KMEMTRACE */
-
+#else
 static inline void kmemtrace_init(void)
 {
 }
-
-static inline void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-					     unsigned long call_site,
-					     const void *ptr,
-					     size_t bytes_req,
-					     size_t bytes_alloc,
-					     gfp_t gfp_flags,
-					     int node)
-{
-}
-
-static inline void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-				       unsigned long call_site,
-				       const void *ptr)
-{
-}
-
-#endif /* CONFIG_KMEMTRACE */
-
-static inline void kmemtrace_mark_alloc(enum kmemtrace_type_id type_id,
-					unsigned long call_site,
-					const void *ptr,
-					size_t bytes_req,
-					size_t bytes_alloc,
-					gfp_t gfp_flags)
-{
-	kmemtrace_mark_alloc_node(type_id, call_site, ptr,
-				  bytes_req, bytes_alloc, gfp_flags, -1);
-}
+#endif
+
+DECLARE_TRACE(kmalloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmem_cache_alloc,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags));
+DECLARE_TRACE(kmalloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kmem_cache_alloc_node,
+	      TP_PROTO(unsigned long call_site,
+		      const void *ptr,
+		      size_t bytes_req,
+		      size_t bytes_alloc,
+		      gfp_t gfp_flags,
+		      int node),
+	      TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node));
+DECLARE_TRACE(kfree,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
+DECLARE_TRACE(kmem_cache_free,
+	      TP_PROTO(unsigned long call_site, const void *ptr),
+	      TP_ARGS(call_site, ptr));
 
 #endif /* __KERNEL__ */
 
diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae201b3eda89..4f7b5db5dd06 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -10,6 +10,7 @@
 #include <linux/debugfs.h>
 #include <linux/fs.h>
 #include <linux/seq_file.h>
+#include <linux/tracepoint.h>
 #include <trace/kmemtrace.h>
 
 #include "trace.h"
@@ -29,10 +30,150 @@ static struct tracer_flags kmem_tracer_flags = {
 	.opts = kmem_opts
 };
 
-
-static bool kmem_tracing_enabled __read_mostly;
 static struct trace_array *kmemtrace_array;
 
+/* Trace allocations */
+static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
+				   unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_alloc_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+	entry->bytes_req = bytes_req;
+	entry->bytes_alloc = bytes_alloc;
+	entry->gfp_flags = gfp_flags;
+	entry->node	=	node;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
+				  unsigned long call_site,
+				  const void *ptr)
+{
+	struct ring_buffer_event *event;
+	struct kmemtrace_free_entry *entry;
+	struct trace_array *tr = kmemtrace_array;
+
+	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
+	if (!event)
+		return;
+	entry	= ring_buffer_event_data(event);
+	tracing_generic_entry_update(&entry->ent, 0, 0);
+
+	entry->ent.type = TRACE_KMEM_FREE;
+	entry->type_id	= type_id;
+	entry->call_site = call_site;
+	entry->ptr = ptr;
+
+	ring_buffer_unlock_commit(tr->buffer, event);
+
+	trace_wake_up();
+}
+
+static void kmemtrace_kmalloc(unsigned long call_site,
+			      const void *ptr,
+			      size_t bytes_req,
+			      size_t bytes_alloc,
+			      gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmem_cache_alloc(unsigned long call_site,
+				       const void *ptr,
+				       size_t bytes_req,
+				       size_t bytes_alloc,
+				       gfp_t gfp_flags)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, -1);
+}
+
+static void kmemtrace_kmalloc_node(unsigned long call_site,
+				   const void *ptr,
+				   size_t bytes_req,
+				   size_t bytes_alloc,
+				   gfp_t gfp_flags,
+				   int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_KMALLOC, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kmem_cache_alloc_node(unsigned long call_site,
+					    const void *ptr,
+					    size_t bytes_req,
+					    size_t bytes_alloc,
+					    gfp_t gfp_flags,
+					    int node)
+{
+	kmemtrace_alloc(KMEMTRACE_TYPE_CACHE, call_site, ptr,
+			bytes_req, bytes_alloc, gfp_flags, node);
+}
+
+static void kmemtrace_kfree(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_KMALLOC, call_site, ptr);
+}
+
+static void kmemtrace_kmem_cache_free(unsigned long call_site, const void *ptr)
+{
+	kmemtrace_free(KMEMTRACE_TYPE_CACHE, call_site, ptr);
+}
+
+static int kmemtrace_start_probes(void)
+{
+	int err;
+
+	err = register_trace_kmalloc(kmemtrace_kmalloc);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	if (err)
+		return err;
+	err = register_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	if (err)
+		return err;
+	err = register_trace_kfree(kmemtrace_kfree);
+	if (err)
+		return err;
+	err = register_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+
+	return err;
+}
+
+static void kmemtrace_stop_probes(void)
+{
+	unregister_trace_kmalloc(kmemtrace_kmalloc);
+	unregister_trace_kmem_cache_alloc(kmemtrace_kmem_cache_alloc);
+	unregister_trace_kmalloc_node(kmemtrace_kmalloc_node);
+	unregister_trace_kmem_cache_alloc_node(kmemtrace_kmem_cache_alloc_node);
+	unregister_trace_kfree(kmemtrace_kfree);
+	unregister_trace_kmem_cache_free(kmemtrace_kmem_cache_free);
+}
+
 static int kmem_trace_init(struct trace_array *tr)
 {
 	int cpu;
@@ -41,14 +182,14 @@ static int kmem_trace_init(struct trace_array *tr)
 	for_each_cpu_mask(cpu, cpu_possible_map)
 		tracing_reset(tr, cpu);
 
-	kmem_tracing_enabled = true;
+	kmemtrace_start_probes();
 
 	return 0;
 }
 
 static void kmem_trace_reset(struct trace_array *tr)
 {
-	kmem_tracing_enabled = false;
+	kmemtrace_stop_probes();
 }
 
 static void kmemtrace_headers(struct seq_file *s)
@@ -260,63 +401,6 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	}
 }
 
-/* Trace allocations */
-void kmemtrace_mark_alloc_node(enum kmemtrace_type_id type_id,
-			     unsigned long call_site,
-			     const void *ptr,
-			     size_t bytes_req,
-			     size_t bytes_alloc,
-			     gfp_t gfp_flags,
-			     int node)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_ALLOC,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_alloc_node);
-
-void kmemtrace_mark_free(enum kmemtrace_type_id type_id,
-		       unsigned long call_site,
-		       const void *ptr)
-{
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
-	struct trace_array *tr = kmemtrace_array;
-
-	if (!kmem_tracing_enabled)
-		return;
-
-	event = trace_buffer_lock_reserve(tr, TRACE_KMEM_FREE,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		return;
-	entry	= ring_buffer_event_data(event);
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-
-	trace_buffer_unlock_commit(tr, event, 0, 0);
-}
-EXPORT_SYMBOL(kmemtrace_mark_free);
-
 static struct tracer kmem_tracer __read_mostly = {
 	.name		= "kmemtrace",
 	.init		= kmem_trace_init,
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d3..cbc168f1e43d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -182,6 +182,12 @@ struct trace_power {
 	struct power_trace	state_data;
 };
 
+enum kmemtrace_type_id {
+	KMEMTRACE_TYPE_KMALLOC = 0,	/* kmalloc() or kfree(). */
+	KMEMTRACE_TYPE_CACHE,		/* kmem_cache_*(). */
+	KMEMTRACE_TYPE_PAGES,		/* __get_free_pages() and friends. */
+};
+
 struct kmemtrace_alloc_entry {
 	struct trace_entry	ent;
 	enum kmemtrace_type_id type_id;
diff --git a/mm/slab.c b/mm/slab.c
index 9ec66c3e6ee0..fa00fd6a644d 100644
--- a/mm/slab.c
+++ b/mm/slab.c
@@ -3565,8 +3565,8 @@ void *kmem_cache_alloc(struct kmem_cache *cachep, gfp_t flags)
 {
 	void *ret = __cache_alloc(cachep, flags, __builtin_return_address(0));
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-			     obj_size(cachep), cachep->buffer_size, flags);
+	trace_kmem_cache_alloc(_RET_IP_, ret,
+			       obj_size(cachep), cachep->buffer_size, flags);
 
 	return ret;
 }
@@ -3627,9 +3627,9 @@ void *kmem_cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, int nodeid)
 	void *ret = __cache_alloc_node(cachep, flags, nodeid,
 				       __builtin_return_address(0));
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-				  obj_size(cachep), cachep->buffer_size,
-				  flags, nodeid);
+	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+				    obj_size(cachep), cachep->buffer_size,
+				    flags, nodeid);
 
 	return ret;
 }
@@ -3657,9 +3657,8 @@ __do_kmalloc_node(size_t size, gfp_t flags, int node, void *caller)
 		return cachep;
 	ret = kmem_cache_alloc_node_notrace(cachep, flags, node);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-				  (unsigned long) caller, ret,
-				  size, cachep->buffer_size, flags, node);
+	trace_kmalloc_node((unsigned long) caller, ret,
+			   size, cachep->buffer_size, flags, node);
 
 	return ret;
 }
@@ -3709,9 +3708,8 @@ static __always_inline void *__do_kmalloc(size_t size, gfp_t flags,
 		return cachep;
 	ret = __cache_alloc(cachep, flags, caller);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC,
-			     (unsigned long) caller, ret,
-			     size, cachep->buffer_size, flags);
+	trace_kmalloc((unsigned long) caller, ret,
+		      size, cachep->buffer_size, flags);
 
 	return ret;
 }
@@ -3757,7 +3755,7 @@ void kmem_cache_free(struct kmem_cache *cachep, void *objp)
 	__cache_free(cachep, objp);
 	local_irq_restore(flags);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, objp);
+	trace_kmem_cache_free(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -3785,7 +3783,7 @@ void kfree(const void *objp)
 	__cache_free(c, (void *)objp);
 	local_irq_restore(flags);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, objp);
+	trace_kfree(_RET_IP_, objp);
 }
 EXPORT_SYMBOL(kfree);
 
diff --git a/mm/slob.c b/mm/slob.c
index 4dd6516447f2..00003587ebfa 100644
--- a/mm/slob.c
+++ b/mm/slob.c
@@ -490,9 +490,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 		*m = size;
 		ret = (void *)m + align;
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, size + align, gfp, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, size + align, gfp, node);
 	} else {
 		unsigned int order = get_order(size);
 
@@ -503,9 +502,8 @@ void *__kmalloc_node(size_t size, gfp_t gfp, int node)
 			page->private = size;
 		}
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, PAGE_SIZE << order, gfp, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, PAGE_SIZE << order, gfp, node);
 	}
 
 	return ret;
@@ -527,7 +525,7 @@ void kfree(const void *block)
 	} else
 		put_page(&sp->page);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, block);
+	trace_kfree(_RET_IP_, block);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -599,16 +597,14 @@ void *kmem_cache_alloc_node(struct kmem_cache *c, gfp_t flags, int node)
 
 	if (c->size < PAGE_SIZE) {
 		b = slob_alloc(c->size, flags, c->align, node);
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
-					  _RET_IP_, b, c->size,
-					  SLOB_UNITS(c->size) * SLOB_UNIT,
-					  flags, node);
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+					    SLOB_UNITS(c->size) * SLOB_UNIT,
+					    flags, node);
 	} else {
 		b = slob_new_pages(flags, get_order(c->size), node);
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE,
-					  _RET_IP_, b, c->size,
-					  PAGE_SIZE << get_order(c->size),
-					  flags, node);
+		trace_kmem_cache_alloc_node(_RET_IP_, b, c->size,
+					    PAGE_SIZE << get_order(c->size),
+					    flags, node);
 	}
 
 	if (c->ctor)
@@ -646,7 +642,7 @@ void kmem_cache_free(struct kmem_cache *c, void *b)
 		__kmem_cache_free(b, c->size);
 	}
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, b);
+	trace_kmem_cache_free(_RET_IP_, b);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
diff --git a/mm/slub.c b/mm/slub.c
index 7aaa121d0ea9..a98078bf738b 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1621,8 +1621,7 @@ void *kmem_cache_alloc(struct kmem_cache *s, gfp_t gfpflags)
 {
 	void *ret = slab_alloc(s, gfpflags, -1, _RET_IP_);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-			     s->objsize, s->size, gfpflags);
+	trace_kmem_cache_alloc(_RET_IP_, ret, s->objsize, s->size, gfpflags);
 
 	return ret;
 }
@@ -1641,8 +1640,8 @@ void *kmem_cache_alloc_node(struct kmem_cache *s, gfp_t gfpflags, int node)
 {
 	void *ret = slab_alloc(s, gfpflags, node, _RET_IP_);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_CACHE, _RET_IP_, ret,
-				  s->objsize, s->size, gfpflags, node);
+	trace_kmem_cache_alloc_node(_RET_IP_, ret,
+				    s->objsize, s->size, gfpflags, node);
 
 	return ret;
 }
@@ -1767,7 +1766,7 @@ void kmem_cache_free(struct kmem_cache *s, void *x)
 
 	slab_free(s, page, x, _RET_IP_);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_CACHE, _RET_IP_, x);
+	trace_kmem_cache_free(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kmem_cache_free);
 
@@ -2702,8 +2701,7 @@ void *__kmalloc(size_t size, gfp_t flags)
 
 	ret = slab_alloc(s, flags, -1, _RET_IP_);
 
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
-			     size, s->size, flags);
+	trace_kmalloc(_RET_IP_, ret, size, s->size, flags);
 
 	return ret;
 }
@@ -2729,10 +2727,9 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 	if (unlikely(size > SLUB_MAX_SIZE)) {
 		ret = kmalloc_large_node(size, flags, node);
 
-		kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC,
-					  _RET_IP_, ret,
-					  size, PAGE_SIZE << get_order(size),
-					  flags, node);
+		trace_kmalloc_node(_RET_IP_, ret,
+				   size, PAGE_SIZE << get_order(size),
+				   flags, node);
 
 		return ret;
 	}
@@ -2744,8 +2741,7 @@ void *__kmalloc_node(size_t size, gfp_t flags, int node)
 
 	ret = slab_alloc(s, flags, node, _RET_IP_);
 
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, ret,
-				  size, s->size, flags, node);
+	trace_kmalloc_node(_RET_IP_, ret, size, s->size, flags, node);
 
 	return ret;
 }
@@ -2807,7 +2803,7 @@ void kfree(const void *x)
 	}
 	slab_free(page->slab, page, object, _RET_IP_);
 
-	kmemtrace_mark_free(KMEMTRACE_TYPE_KMALLOC, _RET_IP_, x);
+	trace_kfree(_RET_IP_, x);
 }
 EXPORT_SYMBOL(kfree);
 
@@ -3290,8 +3286,7 @@ void *__kmalloc_track_caller(size_t size, gfp_t gfpflags, unsigned long caller)
 	ret = slab_alloc(s, gfpflags, -1, caller);
 
 	/* Honor the call site pointer we recieved. */
-	kmemtrace_mark_alloc(KMEMTRACE_TYPE_KMALLOC, caller, ret, size,
-			     s->size, gfpflags);
+	trace_kmalloc(caller, ret, size, s->size, gfpflags);
 
 	return ret;
 }
@@ -3313,8 +3308,7 @@ void *__kmalloc_node_track_caller(size_t size, gfp_t gfpflags,
 	ret = slab_alloc(s, gfpflags, node, caller);
 
 	/* Honor the call site pointer we recieved. */
-	kmemtrace_mark_alloc_node(KMEMTRACE_TYPE_KMALLOC, caller, ret,
-				  size, s->size, gfpflags, node);
+	trace_kmalloc_node(caller, ret, size, s->size, gfpflags, node);
 
 	return ret;
 }
diff --git a/mm/util.c b/mm/util.c
index 7c122e49f769..2599e83eea17 100644
--- a/mm/util.c
+++ b/mm/util.c
@@ -4,6 +4,7 @@
 #include <linux/module.h>
 #include <linux/err.h>
 #include <linux/sched.h>
+#include <linux/tracepoint.h>
 #include <asm/uaccess.h>
 
 /**
@@ -236,3 +237,18 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start,
 	return ret;
 }
 EXPORT_SYMBOL_GPL(get_user_pages_fast);
+
+/* Tracepoints definitions. */
+DEFINE_TRACE(kmalloc);
+DEFINE_TRACE(kmem_cache_alloc);
+DEFINE_TRACE(kmalloc_node);
+DEFINE_TRACE(kmem_cache_alloc_node);
+DEFINE_TRACE(kfree);
+DEFINE_TRACE(kmem_cache_free);
+
+EXPORT_TRACEPOINT_SYMBOL(kmalloc);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc);
+EXPORT_TRACEPOINT_SYMBOL(kmalloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc_node);
+EXPORT_TRACEPOINT_SYMBOL(kfree);
+EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free);
-- 
cgit v1.2.3


From da2635a9854423b4aa3a5f0e4e6efcc39ac99004 Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:25 +0200
Subject: kmemtrace: kmemtrace_alloc() must fill type_id

Impact: fix trace output

kmemtrace_alloc() was not filling type_id, which allowed garbage to make
it into tracing data.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <284dba2732a144849d5aa82258fe0de2ad8dcb0b.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index 4f7b5db5dd06..ae259f04ee39 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -52,6 +52,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
 	entry->ent.type = TRACE_KMEM_ALLOC;
+	entry->type_id = type_id;
 	entry->call_site = call_site;
 	entry->ptr = ptr;
 	entry->bytes_req = bytes_req;
-- 
cgit v1.2.3


From 42af9054c0eeed09ec58d13ec8bf52d225ebcfcc Mon Sep 17 00:00:00 2001
From: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
Date: Mon, 23 Mar 2009 15:12:26 +0200
Subject: kmemtrace: restore original tracing data binary format, improve ABI

When kmemtrace was ported to ftrace, the marker strings were taken as
an indication of how the traced data was being exposed to the userspace.
However, the actual format had been binary, not text.

This restores the original binary format, while also adding an origin CPU
field (since ftrace doesn't expose the data per-CPU to userspace), and
re-adding the timestamp field. It also drops arch-independent field
sizing where it didn't make sense, so pointers won't always be 64 bits
wide like they used to.

Signed-off-by: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <161be9ca8a27b432c4a6ab79f47788c4521652ae.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 82 ++++++++++++++++++++++++++++++++++--------------
 1 file changed, 58 insertions(+), 24 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index ae259f04ee39..d8c2d0c91b4c 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -208,47 +208,81 @@ static void kmemtrace_headers(struct seq_file *s)
 }
 
 /*
- * The two following functions give the original output from kmemtrace,
- * or something close to....perhaps they need some missing things
+ * The following functions give the original output from kmemtrace,
+ * plus the origin CPU, since reordering occurs in-kernel now.
  */
+
+#define KMEMTRACE_USER_ALLOC	0
+#define KMEMTRACE_USER_FREE	1
+
+struct kmemtrace_user_event {
+	u8		event_id;
+	u8		type_id;
+	u16		event_size;
+	u32		cpu;
+	u64		timestamp;
+	unsigned long	call_site;
+	unsigned long	ptr;
+};
+
+struct kmemtrace_user_event_alloc {
+	size_t		bytes_req;
+	size_t		bytes_alloc;
+	unsigned	gfp_flags;
+	int		node;
+};
+
 static enum print_line_t
-kmemtrace_print_alloc_original(struct trace_iterator *iter,
-				struct kmemtrace_alloc_entry *entry)
+kmemtrace_print_alloc_user(struct trace_iterator *iter,
+			   struct kmemtrace_alloc_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
-	int ret;
-
-	/* Taken from the old linux/kmemtrace.h */
-	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu "
-	  "bytes_req %lu bytes_alloc %lu gfp_flags %lu node %d\n",
-	   entry->type_id, entry->call_site, (unsigned long) entry->ptr,
-	   (unsigned long) entry->bytes_req, (unsigned long) entry->bytes_alloc,
-	   (unsigned long) entry->gfp_flags, entry->node);
+	struct kmemtrace_user_event *ev;
+	struct kmemtrace_user_event_alloc *ev_alloc;
 
-	if (!ret)
+	ev = trace_seq_reserve(s, sizeof(*ev));
+	if (!ev)
+		return TRACE_TYPE_PARTIAL_LINE;
+	ev->event_id = KMEMTRACE_USER_ALLOC;
+	ev->type_id = entry->type_id;
+	ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
+	ev->cpu = iter->cpu;
+	ev->timestamp = iter->ts;
+	ev->call_site = entry->call_site; 
+	ev->ptr = (unsigned long) entry->ptr;
+
+	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
+	if (!ev_alloc)
 		return TRACE_TYPE_PARTIAL_LINE;
+	ev_alloc->bytes_req = entry->bytes_req;
+	ev_alloc->bytes_alloc = entry->bytes_alloc;
+	ev_alloc->gfp_flags = entry->gfp_flags;
+	ev_alloc->node = entry->node;
 
 	return TRACE_TYPE_HANDLED;
 }
 
 static enum print_line_t
-kmemtrace_print_free_original(struct trace_iterator *iter,
-				struct kmemtrace_free_entry *entry)
+kmemtrace_print_free_user(struct trace_iterator *iter,
+			  struct kmemtrace_free_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
-	int ret;
+	struct kmemtrace_user_event *ev;
 
-	/* Taken from the old linux/kmemtrace.h */
-	ret = trace_seq_printf(s, "type_id %d call_site %lu ptr %lu\n",
-	   entry->type_id, entry->call_site, (unsigned long) entry->ptr);
-
-	if (!ret)
+	ev = trace_seq_reserve(s, sizeof(*ev));
+	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
+	ev->event_id = KMEMTRACE_USER_FREE;
+	ev->type_id = entry->type_id;
+	ev->event_size = sizeof(*ev);
+	ev->cpu = iter->cpu;
+	ev->timestamp = iter->ts;
+	ev->call_site = entry->call_site; 
+	ev->ptr = (unsigned long) entry->ptr;
 
 	return TRACE_TYPE_HANDLED;
 }
 
-
 /* The two other following provide a more minimalistic output */
 static enum print_line_t
 kmemtrace_print_alloc_compress(struct trace_iterator *iter,
@@ -385,7 +419,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_alloc_compress(iter, field);
 		else
-			return kmemtrace_print_alloc_original(iter, field);
+			return kmemtrace_print_alloc_user(iter, field);
 	}
 
 	case TRACE_KMEM_FREE: {
@@ -394,7 +428,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_free_compress(iter, field);
 		else
-			return kmemtrace_print_free_original(iter, field);
+			return kmemtrace_print_free_user(iter, field);
 	}
 
 	default:
-- 
cgit v1.2.3


From c826e3cd0c931d60d548f2468122da570d145556 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 23 Mar 2009 16:14:13 +0100
Subject: kmemtrace: small cleanups

Cc: Eduard - Gabriel Munteanu <eduard.munteanu@linux360.ro>
LKML-Reference: <161be9ca8a27b432c4a6ab79f47788c4521652ae.1237813499.git.eduard.munteanu@linux360.ro>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/kmemtrace.c | 128 +++++++++++++++++++++++++----------------------
 1 file changed, 67 insertions(+), 61 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c
index d8c2d0c91b4c..5011f4d91e37 100644
--- a/kernel/trace/kmemtrace.c
+++ b/kernel/trace/kmemtrace.c
@@ -6,15 +6,16 @@
  * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
  */
 
-#include <linux/dcache.h>
+#include <linux/tracepoint.h>
+#include <linux/seq_file.h>
 #include <linux/debugfs.h>
+#include <linux/dcache.h>
 #include <linux/fs.h>
-#include <linux/seq_file.h>
-#include <linux/tracepoint.h>
+
 #include <trace/kmemtrace.h>
 
-#include "trace.h"
 #include "trace_output.h"
+#include "trace.h"
 
 /* Select an alternative, minimalistic output than the original one */
 #define TRACE_KMEM_OPT_MINIMAL	0x1
@@ -26,8 +27,8 @@ static struct tracer_opt kmem_opts[] = {
 };
 
 static struct tracer_flags kmem_tracer_flags = {
-	.val = 0,
-	.opts = kmem_opts
+	.val			= 0,
+	.opts			= kmem_opts
 };
 
 static struct trace_array *kmemtrace_array;
@@ -41,24 +42,25 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id,
 				   gfp_t gfp_flags,
 				   int node)
 {
-	struct ring_buffer_event *event;
-	struct kmemtrace_alloc_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
+	struct kmemtrace_alloc_entry *entry;
+	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
 		return;
-	entry	= ring_buffer_event_data(event);
+
+	entry = ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_ALLOC;
-	entry->type_id = type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
-	entry->bytes_req = bytes_req;
-	entry->bytes_alloc = bytes_alloc;
-	entry->gfp_flags = gfp_flags;
-	entry->node	=	node;
+	entry->ent.type		= TRACE_KMEM_ALLOC;
+	entry->type_id		= type_id;
+	entry->call_site	= call_site;
+	entry->ptr		= ptr;
+	entry->bytes_req	= bytes_req;
+	entry->bytes_alloc	= bytes_alloc;
+	entry->gfp_flags	= gfp_flags;
+	entry->node		= node;
 
 	ring_buffer_unlock_commit(tr->buffer, event);
 
@@ -69,9 +71,9 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 				  unsigned long call_site,
 				  const void *ptr)
 {
-	struct ring_buffer_event *event;
-	struct kmemtrace_free_entry *entry;
 	struct trace_array *tr = kmemtrace_array;
+	struct kmemtrace_free_entry *entry;
+	struct ring_buffer_event *event;
 
 	event = ring_buffer_lock_reserve(tr->buffer, sizeof(*entry));
 	if (!event)
@@ -79,10 +81,10 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id,
 	entry	= ring_buffer_event_data(event);
 	tracing_generic_entry_update(&entry->ent, 0, 0);
 
-	entry->ent.type = TRACE_KMEM_FREE;
-	entry->type_id	= type_id;
-	entry->call_site = call_site;
-	entry->ptr = ptr;
+	entry->ent.type		= TRACE_KMEM_FREE;
+	entry->type_id		= type_id;
+	entry->call_site	= call_site;
+	entry->ptr		= ptr;
 
 	ring_buffer_unlock_commit(tr->buffer, event);
 
@@ -216,48 +218,50 @@ static void kmemtrace_headers(struct seq_file *s)
 #define KMEMTRACE_USER_FREE	1
 
 struct kmemtrace_user_event {
-	u8		event_id;
-	u8		type_id;
-	u16		event_size;
-	u32		cpu;
-	u64		timestamp;
-	unsigned long	call_site;
-	unsigned long	ptr;
+	u8			event_id;
+	u8			type_id;
+	u16			event_size;
+	u32			cpu;
+	u64			timestamp;
+	unsigned long		call_site;
+	unsigned long		ptr;
 };
 
 struct kmemtrace_user_event_alloc {
-	size_t		bytes_req;
-	size_t		bytes_alloc;
-	unsigned	gfp_flags;
-	int		node;
+	size_t			bytes_req;
+	size_t			bytes_alloc;
+	unsigned		gfp_flags;
+	int			node;
 };
 
 static enum print_line_t
 kmemtrace_print_alloc_user(struct trace_iterator *iter,
 			   struct kmemtrace_alloc_entry *entry)
 {
+	struct kmemtrace_user_event_alloc *ev_alloc;
 	struct trace_seq *s = &iter->seq;
 	struct kmemtrace_user_event *ev;
-	struct kmemtrace_user_event_alloc *ev_alloc;
 
 	ev = trace_seq_reserve(s, sizeof(*ev));
 	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev->event_id = KMEMTRACE_USER_ALLOC;
-	ev->type_id = entry->type_id;
-	ev->event_size = sizeof(*ev) + sizeof(*ev_alloc);
-	ev->cpu = iter->cpu;
-	ev->timestamp = iter->ts;
-	ev->call_site = entry->call_site; 
-	ev->ptr = (unsigned long) entry->ptr;
+
+	ev->event_id		= KMEMTRACE_USER_ALLOC;
+	ev->type_id		= entry->type_id;
+	ev->event_size		= sizeof(*ev) + sizeof(*ev_alloc);
+	ev->cpu			= iter->cpu;
+	ev->timestamp		= iter->ts;
+	ev->call_site		= entry->call_site;
+	ev->ptr			= (unsigned long)entry->ptr;
 
 	ev_alloc = trace_seq_reserve(s, sizeof(*ev_alloc));
 	if (!ev_alloc)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev_alloc->bytes_req = entry->bytes_req;
-	ev_alloc->bytes_alloc = entry->bytes_alloc;
-	ev_alloc->gfp_flags = entry->gfp_flags;
-	ev_alloc->node = entry->node;
+
+	ev_alloc->bytes_req	= entry->bytes_req;
+	ev_alloc->bytes_alloc	= entry->bytes_alloc;
+	ev_alloc->gfp_flags	= entry->gfp_flags;
+	ev_alloc->node		= entry->node;
 
 	return TRACE_TYPE_HANDLED;
 }
@@ -272,13 +276,14 @@ kmemtrace_print_free_user(struct trace_iterator *iter,
 	ev = trace_seq_reserve(s, sizeof(*ev));
 	if (!ev)
 		return TRACE_TYPE_PARTIAL_LINE;
-	ev->event_id = KMEMTRACE_USER_FREE;
-	ev->type_id = entry->type_id;
-	ev->event_size = sizeof(*ev);
-	ev->cpu = iter->cpu;
-	ev->timestamp = iter->ts;
-	ev->call_site = entry->call_site; 
-	ev->ptr = (unsigned long) entry->ptr;
+
+	ev->event_id		= KMEMTRACE_USER_FREE;
+	ev->type_id		= entry->type_id;
+	ev->event_size		= sizeof(*ev);
+	ev->cpu			= iter->cpu;
+	ev->timestamp		= iter->ts;
+	ev->call_site		= entry->call_site;
+	ev->ptr			= (unsigned long)entry->ptr;
 
 	return TRACE_TYPE_HANDLED;
 }
@@ -354,7 +359,7 @@ kmemtrace_print_alloc_compress(struct trace_iterator *iter,
 
 static enum print_line_t
 kmemtrace_print_free_compress(struct trace_iterator *iter,
-				struct kmemtrace_free_entry *entry)
+			      struct kmemtrace_free_entry *entry)
 {
 	struct trace_seq *s = &iter->seq;
 	int ret;
@@ -415,6 +420,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 	switch (entry->type) {
 	case TRACE_KMEM_ALLOC: {
 		struct kmemtrace_alloc_entry *field;
+
 		trace_assign_type(field, entry);
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_alloc_compress(iter, field);
@@ -424,6 +430,7 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 
 	case TRACE_KMEM_FREE: {
 		struct kmemtrace_free_entry *field;
+
 		trace_assign_type(field, entry);
 		if (kmem_tracer_flags.val & TRACE_KMEM_OPT_MINIMAL)
 			return kmemtrace_print_free_compress(iter, field);
@@ -437,12 +444,12 @@ static enum print_line_t kmemtrace_print_line(struct trace_iterator *iter)
 }
 
 static struct tracer kmem_tracer __read_mostly = {
-	.name		= "kmemtrace",
-	.init		= kmem_trace_init,
-	.reset		= kmem_trace_reset,
-	.print_line	= kmemtrace_print_line,
-	.print_header = kmemtrace_headers,
-	.flags		= &kmem_tracer_flags
+	.name			= "kmemtrace",
+	.init			= kmem_trace_init,
+	.reset			= kmem_trace_reset,
+	.print_line		= kmemtrace_print_line,
+	.print_header		= kmemtrace_headers,
+	.flags			= &kmem_tracer_flags
 };
 
 void kmemtrace_init(void)
@@ -454,5 +461,4 @@ static int __init init_kmem_tracer(void)
 {
 	return register_tracer(&kmem_tracer);
 }
-
 device_initcall(init_kmem_tracer);
-- 
cgit v1.2.3


From a4b3ada83d06554d307dd54abdc62b2e5648264a Mon Sep 17 00:00:00 2001
From: Carl Henrik Lunde <chlunde@ping.uio.no>
Date: Fri, 3 Apr 2009 14:27:15 +0200
Subject: blktrace: NUL-terminate user space messages

Impact: fix corrupted blkparse output

Make sure messages from user space are NUL-terminated strings,
otherwise we could dump random memory to the block trace file.

Additionally, I've limited the message to BLK_TN_MAX_MSG-1
characters, because the last character would be stripped by
vscnprintf anyway.

Signed-off-by: Carl Henrik Lunde <chlunde@ping.uio.no>
Cc: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Alan D. Brunelle" <alan.brunelle@hp.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <20090403122714.GT5178@kernel.dk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 947c5b3f90c4..a400b861fad3 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,10 +327,10 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	char *msg;
 	struct blk_trace *bt;
 
-	if (count > BLK_TN_MAX_MSG)
+	if (count > BLK_TN_MAX_MSG - 1)
 		return -EINVAL;
 
-	msg = kmalloc(count, GFP_KERNEL);
+	msg = kmalloc(count + 1, GFP_KERNEL);
 	if (msg == NULL)
 		return -ENOMEM;
 
@@ -339,6 +339,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 		return -EFAULT;
 	}
 
+	msg[count] = '\0';
 	bt = filp->private_data;
 	__trace_note_message(bt, "%s", msg);
 	kfree(msg);
-- 
cgit v1.2.3


From 7635b03adf3d7b84da7649b81efa91e6ebf11b85 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Fri, 3 Apr 2009 15:31:34 +0800
Subject: blktrace: small cleanup in blk_msg_write()

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: "Alan D. Brunelle" <alan.brunelle@hp.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49D5BB56.7000807@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/blktrace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index a400b861fad3..73d7860b72e2 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -327,7 +327,7 @@ static ssize_t blk_msg_write(struct file *filp, const char __user *buffer,
 	char *msg;
 	struct blk_trace *bt;
 
-	if (count > BLK_TN_MAX_MSG - 1)
+	if (count >= BLK_TN_MAX_MSG)
 		return -EINVAL;
 
 	msg = kmalloc(count + 1, GFP_KERNEL);
-- 
cgit v1.2.3


From e2494e1b42ebac402324105d57646489d19e2b01 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 2 Apr 2009 13:43:26 +0800
Subject: blktrace: fix pdu_len when tracing packet command requests

Impact: output all of packet commands - not just the first 4 / 8 bytes

Since commit d7e3c3249ef23b4617393c69fe464765b4ff1645 ("block: add
large command support"), struct request->cmd has been changed from
unsinged char cmd[BLK_MAX_CDB] to unsigned char *cmd.

v1 -> v2: by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>

- make sure rq->cmd_len is always intialized, and then we can use
  rq->cmd_len instead of BLK_MAX_CDB.

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: FUJITA Tomonori <fujita.tomonori@lab.ntt.co.jp>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Jens Axboe <jens.axboe@oracle.com>
LKML-Reference: <49D4507E.2060602@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 block/blk-core.c        | 1 +
 kernel/trace/blktrace.c | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/block/blk-core.c b/block/blk-core.c
index 29bcfac6c688..859879d0a0bf 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -132,6 +132,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
 	INIT_HLIST_NODE(&rq->hash);
 	RB_CLEAR_NODE(&rq->rb_node);
 	rq->cmd = rq->__cmd;
+	rq->cmd_len = BLK_MAX_CDB;
 	rq->tag = -1;
 	rq->ref_count = 1;
 }
diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c
index 73d7860b72e2..b32ff446c3fb 100644
--- a/kernel/trace/blktrace.c
+++ b/kernel/trace/blktrace.c
@@ -643,7 +643,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq,
 	if (blk_pc_request(rq)) {
 		what |= BLK_TC_ACT(BLK_TC_PC);
 		__blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors,
-				sizeof(rq->cmd), rq->cmd);
+				rq->cmd_len, rq->cmd);
 	} else  {
 		what |= BLK_TC_ACT(BLK_TC_FS);
 		__blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9,
-- 
cgit v1.2.3


From 07fe7cb7c7c179f473fd9c823348fd3eb5dad369 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Create a dynamically sized pool of threads for doing very slow work
 items

Create a dynamically sized pool of threads for doing very slow work items, such
as invoking mkdir() or rmdir() - things that may take a long time and may
sleep, holding mutexes/semaphores and hogging a thread, and are thus unsuitable
for workqueues.

The number of threads is always at least a settable minimum, but more are
started when there's more work to do, up to a limit.  Because of the nature of
the load, it's not suitable for a 1-thread-per-CPU type pool.  A system with
one CPU may well want several threads.

This is used by FS-Cache to do slow caching operations in the background, such
as looking up, creating or deleting cache objects.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |  88 +++++++++++
 init/Kconfig              |  12 ++
 kernel/Makefile           |   1 +
 kernel/slow-work.c        | 388 ++++++++++++++++++++++++++++++++++++++++++++++
 4 files changed, 489 insertions(+)
 create mode 100644 include/linux/slow-work.h
 create mode 100644 kernel/slow-work.c

(limited to 'kernel')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
new file mode 100644
index 000000000000..4dd754af393e
--- /dev/null
+++ b/include/linux/slow-work.h
@@ -0,0 +1,88 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#ifndef _LINUX_SLOW_WORK_H
+#define _LINUX_SLOW_WORK_H
+
+#ifdef CONFIG_SLOW_WORK
+
+struct slow_work;
+
+/*
+ * The operations used to support slow work items
+ */
+struct slow_work_ops {
+	/* get a ref on a work item
+	 * - return 0 if successful, -ve if not
+	 */
+	int (*get_ref)(struct slow_work *work);
+
+	/* discard a ref to a work item */
+	void (*put_ref)(struct slow_work *work);
+
+	/* execute a work item */
+	void (*execute)(struct slow_work *work);
+};
+
+/*
+ * A slow work item
+ * - A reference is held on the parent object by the thread pool when it is
+ *   queued
+ */
+struct slow_work {
+	unsigned long		flags;
+#define SLOW_WORK_PENDING	0	/* item pending (further) execution */
+#define SLOW_WORK_EXECUTING	1	/* item currently executing */
+#define SLOW_WORK_ENQ_DEFERRED	2	/* item enqueue deferred */
+#define SLOW_WORK_VERY_SLOW	3	/* item is very slow */
+	const struct slow_work_ops *ops; /* operations table for this item */
+	struct list_head	link;	/* link in queue */
+};
+
+/**
+ * slow_work_init - Initialise a slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a slow work item.
+ */
+static inline void slow_work_init(struct slow_work *work,
+				  const struct slow_work_ops *ops)
+{
+	work->flags = 0;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+/**
+ * slow_work_init - Initialise a very slow work item
+ * @work: The work item to initialise
+ * @ops: The operations to use to handle the slow work item
+ *
+ * Initialise a very slow work item.  This item will be restricted such that
+ * only a certain number of the pool threads will be able to execute items of
+ * this type.
+ */
+static inline void vslow_work_init(struct slow_work *work,
+				   const struct slow_work_ops *ops)
+{
+	work->flags = 1 << SLOW_WORK_VERY_SLOW;
+	work->ops = ops;
+	INIT_LIST_HEAD(&work->link);
+}
+
+extern int slow_work_enqueue(struct slow_work *work);
+extern int slow_work_register_user(void);
+extern void slow_work_unregister_user(void);
+
+
+#endif /* CONFIG_SLOW_WORK */
+#endif /* _LINUX_SLOW_WORK_H */
diff --git a/init/Kconfig b/init/Kconfig
index 1398a14b0191..236a79377b8e 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1014,6 +1014,18 @@ config MARKERS
 
 source "arch/Kconfig"
 
+config SLOW_WORK
+	default n
+	bool "Enable slow work thread pool"
+	help
+	  The slow work thread pool provides a number of dynamically allocated
+	  threads that can be used by the kernel to perform operations that
+	  take a relatively long time.
+
+	  An example of this would be CacheFiles doing a path lookup followed
+	  by a series of mkdirs and a create call, all of which have to touch
+	  disk.
+
 endmenu		# General setup
 
 config HAVE_GENERIC_DMA_COHERENT
diff --git a/kernel/Makefile b/kernel/Makefile
index e4791b3ba55d..bab1dffe37e9 100644
--- a/kernel/Makefile
+++ b/kernel/Makefile
@@ -93,6 +93,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o
 obj-$(CONFIG_FUNCTION_TRACER) += trace/
 obj-$(CONFIG_TRACING) += trace/
 obj-$(CONFIG_SMP) += sched_cpupri.o
+obj-$(CONFIG_SLOW_WORK) += slow-work.o
 
 ifneq ($(CONFIG_SCHED_OMIT_FRAME_POINTER),y)
 # According to Alan Modra <alan@linuxcare.com.au>, the -fno-omit-frame-pointer is
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
new file mode 100644
index 000000000000..5a7392734c82
--- /dev/null
+++ b/kernel/slow-work.c
@@ -0,0 +1,388 @@
+/* Worker thread pool for slow items, such as filesystem lookups or mkdirs
+ *
+ * Copyright (C) 2008 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public Licence
+ * as published by the Free Software Foundation; either version
+ * 2 of the Licence, or (at your option) any later version.
+ */
+
+#include <linux/module.h>
+#include <linux/slow-work.h>
+#include <linux/kthread.h>
+#include <linux/freezer.h>
+#include <linux/wait.h>
+#include <asm/system.h>
+
+/*
+ * The pool of threads has at least min threads in it as long as someone is
+ * using the facility, and may have as many as max.
+ *
+ * A portion of the pool may be processing very slow operations.
+ */
+static unsigned slow_work_min_threads = 2;
+static unsigned slow_work_max_threads = 4;
+static unsigned vslow_work_proportion = 50; /* % of threads that may process
+					     * very slow work */
+static atomic_t slow_work_thread_count;
+static atomic_t vslow_work_executing_count;
+
+/*
+ * The queues of work items and the lock governing access to them.  These are
+ * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
+ * as the number of threads bears no relation to the number of CPUs.
+ *
+ * There are two queues of work items: one for slow work items, and one for
+ * very slow work items.
+ */
+static LIST_HEAD(slow_work_queue);
+static LIST_HEAD(vslow_work_queue);
+static DEFINE_SPINLOCK(slow_work_queue_lock);
+
+/*
+ * The thread controls.  A variable used to signal to the threads that they
+ * should exit when the queue is empty, a waitqueue used by the threads to wait
+ * for signals, and a completion set by the last thread to exit.
+ */
+static bool slow_work_threads_should_exit;
+static DECLARE_WAIT_QUEUE_HEAD(slow_work_thread_wq);
+static DECLARE_COMPLETION(slow_work_last_thread_exited);
+
+/*
+ * The number of users of the thread pool and its lock.  Whilst this is zero we
+ * have no threads hanging around, and when this reaches zero, we wait for all
+ * active or queued work items to complete and kill all the threads we do have.
+ */
+static int slow_work_user_count;
+static DEFINE_MUTEX(slow_work_user_lock);
+
+/*
+ * Calculate the maximum number of active threads in the pool that are
+ * permitted to process very slow work items.
+ *
+ * The answer is rounded up to at least 1, but may not equal or exceed the
+ * maximum number of the threads in the pool.  This means we always have at
+ * least one thread that can process slow work items, and we always have at
+ * least one thread that won't get tied up doing so.
+ */
+static unsigned slow_work_calc_vsmax(void)
+{
+	unsigned vsmax;
+
+	vsmax = atomic_read(&slow_work_thread_count) * vslow_work_proportion;
+	vsmax /= 100;
+	vsmax = max(vsmax, 1U);
+	return min(vsmax, slow_work_max_threads - 1);
+}
+
+/*
+ * Attempt to execute stuff queued on a slow thread.  Return true if we managed
+ * it, false if there was nothing to do.
+ */
+static bool slow_work_execute(void)
+{
+	struct slow_work *work = NULL;
+	unsigned vsmax;
+	bool very_slow;
+
+	vsmax = slow_work_calc_vsmax();
+
+	/* find something to execute */
+	spin_lock_irq(&slow_work_queue_lock);
+	if (!list_empty(&vslow_work_queue) &&
+	    atomic_read(&vslow_work_executing_count) < vsmax) {
+		work = list_entry(vslow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		atomic_inc(&vslow_work_executing_count);
+		very_slow = true;
+	} else if (!list_empty(&slow_work_queue)) {
+		work = list_entry(slow_work_queue.next,
+				  struct slow_work, link);
+		if (test_and_set_bit_lock(SLOW_WORK_EXECUTING, &work->flags))
+			BUG();
+		list_del_init(&work->link);
+		very_slow = false;
+	} else {
+		very_slow = false; /* avoid the compiler warning */
+	}
+	spin_unlock_irq(&slow_work_queue_lock);
+
+	if (!work)
+		return false;
+
+	if (!test_and_clear_bit(SLOW_WORK_PENDING, &work->flags))
+		BUG();
+
+	work->ops->execute(work);
+
+	if (very_slow)
+		atomic_dec(&vslow_work_executing_count);
+	clear_bit_unlock(SLOW_WORK_EXECUTING, &work->flags);
+
+	/* if someone tried to enqueue the item whilst we were executing it,
+	 * then it'll be left unenqueued to avoid multiple threads trying to
+	 * execute it simultaneously
+	 *
+	 * there is, however, a race between us testing the pending flag and
+	 * getting the spinlock, and between the enqueuer setting the pending
+	 * flag and getting the spinlock, so we use a deferral bit to tell us
+	 * if the enqueuer got there first
+	 */
+	if (test_bit(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irq(&slow_work_queue_lock);
+
+		if (!test_bit(SLOW_WORK_EXECUTING, &work->flags) &&
+		    test_and_clear_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags))
+			goto auto_requeue;
+
+		spin_unlock_irq(&slow_work_queue_lock);
+	}
+
+	work->ops->put_ref(work);
+	return true;
+
+auto_requeue:
+	/* we must complete the enqueue operation
+	 * - we transfer our ref on the item back to the appropriate queue
+	 * - don't wake another thread up as we're awake already
+	 */
+	if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+		list_add_tail(&work->link, &vslow_work_queue);
+	else
+		list_add_tail(&work->link, &slow_work_queue);
+	spin_unlock_irq(&slow_work_queue_lock);
+	return true;
+}
+
+/**
+ * slow_work_enqueue - Schedule a slow work item for processing
+ * @work: The work item to queue
+ *
+ * Schedule a slow work item for processing.  If the item is already undergoing
+ * execution, this guarantees not to re-enter the execution routine until the
+ * first execution finishes.
+ *
+ * The item is pinned by this function as it retains a reference to it, managed
+ * through the item operations.  The item is unpinned once it has been
+ * executed.
+ *
+ * An item may hog the thread that is running it for a relatively large amount
+ * of time, sufficient, for example, to perform several lookup, mkdir, create
+ * and setxattr operations.  It may sleep on I/O and may sleep to obtain locks.
+ *
+ * Conversely, if a number of items are awaiting processing, it may take some
+ * time before any given item is given attention.  The number of threads in the
+ * pool may be increased to deal with demand, but only up to a limit.
+ *
+ * If SLOW_WORK_VERY_SLOW is set on the work item, then it will be placed in
+ * the very slow queue, from which only a portion of the threads will be
+ * allowed to pick items to execute.  This ensures that very slow items won't
+ * overly block ones that are just ordinarily slow.
+ *
+ * Returns 0 if successful, -EAGAIN if not.
+ */
+int slow_work_enqueue(struct slow_work *work)
+{
+	unsigned long flags;
+
+	BUG_ON(slow_work_user_count <= 0);
+	BUG_ON(!work);
+	BUG_ON(!work->ops);
+	BUG_ON(!work->ops->get_ref);
+
+	/* when honouring an enqueue request, we only promise that we will run
+	 * the work function in the future; we do not promise to run it once
+	 * per enqueue request
+	 *
+	 * we use the PENDING bit to merge together repeat requests without
+	 * having to disable IRQs and take the spinlock, whilst still
+	 * maintaining our promise
+	 */
+	if (!test_and_set_bit_lock(SLOW_WORK_PENDING, &work->flags)) {
+		spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+		/* we promise that we will not attempt to execute the work
+		 * function in more than one thread simultaneously
+		 *
+		 * this, however, leaves us with a problem if we're asked to
+		 * enqueue the work whilst someone is executing the work
+		 * function as simply queueing the work immediately means that
+		 * another thread may try executing it whilst it is already
+		 * under execution
+		 *
+		 * to deal with this, we set the ENQ_DEFERRED bit instead of
+		 * enqueueing, and the thread currently executing the work
+		 * function will enqueue the work item when the work function
+		 * returns and it has cleared the EXECUTING bit
+		 */
+		if (test_bit(SLOW_WORK_EXECUTING, &work->flags)) {
+			set_bit(SLOW_WORK_ENQ_DEFERRED, &work->flags);
+		} else {
+			if (work->ops->get_ref(work) < 0)
+				goto cant_get_ref;
+			if (test_bit(SLOW_WORK_VERY_SLOW, &work->flags))
+				list_add_tail(&work->link, &vslow_work_queue);
+			else
+				list_add_tail(&work->link, &slow_work_queue);
+			wake_up(&slow_work_thread_wq);
+		}
+
+		spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	}
+	return 0;
+
+cant_get_ref:
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return -EAGAIN;
+}
+EXPORT_SYMBOL(slow_work_enqueue);
+
+/*
+ * Determine if there is slow work available for dispatch
+ */
+static inline bool slow_work_available(int vsmax)
+{
+	return !list_empty(&slow_work_queue) ||
+		(!list_empty(&vslow_work_queue) &&
+		 atomic_read(&vslow_work_executing_count) < vsmax);
+}
+
+/*
+ * Worker thread dispatcher
+ */
+static int slow_work_thread(void *_data)
+{
+	int vsmax;
+
+	DEFINE_WAIT(wait);
+
+	set_freezable();
+	set_user_nice(current, -5);
+
+	for (;;) {
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		prepare_to_wait(&slow_work_thread_wq, &wait,
+				TASK_INTERRUPTIBLE);
+		if (!freezing(current) &&
+		    !slow_work_threads_should_exit &&
+		    !slow_work_available(vsmax))
+			schedule();
+		finish_wait(&slow_work_thread_wq, &wait);
+
+		try_to_freeze();
+
+		vsmax = vslow_work_proportion;
+		vsmax *= atomic_read(&slow_work_thread_count);
+		vsmax /= 100;
+
+		if (slow_work_available(vsmax) && slow_work_execute()) {
+			cond_resched();
+			continue;
+		}
+
+		if (slow_work_threads_should_exit)
+			break;
+	}
+
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete_and_exit(&slow_work_last_thread_exited, 0);
+	return 0;
+}
+
+/**
+ * slow_work_register_user - Register a user of the facility
+ *
+ * Register a user of the facility, starting up the initial threads if there
+ * aren't any other users at this point.  This will return 0 if successful, or
+ * an error if not.
+ */
+int slow_work_register_user(void)
+{
+	struct task_struct *p;
+	int loop;
+
+	mutex_lock(&slow_work_user_lock);
+
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Starting up\n");
+		init_completion(&slow_work_last_thread_exited);
+
+		slow_work_threads_should_exit = false;
+
+		/* start the minimum number of threads */
+		for (loop = 0; loop < slow_work_min_threads; loop++) {
+			atomic_inc(&slow_work_thread_count);
+			p = kthread_run(slow_work_thread, NULL, "kslowd");
+			if (IS_ERR(p))
+				goto error;
+		}
+		printk(KERN_NOTICE "Slow work thread pool: Ready\n");
+	}
+
+	slow_work_user_count++;
+	mutex_unlock(&slow_work_user_lock);
+	return 0;
+
+error:
+	if (atomic_dec_and_test(&slow_work_thread_count))
+		complete(&slow_work_last_thread_exited);
+	if (loop > 0) {
+		printk(KERN_ERR "Slow work thread pool:"
+		       " Aborting startup on ENOMEM\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_ERR "Slow work thread pool: Aborted\n");
+	}
+	mutex_unlock(&slow_work_user_lock);
+	return PTR_ERR(p);
+}
+EXPORT_SYMBOL(slow_work_register_user);
+
+/**
+ * slow_work_unregister_user - Unregister a user of the facility
+ *
+ * Unregister a user of the facility, killing all the threads if this was the
+ * last one.
+ */
+void slow_work_unregister_user(void)
+{
+	mutex_lock(&slow_work_user_lock);
+
+	BUG_ON(slow_work_user_count <= 0);
+
+	slow_work_user_count--;
+	if (slow_work_user_count == 0) {
+		printk(KERN_NOTICE "Slow work thread pool: Shutting down\n");
+		slow_work_threads_should_exit = true;
+		wake_up_all(&slow_work_thread_wq);
+		wait_for_completion(&slow_work_last_thread_exited);
+		printk(KERN_NOTICE "Slow work thread pool:"
+		       " Shut down complete\n");
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+EXPORT_SYMBOL(slow_work_unregister_user);
+
+/*
+ * Initialise the slow work facility
+ */
+static int __init init_slow_work(void)
+{
+	unsigned nr_cpus = num_possible_cpus();
+
+	if (nr_cpus > slow_work_max_threads)
+		slow_work_max_threads = nr_cpus;
+	return 0;
+}
+
+subsys_initcall(init_slow_work);
-- 
cgit v1.2.3


From 109d9272c423f46604d45fedfe87e21ee0b25180 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make slow-work thread pool actually dynamic

Make the slow-work thread pool actually dynamic in the number of threads it
contains.  With this patch, it will both create additional threads when it has
extra work to do, and cull excess threads that aren't doing anything.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 kernel/slow-work.c | 138 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 137 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 5a7392734c82..454abb21c8bd 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -16,6 +16,14 @@
 #include <linux/wait.h>
 #include <asm/system.h>
 
+#define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
+					 * things to do */
+#define SLOW_WORK_OOM_TIMEOUT (5 * HZ)	/* can't start new threads for 5s after
+					 * OOM */
+
+static void slow_work_cull_timeout(unsigned long);
+static void slow_work_oom_timeout(unsigned long);
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -29,6 +37,12 @@ static unsigned vslow_work_proportion = 50; /* % of threads that may process
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
+static bool slow_work_may_not_start_new_thread;
+static bool slow_work_cull; /* cull a thread due to lack of activity */
+static DEFINE_TIMER(slow_work_cull_timer, slow_work_cull_timeout, 0, 0);
+static DEFINE_TIMER(slow_work_oom_timer, slow_work_oom_timeout, 0, 0);
+static struct slow_work slow_work_new_thread; /* new thread starter */
+
 /*
  * The queues of work items and the lock governing access to them.  These are
  * shared between all the CPUs.  It doesn't make sense to have per-CPU queues
@@ -89,6 +103,14 @@ static bool slow_work_execute(void)
 
 	vsmax = slow_work_calc_vsmax();
 
+	/* see if we can schedule a new thread to be started if we're not
+	 * keeping up with the work */
+	if (!waitqueue_active(&slow_work_thread_wq) &&
+	    (!list_empty(&slow_work_queue) || !list_empty(&vslow_work_queue)) &&
+	    atomic_read(&slow_work_thread_count) < slow_work_max_threads &&
+	    !slow_work_may_not_start_new_thread)
+		slow_work_enqueue(&slow_work_new_thread);
+
 	/* find something to execute */
 	spin_lock_irq(&slow_work_queue_lock);
 	if (!list_empty(&vslow_work_queue) &&
@@ -242,6 +264,33 @@ cant_get_ref:
 }
 EXPORT_SYMBOL(slow_work_enqueue);
 
+/*
+ * Worker thread culling algorithm
+ */
+static bool slow_work_cull_thread(void)
+{
+	unsigned long flags;
+	bool do_cull = false;
+
+	spin_lock_irqsave(&slow_work_queue_lock, flags);
+
+	if (slow_work_cull) {
+		slow_work_cull = false;
+
+		if (list_empty(&slow_work_queue) &&
+		    list_empty(&vslow_work_queue) &&
+		    atomic_read(&slow_work_thread_count) >
+		    slow_work_min_threads) {
+			mod_timer(&slow_work_cull_timer,
+				  jiffies + SLOW_WORK_CULL_TIMEOUT);
+			do_cull = true;
+		}
+	}
+
+	spin_unlock_irqrestore(&slow_work_queue_lock, flags);
+	return do_cull;
+}
+
 /*
  * Determine if there is slow work available for dispatch
  */
@@ -273,7 +322,8 @@ static int slow_work_thread(void *_data)
 				TASK_INTERRUPTIBLE);
 		if (!freezing(current) &&
 		    !slow_work_threads_should_exit &&
-		    !slow_work_available(vsmax))
+		    !slow_work_available(vsmax) &&
+		    !slow_work_cull)
 			schedule();
 		finish_wait(&slow_work_thread_wq, &wait);
 
@@ -285,11 +335,20 @@ static int slow_work_thread(void *_data)
 
 		if (slow_work_available(vsmax) && slow_work_execute()) {
 			cond_resched();
+			if (list_empty(&slow_work_queue) &&
+			    list_empty(&vslow_work_queue) &&
+			    atomic_read(&slow_work_thread_count) >
+			    slow_work_min_threads)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
 			continue;
 		}
 
 		if (slow_work_threads_should_exit)
 			break;
+
+		if (slow_work_cull && slow_work_cull_thread())
+			break;
 	}
 
 	if (atomic_dec_and_test(&slow_work_thread_count))
@@ -297,6 +356,77 @@ static int slow_work_thread(void *_data)
 	return 0;
 }
 
+/*
+ * Handle thread cull timer expiration
+ */
+static void slow_work_cull_timeout(unsigned long data)
+{
+	slow_work_cull = true;
+	wake_up(&slow_work_thread_wq);
+}
+
+/*
+ * Get a reference on slow work thread starter
+ */
+static int slow_work_new_thread_get_ref(struct slow_work *work)
+{
+	return 0;
+}
+
+/*
+ * Drop a reference on slow work thread starter
+ */
+static void slow_work_new_thread_put_ref(struct slow_work *work)
+{
+}
+
+/*
+ * Start a new slow work thread
+ */
+static void slow_work_new_thread_execute(struct slow_work *work)
+{
+	struct task_struct *p;
+
+	if (slow_work_threads_should_exit)
+		return;
+
+	if (atomic_read(&slow_work_thread_count) >= slow_work_max_threads)
+		return;
+
+	if (!mutex_trylock(&slow_work_user_lock))
+		return;
+
+	slow_work_may_not_start_new_thread = true;
+	atomic_inc(&slow_work_thread_count);
+	p = kthread_run(slow_work_thread, NULL, "kslowd");
+	if (IS_ERR(p)) {
+		printk(KERN_DEBUG "Slow work thread pool: OOM\n");
+		if (atomic_dec_and_test(&slow_work_thread_count))
+			BUG(); /* we're running on a slow work thread... */
+		mod_timer(&slow_work_oom_timer,
+			  jiffies + SLOW_WORK_OOM_TIMEOUT);
+	} else {
+		/* ratelimit the starting of new threads */
+		mod_timer(&slow_work_oom_timer, jiffies + 1);
+	}
+
+	mutex_unlock(&slow_work_user_lock);
+}
+
+static const struct slow_work_ops slow_work_new_thread_ops = {
+	.get_ref	= slow_work_new_thread_get_ref,
+	.put_ref	= slow_work_new_thread_put_ref,
+	.execute	= slow_work_new_thread_execute,
+};
+
+/*
+ * post-OOM new thread start suppression expiration
+ */
+static void slow_work_oom_timeout(unsigned long data)
+{
+	slow_work_may_not_start_new_thread = false;
+}
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -316,6 +446,10 @@ int slow_work_register_user(void)
 		init_completion(&slow_work_last_thread_exited);
 
 		slow_work_threads_should_exit = false;
+		slow_work_init(&slow_work_new_thread,
+			       &slow_work_new_thread_ops);
+		slow_work_may_not_start_new_thread = false;
+		slow_work_cull = false;
 
 		/* start the minimum number of threads */
 		for (loop = 0; loop < slow_work_min_threads; loop++) {
@@ -369,6 +503,8 @@ void slow_work_unregister_user(void)
 		       " Shut down complete\n");
 	}
 
+	del_timer_sync(&slow_work_cull_timer);
+
 	mutex_unlock(&slow_work_user_lock);
 }
 EXPORT_SYMBOL(slow_work_unregister_user);
-- 
cgit v1.2.3


From 12e22c5e4bc08ab4b05ac079fe40d9891c5e81a0 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Make the slow work pool configurable

Make the slow work pool configurable through /proc/sys/kernel/slow-work.

 (*) /proc/sys/kernel/slow-work/min-threads

     The minimum number of threads that should be in the pool as long as it is
     in use.  This may be anywhere between 2 and max-threads.

 (*) /proc/sys/kernel/slow-work/max-threads

     The maximum number of threads that should in the pool.  This may be
     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.

 (*) /proc/sys/kernel/slow-work/vslow-percentage

     The percentage of active threads in the pool that may be used to execute
     very slow work items.  This may be between 1 and 99.  The resultant number
     is bounded to between 1 and one fewer than the number of active threads.
     This ensures there is always at least one thread that can process very
     slow work items, and always at least one thread that won't.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 include/linux/slow-work.h |   5 ++
 kernel/slow-work.c        | 118 +++++++++++++++++++++++++++++++++++++++++++++-
 kernel/sysctl.c           |   9 ++++
 3 files changed, 130 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 4dd754af393e..8262809dfa8b 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -14,6 +14,8 @@
 
 #ifdef CONFIG_SLOW_WORK
 
+#include <linux/sysctl.h>
+
 struct slow_work;
 
 /*
@@ -83,6 +85,9 @@ extern int slow_work_enqueue(struct slow_work *work);
 extern int slow_work_register_user(void);
 extern void slow_work_unregister_user(void);
 
+#ifdef CONFIG_SYSCTL
+extern ctl_table slow_work_sysctls[];
+#endif
 
 #endif /* CONFIG_SLOW_WORK */
 #endif /* _LINUX_SLOW_WORK_H */
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 454abb21c8bd..3f65900aa3cb 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -14,7 +14,6 @@
 #include <linux/kthread.h>
 #include <linux/freezer.h>
 #include <linux/wait.h>
-#include <asm/system.h>
 
 #define SLOW_WORK_CULL_TIMEOUT (5 * HZ)	/* cull threads 5s after running out of
 					 * things to do */
@@ -24,6 +23,14 @@
 static void slow_work_cull_timeout(unsigned long);
 static void slow_work_oom_timeout(unsigned long);
 
+#ifdef CONFIG_SYSCTL
+static int slow_work_min_threads_sysctl(struct ctl_table *, int, struct file *,
+					void __user *, size_t *, loff_t *);
+
+static int slow_work_max_threads_sysctl(struct ctl_table *, int , struct file *,
+					void __user *, size_t *, loff_t *);
+#endif
+
 /*
  * The pool of threads has at least min threads in it as long as someone is
  * using the facility, and may have as many as max.
@@ -34,6 +41,51 @@ static unsigned slow_work_min_threads = 2;
 static unsigned slow_work_max_threads = 4;
 static unsigned vslow_work_proportion = 50; /* % of threads that may process
 					     * very slow work */
+
+#ifdef CONFIG_SYSCTL
+static const int slow_work_min_min_threads = 2;
+static int slow_work_max_max_threads = 255;
+static const int slow_work_min_vslow = 1;
+static const int slow_work_max_vslow = 99;
+
+ctl_table slow_work_sysctls[] = {
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "min-threads",
+		.data		= &slow_work_min_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_min_threads_sysctl,
+		.extra1		= (void *) &slow_work_min_min_threads,
+		.extra2		= &slow_work_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "max-threads",
+		.data		= &slow_work_max_threads,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= slow_work_max_threads_sysctl,
+		.extra1		= &slow_work_min_threads,
+		.extra2		= (void *) &slow_work_max_max_threads,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "vslow-percentage",
+		.data		= &vslow_work_proportion,
+		.maxlen		= sizeof(unsigned),
+		.mode		= 0644,
+		.proc_handler	= &proc_dointvec_minmax,
+		.extra1		= (void *) &slow_work_min_vslow,
+		.extra2		= (void *) &slow_work_max_vslow,
+	},
+	{ .ctl_name = 0 }
+};
+#endif
+
+/*
+ * The active state of the thread pool
+ */
 static atomic_t slow_work_thread_count;
 static atomic_t vslow_work_executing_count;
 
@@ -427,6 +479,64 @@ static void slow_work_oom_timeout(unsigned long data)
 	slow_work_may_not_start_new_thread = false;
 }
 
+#ifdef CONFIG_SYSCTL
+/*
+ * Handle adjustment of the minimum number of threads
+ */
+static int slow_work_min_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to start or stop threads */
+			n = atomic_read(&slow_work_thread_count) -
+				slow_work_min_threads;
+
+			if (n < 0 && !slow_work_may_not_start_new_thread)
+				slow_work_enqueue(&slow_work_new_thread);
+			else if (n > 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+
+/*
+ * Handle adjustment of the maximum number of threads
+ */
+static int slow_work_max_threads_sysctl(struct ctl_table *table, int write,
+					struct file *filp, void __user *buffer,
+					size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, filp, buffer, lenp, ppos);
+	int n;
+
+	if (ret == 0) {
+		mutex_lock(&slow_work_user_lock);
+		if (slow_work_user_count > 0) {
+			/* see if we need to stop threads */
+			n = slow_work_max_threads -
+				atomic_read(&slow_work_thread_count);
+
+			if (n < 0)
+				mod_timer(&slow_work_cull_timer,
+					  jiffies + SLOW_WORK_CULL_TIMEOUT);
+		}
+		mutex_unlock(&slow_work_user_lock);
+	}
+
+	return ret;
+}
+#endif /* CONFIG_SYSCTL */
+
 /**
  * slow_work_register_user - Register a user of the facility
  *
@@ -516,8 +626,12 @@ static int __init init_slow_work(void)
 {
 	unsigned nr_cpus = num_possible_cpus();
 
-	if (nr_cpus > slow_work_max_threads)
+	if (slow_work_max_threads < nr_cpus)
 		slow_work_max_threads = nr_cpus;
+#ifdef CONFIG_SYSCTL
+	if (slow_work_max_max_threads < nr_cpus * 2)
+		slow_work_max_max_threads = nr_cpus * 2;
+#endif
 	return 0;
 }
 
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 5ec4543dfc06..82350f8f04f6 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -48,6 +48,7 @@
 #include <linux/acpi.h>
 #include <linux/reboot.h>
 #include <linux/ftrace.h>
+#include <linux/slow-work.h>
 
 #include <asm/uaccess.h>
 #include <asm/processor.h>
@@ -897,6 +898,14 @@ static struct ctl_table kern_table[] = {
 		.proc_handler	= &scan_unevictable_handler,
 	},
 #endif
+#ifdef CONFIG_SLOW_WORK
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "slow-work",
+		.mode		= 0555,
+		.child		= slow_work_sysctls,
+	},
+#endif
 /*
  * NOTE: do not add new entries to this table unless you have read
  * Documentation/sysctl/ctl_unnumbered.txt
-- 
cgit v1.2.3


From 8f0aa2f25b31ba27db84259141e52ee6ec0d2820 Mon Sep 17 00:00:00 2001
From: David Howells <dhowells@redhat.com>
Date: Fri, 3 Apr 2009 16:42:35 +0100
Subject: Document the slow work thread pool

Document the slow work thread pool.

Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: Steve Dickson <steved@redhat.com>
Acked-by: Trond Myklebust <Trond.Myklebust@netapp.com>
Acked-by: Al Viro <viro@zeniv.linux.org.uk>
Tested-by: Daire Byrne <Daire.Byrne@framestore.com>
---
 Documentation/slow-work.txt | 174 ++++++++++++++++++++++++++++++++++++++++++++
 include/linux/slow-work.h   |   2 +
 kernel/slow-work.c          |   2 +
 3 files changed, 178 insertions(+)
 create mode 100644 Documentation/slow-work.txt

(limited to 'kernel')

diff --git a/Documentation/slow-work.txt b/Documentation/slow-work.txt
new file mode 100644
index 000000000000..ebc50f808ea4
--- /dev/null
+++ b/Documentation/slow-work.txt
@@ -0,0 +1,174 @@
+		     ====================================
+		     SLOW WORK ITEM EXECUTION THREAD POOL
+		     ====================================
+
+By: David Howells <dhowells@redhat.com>
+
+The slow work item execution thread pool is a pool of threads for performing
+things that take a relatively long time, such as making mkdir calls.
+Typically, when processing something, these items will spend a lot of time
+blocking a thread on I/O, thus making that thread unavailable for doing other
+work.
+
+The standard workqueue model is unsuitable for this class of work item as that
+limits the owner to a single thread or a single thread per CPU.  For some
+tasks, however, more threads - or fewer - are required.
+
+There is just one pool per system.  It contains no threads unless something
+wants to use it - and that something must register its interest first.  When
+the pool is active, the number of threads it contains is dynamic, varying
+between a maximum and minimum setting, depending on the load.
+
+
+====================
+CLASSES OF WORK ITEM
+====================
+
+This pool support two classes of work items:
+
+ (*) Slow work items.
+
+ (*) Very slow work items.
+
+The former are expected to finish much quicker than the latter.
+
+An operation of the very slow class may do a batch combination of several
+lookups, mkdirs, and a create for instance.
+
+An operation of the ordinarily slow class may, for example, write stuff or
+expand files, provided the time taken to do so isn't too long.
+
+Operations of both types may sleep during execution, thus tying up the thread
+loaned to it.
+
+
+THREAD-TO-CLASS ALLOCATION
+--------------------------
+
+Not all the threads in the pool are available to work on very slow work items.
+The number will be between one and one fewer than the number of active threads.
+This is configurable (see the "Pool Configuration" section).
+
+All the threads are available to work on ordinarily slow work items, but a
+percentage of the threads will prefer to work on very slow work items.
+
+The configuration ensures that at least one thread will be available to work on
+very slow work items, and at least one thread will be available that won't work
+on very slow work items at all.
+
+
+=====================
+USING SLOW WORK ITEMS
+=====================
+
+Firstly, a module or subsystem wanting to make use of slow work items must
+register its interest:
+
+	 int ret = slow_work_register_user();
+
+This will return 0 if successful, or a -ve error upon failure.
+
+
+Slow work items may then be set up by:
+
+ (1) Declaring a slow_work struct type variable:
+
+	#include <linux/slow-work.h>
+
+	struct slow_work myitem;
+
+ (2) Declaring the operations to be used for this item:
+
+	struct slow_work_ops myitem_ops = {
+		.get_ref = myitem_get_ref,
+		.put_ref = myitem_put_ref,
+		.execute = myitem_execute,
+	};
+
+     [*] For a description of the ops, see section "Item Operations".
+
+ (3) Initialising the item:
+
+	slow_work_init(&myitem, &myitem_ops);
+
+     or:
+
+	vslow_work_init(&myitem, &myitem_ops);
+
+     depending on its class.
+
+A suitably set up work item can then be enqueued for processing:
+
+	int ret = slow_work_enqueue(&myitem);
+
+This will return a -ve error if the thread pool is unable to gain a reference
+on the item, 0 otherwise.
+
+
+The items are reference counted, so there ought to be no need for a flush
+operation.  When all a module's slow work items have been processed, and the
+module has no further interest in the facility, it should unregister its
+interest:
+
+	slow_work_unregister_user();
+
+
+===============
+ITEM OPERATIONS
+===============
+
+Each work item requires a table of operations of type struct slow_work_ops.
+All members are required:
+
+ (*) Get a reference on an item:
+
+	int (*get_ref)(struct slow_work *work);
+
+     This allows the thread pool to attempt to pin an item by getting a
+     reference on it.  This function should return 0 if the reference was
+     granted, or a -ve error otherwise.  If an error is returned,
+     slow_work_enqueue() will fail.
+
+     The reference is held whilst the item is queued and whilst it is being
+     executed.  The item may then be requeued with the same reference held, or
+     the reference will be released.
+
+ (*) Release a reference on an item:
+
+	void (*put_ref)(struct slow_work *work);
+
+     This allows the thread pool to unpin an item by releasing the reference on
+     it.  The thread pool will not touch the item again once this has been
+     called.
+
+ (*) Execute an item:
+
+	void (*execute)(struct slow_work *work);
+
+     This should perform the work required of the item.  It may sleep, it may
+     perform disk I/O and it may wait for locks.
+
+
+==================
+POOL CONFIGURATION
+==================
+
+The slow-work thread pool has a number of configurables:
+
+ (*) /proc/sys/kernel/slow-work/min-threads
+
+     The minimum number of threads that should be in the pool whilst it is in
+     use.  This may be anywhere between 2 and max-threads.
+
+ (*) /proc/sys/kernel/slow-work/max-threads
+
+     The maximum number of threads that should in the pool.  This may be
+     anywhere between min-threads and 255 or NR_CPUS * 2, whichever is greater.
+
+ (*) /proc/sys/kernel/slow-work/vslow-percentage
+
+     The percentage of active threads in the pool that may be used to execute
+     very slow work items.  This may be between 1 and 99.  The resultant number
+     is bounded to between 1 and one fewer than the number of active threads.
+     This ensures there is always at least one thread that can process very
+     slow work items, and always at least one thread that won't.
diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h
index 8262809dfa8b..85958277f83d 100644
--- a/include/linux/slow-work.h
+++ b/include/linux/slow-work.h
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #ifndef _LINUX_SLOW_WORK_H
diff --git a/kernel/slow-work.c b/kernel/slow-work.c
index 3f65900aa3cb..cf2bc01186ef 100644
--- a/kernel/slow-work.c
+++ b/kernel/slow-work.c
@@ -7,6 +7,8 @@
  * modify it under the terms of the GNU General Public Licence
  * as published by the Free Software Foundation; either version
  * 2 of the Licence, or (at your option) any later version.
+ *
+ * See Documentation/slow-work.txt
  */
 
 #include <linux/module.h>
-- 
cgit v1.2.3


From ca96a895a6bae7efe7b11a35d9f43e6228467562 Mon Sep 17 00:00:00 2001
From: Jiri Pirko <jpirko@redhat.com>
Date: Fri, 9 Jan 2009 16:44:16 +0100
Subject: audit: EXECVE record - removed bogus newline

(updated)
Added hunk that changes the comment, the rest is the same.

EXECVE records contain a newline after every argument. auditd converts
"\n" to " " so you cannot see newlines even in raw logs, but they're
there nevertheless. If you're not using auditd, you need to work round
them. These '\n' chars are can be easily replaced by spaces when
creating record in kernel. Note there is no need for trailing '\n' in
an audit record.

record before this patch:
"type=EXECVE msg=audit(1231421801.566:31): argc=4 a0=\"./test\"\na1=\"a\"\na2=\"b\"\na3=\"c\"\n"

record after this patch:
"type=EXECVE msg=audit(1231421801.566:31): argc=4 a0=\"./test\" a1=\"a\" a2=\"b\" a3=\"c\""

Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Acked-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 2bfc64786765..738c03695b79 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1024,7 +1024,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 {
 	char arg_num_len_buf[12];
 	const char __user *tmp_p = p;
-	/* how many digits are in arg_num? 3 is the length of a=\n */
+	/* how many digits are in arg_num? 3 is the length of " a=" */
 	size_t arg_num_len = snprintf(arg_num_len_buf, 12, "%d", arg_num) + 3;
 	size_t len, len_left, to_send;
 	size_t max_execve_audit_len = MAX_EXECVE_AUDIT_LEN;
@@ -1110,7 +1110,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		 * so we can be sure nothing was lost.
 		 */
 		if ((i == 0) && (too_long))
-			audit_log_format(*ab, "a%d_len=%zu ", arg_num,
+			audit_log_format(*ab, " a%d_len=%zu", arg_num,
 					 has_cntl ? 2*len : len);
 
 		/*
@@ -1130,7 +1130,7 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 		buf[to_send] = '\0';
 
 		/* actually log it */
-		audit_log_format(*ab, "a%d", arg_num);
+		audit_log_format(*ab, " a%d", arg_num);
 		if (too_long)
 			audit_log_format(*ab, "[%d]", i);
 		audit_log_format(*ab, "=");
@@ -1138,7 +1138,6 @@ static int audit_log_single_execve_arg(struct audit_context *context,
 			audit_log_n_hex(*ab, buf, to_send);
 		else
 			audit_log_format(*ab, "\"%s\"", buf);
-		audit_log_format(*ab, "\n");
 
 		p += to_send;
 		len_left -= to_send;
@@ -1166,7 +1165,7 @@ static void audit_log_execve_info(struct audit_context *context,
 
 	p = (const char __user *)axi->mm->arg_start;
 
-	audit_log_format(*ab, "argc=%d ", axi->argc);
+	audit_log_format(*ab, "argc=%d", axi->argc);
 
 	/*
 	 * we need some kernel buffer to hold the userspace args.  Just
-- 
cgit v1.2.3


From 6b96255998053a89f45c0855de954b71f5c3887b Mon Sep 17 00:00:00 2001
From: Randy Dunlap <randy.dunlap@oracle.com>
Date: Mon, 5 Jan 2009 13:41:13 -0800
Subject: auditsc: fix kernel-doc notation

Fix auditsc kernel-doc notation:

Warning(linux-2.6.28-git7//kernel/auditsc.c:2156): No description found for parameter 'attr'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2156): Excess function parameter 'u_attr' description in '__audit_mq_open'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2204): No description found for parameter 'notification'
Warning(linux-2.6.28-git7//kernel/auditsc.c:2204): Excess function parameter 'u_notification' description in '__audit_mq_notify'

Signed-off-by: Randy Dunlap <randy.dunlap@oracle.com>
cc:	Al Viro <viro@zeniv.linux.org.uk>
cc:	Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 738c03695b79..b344b86557a2 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2149,7 +2149,7 @@ int audit_set_loginuid(struct task_struct *task, uid_t loginuid)
  * __audit_mq_open - record audit data for a POSIX MQ open
  * @oflag: open flag
  * @mode: mode bits
- * @u_attr: queue attributes
+ * @attr: queue attributes
  *
  */
 void __audit_mq_open(int oflag, mode_t mode, struct mq_attr *attr)
@@ -2196,7 +2196,7 @@ void __audit_mq_sendrecv(mqd_t mqdes, size_t msg_len, unsigned int msg_prio,
 /**
  * __audit_mq_notify - record audit data for a POSIX MQ notify
  * @mqdes: MQ descriptor
- * @u_notification: Notification event
+ * @notification: Notification event
  *
  */
 
-- 
cgit v1.2.3


From c28bb7da74ab74a2860d652493aaff7de104d79e Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <helight.xu@gmail.com>
Date: Thu, 12 Mar 2009 22:16:12 +0800
Subject: make the e->rule.xxx shorter in kernel auditfilter.c

make the e->rule.xxx shorter in kernel/auditfilter.c
--
---------------------------------
Zhenwen Xu - Open and Free
Home Page:	http://zhwen.org
My Studio:	http://dim4.cn

>From 99692dc640b278f1cb1a15646ce42f22e89c0f77 Mon Sep 17 00:00:00 2001
From: Zhenwen Xu <Helight.Xu@gmail.com>
Date: Thu, 12 Mar 2009 22:04:59 +0800
Subject: [PATCH] make the e->rule.xxx shorter in kernel/auditfilter.c

Signed-off-by: Zhenwen Xu <Helight.Xu@gmail.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditfilter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditfilter.c b/kernel/auditfilter.c
index fbf24d121d97..a6fe71fd5d1b 100644
--- a/kernel/auditfilter.c
+++ b/kernel/auditfilter.c
@@ -135,18 +135,18 @@ static void audit_remove_watch(struct audit_watch *watch)
 static inline void audit_free_rule(struct audit_entry *e)
 {
 	int i;
-
+	struct audit_krule *erule = &e->rule;
 	/* some rules don't have associated watches */
-	if (e->rule.watch)
-		audit_put_watch(e->rule.watch);
-	if (e->rule.fields)
-		for (i = 0; i < e->rule.field_count; i++) {
-			struct audit_field *f = &e->rule.fields[i];
+	if (erule->watch)
+		audit_put_watch(erule->watch);
+	if (erule->fields)
+		for (i = 0; i < erule->field_count; i++) {
+			struct audit_field *f = &erule->fields[i];
 			kfree(f->lsm_str);
 			security_audit_rule_free(f->lsm_rule);
 		}
-	kfree(e->rule.fields);
-	kfree(e->rule.filterkey);
+	kfree(erule->fields);
+	kfree(erule->filterkey);
 	kfree(e);
 }
 
-- 
cgit v1.2.3


From b3897f567100d18e0597f638b911d23aa5e0dd23 Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 19 Mar 2009 09:48:27 -0400
Subject: Audit: fix handling of 'strings' with NULL characters

currently audit_log_n_untrustedstring() uses audit_string_contains_control()
to check if the 'string' has any control characters.  If the 'string' has an
embedded NULL audit_string_contains_control() will return that the data has
no control characters and will then pass the string to audit_log_n_string
with the total length, not the length up to the first NULL.
audit_log_n_string() does a memcpy of the entire length and so the actual
audit record emitted may then contain a NULL and then whatever random memory
is after the NULL.

Since we want to log the entire octet stream (if we can't trust the data
to be a string we can't trust that a NULL isn't actually a part of it)
we should just consider NULL as a control character.  If the caller is
certain they want to stop at the first NULL they should be using
audit_log_untrustedstring.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index ce6d8ea3131e..fa3805516dff 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1382,7 +1382,7 @@ void audit_log_n_string(struct audit_buffer *ab, const char *string,
 int audit_string_contains_control(const char *string, size_t len)
 {
 	const unsigned char *p;
-	for (p = string; p < (const unsigned char *)string + len && *p; p++) {
+	for (p = string; p < (const unsigned char *)string + len; p++) {
 		if (*p == '"' || *p < 0x21 || *p > 0x7e)
 			return 1;
 	}
-- 
cgit v1.2.3


From 55ad2f8d340678397de5916b9cd960f17ebd7150 Mon Sep 17 00:00:00 2001
From: Miloslav Trmac <mitr@redhat.com>
Date: Thu, 19 Mar 2009 09:52:47 -0400
Subject: audit: ignore terminating NUL in AUDIT_USER_TTY messages

AUDIT_USER_TTY, like all other messages sent from user-space, is sent
NUL-terminated.  Unlike other user-space audit messages, which come only
from trusted sources, AUDIT_USER_TTY messages are processed using
audit_log_n_untrustedstring().

This patch modifies AUDIT_USER_TTY handling to ignore the trailing NUL
and use the "quoted_string" representation of the message if possible.

Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Cc: Eric Paris <eparis@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index fa3805516dff..5560390cb0f5 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -766,6 +766,9 @@ static int audit_receive_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
 
 				audit_log_format(ab, " msg=");
 				size = nlmsg_len(nlh);
+				if (size > 0 &&
+				    ((unsigned char *)data)[size - 1] == '\0')
+					size--;
 				audit_log_n_untrustedstring(ab, data, size);
 			}
 			audit_set_pid(ab, pid);
-- 
cgit v1.2.3


From 6d208da89aabee8502debe842832ca0ab298d16d Mon Sep 17 00:00:00 2001
From: Paul Moore <paul.moore@hp.com>
Date: Wed, 1 Apr 2009 15:47:27 -0400
Subject: audit: Fix possible return value truncation in audit_get_context()

The audit subsystem treats syscall return codes as type long, unfortunately
the audit_get_context() function mistakenly converts the return code to an
int type in the parameters which could cause problems on systems where the
sizeof(int) != sizeof(long).

Signed-off-by: Paul Moore <paul.moore@hp.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index b344b86557a2..e821d626dfe6 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -752,7 +752,7 @@ static void audit_set_auditable(struct audit_context *ctx)
 
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
-						      int return_code)
+						      long return_code)
 {
 	struct audit_context *context = tsk->audit_context;
 
-- 
cgit v1.2.3


From 318b6d3d7ddbcad3d6867e630711b8a705d873d7 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 13 Jan 2009 17:32:40 -0500
Subject: audit: incorrect ref counting in audit tree tag_chunk

tag_chunk has bad exit paths in which the inotify ref counting is wrong.
At the top of the function we found &old_watch using  inotify_find_watch().
inotify_find_watch takes a reference to the watch.  This is never dropped
on an error path.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit_tree.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/audit_tree.c b/kernel/audit_tree.c
index 8ad9545b8db9..917ab9525568 100644
--- a/kernel/audit_tree.c
+++ b/kernel/audit_tree.c
@@ -385,6 +385,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 	mutex_lock(&inode->inotify_mutex);
 	if (inotify_clone_watch(&old->watch, &chunk->watch) < 0) {
 		mutex_unlock(&inode->inotify_mutex);
+		put_inotify_watch(&old->watch);
 		free_chunk(chunk);
 		return -ENOSPC;
 	}
@@ -394,6 +395,7 @@ static int tag_chunk(struct inode *inode, struct audit_tree *tree)
 		chunk->dead = 1;
 		inotify_evict_watch(&chunk->watch);
 		mutex_unlock(&inode->inotify_mutex);
+		put_inotify_watch(&old->watch);
 		put_inotify_watch(&chunk->watch);
 		return 0;
 	}
-- 
cgit v1.2.3


From 679173b724631f49e537a15fa48ea2000bdc1808 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Mon, 26 Jan 2009 18:09:45 -0500
Subject: audit: audit_set_auditable defined but not used
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

after 0590b9335a1c72a3f0defcc6231287f7817e07c8 audit_set_auditable() is now only
used by the audit tree code.  If CONFIG_AUDIT_TREE is unset it will be defined
but unused.  This patch simply moves the function inside a CONFIG_AUDIT_TREE
block.

cc1: warnings being treated as errors
/home/acme_unencrypted/git/linux-2.6-tip/kernel/auditsc.c:745: error: ‘audit_set_auditable’ defined but not used
make[2]: *** [kernel/auditsc.o] Error 1
make[1]: *** [kernel] Error 2
make[1]: *** Waiting for unfinished jobs....

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/auditsc.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

(limited to 'kernel')

diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index e821d626dfe6..aa0428e08367 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -329,6 +329,14 @@ static int audit_match_filetype(struct audit_context *ctx, int which)
  */
 
 #ifdef CONFIG_AUDIT_TREE
+static void audit_set_auditable(struct audit_context *ctx)
+{
+	if (!ctx->prio) {
+		ctx->prio = 1;
+		ctx->current_state = AUDIT_RECORD_CONTEXT;
+	}
+}
+
 static int put_tree_ref(struct audit_context *ctx, struct audit_chunk *chunk)
 {
 	struct audit_tree_refs *p = ctx->trees;
@@ -742,14 +750,6 @@ void audit_filter_inodes(struct task_struct *tsk, struct audit_context *ctx)
 	rcu_read_unlock();
 }
 
-static void audit_set_auditable(struct audit_context *ctx)
-{
-	if (!ctx->prio) {
-		ctx->prio = 1;
-		ctx->current_state = AUDIT_RECORD_CONTEXT;
-	}
-}
-
 static inline struct audit_context *audit_get_context(struct task_struct *tsk,
 						      int return_valid,
 						      long return_code)
-- 
cgit v1.2.3


From def57543418a5f47debae28a0a9dea2effc11692 Mon Sep 17 00:00:00 2001
From: Eric Paris <eparis@redhat.com>
Date: Tue, 10 Mar 2009 18:00:14 -0400
Subject: Audit: remove spaces from audit_log_d_path

audit_log_d_path had spaces in the strings which would be emitted on the
error paths.  This patch simply replaces those spaces with an _ or removes
the needless spaces entirely.

Signed-off-by: Eric Paris <eparis@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 kernel/audit.c   | 4 ++--
 kernel/auditsc.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/audit.c b/kernel/audit.c
index 5560390cb0f5..9442c3533ba9 100644
--- a/kernel/audit.c
+++ b/kernel/audit.c
@@ -1440,13 +1440,13 @@ void audit_log_d_path(struct audit_buffer *ab, const char *prefix,
 	/* We will allow 11 spaces for ' (deleted)' to be appended */
 	pathname = kmalloc(PATH_MAX+11, ab->gfp_mask);
 	if (!pathname) {
-		audit_log_format(ab, "<no memory>");
+		audit_log_string(ab, "<no_memory>");
 		return;
 	}
 	p = d_path(path, pathname, PATH_MAX+11);
 	if (IS_ERR(p)) { /* Should never happen since we send PATH_MAX */
 		/* FIXME: can we save some information here? */
-		audit_log_format(ab, "<too long>");
+		audit_log_string(ab, "<too_long>");
 	} else
 		audit_log_untrustedstring(ab, p);
 	kfree(pathname);
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index aa0428e08367..7d6ac7c1f414 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -1478,7 +1478,7 @@ static void audit_log_exit(struct audit_context *context, struct task_struct *ts
 			case 0:
 				/* name was specified as a relative path and the
 				 * directory component is the cwd */
-				audit_log_d_path(ab, " name=", &context->pwd);
+				audit_log_d_path(ab, "name=", &context->pwd);
 				break;
 			default:
 				/* log the name's directory component */
-- 
cgit v1.2.3


From cd5f9a4c3199b090e91ea0064cb110985ba54814 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@linux-foundation.org>
Date: Mon, 6 Apr 2009 13:38:46 -0700
Subject: kernel/sysctl.c: avoid annoying warnings

Some of the limit constants are used only depending on some complex
configuration dependencies, yet it's not worth making the simple
variables depend on those configuration details.  Just mark them as
perhaps not being unused, and avoid the warning.

Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/sysctl.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 82350f8f04f6..b125e3387568 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -97,8 +97,8 @@ static int neg_one = -1;
 #endif
 
 static int zero;
-static int one = 1;
-static int two = 2;
+static int __maybe_unused one = 1;
+static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
 
-- 
cgit v1.2.3


From 432870dab85a2f69dc417022646cb9a70acf7f94 Mon Sep 17 00:00:00 2001
From: Oleg Nesterov <oleg@redhat.com>
Date: Mon, 6 Apr 2009 16:16:02 +0200
Subject: exit_notify: kill the wrong capable(CAP_KILL) check

The CAP_KILL check in exit_notify() looks just wrong, kill it.

Whatever logic we have to reset ->exit_signal, the malicious user
can bypass it if it execs the setuid application before exiting.

Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/exit.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/exit.c b/kernel/exit.c
index 6686ed1e4aa3..32cbf2607cb0 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -837,8 +837,7 @@ static void exit_notify(struct task_struct *tsk, int group_dead)
 	 */
 	if (tsk->exit_signal != SIGCHLD && !task_detached(tsk) &&
 	    (tsk->parent_exec_id != tsk->real_parent->self_exec_id ||
-	     tsk->self_exec_id != tsk->parent_exec_id) &&
-	    !capable(CAP_KILL))
+	     tsk->self_exec_id != tsk->parent_exec_id))
 		tsk->exit_signal = SIGCHLD;
 
 	signal = tracehook_notify_death(tsk, &cookie, group_dead);
-- 
cgit v1.2.3


From 2e45e77787c9d0720b046eb69856edf43b17e33e Mon Sep 17 00:00:00 2001
From: Rusty Russell <rusty@rustcorp.com.au>
Date: Tue, 7 Apr 2009 17:12:43 +0930
Subject: Revert "module: remove the SHF_ALLOC flag on the __versions section."

This reverts commit 9cb610d8e35fe3ec95a2fe2030b02f85aeea83c1.

This was an impressively stupid patch.  Firstly, we reset the SHF_ALLOC
flag lower down in the same function, so the patch was useless.  Even
better, find_sec() ignores sections with SHF_ALLOC not set, so
it breaks CONFIG_MODVERSIONS=y with CONFIG_MODULE_FORCE_LOAD=n, which
refuses to load the module since it can't find the __versions section.

Signed-off-by: Rusty Russell <rusty@rustcorp.com.au>
---
 kernel/module.c | 3 ---
 1 file changed, 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/module.c b/kernel/module.c
index c268a771595c..05f014efa32c 100644
--- a/kernel/module.c
+++ b/kernel/module.c
@@ -1952,9 +1952,6 @@ static noinline struct module *load_module(void __user *umod,
 		if (strstarts(secstrings+sechdrs[i].sh_name, ".exit"))
 			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 #endif
-		/* Don't keep __versions around; it's just for loading. */
-		if (strcmp(secstrings + sechdrs[i].sh_name, "__versions") == 0)
-			sechdrs[i].sh_flags &= ~(unsigned long)SHF_ALLOC;
 	}
 
 	modindex = find_sec(hdr, sechdrs, secstrings,
-- 
cgit v1.2.3


From 301fd748e2c81e78e74edbc694a64caa7b95dda2 Mon Sep 17 00:00:00 2001
From: Steven Rostedt <srostedt@redhat.com>
Date: Fri, 3 Apr 2009 11:12:23 -0400
Subject: tracing: remove CALLER_ADDR2 from wakeup tracer

Maneesh Soni was getting a crash when running the wakeup tracer.
We debugged it down to the recording of the function with the
CALLER_ADDR2 macro.  This is used to get the location of the caller
to schedule.

But the problem comes when schedule is called by assmebly. In the case
that Maneesh had, retint_careful would call schedule. But retint_careful
does not set up a proper frame pointer. CALLER_ADDR2 is defined as
__builtin_return_address(2). This produces the following assembly in
the wakeup tracer code.

   mov    0x0(%rbp),%rcx  <--- get the frame pointer of the caller
   mov    %r14d,%r8d
   mov    0xf2de8e(%rip),%rdi

   mov    0x8(%rcx),%rsi  <-- this is __builtin_return_address(1)
   mov    0x28(%rdi,%rax,8),%rbx

   mov    (%rcx),%rax  <-- get the frame pointer of the caller's caller
   mov    %r12,%rcx
   mov    0x8(%rax),%rdx <-- this is __builtin_return_address(2)

At the reading of 0x8(%rax) Maneesh's machine would take a fault.
The reason is that retint_careful did not set up the return address
and the content of %rax here was zero.

To verify this, I sent Maneesh a patch to create a frame pointer
in retint_careful. He ran the test again but this time he would take
the same type of fault from sysret_careful. The retint_careful was no
longer an issue, but there are other callers that still have issues.

Instead of adding frame pointers for all callers to schedule (in possibly
all archs), it is much safer to simply not use CALLER_ADDR2. This
loses out on knowing what called schedule, but the function tracer
will help there if needed.

Reported-by: Maneesh Soni <maneesh@in.ibm.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_wakeup.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c
index 3c5ad6b2ec84..5bc00e8f153e 100644
--- a/kernel/trace/trace_sched_wakeup.c
+++ b/kernel/trace/trace_sched_wakeup.c
@@ -154,7 +154,7 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev,
 	if (unlikely(!tracer_enabled || next != wakeup_task))
 		goto out_unlock;
 
-	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
+	trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc);
 	tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc);
 
 	/*
@@ -257,6 +257,12 @@ probe_wakeup(struct rq *rq, struct task_struct *p, int success)
 	data = wakeup_trace->data[wakeup_cpu];
 	data->preempt_timestamp = ftrace_now(cpu);
 	tracing_sched_wakeup_trace(wakeup_trace, p, current, flags, pc);
+
+	/*
+	 * We must be careful in using CALLER_ADDR2. But since wake_up
+	 * is not called by an assembly function  (where as schedule is)
+	 * it should be safe to use it here.
+	 */
 	trace_function(wakeup_trace, CALLER_ADDR1, CALLER_ADDR2, flags, pc);
 
 out_locked:
-- 
cgit v1.2.3


From cf8e3474654f20433aab9aa35826d43b5f245008 Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Mon, 30 Mar 2009 13:48:00 +0800
Subject: tracing: fix incorrect return type of ns2usecs()

Impact: fix time output bug in 32bits system

ns2usecs() returns 'long', it's incorrect.

(In i386)
...
          <idle>-0     [000]   521.442100: _spin_lock <-tick_do_update_jiffies64
          <idle>-0     [000]   521.442101: do_timer <-tick_do_update_jiffies64
          <idle>-0     [000]   521.442102: update_wall_time <-do_timer
          <idle>-0     [000]   521.442102: update_xtime_cache <-update_wall_time
....
(It always print the time less than 2200 seconds besides ...)
Because 'long' is 32bits in i386. ( (1<<31) useconds is about 2200 seconds)

...
          <idle>-0     [001] 4154502640.134759: rcu_bh_qsctr_inc <-__do_softirq
          <idle>-0     [001] 4154502640.134760: _local_bh_enable <-__do_softirq
          <idle>-0     [001] 4154502640.134761: idle_cpu <-irq_exit
...
(very large value)
Because 'long' is a signed type and it is 32bits in i386.

Changes in v2:
return 'unsigned long long' instead of 'cycle_t'

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
LKML-Reference: <49D05D10.4030009@cn.fujitsu.com>
Reported-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c        | 3 +--
 kernel/trace/trace.h        | 2 +-
 kernel/trace/trace_output.c | 2 +-
 3 files changed, 3 insertions(+), 4 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index a0174a40c563..457dd8c97e0d 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -147,8 +147,7 @@ static int __init set_ftrace_dump_on_oops(char *str)
 }
 __setup("ftrace_dump_on_oops", set_ftrace_dump_on_oops);
 
-long
-ns2usecs(cycle_t nsec)
+unsigned long long ns2usecs(cycle_t nsec)
 {
 	nsec += 500;
 	do_div(nsec, 1000);
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index cb0ce3fc36d3..0d81a4a2a4a5 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -596,7 +596,7 @@ extern int trace_selftest_startup_branch(struct tracer *trace,
 #endif /* CONFIG_FTRACE_STARTUP_TEST */
 
 extern void *head_page(struct trace_array_cpu *data);
-extern long ns2usecs(cycle_t nsec);
+extern unsigned long long ns2usecs(cycle_t nsec);
 extern int
 trace_vbprintk(unsigned long ip, const char *fmt, va_list args);
 extern int
diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c
index d72b9a63b247..64b54a59c55b 100644
--- a/kernel/trace/trace_output.c
+++ b/kernel/trace/trace_output.c
@@ -423,7 +423,7 @@ int trace_print_lat_context(struct trace_iterator *iter)
 
 		trace_find_cmdline(entry->pid, comm);
 
-		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08lx]"
+		ret = trace_seq_printf(s, "%16s %5d %3d %d %08x %08lx [%08llx]"
 				       " %ld.%03ldms (+%ld.%03ldms): ", comm,
 				       entry->pid, iter->cpu, entry->flags,
 				       entry->preempt_count, iter->idx,
-- 
cgit v1.2.3


From 5f0c6c03c5fee91c02c696bc9bf4c0d41392abe7 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 27 Mar 2009 14:22:10 +0100
Subject: tracing/ftrace: fix missing include string.h

Building a kernel with tracing can raise the following warning on
tip/master:

kernel/trace/trace.c:1249: error: implicit declaration of function 'vbin_printf'

We are missing an include to string.h

Reported-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238160130-7437-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 1 +
 1 file changed, 1 insertion(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 457dd8c97e0d..2230b46f9e1c 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -30,6 +30,7 @@
 #include <linux/percpu.h>
 #include <linux/splice.h>
 #include <linux/kdebug.h>
+#include <linux/string.h>
 #include <linux/ctype.h>
 #include <linux/init.h>
 #include <linux/poll.h>
-- 
cgit v1.2.3


From 8bcae09b93e7f96f700b6bb372c2b3f2b36636dc Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Tue, 31 Mar 2009 15:24:51 +0800
Subject: ftrace: Add check of sched_stopped for probe_sched_wakeup

The wakeup tracing in sched_switch does not stop when a user
disables tracing. This is because the probe_sched_wakeup() is missing
the check to prevent the wakeup from being traced.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D1C543.3010307@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_sched_switch.c | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'kernel')

diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c
index de35f200abd3..9117cea6f1ae 100644
--- a/kernel/trace/trace_sched_switch.c
+++ b/kernel/trace/trace_sched_switch.c
@@ -62,6 +62,9 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success)
 	pc = preempt_count();
 	tracing_record_cmdline(current);
 
+	if (sched_stopped)
+		return;
+
 	local_irq_save(flags);
 	cpu = raw_smp_processor_id();
 	data = ctx_trace->data[cpu];
-- 
cgit v1.2.3


From b0dfa978c7a1699fb3506fbfcba0b6a5c4bd17ae Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 1 Apr 2009 22:53:08 +0200
Subject: tracing/ftrace: alloc the started cpumask for the trace file

Impact: fix a crash while cat trace file

Currently we are using a cpumask to remind each cpu where a
trace occured. It lets us notice the user that a cpu just had
its first trace.

But on latest -tip we have the following crash once we cat the trace
file:

IP: [<c0270c4a>] print_trace_fmt+0x45/0xe7
*pde = 00000000
Oops: 0000 [#1] PREEMPT SMP
last sysfs file: /sys/class/net/eth0/carrier
Pid: 3897, comm: cat Not tainted (2.6.29-tip-02825-g0f22972-dirty #81)
EIP: 0060:[<c0270c4a>] EFLAGS: 00010297 CPU: 0
EIP is at print_trace_fmt+0x45/0xe7
EAX: 00000000 EBX: 00000000 ECX: c12d9e98 EDX: ccdb7010
ESI: d31f4000 EDI: 00322401 EBP: d31f3f10 ESP: d31f3efc
DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068
Process cat (pid: 3897, ti=d31f2000 task=d3b3cf20 task.ti=d31f2000)
Stack:
d31f4080 ccdb7010 d31f4000 d691fe70 ccdb7010 d31f3f24 c0270e5c d31f4000
d691fe70 d31f4000 d31f3f34 c02718e8 c12d9e98 d691fe70 d31f3f70 c02bfc33
00001000 09130000 d3b46e00 d691fe98 00000000 00000079 00000001 00000000
Call Trace:
[<c0270e5c>] ? print_trace_line+0x170/0x17c
[<c02718e8>] ? s_show+0xa7/0xbd
[<c02bfc33>] ? seq_read+0x24a/0x327
[<c02bf9e9>] ? seq_read+0x0/0x327
[<c02ab18b>] ? vfs_read+0x86/0xe1
[<c02ab289>] ? sys_read+0x40/0x65
[<c0202d8f>] ? sysenter_do_call+0x12/0x3c
Code: 00 00 00 89 45 ec f7 c7 00 20 00 00 89 55 f0 74 4e f6 86 98 10 00 00 02 74 45 8b 86 8c 10 00 00 8b 9e a8 10 00 00 e8 52 f3 ff ff <0f> a3 03 19 c0 85 c0 75 2b 8b 86 8c 10 00 00 8b 9e a8 10 00 00
EIP: [<c0270c4a>] print_trace_fmt+0x45/0xe7 SS:ESP 0068:d31f3efc
CR2: 0000000000000000
---[ end trace aa9cf38e5ebed9dd ]---

This is because we alloc the iter->started cpumask on tracing_pipe_open but
not on tracing_open.

It hadn't been noticed until now because we need to have ring buffer overruns
to activate the starting of cpu buffer detection.

Also, we need a check to not print the messagge for the first trace on the file.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <1238619188-6109-1-git-send-email-fweisbec@gmail.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 2230b46f9e1c..fc8c7d66832b 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -1632,7 +1632,11 @@ static void test_cpu_buff_start(struct trace_iterator *iter)
 		return;
 
 	cpumask_set_cpu(iter->cpu, iter->started);
-	trace_seq_printf(s, "##### CPU %u buffer started ####\n", iter->cpu);
+
+	/* Don't print started cpu buffer for the first entry of the trace */
+	if (iter->idx > 1)
+		trace_seq_printf(s, "##### CPU %u buffer started ####\n",
+				iter->cpu);
 }
 
 static enum print_line_t print_trace_fmt(struct trace_iterator *iter)
@@ -1867,6 +1871,11 @@ __tracing_open(struct inode *inode, struct file *file)
 	if (current_trace)
 		*iter->trace = *current_trace;
 
+	if (!alloc_cpumask_var(&iter->started, GFP_KERNEL))
+		goto fail;
+
+	cpumask_clear(iter->started);
+
 	if (current_trace && current_trace->print_max)
 		iter->tr = &max_tr;
 	else
@@ -1917,6 +1926,7 @@ __tracing_open(struct inode *inode, struct file *file)
 		if (iter->buffer_iter[cpu])
 			ring_buffer_read_finish(iter->buffer_iter[cpu]);
 	}
+	free_cpumask_var(iter->started);
  fail:
 	mutex_unlock(&trace_types_lock);
 	kfree(iter->trace);
@@ -1960,6 +1970,7 @@ static int tracing_release(struct inode *inode, struct file *file)
 
 	seq_release(inode, file);
 	mutex_destroy(&iter->mutex);
+	free_cpumask_var(iter->started);
 	kfree(iter->trace);
 	kfree(iter);
 	return 0;
-- 
cgit v1.2.3


From bc2b6871c17b3aff79fb14e1a1c06c5f5a187f76 Mon Sep 17 00:00:00 2001
From: Nikanth Karthikesan <knikanth@suse.de>
Date: Mon, 23 Mar 2009 11:58:31 +0530
Subject: Update /debug/tracing/README

Some of the tracers have been renamed, which was not updated in the in-kernel
run-time README file. Update it.

Signed-off-by: Nikanth Karthikesan <knikanth@suse.de>
LKML-Reference: <200903231158.32151.knikanth@suse.de>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index fc8c7d66832b..9d28476a9851 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -2369,9 +2369,9 @@ static const char readme_msg[] =
 	"# mkdir /debug\n"
 	"# mount -t debugfs nodev /debug\n\n"
 	"# cat /debug/tracing/available_tracers\n"
-	"wakeup preemptirqsoff preemptoff irqsoff ftrace sched_switch none\n\n"
+	"wakeup preemptirqsoff preemptoff irqsoff function sched_switch nop\n\n"
 	"# cat /debug/tracing/current_tracer\n"
-	"none\n"
+	"nop\n"
 	"# echo sched_switch > /debug/tracing/current_tracer\n"
 	"# cat /debug/tracing/current_tracer\n"
 	"sched_switch\n"
-- 
cgit v1.2.3


From 1bbe2a83ab68e5cf8c66c372c7cb3b51910c2cfe Mon Sep 17 00:00:00 2001
From: Zhaolei <zhaolei@cn.fujitsu.com>
Date: Fri, 3 Apr 2009 18:24:46 +0800
Subject: ftrace: Correct a text align for event format output

If we cat debugfs/tracing/events/ftrace/bprint/format, we'll see:
name: bprint
ID: 6
format:
	field:unsigned char common_type;	offset:0;	size:1;
	field:unsigned char common_flags;	offset:1;	size:1;
	field:unsigned char common_preempt_count;	offset:2;	size:1;
	field:int common_pid;	offset:4;	size:4;
	field:int common_tgid;	offset:8;	size:4;

	field:unsigned long ip;	offset:12;	size:4;
	field:char * fmt;	offset:16;	size:4;
	field: char buf;	offset:20;	size:0;

print fmt: "%08lx (%d) fmt:%p %s"

There is an inconsistent blank before char buf.

Signed-off-by: Zhao Lei <zhaolei@cn.fujitsu.com>
LKML-Reference: <49D5E3EE.70201@cn.fujitsu.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/trace/trace_export.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c
index 4d9952d3df50..07a22c33ebf3 100644
--- a/kernel/trace/trace_export.c
+++ b/kernel/trace/trace_export.c
@@ -40,7 +40,7 @@
 
 #undef TRACE_FIELD_ZERO_CHAR
 #define TRACE_FIELD_ZERO_CHAR(item)					\
-	ret = trace_seq_printf(s, "\tfield: char " #item ";\t"		\
+	ret = trace_seq_printf(s, "\tfield:char " #item ";\t"		\
 			       "offset:%u;\tsize:0;\n",			\
 			       (unsigned int)offsetof(typeof(field), item)); \
 	if (!ret)							\
-- 
cgit v1.2.3


From fafd688e4c0c34da0f3de909881117d374e4c7af Mon Sep 17 00:00:00 2001
From: Peter W Morreale <pmorreale@novell.com>
Date: Mon, 6 Apr 2009 19:00:29 -0700
Subject: mm: add /proc controls for pdflush threads

Add /proc entries to give the admin the ability to control the minimum and
maximum number of pdflush threads.  This allows finer control of pdflush
on both large and small machines.

The rationale is simply one size does not fit all.  Admins on large and/or
small systems may want to tune the min/max pdflush thread count to best
suit their needs.  Right now the min/max is hardcoded to 2/8.  While
probably a fair estimate for smaller machines, large machines with large
numbers of CPUs and large numbers of filesystems/block devices may benefit
from larger numbers of threads working on different block devices.

Even if the background flushing algorithm is radically changed, it is
still likely that multiple threads will be involved and admins would still
desire finer control on the min/max other than to have to recompile the
kernel.

The patch adds '/proc/sys/vm/nr_pdflush_threads_min' and
'/proc/sys/vm/nr_pdflush_threads_max' with r/w permissions.

The minimum value for nr_pdflush_threads_min is 1 and the maximum value is
the current value of nr_pdflush_threads_max.  This minimum is required
since additional thread creation is performed in a pdflush thread itself.

The minimum value for nr_pdflush_threads_max is the current value of
nr_pdflush_threads_min and the maximum value can be 1000.

Documentation/sysctl/vm.txt is also updated.

[akpm@linux-foundation.org: fix comment, fix whitespace, use __read_mostly]
Signed-off-by: Peter W Morreale <pmorreale@novell.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/sysctl/vm.txt | 28 ++++++++++++++++++++++++++++
 include/linux/writeback.h   |  2 ++
 kernel/sysctl.c             | 23 +++++++++++++++++++++++
 mm/pdflush.c                | 31 +++++++++++++++++++------------
 4 files changed, 72 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt
index 3197fc83bc51..97c4b3284329 100644
--- a/Documentation/sysctl/vm.txt
+++ b/Documentation/sysctl/vm.txt
@@ -39,6 +39,8 @@ Currently, these files are in /proc/sys/vm:
 - nr_hugepages
 - nr_overcommit_hugepages
 - nr_pdflush_threads
+- nr_pdflush_threads_min
+- nr_pdflush_threads_max
 - nr_trim_pages         (only if CONFIG_MMU=n)
 - numa_zonelist_order
 - oom_dump_tasks
@@ -463,6 +465,32 @@ The default value is 0.
 
 ==============================================================
 
+nr_pdflush_threads_min
+
+This value controls the minimum number of pdflush threads.
+
+At boot time, the kernel will create and maintain 'nr_pdflush_threads_min'
+threads for the kernel's lifetime.
+
+The default value is 2.  The minimum value you can specify is 1, and
+the maximum value is the current setting of 'nr_pdflush_threads_max'.
+
+See 'nr_pdflush_threads_max' below for more information.
+
+==============================================================
+
+nr_pdflush_threads_max
+
+This value controls the maximum number of pdflush threads that can be
+created.  The pdflush algorithm will create a new pdflush thread (up to
+this maximum) if no pdflush threads have been available for >= 1 second.
+
+The default value is 8.  The minimum value you can specify is the
+current value of 'nr_pdflush_threads_min' and the
+maximum is 1000.
+
+==============================================================
+
 overcommit_memory:
 
 This value contains a flag that enables memory overcommitment.
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index 93445477f86a..9c1ed1fb6ddb 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -168,6 +168,8 @@ void writeback_set_ratelimit(void);
 /* pdflush.c */
 extern int nr_pdflush_threads;	/* Global so it can be exported to sysctl
 				   read-only. */
+extern int nr_pdflush_threads_max; /* Global so it can be exported to sysctl */
+extern int nr_pdflush_threads_min; /* Global so it can be exported to sysctl */
 
 
 #endif		/* WRITEBACK_H */
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index b125e3387568..72eb1a41dcab 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -101,6 +101,7 @@ static int __maybe_unused one = 1;
 static int __maybe_unused two = 2;
 static unsigned long one_ul = 1;
 static int one_hundred = 100;
+static int one_thousand = 1000;
 
 /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */
 static int maxolduid = 65535;
@@ -1026,6 +1027,28 @@ static struct ctl_table vm_table[] = {
 		.mode		= 0444 /* read-only*/,
 		.proc_handler	= &proc_dointvec,
 	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_min",
+		.data		= &nr_pdflush_threads_min,
+		.maxlen		= sizeof nr_pdflush_threads_min,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &one,
+		.extra2		= &nr_pdflush_threads_max,
+	},
+	{
+		.ctl_name	= CTL_UNNUMBERED,
+		.procname	= "nr_pdflush_threads_max",
+		.data		= &nr_pdflush_threads_max,
+		.maxlen		= sizeof nr_pdflush_threads_max,
+		.mode		= 0644 /* read-write */,
+		.proc_handler	= &proc_dointvec_minmax,
+		.strategy	= &sysctl_intvec,
+		.extra1		= &nr_pdflush_threads_min,
+		.extra2		= &one_thousand,
+	},
 	{
 		.ctl_name	= VM_SWAPPINESS,
 		.procname	= "swappiness",
diff --git a/mm/pdflush.c b/mm/pdflush.c
index 235ac440c44e..f2caf96993f8 100644
--- a/mm/pdflush.c
+++ b/mm/pdflush.c
@@ -57,6 +57,14 @@ static DEFINE_SPINLOCK(pdflush_lock);
  */
 int nr_pdflush_threads = 0;
 
+/*
+ * The max/min number of pdflush threads. R/W by sysctl at
+ * /proc/sys/vm/nr_pdflush_threads_max/min
+ */
+int nr_pdflush_threads_max __read_mostly = MAX_PDFLUSH_THREADS;
+int nr_pdflush_threads_min __read_mostly = MIN_PDFLUSH_THREADS;
+
+
 /*
  * The time at which the pdflush thread pool last went empty
  */
@@ -68,7 +76,7 @@ static unsigned long last_empty_jifs;
  * Thread pool management algorithm:
  * 
  * - The minimum and maximum number of pdflush instances are bound
- *   by MIN_PDFLUSH_THREADS and MAX_PDFLUSH_THREADS.
+ *   by nr_pdflush_threads_min and nr_pdflush_threads_max.
  * 
  * - If there have been no idle pdflush instances for 1 second, create
  *   a new one.
@@ -134,14 +142,13 @@ static int __pdflush(struct pdflush_work *my_work)
 		 * To throttle creation, we reset last_empty_jifs.
 		 */
 		if (time_after(jiffies, last_empty_jifs + 1 * HZ)) {
-			if (list_empty(&pdflush_list)) {
-				if (nr_pdflush_threads < MAX_PDFLUSH_THREADS) {
-					last_empty_jifs = jiffies;
-					nr_pdflush_threads++;
-					spin_unlock_irq(&pdflush_lock);
-					start_one_pdflush_thread();
-					spin_lock_irq(&pdflush_lock);
-				}
+			if (list_empty(&pdflush_list) &&
+			    nr_pdflush_threads < nr_pdflush_threads_max) {
+				last_empty_jifs = jiffies;
+				nr_pdflush_threads++;
+				spin_unlock_irq(&pdflush_lock);
+				start_one_pdflush_thread();
+				spin_lock_irq(&pdflush_lock);
 			}
 		}
 
@@ -153,7 +160,7 @@ static int __pdflush(struct pdflush_work *my_work)
 		 */
 		if (list_empty(&pdflush_list))
 			continue;
-		if (nr_pdflush_threads <= MIN_PDFLUSH_THREADS)
+		if (nr_pdflush_threads <= nr_pdflush_threads_min)
 			continue;
 		pdf = list_entry(pdflush_list.prev, struct pdflush_work, list);
 		if (time_after(jiffies, pdf->when_i_went_to_sleep + 1 * HZ)) {
@@ -259,9 +266,9 @@ static int __init pdflush_init(void)
 	 * Pre-set nr_pdflush_threads...  If we fail to create,
 	 * the count will be decremented.
 	 */
-	nr_pdflush_threads = MIN_PDFLUSH_THREADS;
+	nr_pdflush_threads = nr_pdflush_threads_min;
 
-	for (i = 0; i < MIN_PDFLUSH_THREADS; i++)
+	for (i = 0; i < nr_pdflush_threads_min; i++)
 		start_one_pdflush_thread();
 	return 0;
 }
-- 
cgit v1.2.3


From b918e5e60d775549478e4268155142156a95aa17 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:00:58 -0700
Subject: kprobes: cleanup aggr_kprobe related code

Currently, kprobes can disable all probes at once, but can't disable it
individually (not unregister, just disable an kprobe, because
unregistering needs to wait for scheduler synchronization).  These patches
introduce APIs for on-the-fly per-probe disabling and re-enabling by
dis-arming/re-arming its breakpoint instruction.

This patch:

Change old_p to ap in add_new_kprobe() for readability, copy flags member
in add_aggr_kprobe(), and simplify the code flow of
register_aggr_kprobe().

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 60 ++++++++++++++++++++++++++++----------------------------
 1 file changed, 30 insertions(+), 30 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index 5016bfb682b9..a55bfadfd766 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -518,20 +518,20 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 }
 
 /*
-* Add the new probe to old_p->list. Fail if this is the
+* Add the new probe to ap->list. Fail if this is the
 * second jprobe at the address - two jprobes can't coexist
 */
-static int __kprobes add_new_kprobe(struct kprobe *old_p, struct kprobe *p)
+static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
 	if (p->break_handler) {
-		if (old_p->break_handler)
+		if (ap->break_handler)
 			return -EEXIST;
-		list_add_tail_rcu(&p->list, &old_p->list);
-		old_p->break_handler = aggr_break_handler;
+		list_add_tail_rcu(&p->list, &ap->list);
+		ap->break_handler = aggr_break_handler;
 	} else
-		list_add_rcu(&p->list, &old_p->list);
-	if (p->post_handler && !old_p->post_handler)
-		old_p->post_handler = aggr_post_handler;
+		list_add_rcu(&p->list, &ap->list);
+	if (p->post_handler && !ap->post_handler)
+		ap->post_handler = aggr_post_handler;
 	return 0;
 }
 
@@ -544,6 +544,7 @@ static inline void add_aggr_kprobe(struct kprobe *ap, struct kprobe *p)
 	copy_kprobe(p, ap);
 	flush_insn_slot(ap);
 	ap->addr = p->addr;
+	ap->flags = p->flags;
 	ap->pre_handler = aggr_pre_handler;
 	ap->fault_handler = aggr_fault_handler;
 	/* We don't care the kprobe which has gone. */
@@ -566,44 +567,43 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 					  struct kprobe *p)
 {
 	int ret = 0;
-	struct kprobe *ap;
+	struct kprobe *ap = old_p;
 
-	if (kprobe_gone(old_p)) {
+	if (old_p->pre_handler != aggr_pre_handler) {
+		/* If old_p is not an aggr_probe, create new aggr_kprobe. */
+		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
+		if (!ap)
+			return -ENOMEM;
+		add_aggr_kprobe(ap, old_p);
+	}
+
+	if (kprobe_gone(ap)) {
 		/*
 		 * Attempting to insert new probe at the same location that
 		 * had a probe in the module vaddr area which already
 		 * freed. So, the instruction slot has already been
 		 * released. We need a new slot for the new probe.
 		 */
-		ret = arch_prepare_kprobe(old_p);
+		ret = arch_prepare_kprobe(ap);
 		if (ret)
+			/*
+			 * Even if fail to allocate new slot, don't need to
+			 * free aggr_probe. It will be used next time, or
+			 * freed by unregister_kprobe.
+			 */
 			return ret;
-	}
-	if (old_p->pre_handler == aggr_pre_handler) {
-		copy_kprobe(old_p, p);
-		ret = add_new_kprobe(old_p, p);
-		ap = old_p;
-	} else {
-		ap = kzalloc(sizeof(struct kprobe), GFP_KERNEL);
-		if (!ap) {
-			if (kprobe_gone(old_p))
-				arch_remove_kprobe(old_p);
-			return -ENOMEM;
-		}
-		add_aggr_kprobe(ap, old_p);
-		copy_kprobe(ap, p);
-		ret = add_new_kprobe(ap, p);
-	}
-	if (kprobe_gone(old_p)) {
+		/* Clear gone flag to prevent allocating new slot again. */
+		ap->flags &= ~KPROBE_FLAG_GONE;
 		/*
 		 * If the old_p has gone, its breakpoint has been disarmed.
 		 * We have to arm it again after preparing real kprobes.
 		 */
-		ap->flags &= ~KPROBE_FLAG_GONE;
 		if (kprobe_enabled)
 			arch_arm_kprobe(ap);
 	}
-	return ret;
+
+	copy_kprobe(ap, p);
+	return add_new_kprobe(ap, p);
 }
 
 static int __kprobes in_kprobes_functions(unsigned long addr)
-- 
cgit v1.2.3


From 99081ab553d6a88dd6b3774af5eef94dbb8faad7 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:00:59 -0700
Subject: kprobes: move EXPORT_SYMBOL_GPL just after function definitions

Clean up positions of EXPORT_SYMBOL_GPL in kernel/kprobes.c according to
checkpatch.pl.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index a55bfadfd766..ca4c22d78cfd 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -722,6 +722,7 @@ out:
 
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kprobe);
 
 /*
  * Unregister a kprobe without a scheduler synchronization.
@@ -803,11 +804,13 @@ int __kprobes register_kprobes(struct kprobe **kps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kprobes);
 
 void __kprobes unregister_kprobe(struct kprobe *p)
 {
 	unregister_kprobes(&p, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_kprobe);
 
 void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 {
@@ -826,6 +829,7 @@ void __kprobes unregister_kprobes(struct kprobe **kps, int num)
 		if (kps[i]->addr)
 			__unregister_kprobe_bottom(kps[i]);
 }
+EXPORT_SYMBOL_GPL(unregister_kprobes);
 
 static struct notifier_block kprobe_exceptions_nb = {
 	.notifier_call = kprobe_exceptions_notify,
@@ -865,16 +869,19 @@ int __kprobes register_jprobes(struct jprobe **jps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_jprobes);
 
 int __kprobes register_jprobe(struct jprobe *jp)
 {
 	return register_jprobes(&jp, 1);
 }
+EXPORT_SYMBOL_GPL(register_jprobe);
 
 void __kprobes unregister_jprobe(struct jprobe *jp)
 {
 	unregister_jprobes(&jp, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_jprobe);
 
 void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 {
@@ -894,6 +901,7 @@ void __kprobes unregister_jprobes(struct jprobe **jps, int num)
 			__unregister_kprobe_bottom(&jps[i]->kp);
 	}
 }
+EXPORT_SYMBOL_GPL(unregister_jprobes);
 
 #ifdef CONFIG_KRETPROBES
 /*
@@ -987,6 +995,7 @@ int __kprobes register_kretprobe(struct kretprobe *rp)
 		free_rp_inst(rp);
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kretprobe);
 
 int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
@@ -1004,11 +1013,13 @@ int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 	}
 	return ret;
 }
+EXPORT_SYMBOL_GPL(register_kretprobes);
 
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 	unregister_kretprobes(&rp, 1);
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
 
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
@@ -1030,24 +1041,30 @@ void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 		}
 	}
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 
 #else /* CONFIG_KRETPROBES */
 int __kprobes register_kretprobe(struct kretprobe *rp)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(register_kretprobe);
 
 int __kprobes register_kretprobes(struct kretprobe **rps, int num)
 {
 	return -ENOSYS;
 }
+EXPORT_SYMBOL_GPL(register_kretprobes);
+
 void __kprobes unregister_kretprobe(struct kretprobe *rp)
 {
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobe);
 
 void __kprobes unregister_kretprobes(struct kretprobe **rps, int num)
 {
 }
+EXPORT_SYMBOL_GPL(unregister_kretprobes);
 
 static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 					   struct pt_regs *regs)
@@ -1418,16 +1435,5 @@ late_initcall(debugfs_kprobe_init);
 
 module_init(init_kprobes);
 
-EXPORT_SYMBOL_GPL(register_kprobe);
-EXPORT_SYMBOL_GPL(unregister_kprobe);
-EXPORT_SYMBOL_GPL(register_kprobes);
-EXPORT_SYMBOL_GPL(unregister_kprobes);
-EXPORT_SYMBOL_GPL(register_jprobe);
-EXPORT_SYMBOL_GPL(unregister_jprobe);
-EXPORT_SYMBOL_GPL(register_jprobes);
-EXPORT_SYMBOL_GPL(unregister_jprobes);
+/* defined in arch/.../kernel/kprobes.c */
 EXPORT_SYMBOL_GPL(jprobe_return);
-EXPORT_SYMBOL_GPL(register_kretprobe);
-EXPORT_SYMBOL_GPL(unregister_kretprobe);
-EXPORT_SYMBOL_GPL(register_kretprobes);
-EXPORT_SYMBOL_GPL(unregister_kretprobes);
-- 
cgit v1.2.3


From e579abeb58eb4b8d7321c6eb44dd9e2d0cbaebaa Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:01 -0700
Subject: kprobes: rename kprobe_enabled to kprobes_all_disarmed

Rename kprobe_enabled to kprobes_all_disarmed and invert logic due to
avoiding naming confusion from per-probe disabling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 kernel/kprobes.c | 34 +++++++++++++++++-----------------
 1 file changed, 17 insertions(+), 17 deletions(-)

(limited to 'kernel')

diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index ca4c22d78cfd..dae198b68e97 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -68,7 +68,7 @@ static struct hlist_head kprobe_table[KPROBE_TABLE_SIZE];
 static struct hlist_head kretprobe_inst_table[KPROBE_TABLE_SIZE];
 
 /* NOTE: change this value only with kprobe_mutex held */
-static bool kprobe_enabled;
+static bool kprobes_all_disarmed;
 
 static DEFINE_MUTEX(kprobe_mutex);	/* Protects kprobe_table */
 static DEFINE_PER_CPU(struct kprobe *, kprobe_instance) = NULL;
@@ -598,7 +598,7 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 		 * If the old_p has gone, its breakpoint has been disarmed.
 		 * We have to arm it again after preparing real kprobes.
 		 */
-		if (kprobe_enabled)
+		if (!kprobes_all_disarmed)
 			arch_arm_kprobe(ap);
 	}
 
@@ -709,7 +709,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (kprobe_enabled)
+	if (!kprobes_all_disarmed)
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -751,7 +751,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (kprobe_enabled && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -1190,8 +1190,8 @@ static int __init init_kprobes(void)
 		}
 	}
 
-	/* By default, kprobes are enabled */
-	kprobe_enabled = true;
+	/* By default, kprobes are armed */
+	kprobes_all_disarmed = false;
 
 	err = arch_init_kprobes();
 	if (!err)
@@ -1289,7 +1289,7 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
-static void __kprobes enable_all_kprobes(void)
+static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -1298,8 +1298,8 @@ static void __kprobes enable_all_kprobes(void)
 
 	mutex_lock(&kprobe_mutex);
 
-	/* If kprobes are already enabled, just return */
-	if (kprobe_enabled)
+	/* If kprobes are armed, just return */
+	if (!kprobes_all_disarmed)
 		goto already_enabled;
 
 	mutex_lock(&text_mutex);
@@ -1311,7 +1311,7 @@ static void __kprobes enable_all_kprobes(void)
 	}
 	mutex_unlock(&text_mutex);
 
-	kprobe_enabled = true;
+	kprobes_all_disarmed = false;
 	printk(KERN_INFO "Kprobes globally enabled\n");
 
 already_enabled:
@@ -1319,7 +1319,7 @@ already_enabled:
 	return;
 }
 
-static void __kprobes disable_all_kprobes(void)
+static void __kprobes disarm_all_kprobes(void)
 {
 	struct hlist_head *head;
 	struct hlist_node *node;
@@ -1328,11 +1328,11 @@ static void __kprobes disable_all_kprobes(void)
 
 	mutex_lock(&kprobe_mutex);
 
-	/* If kprobes are already disabled, just return */
-	if (!kprobe_enabled)
+	/* If kprobes are already disarmed, just return */
+	if (kprobes_all_disarmed)
 		goto already_disabled;
 
-	kprobe_enabled = false;
+	kprobes_all_disarmed = true;
 	printk(KERN_INFO "Kprobes globally disabled\n");
 	mutex_lock(&text_mutex);
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
@@ -1364,7 +1364,7 @@ static ssize_t read_enabled_file_bool(struct file *file,
 {
 	char buf[3];
 
-	if (kprobe_enabled)
+	if (!kprobes_all_disarmed)
 		buf[0] = '1';
 	else
 		buf[0] = '0';
@@ -1387,12 +1387,12 @@ static ssize_t write_enabled_file_bool(struct file *file,
 	case 'y':
 	case 'Y':
 	case '1':
-		enable_all_kprobes();
+		arm_all_kprobes();
 		break;
 	case 'n':
 	case 'N':
 	case '0':
-		disable_all_kprobes();
+		disarm_all_kprobes();
 		break;
 	}
 
-- 
cgit v1.2.3


From de5bd88d5a5cce3cacea904d3503e5ebdb3852a2 Mon Sep 17 00:00:00 2001
From: Masami Hiramatsu <mhiramat@redhat.com>
Date: Mon, 6 Apr 2009 19:01:02 -0700
Subject: kprobes: support per-kprobe disabling

Add disable_kprobe() and enable_kprobe() to disable/enable kprobes
temporarily.

disable_kprobe() asynchronously disables probe handlers of specified
kprobe.  So, after calling it, some handlers can be called at a while.
enable_kprobe() enables specified kprobe.

aggr_pre_handler and aggr_post_handler check disabled probes.  On the
other hand aggr_break_handler and aggr_fault_handler don't check it
because these handlers will be called while executing pre or post handlers
and usually those help error handling.

Signed-off-by: Masami Hiramatsu <mhiramat@redhat.com>
Acked-by: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
Cc: David S. Miller <davem@davemloft.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 Documentation/kprobes.txt |  34 ++++++++--
 include/linux/kprobes.h   |  23 ++++++-
 kernel/kprobes.c          | 167 ++++++++++++++++++++++++++++++++++++++--------
 3 files changed, 191 insertions(+), 33 deletions(-)

(limited to 'kernel')

diff --git a/Documentation/kprobes.txt b/Documentation/kprobes.txt
index 48b3de90eb1e..f609af242d6c 100644
--- a/Documentation/kprobes.txt
+++ b/Documentation/kprobes.txt
@@ -212,7 +212,9 @@ hit, Kprobes calls kp->pre_handler.  After the probed instruction
 is single-stepped, Kprobe calls kp->post_handler.  If a fault
 occurs during execution of kp->pre_handler or kp->post_handler,
 or during single-stepping of the probed instruction, Kprobes calls
-kp->fault_handler.  Any or all handlers can be NULL.
+kp->fault_handler.  Any or all handlers can be NULL. If kp->flags
+is set KPROBE_FLAG_DISABLED, that kp will be registered but disabled,
+so, it's handlers aren't hit until calling enable_kprobe(kp).
 
 NOTE:
 1. With the introduction of the "symbol_name" field to struct kprobe,
@@ -363,6 +365,22 @@ probes) in the specified array, they clear the addr field of those
 incorrect probes. However, other probes in the array are
 unregistered correctly.
 
+4.7 disable_kprobe
+
+#include <linux/kprobes.h>
+int disable_kprobe(struct kprobe *kp);
+
+Temporarily disables the specified kprobe. You can enable it again by using
+enable_kprobe(). You must specify the kprobe which has been registered.
+
+4.8 enable_kprobe
+
+#include <linux/kprobes.h>
+int enable_kprobe(struct kprobe *kp);
+
+Enables kprobe which has been disabled by disable_kprobe(). You must specify
+the kprobe which has been registered.
+
 5. Kprobes Features and Limitations
 
 Kprobes allows multiple probes at the same address.  Currently,
@@ -500,10 +518,14 @@ the probe. If the probed function belongs to a module, the module name
 is also specified. Following columns show probe status. If the probe is on
 a virtual address that is no longer valid (module init sections, module
 virtual addresses that correspond to modules that've been unloaded),
-such probes are marked with [GONE].
+such probes are marked with [GONE]. If the probe is temporarily disabled,
+such probes are marked with [DISABLED].
 
-/debug/kprobes/enabled: Turn kprobes ON/OFF
+/debug/kprobes/enabled: Turn kprobes ON/OFF forcibly.
 
-Provides a knob to globally turn registered kprobes ON or OFF. By default,
-all kprobes are enabled. By echoing "0" to this file, all registered probes
-will be disarmed, till such time a "1" is echoed to this file.
+Provides a knob to globally and forcibly turn registered kprobes ON or OFF.
+By default, all kprobes are enabled. By echoing "0" to this file, all
+registered probes will be disarmed, till such time a "1" is echoed to this
+file. Note that this knob just disarms and arms all kprobes and doesn't
+change each probe's disabling state. This means that disabled kprobes (marked
+[DISABLED]) will be not enabled if you turn ON all kprobes by this knob.
diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h
index 39826a678364..1071cfddddc9 100644
--- a/include/linux/kprobes.h
+++ b/include/linux/kprobes.h
@@ -112,18 +112,28 @@ struct kprobe {
 	/* copy of the original instruction */
 	struct arch_specific_insn ainsn;
 
-	/* Indicates various status flags.  Protected by kprobe_mutex. */
+	/*
+	 * Indicates various status flags.
+	 * Protected by kprobe_mutex after this kprobe is registered.
+	 */
 	u32 flags;
 };
 
 /* Kprobe status flags */
 #define KPROBE_FLAG_GONE	1 /* breakpoint has already gone */
+#define KPROBE_FLAG_DISABLED	2 /* probe is temporarily disabled */
 
+/* Has this kprobe gone ? */
 static inline int kprobe_gone(struct kprobe *p)
 {
 	return p->flags & KPROBE_FLAG_GONE;
 }
 
+/* Is this kprobe disabled ? */
+static inline int kprobe_disabled(struct kprobe *p)
+{
+	return p->flags & (KPROBE_FLAG_DISABLED | KPROBE_FLAG_GONE);
+}
 /*
  * Special probe type that uses setjmp-longjmp type tricks to resume
  * execution at a specified entry with a matching prototype corresponding
@@ -283,6 +293,9 @@ void unregister_kretprobes(struct kretprobe **rps, int num);
 void kprobe_flush_task(struct task_struct *tk);
 void recycle_rp_inst(struct kretprobe_instance *ri, struct hlist_head *head);
 
+int disable_kprobe(struct kprobe *kp);
+int enable_kprobe(struct kprobe *kp);
+
 #else /* !CONFIG_KPROBES: */
 
 static inline int kprobes_built_in(void)
@@ -349,5 +362,13 @@ static inline void unregister_kretprobes(struct kretprobe **rps, int num)
 static inline void kprobe_flush_task(struct task_struct *tk)
 {
 }
+static inline int disable_kprobe(struct kprobe *kp)
+{
+	return -ENOSYS;
+}
+static inline int enable_kprobe(struct kprobe *kp)
+{
+	return -ENOSYS;
+}
 #endif /* CONFIG_KPROBES */
 #endif /* _LINUX_KPROBES_H */
diff --git a/kernel/kprobes.c b/kernel/kprobes.c
index dae198b68e97..a5e74ddee0e2 100644
--- a/kernel/kprobes.c
+++ b/kernel/kprobes.c
@@ -328,7 +328,7 @@ static int __kprobes aggr_pre_handler(struct kprobe *p, struct pt_regs *regs)
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->pre_handler && !kprobe_gone(kp)) {
+		if (kp->pre_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			if (kp->pre_handler(kp, regs))
 				return 1;
@@ -344,7 +344,7 @@ static void __kprobes aggr_post_handler(struct kprobe *p, struct pt_regs *regs,
 	struct kprobe *kp;
 
 	list_for_each_entry_rcu(kp, &p->list, list) {
-		if (kp->post_handler && !kprobe_gone(kp)) {
+		if (kp->post_handler && likely(!kprobe_disabled(kp))) {
 			set_kprobe_instance(kp);
 			kp->post_handler(kp, regs, flags);
 			reset_kprobe_instance();
@@ -523,6 +523,7 @@ static inline void copy_kprobe(struct kprobe *old_p, struct kprobe *p)
 */
 static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 {
+	BUG_ON(kprobe_gone(ap) || kprobe_gone(p));
 	if (p->break_handler) {
 		if (ap->break_handler)
 			return -EEXIST;
@@ -532,6 +533,13 @@ static int __kprobes add_new_kprobe(struct kprobe *ap, struct kprobe *p)
 		list_add_rcu(&p->list, &ap->list);
 	if (p->post_handler && !ap->post_handler)
 		ap->post_handler = aggr_post_handler;
+
+	if (kprobe_disabled(ap) && !kprobe_disabled(p)) {
+		ap->flags &= ~KPROBE_FLAG_DISABLED;
+		if (!kprobes_all_disarmed)
+			/* Arm the breakpoint again. */
+			arch_arm_kprobe(ap);
+	}
 	return 0;
 }
 
@@ -592,20 +600,36 @@ static int __kprobes register_aggr_kprobe(struct kprobe *old_p,
 			 * freed by unregister_kprobe.
 			 */
 			return ret;
-		/* Clear gone flag to prevent allocating new slot again. */
-		ap->flags &= ~KPROBE_FLAG_GONE;
+
 		/*
-		 * If the old_p has gone, its breakpoint has been disarmed.
-		 * We have to arm it again after preparing real kprobes.
+		 * Clear gone flag to prevent allocating new slot again, and
+		 * set disabled flag because it is not armed yet.
 		 */
-		if (!kprobes_all_disarmed)
-			arch_arm_kprobe(ap);
+		ap->flags = (ap->flags & ~KPROBE_FLAG_GONE)
+			    | KPROBE_FLAG_DISABLED;
 	}
 
 	copy_kprobe(ap, p);
 	return add_new_kprobe(ap, p);
 }
 
+/* Try to disable aggr_kprobe, and return 1 if succeeded.*/
+static int __kprobes try_to_disable_aggr_kprobe(struct kprobe *p)
+{
+	struct kprobe *kp;
+
+	list_for_each_entry_rcu(kp, &p->list, list) {
+		if (!kprobe_disabled(kp))
+			/*
+			 * There is an active probe on the list.
+			 * We can't disable aggr_kprobe.
+			 */
+			return 0;
+	}
+	p->flags |= KPROBE_FLAG_DISABLED;
+	return 1;
+}
+
 static int __kprobes in_kprobes_functions(unsigned long addr)
 {
 	struct kprobe_blackpoint *kb;
@@ -664,7 +688,9 @@ int __kprobes register_kprobe(struct kprobe *p)
 		return -EINVAL;
 	}
 
-	p->flags = 0;
+	/* User can pass only KPROBE_FLAG_DISABLED to register_kprobe */
+	p->flags &= KPROBE_FLAG_DISABLED;
+
 	/*
 	 * Check if are we probing a module.
 	 */
@@ -709,7 +735,7 @@ int __kprobes register_kprobe(struct kprobe *p)
 	hlist_add_head_rcu(&p->hlist,
 		       &kprobe_table[hash_ptr(p->addr, KPROBE_HASH_BITS)]);
 
-	if (!kprobes_all_disarmed)
+	if (!kprobes_all_disarmed && !kprobe_disabled(p))
 		arch_arm_kprobe(p);
 
 out_unlock_text:
@@ -724,25 +750,37 @@ out:
 }
 EXPORT_SYMBOL_GPL(register_kprobe);
 
-/*
- * Unregister a kprobe without a scheduler synchronization.
- */
-static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+/* Check passed kprobe is valid and return kprobe in kprobe_table. */
+static struct kprobe * __kprobes __get_valid_kprobe(struct kprobe *p)
 {
 	struct kprobe *old_p, *list_p;
 
 	old_p = get_kprobe(p->addr);
 	if (unlikely(!old_p))
-		return -EINVAL;
+		return NULL;
 
 	if (p != old_p) {
 		list_for_each_entry_rcu(list_p, &old_p->list, list)
 			if (list_p == p)
 			/* kprobe p is a valid probe */
-				goto valid_p;
-		return -EINVAL;
+				goto valid;
+		return NULL;
 	}
-valid_p:
+valid:
+	return old_p;
+}
+
+/*
+ * Unregister a kprobe without a scheduler synchronization.
+ */
+static int __kprobes __unregister_kprobe_top(struct kprobe *p)
+{
+	struct kprobe *old_p, *list_p;
+
+	old_p = __get_valid_kprobe(p);
+	if (old_p == NULL)
+		return -EINVAL;
+
 	if (old_p == p ||
 	    (old_p->pre_handler == aggr_pre_handler &&
 	     list_is_singular(&old_p->list))) {
@@ -751,7 +789,7 @@ valid_p:
 		 * enabled and not gone - otherwise, the breakpoint would
 		 * already have been removed. We save on flushing icache.
 		 */
-		if (!kprobes_all_disarmed && !kprobe_gone(old_p)) {
+		if (!kprobes_all_disarmed && !kprobe_disabled(old_p)) {
 			mutex_lock(&text_mutex);
 			arch_disarm_kprobe(p);
 			mutex_unlock(&text_mutex);
@@ -769,6 +807,11 @@ valid_p:
 		}
 noclean:
 		list_del_rcu(&p->list);
+		if (!kprobe_disabled(old_p)) {
+			try_to_disable_aggr_kprobe(old_p);
+			if (!kprobes_all_disarmed && kprobe_disabled(old_p))
+				arch_disarm_kprobe(old_p);
+		}
 	}
 	return 0;
 }
@@ -1078,6 +1121,7 @@ static int __kprobes pre_handler_kretprobe(struct kprobe *p,
 static void __kprobes kill_kprobe(struct kprobe *p)
 {
 	struct kprobe *kp;
+
 	p->flags |= KPROBE_FLAG_GONE;
 	if (p->pre_handler == aggr_pre_handler) {
 		/*
@@ -1219,12 +1263,18 @@ static void __kprobes report_probe(struct seq_file *pi, struct kprobe *p,
 	else
 		kprobe_type = "k";
 	if (sym)
-		seq_printf(pi, "%p  %s  %s+0x%x  %s %s\n", p->addr, kprobe_type,
-			sym, offset, (modname ? modname : " "),
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %s+0x%x  %s %s%s\n",
+			p->addr, kprobe_type, sym, offset,
+			(modname ? modname : " "),
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 	else
-		seq_printf(pi, "%p  %s  %p %s\n", p->addr, kprobe_type, p->addr,
-			(kprobe_gone(p) ? "[GONE]" : ""));
+		seq_printf(pi, "%p  %s  %p %s%s\n",
+			p->addr, kprobe_type, p->addr,
+			(kprobe_gone(p) ? "[GONE]" : ""),
+			((kprobe_disabled(p) && !kprobe_gone(p)) ?
+			 "[DISABLED]" : ""));
 }
 
 static void __kprobes *kprobe_seq_start(struct seq_file *f, loff_t *pos)
@@ -1289,6 +1339,71 @@ static struct file_operations debugfs_kprobes_operations = {
 	.release        = seq_release,
 };
 
+/* Disable one kprobe */
+int __kprobes disable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	/* If the probe is already disabled (or gone), just return */
+	if (kprobe_disabled(kp))
+		goto out;
+
+	kp->flags |= KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		/* When kp != p, p is always enabled. */
+		try_to_disable_aggr_kprobe(p);
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_disarm_kprobe(p);
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(disable_kprobe);
+
+/* Enable one kprobe */
+int __kprobes enable_kprobe(struct kprobe *kp)
+{
+	int ret = 0;
+	struct kprobe *p;
+
+	mutex_lock(&kprobe_mutex);
+
+	/* Check whether specified probe is valid. */
+	p = __get_valid_kprobe(kp);
+	if (unlikely(p == NULL)) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (kprobe_gone(kp)) {
+		/* This kprobe has gone, we couldn't enable it. */
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (!kprobes_all_disarmed && kprobe_disabled(p))
+		arch_arm_kprobe(p);
+
+	p->flags &= ~KPROBE_FLAG_DISABLED;
+	if (p != kp)
+		kp->flags &= ~KPROBE_FLAG_DISABLED;
+out:
+	mutex_unlock(&kprobe_mutex);
+	return ret;
+}
+EXPORT_SYMBOL_GPL(enable_kprobe);
+
 static void __kprobes arm_all_kprobes(void)
 {
 	struct hlist_head *head;
@@ -1306,7 +1421,7 @@ static void __kprobes arm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist)
-			if (!kprobe_gone(p))
+			if (!kprobe_disabled(p))
 				arch_arm_kprobe(p);
 	}
 	mutex_unlock(&text_mutex);
@@ -1338,7 +1453,7 @@ static void __kprobes disarm_all_kprobes(void)
 	for (i = 0; i < KPROBE_TABLE_SIZE; i++) {
 		head = &kprobe_table[i];
 		hlist_for_each_entry_rcu(p, node, head, hlist) {
-			if (!arch_trampoline_kprobe(p) && !kprobe_gone(p))
+			if (!arch_trampoline_kprobe(p) && !kprobe_disabled(p))
 				arch_disarm_kprobe(p);
 		}
 	}
-- 
cgit v1.2.3