From 0a02ad9331dd4db56c29c60db2e99c4daaad8a48 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf: Add 'perf sched' tool

This turn-key tool allows scheduler measurements to be
conducted and the results be displayed numerically.

First baby step towards that goal: clone the new command off of
perf trace.

Fix a few other details along the way:

 - add (minimal) perf trace documentation

 - reorder a few places

 - list perf trace in the mainporcelain list as well
   as it's a very useful utility.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-sched.txt |  25 +++
 tools/perf/Documentation/perf-trace.txt |  25 +++
 tools/perf/Makefile                     |   1 +
 tools/perf/builtin-sched.c              | 297 ++++++++++++++++++++++++++++++++
 tools/perf/builtin.h                    |   5 +-
 tools/perf/command-list.txt             |   2 +
 tools/perf/perf.c                       |   1 +
 7 files changed, 354 insertions(+), 2 deletions(-)
 create mode 100644 tools/perf/Documentation/perf-sched.txt
 create mode 100644 tools/perf/Documentation/perf-trace.txt
 create mode 100644 tools/perf/builtin-sched.c
diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
new file mode 100644
index 000000000000..056320eecb3a
--- /dev/null
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -0,0 +1,25 @@
+perf-sched(1)
+==============
+
+NAME
+----
+perf-sched - Read perf.data (created by perf record) and display sched output
+
+SYNOPSIS
+--------
+[verse]
+'perf sched' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the latencies recorded.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the sched data.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Documentation/perf-trace.txt b/tools/perf/Documentation/perf-trace.txt
new file mode 100644
index 000000000000..41ed75398ca9
--- /dev/null
+++ b/tools/perf/Documentation/perf-trace.txt
@@ -0,0 +1,25 @@
+perf-trace(1)
+==============
+
+NAME
+----
+perf-trace - Read perf.data (created by perf record) and display trace output
+
+SYNOPSIS
+--------
+[verse]
+'perf trace' [-i <file> | --input=file] symbol_name
+
+DESCRIPTION
+-----------
+This command reads the input file and displays the trace recorded.
+
+OPTIONS
+-------
+-D::
+--dump-raw-trace=::
+        Display verbose dump of the trace data.
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 9f8d207a91bf..2cb8cc3f6772 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -376,6 +376,7 @@ LIB_OBJS += util/trace-event-info.o
 
 BUILTIN_OBJS += builtin-annotate.o
 BUILTIN_OBJS += builtin-help.o
+BUILTIN_OBJS += builtin-sched.o
 BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
new file mode 100644
index 000000000000..60228d9179ac
--- /dev/null
+++ b/tools/perf/builtin-sched.c
@@ -0,0 +1,297 @@
+#include "builtin.h"
+
+#include "util/util.h"
+#include "util/cache.h"
+#include "util/symbol.h"
+#include "util/thread.h"
+#include "util/header.h"
+
+#include "util/parse-options.h"
+
+#include "perf.h"
+#include "util/debug.h"
+
+#include "util/trace-event.h"
+
+static char		const *input_name = "perf.data";
+static int		input;
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+
+static unsigned long	total = 0;
+static unsigned long	total_comm = 0;
+
+static struct rb_root	threads;
+static struct thread	*last_match;
+
+static struct perf_header *header;
+static u64		sample_type;
+
+
+static int
+process_comm_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	struct thread *thread;
+
+	thread = threads__findnew(event->comm.pid, &threads, &last_match);
+
+	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->comm.comm, event->comm.pid);
+
+	if (thread == NULL ||
+	    thread__set_comm(thread, event->comm.comm)) {
+		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		return -1;
+	}
+	total_comm++;
+
+	return 0;
+}
+
+static int
+process_sample_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	char level;
+	int show = 0;
+	struct dso *dso = NULL;
+	struct thread *thread;
+	u64 ip = event->ip.ip;
+	u64 timestamp = -1;
+	u32 cpu = -1;
+	u64 period = 1;
+	void *more_data = event->ip.__more_data;
+	int cpumode;
+
+	thread = threads__findnew(event->ip.pid, &threads, &last_match);
+
+	if (sample_type & PERF_SAMPLE_TIME) {
+		timestamp = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	if (sample_type & PERF_SAMPLE_CPU) {
+		cpu = *(u32 *)more_data;
+		more_data += sizeof(u32);
+		more_data += sizeof(u32); /* reserved */
+	}
+
+	if (sample_type & PERF_SAMPLE_PERIOD) {
+		period = *(u64 *)more_data;
+		more_data += sizeof(u64);
+	}
+
+	dump_printf("%p [%p]: PERF_EVENT_SAMPLE (IP, %d): %d/%d: %p period: %Ld\n",
+		(void *)(offset + head),
+		(void *)(long)(event->header.size),
+		event->header.misc,
+		event->ip.pid, event->ip.tid,
+		(void *)(long)ip,
+		(long long)period);
+
+	dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid);
+
+	if (thread == NULL) {
+		eprintf("problem processing %d event, skipping it.\n",
+			event->header.type);
+		return -1;
+	}
+
+	cpumode = event->header.misc & PERF_EVENT_MISC_CPUMODE_MASK;
+
+	if (cpumode == PERF_EVENT_MISC_KERNEL) {
+		show = SHOW_KERNEL;
+		level = 'k';
+
+		dso = kernel_dso;
+
+		dump_printf(" ...... dso: %s\n", dso->name);
+
+	} else if (cpumode == PERF_EVENT_MISC_USER) {
+
+		show = SHOW_USER;
+		level = '.';
+
+	} else {
+		show = SHOW_HV;
+		level = 'H';
+
+		dso = hypervisor_dso;
+
+		dump_printf(" ...... dso: [hypervisor]\n");
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		struct {
+			u32 size;
+			char data[0];
+		} *raw = more_data;
+
+		/*
+		 * FIXME: better resolve from pid from the struct trace_entry
+		 * field, although it should be the same than this perf
+		 * event pid
+		 */
+		print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
+	}
+	total += period;
+
+	return 0;
+}
+
+static int
+process_event(event_t *event, unsigned long offset, unsigned long head)
+{
+	trace_event(event);
+
+	switch (event->header.type) {
+	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+		return 0;
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event, offset, head);
+
+	case PERF_EVENT_EXIT ... PERF_EVENT_READ:
+		return 0;
+
+	case PERF_EVENT_SAMPLE:
+		return process_sample_event(event, offset, head);
+
+	case PERF_EVENT_MAX:
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static int __cmd_sched(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head = 0;
+	struct stat perf_stat;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+
+	trace_report();
+	register_idle_thread(&threads, &last_match);
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		perror("failed to open file");
+		exit(-1);
+	}
+
+	ret = fstat(input, &perf_stat);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!perf_stat.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+	header = perf_header__read(input);
+	head = header->data_offset;
+	sample_type = perf_header__sample_type(header);
+
+	if (!(sample_type & PERF_SAMPLE_RAW))
+		die("No trace sample to read. Did you call perf record "
+		    "without -R?");
+
+	if (load_kernel() < 0) {
+		perror("failed to load kernel symbols");
+		return EXIT_FAILURE;
+	}
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		unsigned long shift = page_size * (head / page_size);
+		int res;
+
+		res = munmap(buf, page_size * mmap_window);
+		assert(res == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+
+	if (!size || process_event(event, offset, head) < 0) {
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head < (unsigned long)perf_stat.st_size)
+		goto more;
+
+	rc = EXIT_SUCCESS;
+	close(input);
+
+	return rc;
+}
+
+static const char * const annotate_usage[] = {
+	"perf trace [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_END()
+};
+
+int cmd_sched(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init();
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	if (argc) {
+		/*
+		 * Special case: if there's an argument left then assume tha
+		 * it's a symbol filter:
+		 */
+		if (argc > 1)
+			usage_with_options(annotate_usage, options);
+	}
+
+
+	setup_pager();
+
+	return __cmd_sched();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index 3a63e41fb44e..b09cadbd76b1 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -16,12 +16,13 @@ extern int check_pager_config(const char *cmd);
 
 extern int cmd_annotate(int argc, const char **argv, const char *prefix);
 extern int cmd_help(int argc, const char **argv, const char *prefix);
+extern int cmd_sched(int argc, const char **argv, const char *prefix);
+extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
-extern int cmd_version(int argc, const char **argv, const char *prefix);
-extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
+extern int cmd_version(int argc, const char **argv, const char *prefix);
 
 #endif
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index eebce30afbc0..3133c74729dd 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -4,7 +4,9 @@
 #
 perf-annotate			mainporcelain common
 perf-list			mainporcelain common
+perf-sched			mainporcelain common
 perf-record			mainporcelain common
 perf-report			mainporcelain common
 perf-stat			mainporcelain common
 perf-top			mainporcelain common
+perf-trace			mainporcelain common
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index fe4589dde950..c972d1c35489 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -293,6 +293,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "annotate", cmd_annotate, 0 },
 		{ "version", cmd_version, 0 },
 		{ "trace", cmd_trace, 0 },
+		{ "sched", cmd_sched, 0 },
 	};
 	unsigned int i;
 	static const char ext[] = STRIP_EXTENSION;
-- 
cgit v1.2.3


From ec156764d424dd67283c2cd5e9f6f1b8388364ac Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf sched: Import schedbench.c

Import the schedbench.c tool that i wrote some time ago to
simulate scheduler behavior but never finished. It's a good
basis for perf sched nevertheless.

Most of its guts are not hooked up to the perf event loop
yet - that will be done in the patches to come.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c          | 842 ++++++++++++++++++++++++++++++++++--
 tools/perf/util/trace-event-parse.c |  16 +-
 tools/perf/util/trace-event.h       |   2 +
 3 files changed, 827 insertions(+), 33 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 60228d9179ac..c66e6a321371 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -12,22 +12,770 @@
 #include "util/debug.h"
 
 #include "util/trace-event.h"
+#include <sys/types.h>
 
-static char		const *input_name = "perf.data";
-static int		input;
-static unsigned long	page_size;
-static unsigned long	mmap_window = 32;
+static char			const *input_name = "perf.data";
+static int			input;
+static unsigned long		page_size;
+static unsigned long		mmap_window = 32;
 
-static unsigned long	total = 0;
-static unsigned long	total_comm = 0;
+static unsigned long		total_comm = 0;
 
-static struct rb_root	threads;
-static struct thread	*last_match;
+static struct rb_root		threads;
+static struct thread		*last_match;
 
-static struct perf_header *header;
-static u64		sample_type;
+static struct perf_header	*header;
+static u64			sample_type;
 
 
+/*
+ * Scheduler benchmarks
+ */
+#include <sys/resource.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/time.h>
+#include <sys/prctl.h>
+
+#include <linux/unistd.h>
+
+#include <semaphore.h>
+#include <pthread.h>
+#include <signal.h>
+#include <values.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <fcntl.h>
+#include <time.h>
+#include <math.h>
+
+#include <stdio.h>
+
+#define PR_SET_NAME	15               /* Set process name */
+
+#define BUG_ON(x)	assert(!(x))
+
+#define DEBUG		1
+
+typedef unsigned long long nsec_t;
+
+#define printk(x...)		do { printf(x); fflush(stdout); } while (0)
+
+nsec_t prev_printk;
+
+#define __dprintk(x,y...) do {						 \
+	nsec_t __now = get_nsecs(), __delta = __now - prev_printk;	 \
+									 \
+	prev_printk = __now;						 \
+									 \
+	printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\
+} while (0)
+
+#if !DEBUG
+# define dprintk(x...)	do { } while (0)
+#else
+# define dprintk(x...)	__dprintk(x)
+#endif
+
+#define __DP()		__dprintk("parent: line %d\n", __LINE__)
+#define DP()		dprintk("parent: line %d\n", __LINE__)
+#define D()		dprintk("task %ld: line %d\n", this_task->nr, __LINE__)
+
+
+static nsec_t run_measurement_overhead;
+static nsec_t sleep_measurement_overhead;
+
+static nsec_t get_nsecs(void)
+{
+	struct timespec ts;
+
+	clock_gettime(CLOCK_MONOTONIC, &ts);
+
+	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
+}
+
+static void burn_nsecs(nsec_t nsecs)
+{
+	nsec_t T0 = get_nsecs(), T1;
+
+	do {
+		T1 = get_nsecs();
+	} while (T1 + run_measurement_overhead < T0 + nsecs);
+}
+
+static void sleep_nsecs(nsec_t nsecs)
+{
+	struct timespec ts;
+
+	ts.tv_nsec = nsecs % 999999999;
+	ts.tv_sec = nsecs / 999999999;
+
+	nanosleep(&ts, NULL);
+}
+
+static void calibrate_run_measurement_overhead(void)
+{
+	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		T0 = get_nsecs();
+		burn_nsecs(0);
+		T1 = get_nsecs();
+		delta = T1-T0;
+		min_delta = min(min_delta, delta);
+	}
+	run_measurement_overhead = min_delta;
+
+	printk("run measurement overhead: %Ld nsecs\n", min_delta);
+}
+
+static void calibrate_sleep_measurement_overhead(void)
+{
+	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	int i;
+
+	for (i = 0; i < 10; i++) {
+		T0 = get_nsecs();
+		sleep_nsecs(10000);
+		T1 = get_nsecs();
+		delta = T1-T0;
+		min_delta = min(min_delta, delta);
+	}
+	min_delta -= 10000;
+	sleep_measurement_overhead = min_delta;
+
+	printk("sleep measurement overhead: %Ld nsecs\n", min_delta);
+}
+
+#define COMM_LEN	20
+#define SYM_LEN		129
+
+#define MAX_PID		65536
+
+static unsigned long nr_tasks;
+
+struct sched_event;
+
+struct task_desc {
+	unsigned long		nr;
+	unsigned long		pid;
+	char			comm[COMM_LEN];
+
+	unsigned long		nr_events;
+	unsigned long		curr_event;
+	struct sched_event	**events;
+
+	pthread_t		thread;
+	sem_t			sleep_sem;
+
+	sem_t			ready_for_work;
+	sem_t			work_done_sem;
+
+	nsec_t			cpu_usage;
+};
+
+enum sched_event_type {
+	SCHED_EVENT_RUN,
+	SCHED_EVENT_SLEEP,
+	SCHED_EVENT_WAKEUP,
+};
+
+struct sched_event {
+	enum sched_event_type	type;
+	nsec_t			timestamp;
+	nsec_t			duration;
+	unsigned long		nr;
+	int			specific_wait;
+	sem_t			*wait_sem;
+	struct task_desc	*wakee;
+};
+
+static struct task_desc		*pid_to_task[MAX_PID];
+
+static struct task_desc		**tasks;
+
+static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
+static nsec_t			start_time;
+
+static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
+
+static unsigned long		nr_run_events;
+static unsigned long		nr_sleep_events;
+static unsigned long		nr_wakeup_events;
+
+static unsigned long		nr_sleep_corrections;
+static unsigned long		nr_run_events_optimized;
+
+static struct sched_event *
+get_new_event(struct task_desc *task, nsec_t timestamp)
+{
+	struct sched_event *event = calloc(1, sizeof(*event));
+	unsigned long idx = task->nr_events;
+	size_t size;
+
+	event->timestamp = timestamp;
+	event->nr = idx;
+
+	task->nr_events++;
+	size = sizeof(struct sched_event *) * task->nr_events;
+	task->events = realloc(task->events, size);
+	BUG_ON(!task->events);
+
+	task->events[idx] = event;
+
+	return event;
+}
+
+static struct sched_event *last_event(struct task_desc *task)
+{
+	if (!task->nr_events)
+		return NULL;
+
+	return task->events[task->nr_events - 1];
+}
+
+static void
+add_sched_event_run(struct task_desc *task, nsec_t timestamp,
+		    unsigned long duration)
+{
+	struct sched_event *event, *curr_event = last_event(task);
+
+	/*
+ 	 * optimize an existing RUN event by merging this one
+ 	 * to it:
+ 	 */
+	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
+		nr_run_events_optimized++;
+		curr_event->duration += duration;
+		return;
+	}
+
+	event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_RUN;
+	event->duration = duration;
+
+	nr_run_events++;
+}
+
+static unsigned long targetless_wakeups;
+static unsigned long multitarget_wakeups;
+
+static void
+add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
+		       struct task_desc *wakee)
+{
+	struct sched_event *event, *wakee_event;
+
+	event = get_new_event(task, timestamp);
+	event->type = SCHED_EVENT_WAKEUP;
+	event->wakee = wakee;
+
+	wakee_event = last_event(wakee);
+	if (!wakee_event || wakee_event->type != SCHED_EVENT_SLEEP) {
+		targetless_wakeups++;
+		return;
+	}
+	if (wakee_event->wait_sem) {
+		multitarget_wakeups++;
+		return;
+	}
+
+	wakee_event->wait_sem = calloc(1, sizeof(*wakee_event->wait_sem));
+	sem_init(wakee_event->wait_sem, 0, 0);
+	wakee_event->specific_wait = 1;
+	event->wait_sem = wakee_event->wait_sem;
+
+	nr_wakeup_events++;
+}
+
+static void
+add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
+		      unsigned long uninterruptible __used)
+{
+	struct sched_event *event = get_new_event(task, timestamp);
+
+	event->type = SCHED_EVENT_SLEEP;
+
+	nr_sleep_events++;
+}
+
+static struct task_desc *register_pid(unsigned long pid, const char *comm)
+{
+	struct task_desc *task;
+
+	BUG_ON(pid >= MAX_PID);
+
+	task = pid_to_task[pid];
+
+	if (task)
+		return task;
+
+	task = calloc(1, sizeof(*task));
+	task->pid = pid;
+	task->nr = nr_tasks;
+	strcpy(task->comm, comm);
+	/*
+	 * every task starts in sleeping state - this gets ignored
+	 * if there's no wakeup pointing to this sleep state:
+	 */
+	add_sched_event_sleep(task, 0, 0);
+
+	pid_to_task[pid] = task;
+	nr_tasks++;
+	tasks = realloc(tasks, nr_tasks*sizeof(struct task_task *));
+	BUG_ON(!tasks);
+	tasks[task->nr] = task;
+
+	printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+
+	return task;
+}
+
+
+static int first_trace_line = 1;
+
+static nsec_t first_timestamp;
+static nsec_t prev_timestamp;
+
+void parse_line(char *line);
+
+void parse_line(char *line)
+{
+	unsigned long param1 = 0, param2 = 0;
+	char comm[COMM_LEN], comm2[COMM_LEN];
+	unsigned long pid, pid2, timestamp0;
+	struct task_desc *task, *task2;
+	char func_str[SYM_LEN];
+	nsec_t timestamp;
+	int ret;
+
+	//"   <idle> 0     0D.s3    0us+: try_to_wake_up <events/0 9> (1 0)"
+	ret = sscanf(line, "%20s %5ld %*s %ldus%*c:"
+			   " %128s <%20s %ld> (%ld %ld)\n",
+		comm, &pid, &timestamp0,
+		func_str, comm2, &pid2, &param1, &param2);
+	dprintk("ret: %d\n", ret);
+	if (ret != 8)
+		return;
+
+	timestamp = timestamp0 * 1000LL;
+
+	if (first_trace_line) {
+		first_trace_line = 0;
+		first_timestamp = timestamp;
+	}
+
+	timestamp -= first_timestamp;
+	BUG_ON(timestamp < prev_timestamp);
+	prev_timestamp = timestamp;
+
+	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
+		comm,
+		pid,
+		timestamp, 
+		func_str,
+		comm2,
+		pid2,
+		param1,
+		param2);
+
+	task = register_pid(pid, comm);
+	task2 = register_pid(pid2, comm2);
+
+	if (!strcmp(func_str, "update_curr")) {
+		dprintk("%Ld: task %ld runs for %ld nsecs\n",
+			timestamp, task->nr, param1);
+		add_sched_event_run(task, timestamp, param1);
+	} else if (!strcmp(func_str, "try_to_wake_up")) {
+		dprintk("%Ld: task %ld wakes up task %ld\n",
+			timestamp, task->nr, task2->nr);
+		add_sched_event_wakeup(task, timestamp, task2);
+	} else if (!strcmp(func_str, "deactivate_task")) {
+		dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n",
+			timestamp, task->nr, param1);
+		add_sched_event_sleep(task, timestamp, param1);
+	}
+}
+
+static void print_task_traces(void)
+{
+	struct task_desc *task;
+	unsigned long i;
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		printk("task %6ld (%20s:%10ld), nr_events: %ld\n",
+			task->nr, task->comm, task->pid, task->nr_events);
+	}
+}
+
+static void add_cross_task_wakeups(void)
+{
+	struct task_desc *task1, *task2;
+	unsigned long i, j;
+
+	for (i = 0; i < nr_tasks; i++) {
+		task1 = tasks[i];
+		j = i + 1;
+		if (j == nr_tasks)
+			j = 0;
+		task2 = tasks[j];
+		add_sched_event_wakeup(task1, 0, task2);
+	}
+}
+
+static void
+process_sched_event(struct task_desc *this_task, struct sched_event *event)
+{
+	int ret = 0;
+	nsec_t now;
+	long long delta;
+
+	now = get_nsecs();
+	delta = start_time + event->timestamp - now;
+
+	dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n",
+		this_task->nr, event->nr, event->timestamp,
+		(double)delta/1e6, delta);
+
+	if (0 && delta > 0) {
+		dprintk("%.3f: task %ld FIX %.3f\n",
+			(double)event->timestamp/1e6,
+			this_task->nr,
+			(double)delta/1e6);
+		sleep_nsecs(start_time + event->timestamp - now);
+		nr_sleep_corrections++;
+	}
+
+	switch (event->type) {
+		case SCHED_EVENT_RUN:
+			dprintk("%.3f: task %ld RUN for %.3f\n",
+				(double)event->timestamp/1e6,
+				this_task->nr,
+				(double)event->duration/1e6);
+			burn_nsecs(event->duration);
+			break;
+		case SCHED_EVENT_SLEEP:
+			dprintk("%.3f: task %ld %s SLEEP\n",
+				(double)event->timestamp/1e6,
+				this_task->nr, event->wait_sem ? "" : "SKIP");
+			if (event->wait_sem)
+				ret = sem_wait(event->wait_sem);
+			BUG_ON(ret);
+			break;
+		case SCHED_EVENT_WAKEUP:
+			dprintk("%.3f: task %ld WAKEUP => task %ld\n",
+				(double)event->timestamp/1e6,
+				this_task->nr,
+				event->wakee->nr);
+			if (event->wait_sem)
+				ret = sem_post(event->wait_sem);
+			BUG_ON(ret);
+			break;
+		default:
+			BUG_ON(1);
+	}
+}
+
+static nsec_t get_cpu_usage_nsec_parent(void)
+{
+	struct rusage ru;
+	nsec_t sum;
+	int err;
+
+	err = getrusage(RUSAGE_SELF, &ru);
+	BUG_ON(err);
+
+	sum =  ru.ru_utime.tv_sec*1e9 + ru.ru_utime.tv_usec*1e3;
+	sum += ru.ru_stime.tv_sec*1e9 + ru.ru_stime.tv_usec*1e3;
+
+	return sum;
+}
+
+static nsec_t get_cpu_usage_nsec_self(void)
+{
+	char filename [] = "/proc/1234567890/sched";
+	unsigned long msecs, nsecs;
+	char *line = NULL;
+	nsec_t total = 0;
+	size_t len = 0;
+	ssize_t chars;
+	FILE *file;
+	int ret;
+
+	sprintf(filename, "/proc/%d/sched", getpid());
+	file = fopen(filename, "r");
+	BUG_ON(!file);
+
+	while ((chars = getline(&line, &len, file)) != -1) {
+		dprintk("got line with length %zu :\n", chars);
+		dprintk("%s", line);
+		ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
+			&msecs, &nsecs);
+		if (ret == 2) {
+			total = msecs*1e6 + nsecs;
+			dprintk("total: (%ld.%06ld) %Ld\n",
+				msecs, nsecs, total);
+			break;
+		}
+	}
+	if (line)
+		free(line);
+	fclose(file);
+
+	return total;
+}
+
+static void *thread_func(void *ctx)
+{
+	struct task_desc *this_task = ctx;
+	nsec_t cpu_usage_0, cpu_usage_1;
+	unsigned long i, ret;
+	char comm2[22];
+
+	dprintk("task %ld started up.\n", this_task->nr);
+	sprintf(comm2, ":%s", this_task->comm);
+	prctl(PR_SET_NAME, comm2);
+
+again:
+	ret = sem_post(&this_task->ready_for_work);
+	BUG_ON(ret);
+	D();
+	ret = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(ret);
+	ret = pthread_mutex_unlock(&start_work_mutex);
+	BUG_ON(ret);
+	D();
+
+	cpu_usage_0 = get_cpu_usage_nsec_self();
+
+	for (i = 0; i < this_task->nr_events; i++) {
+		this_task->curr_event = i;
+		process_sched_event(this_task, this_task->events[i]);
+	}
+
+	cpu_usage_1 = get_cpu_usage_nsec_self();
+	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
+
+	dprintk("task %ld cpu usage: %0.3f msecs\n",
+		this_task->nr, (double)this_task->cpu_usage / 1e6);
+
+	D();
+	ret = sem_post(&this_task->work_done_sem);
+	BUG_ON(ret);
+	D();
+
+	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(ret);
+	ret = pthread_mutex_unlock(&work_done_wait_mutex);
+	BUG_ON(ret);
+	D();
+
+	goto again;
+}
+
+static void create_tasks(void)
+{
+	struct task_desc *task;
+	pthread_attr_t attr;
+	unsigned long i;
+	int err;
+
+	err = pthread_attr_init(&attr);
+	BUG_ON(err);
+	err = pthread_attr_setstacksize(&attr, (size_t)(16*1024));
+	BUG_ON(err);
+	err = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(err);
+	err = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(err);
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		sem_init(&task->sleep_sem, 0, 0);
+		sem_init(&task->ready_for_work, 0, 0);
+		sem_init(&task->work_done_sem, 0, 0);
+		task->curr_event = 0;
+		err = pthread_create(&task->thread, &attr, thread_func, task);
+		BUG_ON(err);
+	}
+}
+
+static nsec_t cpu_usage;
+static nsec_t runavg_cpu_usage;
+static nsec_t parent_cpu_usage;
+static nsec_t runavg_parent_cpu_usage;
+
+static void wait_for_tasks(void)
+{
+	nsec_t cpu_usage_0, cpu_usage_1;
+	struct task_desc *task;
+	unsigned long i, ret;
+
+	DP();
+	start_time = get_nsecs();
+	DP();
+	cpu_usage = 0;
+	pthread_mutex_unlock(&work_done_wait_mutex);
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		ret = sem_wait(&task->ready_for_work);
+		BUG_ON(ret);
+		sem_init(&task->ready_for_work, 0, 0);
+	}
+	ret = pthread_mutex_lock(&work_done_wait_mutex);
+	BUG_ON(ret);
+
+	cpu_usage_0 = get_cpu_usage_nsec_parent();
+
+	pthread_mutex_unlock(&start_work_mutex);
+
+#if 0
+	for (i = 0; i < nr_tasks; i++) {
+		unsigned long missed;
+
+		task = tasks[i];
+		while (task->curr_event + 1 < task->nr_events) {
+			dprintk("parent waiting for %ld (%ld != %ld)\n",
+				i, task->curr_event, task->nr_events);
+			sleep_nsecs(100000000);
+		}
+		missed = task->nr_events - 1 - task->curr_event;
+		if (missed)
+			printk("task %ld missed events: %ld\n", i, missed);
+		ret = sem_post(&task->sleep_sem);
+		BUG_ON(ret);
+	}
+#endif
+	DP();
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		ret = sem_wait(&task->work_done_sem);
+		BUG_ON(ret);
+		sem_init(&task->work_done_sem, 0, 0);
+		cpu_usage += task->cpu_usage;
+		task->cpu_usage = 0;
+	}
+
+	cpu_usage_1 = get_cpu_usage_nsec_parent();
+	if (!runavg_cpu_usage)
+		runavg_cpu_usage = cpu_usage;
+	runavg_cpu_usage = (runavg_cpu_usage*9 + cpu_usage)/10;
+
+	parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
+	if (!runavg_parent_cpu_usage)
+		runavg_parent_cpu_usage = parent_cpu_usage;
+	runavg_parent_cpu_usage = (runavg_parent_cpu_usage*9 +
+				   parent_cpu_usage)/10;
+
+	ret = pthread_mutex_lock(&start_work_mutex);
+	BUG_ON(ret);
+
+	for (i = 0; i < nr_tasks; i++) {
+		task = tasks[i];
+		sem_init(&task->sleep_sem, 0, 0);
+		task->curr_event = 0;
+	}
+}
+
+static int __cmd_sched(void);
+
+static void parse_trace(void)
+{
+	__cmd_sched();
+
+	printk("nr_run_events:        %ld\n", nr_run_events);
+	printk("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printk("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printk("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printk("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printk("run events optimized: %ld\n",
+			nr_run_events_optimized);
+}
+
+static unsigned long nr_runs;
+static nsec_t sum_runtime;
+static nsec_t sum_fluct;
+static nsec_t run_avg;
+
+static void run_one_test(void)
+{
+	nsec_t T0, T1, delta, avg_delta, fluct, std_dev;
+
+	T0 = get_nsecs();
+	wait_for_tasks();
+	T1 = get_nsecs();
+
+	delta = T1 - T0;
+	sum_runtime += delta;
+	nr_runs++;
+
+	avg_delta = sum_runtime / nr_runs;
+	if (delta < avg_delta)
+		fluct = avg_delta - delta;
+	else
+		fluct = delta - avg_delta;
+	sum_fluct += fluct;
+	std_dev = sum_fluct / nr_runs / sqrt(nr_runs);
+	if (!run_avg)
+		run_avg = delta;
+	run_avg = (run_avg*9 + delta)/10;
+
+	printk("#%-3ld: %0.3f, ",
+		nr_runs, (double)delta/1000000.0);
+
+#if 0
+	printk("%0.2f +- %0.2f, ",
+		(double)avg_delta/1e6, (double)std_dev/1e6);
+#endif
+	printk("ravg: %0.2f, ",
+		(double)run_avg/1e6);
+
+	printk("cpu: %0.2f / %0.2f",
+		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
+
+#if 0
+	/*
+ 	 * rusage statistics done by the parent, these are less
+ 	 * accurate than the sum_exec_runtime based statistics:
+ 	 */
+	printk(" [%0.2f / %0.2f]",
+		(double)parent_cpu_usage/1e6,
+		(double)runavg_parent_cpu_usage/1e6);
+#endif
+
+	printk("\n");
+
+	if (nr_sleep_corrections)
+		printk(" (%ld sleep corrections)\n", nr_sleep_corrections);
+	nr_sleep_corrections = 0;
+}
+
+static void test_calibrations(void)
+{
+	nsec_t T0, T1;
+
+	T0 = get_nsecs();
+	burn_nsecs(1e6);
+	T1 = get_nsecs();
+
+	printk("the run test took %Ld nsecs\n", T1-T0);
+
+	T0 = get_nsecs();
+	sleep_nsecs(1e6);
+	T1 = get_nsecs();
+
+	printk("the sleep test took %Ld nsecs\n", T1-T0);
+}
+
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -50,6 +798,46 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+static void process_sched_wakeup_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_wakeup event %p\n", event);
+}
+
+static void process_sched_switch_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_switch event %p\n", event);
+}
+
+static void
+process_raw_event(event_t *raw_event, void *more_data,
+		  int cpu, u64 timestamp, struct thread *thread)
+{
+	struct {
+		u32 size;
+		char data[0];
+	} *raw = more_data;
+	struct event *event;
+	int type;
+
+	type = trace_parse_common_type(raw->data);
+	event = trace_find_event(type);
+
+	/*
+	 * FIXME: better resolve from pid from the struct trace_entry
+	 * field, although it should be the same than this perf
+	 * event pid
+	 */
+	printf("id %d, type: %d, event: %s\n",
+		raw_event->header.type, type, event->name);
+
+	if (!strcmp(event->name, "sched_switch"))
+		process_sched_switch_event(event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_wakeup"))
+		process_sched_wakeup_event(event, cpu, timestamp, thread);
+}
+
 static int
 process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -122,20 +910,8 @@ process_sample_event(event_t *event, unsigned long offset, unsigned long head)
 		dump_printf(" ...... dso: [hypervisor]\n");
 	}
 
-	if (sample_type & PERF_SAMPLE_RAW) {
-		struct {
-			u32 size;
-			char data[0];
-		} *raw = more_data;
-
-		/*
-		 * FIXME: better resolve from pid from the struct trace_entry
-		 * field, although it should be the same than this perf
-		 * event pid
-		 */
-		print_event(cpu, raw->data, raw->size, timestamp, thread->comm);
-	}
-	total += period;
+	if (sample_type & PERF_SAMPLE_RAW)
+		process_raw_event(event, more_data, cpu, timestamp, thread);
 
 	return 0;
 }
@@ -277,6 +1053,8 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
+	long nr_iterations = LONG_MAX, i;
+
 	symbol__init();
 	page_size = getpagesize();
 
@@ -293,5 +1071,19 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 
 	setup_pager();
 
-	return __cmd_sched();
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	parse_trace();
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printk("------------------------------------------------------------\n");
+	for (i = 0; i < nr_iterations; i++)
+		run_one_test();
+
+	return 0;
 }
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 629e602d9405..16cf2d51c4e1 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -1799,7 +1799,7 @@ static int get_common_info(const char *type, int *offset, int *size)
 	return 0;
 }
 
-static int parse_common_type(void *data)
+int trace_parse_common_type(void *data)
 {
 	static int type_offset;
 	static int type_size;
@@ -1832,7 +1832,7 @@ static int parse_common_pid(void *data)
 	return read_size(data + pid_offset, pid_size);
 }
 
-static struct event *find_event(int id)
+struct event *trace_find_event(int id)
 {
 	struct event *event;
 
@@ -2420,8 +2420,8 @@ get_return_for_leaf(int cpu, int cur_pid, unsigned long long cur_func,
 	int type;
 	int pid;
 
-	type = parse_common_type(next->data);
-	event = find_event(type);
+	type = trace_parse_common_type(next->data);
+	event = trace_find_event(type);
 	if (!event)
 		return NULL;
 
@@ -2502,8 +2502,8 @@ print_graph_entry_leaf(struct event *event, void *data, struct record *ret_rec)
 	int type;
 	int i;
 
-	type = parse_common_type(ret_rec->data);
-	ret_event = find_event(type);
+	type = trace_parse_common_type(ret_rec->data);
+	ret_event = trace_find_event(type);
 
 	field = find_field(ret_event, "rettime");
 	if (!field)
@@ -2696,9 +2696,9 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
 	nsecs -= secs * NSECS_PER_SEC;
 	usecs = nsecs / NSECS_PER_USEC;
 
-	type = parse_common_type(data);
+	type = trace_parse_common_type(data);
 
-	event = find_event(type);
+	event = trace_find_event(type);
 	if (!event)
 		die("ug! no event found for type %d", type);
 
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index 420294a5773e..bc81612fd244 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -234,6 +234,8 @@ extern int header_page_data_offset;
 extern int header_page_data_size;
 
 int parse_header_page(char *buf, unsigned long size);
+int trace_parse_common_type(void *data);
+struct event *trace_find_event(int id);
 
 void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
 
-- 
cgit v1.2.3


From fbf9482911825f965829567aea8acff3bbc5279c Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf sched: Implement the scheduling workload replay engine

Integrate the schedbench.c bits with the raw trace events
that we get from the perf machinery, and activate the
workload replayer/simulator.

Example of a captured 'make -j' workload:

$ perf sched

  run measurement overhead: 90 nsecs
  sleep measurement overhead: 2724743 nsecs
  the run test took 1000081 nsecs
  the sleep test took 2981111 nsecs
  version = 0.5
  ...
  nr_run_events:        70
  nr_sleep_events:      66
  nr_wakeup_events:     9
  target-less wakeups:  71
  multi-target wakeups: 47
  run events optimized: 139
  task      0 (                perf:      6607), nr_events: 2
  task      1 (                perf:      6608), nr_events: 6
  task      2 (                    :         0), nr_events: 1
  task      3 (                make:      6609), nr_events: 5
  task      4 (                  sh:      6610), nr_events: 4
  task      5 (                make:      6611), nr_events: 6
  task      6 (                  sh:      6612), nr_events: 4
  task      7 (                make:      6613), nr_events: 5
  task      8 (        migration/11:        25), nr_events: 1
  task      9 (        migration/13:        29), nr_events: 1
  task     10 (        migration/15:        33), nr_events: 1
  task     11 (         migration/9:        21), nr_events: 1
  task     12 (                  sh:      6614), nr_events: 4
  task     13 (                make:      6615), nr_events: 5
  task     14 (                  sh:      6616), nr_events: 4
  task     15 (                make:      6617), nr_events: 7
  task     16 (         migration/3:         9), nr_events: 1
  task     17 (         migration/5:        13), nr_events: 1
  task     18 (         migration/7:        17), nr_events: 1
  task     19 (         migration/1:         5), nr_events: 1
  task     20 (                  sh:      6618), nr_events: 4
  task     21 (                make:      6619), nr_events: 5
  task     22 (                  sh:      6620), nr_events: 4
  task     23 (                make:      6621), nr_events: 10
  task     24 (                  sh:      6623), nr_events: 3
  task     25 (                 gcc:      6624), nr_events: 4
  task     26 (                 gcc:      6625), nr_events: 4
  task     27 (                 gcc:      6626), nr_events: 5
  task     28 (            collect2:      6627), nr_events: 5
  task     29 (                  sh:      6622), nr_events: 1
  task     30 (                make:      6628), nr_events: 7
  task     31 (                  sh:      6630), nr_events: 4
  task     32 (                 gcc:      6631), nr_events: 4
  task     33 (                  sh:      6629), nr_events: 1
  task     34 (                 gcc:      6632), nr_events: 4
  task     35 (                 gcc:      6633), nr_events: 4
  task     36 (            collect2:      6634), nr_events: 4
  task     37 (                make:      6635), nr_events: 8
  task     38 (                  sh:      6637), nr_events: 4
  task     39 (                  sh:      6636), nr_events: 1
  task     40 (                 gcc:      6638), nr_events: 4
  task     41 (                 gcc:      6639), nr_events: 4
  task     42 (                 gcc:      6640), nr_events: 4
  task     43 (            collect2:      6641), nr_events: 4
  task     44 (                make:      6642), nr_events: 6
  task     45 (                  sh:      6643), nr_events: 5
  task     46 (                  sh:      6644), nr_events: 3
  task     47 (                  sh:      6645), nr_events: 4
  task     48 (                make:      6646), nr_events: 6
  task     49 (                  sh:      6647), nr_events: 3
  task     50 (                make:      6648), nr_events: 5
  task     51 (                  sh:      6649), nr_events: 5
  task     52 (                  sh:      6650), nr_events: 6
  task     53 (                make:      6651), nr_events: 4
  task     54 (                make:      6652), nr_events: 5
  task     55 (                make:      6653), nr_events: 4
  task     56 (                make:      6654), nr_events: 4
  task     57 (                make:      6655), nr_events: 5
  task     58 (                  sh:      6656), nr_events: 4
  task     59 (                 gcc:      6657), nr_events: 9
  task     60 (         ksoftirqd/3:        10), nr_events: 1
  task     61 (                 gcc:      6658), nr_events: 4
  task     62 (                make:      6659), nr_events: 5
  task     63 (                  sh:      6660), nr_events: 3
  task     64 (                 gcc:      6661), nr_events: 5
  task     65 (            collect2:      6662), nr_events: 4
  ------------------------------------------------------------
  #1  : 256.745, ravg: 256.74, cpu: 0.00 / 0.00
  #2  : 439.372, ravg: 275.01, cpu: 0.00 / 0.00
  #3  : 411.971, ravg: 288.70, cpu: 0.00 / 0.00
  #4  : 385.500, ravg: 298.38, cpu: 0.00 / 0.00
  #5  : 366.526, ravg: 305.20, cpu: 0.00 / 0.00
  #6  : 381.281, ravg: 312.81, cpu: 0.00 / 0.00
  #7  : 410.756, ravg: 322.60, cpu: 0.00 / 0.00
  #8  : 368.009, ravg: 327.14, cpu: 0.00 / 0.00
  #9  : 408.098, ravg: 335.24, cpu: 0.00 / 0.00
  #10 : 368.582, ravg: 338.57, cpu: 0.00 / 0.00

I.e. we successfully analyzed the trace, replayed it
via real threads and measured the replayed workload's
scheduling properties.

This is how it looked like in 'top' output:

   PID USER      PR  NI  VIRT  RES  SHR S %CPU %MEM    TIME+  COMMAND
  7164 mingo     20   0 1434m 8080  888 R 57.0  0.1   0:02.04 :perf
  7165 mingo     20   0 1434m 8080  888 R 41.8  0.1   0:01.52 :perf
  7228 mingo     20   0 1434m 8080  888 R 39.8  0.1   0:01.44 :gcc
  7225 mingo     20   0 1434m 8080  888 R 33.8  0.1   0:01.26 :gcc
  7202 mingo     20   0 1434m 8080  888 R 31.2  0.1   0:01.16 :sh
  7222 mingo     20   0 1434m 8080  888 R 25.2  0.1   0:00.96 :sh
  7211 mingo     20   0 1434m 8080  888 R 21.9  0.1   0:00.82 :sh
  7213 mingo     20   0 1434m 8080  888 D 19.2  0.1   0:00.74 :sh
  7194 mingo     20   0 1434m 8080  888 D 18.6  0.1   0:00.72 :make

There's still various kinks in it - more patches to come.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 152 +++++++++++++++++++++++++++++++++++++++------
 1 file changed, 134 insertions(+), 18 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c66e6a321371..6ec4f51d536b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -57,7 +57,7 @@ static u64			sample_type;
 
 #define BUG_ON(x)	assert(!(x))
 
-#define DEBUG		1
+#define DEBUG		0
 
 typedef unsigned long long nsec_t;
 
@@ -238,15 +238,14 @@ static struct sched_event *last_event(struct task_desc *task)
 }
 
 static void
-add_sched_event_run(struct task_desc *task, nsec_t timestamp,
-		    unsigned long duration)
+add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 {
 	struct sched_event *event, *curr_event = last_event(task);
 
 	/*
- 	 * optimize an existing RUN event by merging this one
- 	 * to it:
- 	 */
+	 * optimize an existing RUN event by merging this one
+	 * to it:
+	 */
 	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
 		nr_run_events_optimized++;
 		curr_event->duration += duration;
@@ -376,7 +375,7 @@ void parse_line(char *line)
 	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
 		comm,
 		pid,
-		timestamp, 
+		timestamp,
 		func_str,
 		comm2,
 		pid2,
@@ -429,7 +428,7 @@ static void add_cross_task_wakeups(void)
 }
 
 static void
-process_sched_event(struct task_desc *this_task, struct sched_event *event)
+process_sched_event(struct task_desc *this_task __used, struct sched_event *event)
 {
 	int ret = 0;
 	nsec_t now;
@@ -744,9 +743,9 @@ static void run_one_test(void)
 
 #if 0
 	/*
- 	 * rusage statistics done by the parent, these are less
- 	 * accurate than the sum_exec_runtime based statistics:
- 	 */
+	 * rusage statistics done by the parent, these are less
+	 * accurate than the sum_exec_runtime based statistics:
+	 */
 	printk(" [%0.2f / %0.2f]",
 		(double)parent_cpu_usage/1e6,
 		(double)runavg_parent_cpu_usage/1e6);
@@ -798,16 +797,128 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static void process_sched_wakeup_event(struct event *event,
+struct trace_wakeup_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char comm[16];
+	u32 pid;
+
+	u32 prio;
+	u32 success;
+	u32 cpu;
+};
+
+static void
+process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct task_desc *waker, *wakee;
+
 	printf("sched_wakeup event %p\n", event);
+
+	printf(" ... pid %d woke up %s/%d\n",
+		wakeup_event->common_pid,
+		wakeup_event->comm,
+		wakeup_event->pid);
+
+	waker = register_pid(wakeup_event->common_pid, "<unknown>");
+	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
+
+	add_sched_event_wakeup(waker, timestamp, wakee);
 }
 
-static void process_sched_switch_event(struct event *event,
+struct trace_switch_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char prev_comm[16];
+	u32 prev_pid;
+	u32 prev_prio;
+	u64 prev_state;
+	char next_comm[16];
+	u32 next_pid;
+	u32 next_prio;
+};
+
+#define MAX_CPUS 4096
+
+unsigned long cpu_last_switched[MAX_CPUS];
+
+static void
+process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct task_desc *prev, *next;
+	u64 timestamp0;
+	s64 delta;
+
 	printf("sched_switch event %p\n", event);
+	if (cpu >= MAX_CPUS || cpu < 0)
+		return;
+
+	timestamp0 = cpu_last_switched[cpu];
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
+	printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
+		switch_event->prev_comm, switch_event->prev_pid,
+		switch_event->next_comm, switch_event->next_pid,
+		delta);
+
+	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
+	next = register_pid(switch_event->next_pid, switch_event->next_comm);
+
+	cpu_last_switched[cpu] = timestamp;
+
+	add_sched_event_run(prev, timestamp, delta);
+}
+
+struct trace_fork_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char parent_comm[16];
+	u32 parent_pid;
+	char child_comm[16];
+	u32 child_pid;
+};
+
+static void
+process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_fork event %p\n", event);
+	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	register_pid(fork_event->parent_pid, fork_event->parent_comm);
+	register_pid(fork_event->child_pid, fork_event->child_comm);
+}
+
+static void process_sched_exit_event(struct event *event,
+		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+{
+	printf("sched_exit event %p\n", event);
 }
 
 static void
@@ -833,9 +944,15 @@ process_raw_event(event_t *raw_event, void *more_data,
 		raw_event->header.type, type, event->name);
 
 	if (!strcmp(event->name, "sched_switch"))
-		process_sched_switch_event(event, cpu, timestamp, thread);
+		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
-		process_sched_wakeup_event(event, cpu, timestamp, thread);
+		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_wakeup_new"))
+		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_process_fork"))
+		process_sched_fork_event(more_data, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_process_exit"))
+		process_sched_exit_event(event, cpu, timestamp, thread);
 }
 
 static int
@@ -1053,7 +1170,7 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-	long nr_iterations = LONG_MAX, i;
+	long nr_iterations = 10, i;
 
 	symbol__init();
 	page_size = getpagesize();
@@ -1068,8 +1185,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 			usage_with_options(annotate_usage, options);
 	}
 
-
-	setup_pager();
+//	setup_pager();
 
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();
-- 
cgit v1.2.3


From ad236fd23b6d6372dcacd549983cce051d2ccff6 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf sched: Tighten up the code

Various small cleanups - removal of debug printks and dead
functions, etc.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 236 +++++++++------------------------------------
 1 file changed, 47 insertions(+), 189 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 6ec4f51d536b..de93a2604528 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -61,29 +61,6 @@ static u64			sample_type;
 
 typedef unsigned long long nsec_t;
 
-#define printk(x...)		do { printf(x); fflush(stdout); } while (0)
-
-nsec_t prev_printk;
-
-#define __dprintk(x,y...) do {						 \
-	nsec_t __now = get_nsecs(), __delta = __now - prev_printk;	 \
-									 \
-	prev_printk = __now;						 \
-									 \
-	printf("%.3f [%Ld] [%.3f]: " x, (double)__now/1e6, __now, (double)__delta/1e6, y);\
-} while (0)
-
-#if !DEBUG
-# define dprintk(x...)	do { } while (0)
-#else
-# define dprintk(x...)	__dprintk(x)
-#endif
-
-#define __DP()		__dprintk("parent: line %d\n", __LINE__)
-#define DP()		dprintk("parent: line %d\n", __LINE__)
-#define D()		dprintk("task %ld: line %d\n", this_task->nr, __LINE__)
-
-
 static nsec_t run_measurement_overhead;
 static nsec_t sleep_measurement_overhead;
 
@@ -129,7 +106,7 @@ static void calibrate_run_measurement_overhead(void)
 	}
 	run_measurement_overhead = min_delta;
 
-	printk("run measurement overhead: %Ld nsecs\n", min_delta);
+	printf("run measurement overhead: %Ld nsecs\n", min_delta);
 }
 
 static void calibrate_sleep_measurement_overhead(void)
@@ -147,7 +124,7 @@ static void calibrate_sleep_measurement_overhead(void)
 	min_delta -= 10000;
 	sleep_measurement_overhead = min_delta;
 
-	printk("sleep measurement overhead: %Ld nsecs\n", min_delta);
+	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
 }
 
 #define COMM_LEN	20
@@ -293,7 +270,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
 
 static void
 add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
-		      unsigned long uninterruptible __used)
+		      u64 task_state __used)
 {
 	struct sched_event *event = get_new_event(task, timestamp);
 
@@ -329,77 +306,13 @@ static struct task_desc *register_pid(unsigned long pid, const char *comm)
 	BUG_ON(!tasks);
 	tasks[task->nr] = task;
 
-	printk("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
+	if (verbose)
+		printf("registered task #%ld, PID %ld (%s)\n", nr_tasks, pid, comm);
 
 	return task;
 }
 
 
-static int first_trace_line = 1;
-
-static nsec_t first_timestamp;
-static nsec_t prev_timestamp;
-
-void parse_line(char *line);
-
-void parse_line(char *line)
-{
-	unsigned long param1 = 0, param2 = 0;
-	char comm[COMM_LEN], comm2[COMM_LEN];
-	unsigned long pid, pid2, timestamp0;
-	struct task_desc *task, *task2;
-	char func_str[SYM_LEN];
-	nsec_t timestamp;
-	int ret;
-
-	//"   <idle> 0     0D.s3    0us+: try_to_wake_up <events/0 9> (1 0)"
-	ret = sscanf(line, "%20s %5ld %*s %ldus%*c:"
-			   " %128s <%20s %ld> (%ld %ld)\n",
-		comm, &pid, &timestamp0,
-		func_str, comm2, &pid2, &param1, &param2);
-	dprintk("ret: %d\n", ret);
-	if (ret != 8)
-		return;
-
-	timestamp = timestamp0 * 1000LL;
-
-	if (first_trace_line) {
-		first_trace_line = 0;
-		first_timestamp = timestamp;
-	}
-
-	timestamp -= first_timestamp;
-	BUG_ON(timestamp < prev_timestamp);
-	prev_timestamp = timestamp;
-
-	dprintk("parsed: %s - %ld %Ld: %s - <%s %ld> (%ld %ld)\n",
-		comm,
-		pid,
-		timestamp,
-		func_str,
-		comm2,
-		pid2,
-		param1,
-		param2);
-
-	task = register_pid(pid, comm);
-	task2 = register_pid(pid2, comm2);
-
-	if (!strcmp(func_str, "update_curr")) {
-		dprintk("%Ld: task %ld runs for %ld nsecs\n",
-			timestamp, task->nr, param1);
-		add_sched_event_run(task, timestamp, param1);
-	} else if (!strcmp(func_str, "try_to_wake_up")) {
-		dprintk("%Ld: task %ld wakes up task %ld\n",
-			timestamp, task->nr, task2->nr);
-		add_sched_event_wakeup(task, timestamp, task2);
-	} else if (!strcmp(func_str, "deactivate_task")) {
-		dprintk("%Ld: task %ld goes to sleep (uninterruptible: %ld)\n",
-			timestamp, task->nr, param1);
-		add_sched_event_sleep(task, timestamp, param1);
-	}
-}
-
 static void print_task_traces(void)
 {
 	struct task_desc *task;
@@ -407,7 +320,7 @@ static void print_task_traces(void)
 
 	for (i = 0; i < nr_tasks; i++) {
 		task = tasks[i];
-		printk("task %6ld (%20s:%10ld), nr_events: %ld\n",
+		printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
 			task->nr, task->comm, task->pid, task->nr_events);
 	}
 }
@@ -437,40 +350,16 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even
 	now = get_nsecs();
 	delta = start_time + event->timestamp - now;
 
-	dprintk("task %ld, event #%ld, %Ld, delta: %.3f (%Ld)\n",
-		this_task->nr, event->nr, event->timestamp,
-		(double)delta/1e6, delta);
-
-	if (0 && delta > 0) {
-		dprintk("%.3f: task %ld FIX %.3f\n",
-			(double)event->timestamp/1e6,
-			this_task->nr,
-			(double)delta/1e6);
-		sleep_nsecs(start_time + event->timestamp - now);
-		nr_sleep_corrections++;
-	}
-
 	switch (event->type) {
 		case SCHED_EVENT_RUN:
-			dprintk("%.3f: task %ld RUN for %.3f\n",
-				(double)event->timestamp/1e6,
-				this_task->nr,
-				(double)event->duration/1e6);
 			burn_nsecs(event->duration);
 			break;
 		case SCHED_EVENT_SLEEP:
-			dprintk("%.3f: task %ld %s SLEEP\n",
-				(double)event->timestamp/1e6,
-				this_task->nr, event->wait_sem ? "" : "SKIP");
 			if (event->wait_sem)
 				ret = sem_wait(event->wait_sem);
 			BUG_ON(ret);
 			break;
 		case SCHED_EVENT_WAKEUP:
-			dprintk("%.3f: task %ld WAKEUP => task %ld\n",
-				(double)event->timestamp/1e6,
-				this_task->nr,
-				event->wakee->nr);
 			if (event->wait_sem)
 				ret = sem_post(event->wait_sem);
 			BUG_ON(ret);
@@ -511,14 +400,10 @@ static nsec_t get_cpu_usage_nsec_self(void)
 	BUG_ON(!file);
 
 	while ((chars = getline(&line, &len, file)) != -1) {
-		dprintk("got line with length %zu :\n", chars);
-		dprintk("%s", line);
 		ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n",
 			&msecs, &nsecs);
 		if (ret == 2) {
 			total = msecs*1e6 + nsecs;
-			dprintk("total: (%ld.%06ld) %Ld\n",
-				msecs, nsecs, total);
 			break;
 		}
 	}
@@ -536,19 +421,16 @@ static void *thread_func(void *ctx)
 	unsigned long i, ret;
 	char comm2[22];
 
-	dprintk("task %ld started up.\n", this_task->nr);
 	sprintf(comm2, ":%s", this_task->comm);
 	prctl(PR_SET_NAME, comm2);
 
 again:
 	ret = sem_post(&this_task->ready_for_work);
 	BUG_ON(ret);
-	D();
 	ret = pthread_mutex_lock(&start_work_mutex);
 	BUG_ON(ret);
 	ret = pthread_mutex_unlock(&start_work_mutex);
 	BUG_ON(ret);
-	D();
 
 	cpu_usage_0 = get_cpu_usage_nsec_self();
 
@@ -560,19 +442,13 @@ again:
 	cpu_usage_1 = get_cpu_usage_nsec_self();
 	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
 
-	dprintk("task %ld cpu usage: %0.3f msecs\n",
-		this_task->nr, (double)this_task->cpu_usage / 1e6);
-
-	D();
 	ret = sem_post(&this_task->work_done_sem);
 	BUG_ON(ret);
-	D();
 
 	ret = pthread_mutex_lock(&work_done_wait_mutex);
 	BUG_ON(ret);
 	ret = pthread_mutex_unlock(&work_done_wait_mutex);
 	BUG_ON(ret);
-	D();
 
 	goto again;
 }
@@ -614,9 +490,7 @@ static void wait_for_tasks(void)
 	struct task_desc *task;
 	unsigned long i, ret;
 
-	DP();
 	start_time = get_nsecs();
-	DP();
 	cpu_usage = 0;
 	pthread_mutex_unlock(&work_done_wait_mutex);
 
@@ -633,24 +507,6 @@ static void wait_for_tasks(void)
 
 	pthread_mutex_unlock(&start_work_mutex);
 
-#if 0
-	for (i = 0; i < nr_tasks; i++) {
-		unsigned long missed;
-
-		task = tasks[i];
-		while (task->curr_event + 1 < task->nr_events) {
-			dprintk("parent waiting for %ld (%ld != %ld)\n",
-				i, task->curr_event, task->nr_events);
-			sleep_nsecs(100000000);
-		}
-		missed = task->nr_events - 1 - task->curr_event;
-		if (missed)
-			printk("task %ld missed events: %ld\n", i, missed);
-		ret = sem_post(&task->sleep_sem);
-		BUG_ON(ret);
-	}
-#endif
-	DP();
 	for (i = 0; i < nr_tasks; i++) {
 		task = tasks[i];
 		ret = sem_wait(&task->work_done_sem);
@@ -687,16 +543,16 @@ static void parse_trace(void)
 {
 	__cmd_sched();
 
-	printk("nr_run_events:        %ld\n", nr_run_events);
-	printk("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printk("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
 
 	if (targetless_wakeups)
-		printk("target-less wakeups:  %ld\n", targetless_wakeups);
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
 	if (multitarget_wakeups)
-		printk("multi-target wakeups: %ld\n", multitarget_wakeups);
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
 	if (nr_run_events_optimized)
-		printk("run events optimized: %ld\n",
+		printf("run events optimized: %ld\n",
 			nr_run_events_optimized);
 }
 
@@ -728,17 +584,17 @@ static void run_one_test(void)
 		run_avg = delta;
 	run_avg = (run_avg*9 + delta)/10;
 
-	printk("#%-3ld: %0.3f, ",
+	printf("#%-3ld: %0.3f, ",
 		nr_runs, (double)delta/1000000.0);
 
 #if 0
-	printk("%0.2f +- %0.2f, ",
+	printf("%0.2f +- %0.2f, ",
 		(double)avg_delta/1e6, (double)std_dev/1e6);
 #endif
-	printk("ravg: %0.2f, ",
+	printf("ravg: %0.2f, ",
 		(double)run_avg/1e6);
 
-	printk("cpu: %0.2f / %0.2f",
+	printf("cpu: %0.2f / %0.2f",
 		(double)cpu_usage/1e6, (double)runavg_cpu_usage/1e6);
 
 #if 0
@@ -746,15 +602,15 @@ static void run_one_test(void)
 	 * rusage statistics done by the parent, these are less
 	 * accurate than the sum_exec_runtime based statistics:
 	 */
-	printk(" [%0.2f / %0.2f]",
+	printf(" [%0.2f / %0.2f]",
 		(double)parent_cpu_usage/1e6,
 		(double)runavg_parent_cpu_usage/1e6);
 #endif
 
-	printk("\n");
+	printf("\n");
 
 	if (nr_sleep_corrections)
-		printk(" (%ld sleep corrections)\n", nr_sleep_corrections);
+		printf(" (%ld sleep corrections)\n", nr_sleep_corrections);
 	nr_sleep_corrections = 0;
 }
 
@@ -766,13 +622,13 @@ static void test_calibrations(void)
 	burn_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printk("the run test took %Ld nsecs\n", T1-T0);
+	printf("the run test took %Ld nsecs\n", T1-T0);
 
 	T0 = get_nsecs();
 	sleep_nsecs(1e6);
 	T1 = get_nsecs();
 
-	printk("the sleep test took %Ld nsecs\n", T1-T0);
+	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
 static int
@@ -820,12 +676,14 @@ process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event
 {
 	struct task_desc *waker, *wakee;
 
-	printf("sched_wakeup event %p\n", event);
+	if (verbose) {
+		printf("sched_wakeup event %p\n", event);
 
-	printf(" ... pid %d woke up %s/%d\n",
-		wakeup_event->common_pid,
-		wakeup_event->comm,
-		wakeup_event->pid);
+		printf(" ... pid %d woke up %s/%d\n",
+			wakeup_event->common_pid,
+			wakeup_event->comm,
+			wakeup_event->pid);
+	}
 
 	waker = register_pid(wakeup_event->common_pid, "<unknown>");
 	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
@@ -863,7 +721,9 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	u64 timestamp0;
 	s64 delta;
 
-	printf("sched_switch event %p\n", event);
+	if (verbose)
+		printf("sched_switch event %p\n", event);
+
 	if (cpu >= MAX_CPUS || cpu < 0)
 		return;
 
@@ -876,10 +736,12 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	if (delta < 0)
 		die("hm, delta: %Ld < 0 ?\n", delta);
 
-	printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
-		switch_event->prev_comm, switch_event->prev_pid,
-		switch_event->next_comm, switch_event->next_pid,
-		delta);
+	if (verbose) {
+		printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
+			switch_event->prev_comm, switch_event->prev_pid,
+			switch_event->next_comm, switch_event->next_pid,
+			delta);
+	}
 
 	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
 	next = register_pid(switch_event->next_pid, switch_event->next_comm);
@@ -887,6 +749,7 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 	cpu_last_switched[cpu] = timestamp;
 
 	add_sched_event_run(prev, timestamp, delta);
+	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
 }
 
 struct trace_fork_event {
@@ -908,9 +771,11 @@ static void
 process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
-	printf("sched_fork event %p\n", event);
-	printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
-	printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	if (verbose) {
+		printf("sched_fork event %p\n", event);
+		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	}
 	register_pid(fork_event->parent_pid, fork_event->parent_comm);
 	register_pid(fork_event->child_pid, fork_event->child_comm);
 }
@@ -918,11 +783,12 @@ process_sched_fork_event(struct trace_fork_event *fork_event, struct event *even
 static void process_sched_exit_event(struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
-	printf("sched_exit event %p\n", event);
+	if (verbose)
+		printf("sched_exit event %p\n", event);
 }
 
 static void
-process_raw_event(event_t *raw_event, void *more_data,
+process_raw_event(event_t *raw_event __used, void *more_data,
 		  int cpu, u64 timestamp, struct thread *thread)
 {
 	struct {
@@ -935,14 +801,6 @@ process_raw_event(event_t *raw_event, void *more_data,
 	type = trace_parse_common_type(raw->data);
 	event = trace_find_event(type);
 
-	/*
-	 * FIXME: better resolve from pid from the struct trace_entry
-	 * field, although it should be the same than this perf
-	 * event pid
-	 */
-	printf("id %d, type: %d, event: %s\n",
-		raw_event->header.type, type, event->name);
-
 	if (!strcmp(event->name, "sched_switch"))
 		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
@@ -1197,7 +1055,7 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	add_cross_task_wakeups();
 
 	create_tasks();
-	printk("------------------------------------------------------------\n");
+	printf("------------------------------------------------------------\n");
 	for (i = 0; i < nr_iterations; i++)
 		run_one_test();
 
-- 
cgit v1.2.3


From bcd3279f465cdcc1e0454b5f605f021c4ff4dbb5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Sep 2009 23:19:45 +0200
Subject: perf tools: Allow the specification of all tracepoints at once

Currently, when one wants to activate every tracepoint
counters of a subsystem from perf record, the current sequence
is needed:

  perf record -e subsys:ev1 -e subsys:ev2 -e subsys:ev3

This may annoy the most patient of us.

Now we can just do:

  perf record -e subsys:*

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 204 +++++++++++++++++++++++++++++++----------
 1 file changed, 154 insertions(+), 50 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a587d41ae3c9..d06c66cd358b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -18,6 +18,12 @@ struct event_symbol {
 	const char	*alias;
 };
 
+enum event_result {
+	EVT_FAILED,
+	EVT_HANDLED,
+	EVT_HANDLED_ALL
+};
+
 char debugfs_path[MAXPATHLEN];
 
 #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x
@@ -344,7 +350,7 @@ static int parse_aliases(const char **str, const char *names[][MAX_ALIASES], int
 	return -1;
 }
 
-static int
+static enum event_result
 parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
 {
 	const char *s = *str;
@@ -356,7 +362,7 @@ parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
 	 * then bail out:
 	 */
 	if (cache_type == -1)
-		return 0;
+		return EVT_FAILED;
 
 	while ((cache_op == -1 || cache_result == -1) && *s == '-') {
 		++s;
@@ -402,27 +408,112 @@ parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
 	attr->type = PERF_TYPE_HW_CACHE;
 
 	*str = s;
-	return 1;
+	return EVT_HANDLED;
+}
+
+static enum event_result
+parse_single_tracepoint_event(char *sys_name,
+			      const char *evt_name,
+			      unsigned int evt_length,
+			      char *flags,
+			      struct perf_counter_attr *attr,
+			      const char **strp)
+{
+	char evt_path[MAXPATHLEN];
+	char id_buf[4];
+	u64 id;
+	int fd;
+
+	if (flags) {
+		if (!strncmp(flags, "record", strlen(flags)))
+			attr->sample_type |= PERF_SAMPLE_RAW;
+	}
+
+	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
+		 sys_name, evt_name);
+
+	fd = open(evt_path, O_RDONLY);
+	if (fd < 0)
+		return EVT_FAILED;
+
+	if (read(fd, id_buf, sizeof(id_buf)) < 0) {
+		close(fd);
+		return EVT_FAILED;
+	}
+
+	close(fd);
+	id = atoll(id_buf);
+	attr->config = id;
+	attr->type = PERF_TYPE_TRACEPOINT;
+	*strp = evt_name + evt_length;
+
+	return EVT_HANDLED;
 }
 
-static int parse_tracepoint_event(const char **strp,
+/* sys + ':' + event + ':' + flags*/
+#define MAX_EVOPT_LEN	(MAX_EVENT_LENGTH * 2 + 2 + 128)
+static enum event_result
+parse_subsystem_tracepoint_event(char *sys_name, char *flags)
+{
+	char evt_path[MAXPATHLEN];
+	struct dirent *evt_ent;
+	DIR *evt_dir;
+
+	snprintf(evt_path, MAXPATHLEN, "%s/%s", debugfs_path, sys_name);
+	evt_dir = opendir(evt_path);
+
+	if (!evt_dir) {
+		perror("Can't open event dir");
+		return EVT_FAILED;
+	}
+
+	while ((evt_ent = readdir(evt_dir))) {
+		char event_opt[MAX_EVOPT_LEN + 1];
+		int len;
+		unsigned int rem = MAX_EVOPT_LEN;
+
+		if (!strcmp(evt_ent->d_name, ".")
+		    || !strcmp(evt_ent->d_name, "..")
+		    || !strcmp(evt_ent->d_name, "enable")
+		    || !strcmp(evt_ent->d_name, "filter"))
+			continue;
+
+		len = snprintf(event_opt, MAX_EVOPT_LEN, "%s:%s", sys_name,
+			       evt_ent->d_name);
+		if (len < 0)
+			return EVT_FAILED;
+
+		rem -= len;
+		if (flags) {
+			if (rem < strlen(flags) + 1)
+				return EVT_FAILED;
+
+			strcat(event_opt, ":");
+			strcat(event_opt, flags);
+		}
+
+		if (parse_events(NULL, event_opt, 0))
+			return EVT_FAILED;
+	}
+
+	return EVT_HANDLED_ALL;
+}
+
+
+static enum event_result parse_tracepoint_event(const char **strp,
 				    struct perf_counter_attr *attr)
 {
 	const char *evt_name;
 	char *flags;
 	char sys_name[MAX_EVENT_LENGTH];
-	char id_buf[4];
-	int fd;
 	unsigned int sys_length, evt_length;
-	u64 id;
-	char evt_path[MAXPATHLEN];
 
 	if (valid_debugfs_mount(debugfs_path))
 		return 0;
 
 	evt_name = strchr(*strp, ':');
 	if (!evt_name)
-		return 0;
+		return EVT_FAILED;
 
 	sys_length = evt_name - *strp;
 	if (sys_length >= MAX_EVENT_LENGTH)
@@ -436,30 +527,19 @@ static int parse_tracepoint_event(const char **strp,
 	if (flags) {
 		*flags = '\0';
 		flags++;
-		if (!strncmp(flags, "record", strlen(flags)))
-			attr->sample_type |= PERF_SAMPLE_RAW;
 	}
 
 	evt_length = strlen(evt_name);
 	if (evt_length >= MAX_EVENT_LENGTH)
-		return 0;
+		return EVT_FAILED;
 
-	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
-		 sys_name, evt_name);
-	fd = open(evt_path, O_RDONLY);
-	if (fd < 0)
-		return 0;
-
-	if (read(fd, id_buf, sizeof(id_buf)) < 0) {
-		close(fd);
-		return 0;
-	}
-	close(fd);
-	id = atoll(id_buf);
-	attr->config = id;
-	attr->type = PERF_TYPE_TRACEPOINT;
-	*strp = evt_name + evt_length;
-	return 1;
+	if (!strcmp(evt_name, "*")) {
+		*strp = evt_name + evt_length;
+		return parse_subsystem_tracepoint_event(sys_name, flags);
+	} else
+		return parse_single_tracepoint_event(sys_name, evt_name,
+						     evt_length, flags,
+						     attr, strp);
 }
 
 static int check_events(const char *str, unsigned int i)
@@ -477,7 +557,7 @@ static int check_events(const char *str, unsigned int i)
 	return 0;
 }
 
-static int
+static enum event_result
 parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
 {
 	const char *str = *strp;
@@ -490,31 +570,32 @@ parse_symbolic_event(const char **strp, struct perf_counter_attr *attr)
 			attr->type = event_symbols[i].type;
 			attr->config = event_symbols[i].config;
 			*strp = str + n;
-			return 1;
+			return EVT_HANDLED;
 		}
 	}
-	return 0;
+	return EVT_FAILED;
 }
 
-static int parse_raw_event(const char **strp, struct perf_counter_attr *attr)
+static enum event_result
+parse_raw_event(const char **strp, struct perf_counter_attr *attr)
 {
 	const char *str = *strp;
 	u64 config;
 	int n;
 
 	if (*str != 'r')
-		return 0;
+		return EVT_FAILED;
 	n = hex2u64(str + 1, &config);
 	if (n > 0) {
 		*strp = str + n + 1;
 		attr->type = PERF_TYPE_RAW;
 		attr->config = config;
-		return 1;
+		return EVT_HANDLED;
 	}
-	return 0;
+	return EVT_FAILED;
 }
 
-static int
+static enum event_result
 parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
 {
 	const char *str = *strp;
@@ -530,13 +611,13 @@ parse_numeric_event(const char **strp, struct perf_counter_attr *attr)
 			attr->type = type;
 			attr->config = config;
 			*strp = endp;
-			return 1;
+			return EVT_HANDLED;
 		}
 	}
-	return 0;
+	return EVT_FAILED;
 }
 
-static int
+static enum event_result
 parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
 {
 	const char *str = *strp;
@@ -569,37 +650,60 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
  * Each event can have multiple symbolic names.
  * Symbolic names are (almost) exactly matched.
  */
-static int parse_event_symbols(const char **str, struct perf_counter_attr *attr)
+static enum event_result
+parse_event_symbols(const char **str, struct perf_counter_attr *attr)
 {
-	if (!(parse_tracepoint_event(str, attr) ||
-	      parse_raw_event(str, attr) ||
-	      parse_numeric_event(str, attr) ||
-	      parse_symbolic_event(str, attr) ||
-	      parse_generic_hw_event(str, attr)))
-		return 0;
+	enum event_result ret;
+
+	ret = parse_tracepoint_event(str, attr);
+	if (ret != EVT_FAILED)
+		goto modifier;
+
+	ret = parse_raw_event(str, attr);
+	if (ret != EVT_FAILED)
+		goto modifier;
 
+	ret = parse_numeric_event(str, attr);
+	if (ret != EVT_FAILED)
+		goto modifier;
+
+	ret = parse_symbolic_event(str, attr);
+	if (ret != EVT_FAILED)
+		goto modifier;
+
+	ret = parse_generic_hw_event(str, attr);
+	if (ret != EVT_FAILED)
+		goto modifier;
+
+	return EVT_FAILED;
+
+modifier:
 	parse_event_modifier(str, attr);
 
-	return 1;
+	return ret;
 }
 
 int parse_events(const struct option *opt __used, const char *str, int unset __used)
 {
 	struct perf_counter_attr attr;
+	enum event_result ret;
 
 	for (;;) {
 		if (nr_counters == MAX_COUNTERS)
 			return -1;
 
 		memset(&attr, 0, sizeof(attr));
-		if (!parse_event_symbols(&str, &attr))
+		ret = parse_event_symbols(&str, &attr);
+		if (ret == EVT_FAILED)
 			return -1;
 
 		if (!(*str == 0 || *str == ',' || isspace(*str)))
 			return -1;
 
-		attrs[nr_counters] = attr;
-		nr_counters++;
+		if (ret != EVT_HANDLED_ALL) {
+			attrs[nr_counters] = attr;
+			nr_counters++;
+		}
 
 		if (*str == 0)
 			break;
-- 
cgit v1.2.3


From 46538818023e8ea94f656acfa1e38297e2df20e2 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 12 Sep 2009 02:43:45 +0200
Subject: perf sched: Fix bad event alignment

perf sched raises the following error when it meets a sched
switch event:

perf: builtin-sched.c:286: register_pid: Assertion `!(pid >= 65536)' failed.
Abandon

Currently in x86-64, the sched switch events have a hole in the
middle of the structure:

	u16 common_type;
	u8 common_flags;
	u8 common_preempt_count;
	u32 common_pid;
	u32 common_tgid;

	char prev_comm[16];
	u32 prev_pid;
	u32 prev_prio;
			<--- there
	u64 prev_state;
	char next_comm[16];
	u32 next_pid;
	u32 next_prio;

Gcc inserts a 4 bytes hole there for prev_state to be u64
aligned. And the events are exported to userspace with this
hole.

But in userspace, from perf sched, we fetch it using a
structure that has a new field in the beginning: u32 size. This
is because our trace is exported with its size as a field. But
now that we have this new field, the hole in the middle
disappears because it makes prev_state becoming well aligned.

And since we are using a pointer to the raw trace using this
struct, instead of reading prev_state, we are reading the hole.

We could fix it by keeping the size seperate from the struct
but actually there a lot of other potential problems: some
fields may be saved as long in a 64 bits system and later read
as long in a 32 bits system. Also this direct cast doesn't care
about the endianness differences between the host traced
machine and the machine in which we do the post processing.

So instead of using such dangerous direct casts, fetch the
values using the trace parsing API that already takes care of
all these problems.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c          | 101 +++++++++++++++++++++++++++---------
 tools/perf/util/trace-event-parse.c |  23 ++++++++
 tools/perf/util/trace-event.h       |   3 ++
 3 files changed, 102 insertions(+), 25 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index de93a2604528..0215936696ed 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -653,6 +653,30 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
+
+struct raw_event_sample {
+	u32 size;
+	char data[0];
+};
+
+#define FILL_FIELD(ptr, field, event, data)	\
+	ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data)
+
+#define FILL_ARRAY(ptr, array, event, data)			\
+do {								\
+	void *__array = raw_field_ptr(event, #array, data);	\
+	memcpy(ptr.array, __array, sizeof(ptr.array));	\
+} while(0)
+
+#define FILL_COMMON_FIELDS(ptr, event, data)			\
+do {								\
+	FILL_FIELD(ptr, common_type, event, data);		\
+	FILL_FIELD(ptr, common_flags, event, data);		\
+	FILL_FIELD(ptr, common_preempt_count, event, data);	\
+	FILL_FIELD(ptr, common_pid, event, data);		\
+	FILL_FIELD(ptr, common_tgid, event, data);		\
+} while (0)
+
 struct trace_wakeup_event {
 	u32 size;
 
@@ -671,22 +695,32 @@ struct trace_wakeup_event {
 };
 
 static void
-process_sched_wakeup_event(struct trace_wakeup_event *wakeup_event, struct event *event,
+process_sched_wakeup_event(struct raw_event_sample *raw, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
 	struct task_desc *waker, *wakee;
+	struct trace_wakeup_event wakeup_event;
+
+	FILL_COMMON_FIELDS(wakeup_event, event, raw->data);
+
+	FILL_ARRAY(wakeup_event, comm, event, raw->data);
+	FILL_FIELD(wakeup_event, pid, event, raw->data);
+	FILL_FIELD(wakeup_event, prio, event, raw->data);
+	FILL_FIELD(wakeup_event, success, event, raw->data);
+	FILL_FIELD(wakeup_event, cpu, event, raw->data);
+
 
 	if (verbose) {
 		printf("sched_wakeup event %p\n", event);
 
 		printf(" ... pid %d woke up %s/%d\n",
-			wakeup_event->common_pid,
-			wakeup_event->comm,
-			wakeup_event->pid);
+			wakeup_event.common_pid,
+			wakeup_event.comm,
+			wakeup_event.pid);
 	}
 
-	waker = register_pid(wakeup_event->common_pid, "<unknown>");
-	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
+	waker = register_pid(wakeup_event.common_pid, "<unknown>");
+	wakee = register_pid(wakeup_event.pid, wakeup_event.comm);
 
 	add_sched_event_wakeup(waker, timestamp, wakee);
 }
@@ -714,13 +748,24 @@ struct trace_switch_event {
 unsigned long cpu_last_switched[MAX_CPUS];
 
 static void
-process_sched_switch_event(struct trace_switch_event *switch_event, struct event *event,
+process_sched_switch_event(struct raw_event_sample *raw, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct trace_switch_event switch_event;
 	struct task_desc *prev, *next;
 	u64 timestamp0;
 	s64 delta;
 
+	FILL_COMMON_FIELDS(switch_event, event, raw->data);
+
+	FILL_ARRAY(switch_event, prev_comm, event, raw->data);
+	FILL_FIELD(switch_event, prev_pid, event, raw->data);
+	FILL_FIELD(switch_event, prev_prio, event, raw->data);
+	FILL_FIELD(switch_event, prev_state, event, raw->data);
+	FILL_ARRAY(switch_event, next_comm, event, raw->data);
+	FILL_FIELD(switch_event, next_pid, event, raw->data);
+	FILL_FIELD(switch_event, next_prio, event, raw->data);
+
 	if (verbose)
 		printf("sched_switch event %p\n", event);
 
@@ -738,18 +783,18 @@ process_sched_switch_event(struct trace_switch_event *switch_event, struct event
 
 	if (verbose) {
 		printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
-			switch_event->prev_comm, switch_event->prev_pid,
-			switch_event->next_comm, switch_event->next_pid,
+			switch_event.prev_comm, switch_event.prev_pid,
+			switch_event.next_comm, switch_event.next_pid,
 			delta);
 	}
 
-	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
-	next = register_pid(switch_event->next_pid, switch_event->next_comm);
+	prev = register_pid(switch_event.prev_pid, switch_event.prev_comm);
+	next = register_pid(switch_event.next_pid, switch_event.next_comm);
 
 	cpu_last_switched[cpu] = timestamp;
 
 	add_sched_event_run(prev, timestamp, delta);
-	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
+	add_sched_event_sleep(prev, timestamp, switch_event.prev_state);
 }
 
 struct trace_fork_event {
@@ -768,16 +813,25 @@ struct trace_fork_event {
 };
 
 static void
-process_sched_fork_event(struct trace_fork_event *fork_event, struct event *event,
+process_sched_fork_event(struct raw_event_sample *raw, struct event *event,
 		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
 {
+	struct trace_fork_event fork_event;
+
+	FILL_COMMON_FIELDS(fork_event, event, raw->data);
+
+	FILL_ARRAY(fork_event, parent_comm, event, raw->data);
+	FILL_FIELD(fork_event, parent_pid, event, raw->data);
+	FILL_ARRAY(fork_event, child_comm, event, raw->data);
+	FILL_FIELD(fork_event, child_pid, event, raw->data);
+
 	if (verbose) {
 		printf("sched_fork event %p\n", event);
-		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
-		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+		printf("... parent: %s/%d\n", fork_event.parent_comm, fork_event.parent_pid);
+		printf("...  child: %s/%d\n", fork_event.child_comm, fork_event.child_pid);
 	}
-	register_pid(fork_event->parent_pid, fork_event->parent_comm);
-	register_pid(fork_event->child_pid, fork_event->child_comm);
+	register_pid(fork_event.parent_pid, fork_event.parent_comm);
+	register_pid(fork_event.child_pid, fork_event.child_comm);
 }
 
 static void process_sched_exit_event(struct event *event,
@@ -791,10 +845,7 @@ static void
 process_raw_event(event_t *raw_event __used, void *more_data,
 		  int cpu, u64 timestamp, struct thread *thread)
 {
-	struct {
-		u32 size;
-		char data[0];
-	} *raw = more_data;
+	struct raw_event_sample *raw = more_data;
 	struct event *event;
 	int type;
 
@@ -802,13 +853,13 @@ process_raw_event(event_t *raw_event __used, void *more_data,
 	event = trace_find_event(type);
 
 	if (!strcmp(event->name, "sched_switch"))
-		process_sched_switch_event(more_data, event, cpu, timestamp, thread);
+		process_sched_switch_event(raw, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
-		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+		process_sched_wakeup_event(raw, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup_new"))
-		process_sched_wakeup_event(more_data, event, cpu, timestamp, thread);
+		process_sched_wakeup_event(raw, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_process_fork"))
-		process_sched_fork_event(more_data, event, cpu, timestamp, thread);
+		process_sched_fork_event(raw, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_process_exit"))
 		process_sched_exit_event(event, cpu, timestamp, thread);
 }
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 16cf2d51c4e1..64d6e302751a 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -1776,6 +1776,29 @@ static unsigned long long read_size(void *ptr, int size)
 	}
 }
 
+unsigned long long
+raw_field_value(struct event *event, const char *name, void *data)
+{
+	struct format_field *field;
+
+	field = find_any_field(event, name);
+	if (!field)
+		return 0ULL;
+
+	return read_size(data + field->offset, field->size);
+}
+
+void *raw_field_ptr(struct event *event, const char *name, void *data)
+{
+	struct format_field *field;
+
+	field = find_any_field(event, name);
+	if (!field)
+		return NULL;
+
+	return data + field->offset;
+}
+
 static int get_common_info(const char *type, int *offset, int *size)
 {
 	struct event *event;
diff --git a/tools/perf/util/trace-event.h b/tools/perf/util/trace-event.h
index bc81612fd244..d35ebf1e29ff 100644
--- a/tools/perf/util/trace-event.h
+++ b/tools/perf/util/trace-event.h
@@ -236,6 +236,9 @@ extern int header_page_data_size;
 int parse_header_page(char *buf, unsigned long size);
 int trace_parse_common_type(void *data);
 struct event *trace_find_event(int id);
+unsigned long long
+raw_field_value(struct event *event, const char *name, void *data);
+void *raw_field_ptr(struct event *event, const char *name, void *data);
 
 void read_tracing_data(struct perf_counter_attr *pattrs, int nb_counters);
 
-- 
cgit v1.2.3


From 419ab0d6a959f41ec7fde807fe311aaafb05c3be Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 12 Sep 2009 03:59:01 +0200
Subject: perf sched: Make it easier to plug in new sub profilers

Create a sched event structure of handlers in which various
sched events reader can plug their own callbacks.

This makes easier the addition of new perf sched sub commands.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 243 ++++++++++++++++++++++++++++++---------------
 1 file changed, 165 insertions(+), 78 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 0215936696ed..756fe62eb046 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -14,6 +14,9 @@
 #include "util/trace-event.h"
 #include <sys/types.h>
 
+
+#define MAX_CPUS 4096
+
 static char			const *input_name = "perf.data";
 static int			input;
 static unsigned long		page_size;
@@ -27,6 +30,8 @@ static struct thread		*last_match;
 static struct perf_header	*header;
 static u64			sample_type;
 
+static int			replay_mode;
+
 
 /*
  * Scheduler benchmarks
@@ -677,6 +682,27 @@ do {								\
 	FILL_FIELD(ptr, common_tgid, event, data);		\
 } while (0)
 
+
+
+struct trace_switch_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char prev_comm[16];
+	u32 prev_pid;
+	u32 prev_prio;
+	u64 prev_state;
+	char next_comm[16];
+	u32 next_pid;
+	u32 next_prio;
+};
+
+
 struct trace_wakeup_event {
 	u32 size;
 
@@ -694,78 +720,79 @@ struct trace_wakeup_event {
 	u32 cpu;
 };
 
-static void
-process_sched_wakeup_event(struct raw_event_sample *raw, struct event *event,
-		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
-{
-	struct task_desc *waker, *wakee;
-	struct trace_wakeup_event wakeup_event;
+struct trace_fork_event {
+	u32 size;
 
-	FILL_COMMON_FIELDS(wakeup_event, event, raw->data);
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char parent_comm[16];
+	u32 parent_pid;
+	char child_comm[16];
+	u32 child_pid;
+};
+
+struct trace_sched_handler {
+	void (*switch_event)(struct trace_switch_event *,
+			     struct event *,
+			     int cpu,
+			     u64 timestamp,
+			     struct thread *thread);
+
+	void (*wakeup_event)(struct trace_wakeup_event *,
+			     struct event *,
+			     int cpu,
+			     u64 timestamp,
+			     struct thread *thread);
+
+	void (*fork_event)(struct trace_fork_event *,
+			   struct event *,
+			   int cpu,
+			   u64 timestamp,
+			   struct thread *thread);
+};
 
-	FILL_ARRAY(wakeup_event, comm, event, raw->data);
-	FILL_FIELD(wakeup_event, pid, event, raw->data);
-	FILL_FIELD(wakeup_event, prio, event, raw->data);
-	FILL_FIELD(wakeup_event, success, event, raw->data);
-	FILL_FIELD(wakeup_event, cpu, event, raw->data);
 
+static void
+replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
+		    struct event *event,
+		    int cpu __used,
+		    u64 timestamp __used,
+		    struct thread *thread __used)
+{
+	struct task_desc *waker, *wakee;
 
 	if (verbose) {
 		printf("sched_wakeup event %p\n", event);
 
 		printf(" ... pid %d woke up %s/%d\n",
-			wakeup_event.common_pid,
-			wakeup_event.comm,
-			wakeup_event.pid);
+			wakeup_event->common_pid,
+			wakeup_event->comm,
+			wakeup_event->pid);
 	}
 
-	waker = register_pid(wakeup_event.common_pid, "<unknown>");
-	wakee = register_pid(wakeup_event.pid, wakeup_event.comm);
+	waker = register_pid(wakeup_event->common_pid, "<unknown>");
+	wakee = register_pid(wakeup_event->pid, wakeup_event->comm);
 
 	add_sched_event_wakeup(waker, timestamp, wakee);
 }
 
-struct trace_switch_event {
-	u32 size;
-
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
-
-	char prev_comm[16];
-	u32 prev_pid;
-	u32 prev_prio;
-	u64 prev_state;
-	char next_comm[16];
-	u32 next_pid;
-	u32 next_prio;
-};
-
-#define MAX_CPUS 4096
-
-unsigned long cpu_last_switched[MAX_CPUS];
+static unsigned long cpu_last_switched[MAX_CPUS];
 
 static void
-process_sched_switch_event(struct raw_event_sample *raw, struct event *event,
-		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+replay_switch_event(struct trace_switch_event *switch_event,
+		    struct event *event,
+		    int cpu,
+		    u64 timestamp,
+		    struct thread *thread __used)
 {
-	struct trace_switch_event switch_event;
 	struct task_desc *prev, *next;
 	u64 timestamp0;
 	s64 delta;
 
-	FILL_COMMON_FIELDS(switch_event, event, raw->data);
-
-	FILL_ARRAY(switch_event, prev_comm, event, raw->data);
-	FILL_FIELD(switch_event, prev_pid, event, raw->data);
-	FILL_FIELD(switch_event, prev_prio, event, raw->data);
-	FILL_FIELD(switch_event, prev_state, event, raw->data);
-	FILL_ARRAY(switch_event, next_comm, event, raw->data);
-	FILL_FIELD(switch_event, next_pid, event, raw->data);
-	FILL_FIELD(switch_event, next_prio, event, raw->data);
-
 	if (verbose)
 		printf("sched_switch event %p\n", event);
 
@@ -783,38 +810,94 @@ process_sched_switch_event(struct raw_event_sample *raw, struct event *event,
 
 	if (verbose) {
 		printf(" ... switch from %s/%d to %s/%d [ran %Ld nsecs]\n",
-			switch_event.prev_comm, switch_event.prev_pid,
-			switch_event.next_comm, switch_event.next_pid,
+			switch_event->prev_comm, switch_event->prev_pid,
+			switch_event->next_comm, switch_event->next_pid,
 			delta);
 	}
 
-	prev = register_pid(switch_event.prev_pid, switch_event.prev_comm);
-	next = register_pid(switch_event.next_pid, switch_event.next_comm);
+	prev = register_pid(switch_event->prev_pid, switch_event->prev_comm);
+	next = register_pid(switch_event->next_pid, switch_event->next_comm);
 
 	cpu_last_switched[cpu] = timestamp;
 
 	add_sched_event_run(prev, timestamp, delta);
-	add_sched_event_sleep(prev, timestamp, switch_event.prev_state);
+	add_sched_event_sleep(prev, timestamp, switch_event->prev_state);
 }
 
-struct trace_fork_event {
-	u32 size;
 
-	u16 common_type;
-	u8 common_flags;
-	u8 common_preempt_count;
-	u32 common_pid;
-	u32 common_tgid;
+static void
+replay_fork_event(struct trace_fork_event *fork_event,
+		  struct event *event,
+		  int cpu __used,
+		  u64 timestamp __used,
+		  struct thread *thread __used)
+{
+	if (verbose) {
+		printf("sched_fork event %p\n", event);
+		printf("... parent: %s/%d\n", fork_event->parent_comm, fork_event->parent_pid);
+		printf("...  child: %s/%d\n", fork_event->child_comm, fork_event->child_pid);
+	}
+	register_pid(fork_event->parent_pid, fork_event->parent_comm);
+	register_pid(fork_event->child_pid, fork_event->child_comm);
+}
 
-	char parent_comm[16];
-	u32 parent_pid;
-	char child_comm[16];
-	u32 child_pid;
+static struct trace_sched_handler replay_ops  = {
+	.wakeup_event = replay_wakeup_event,
+	.switch_event = replay_switch_event,
+	.fork_event = replay_fork_event,
 };
 
+
+static struct trace_sched_handler *trace_handler;
+
 static void
-process_sched_fork_event(struct raw_event_sample *raw, struct event *event,
-		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+process_sched_wakeup_event(struct raw_event_sample *raw,
+			   struct event *event,
+			   int cpu __used,
+			   u64 timestamp __used,
+			   struct thread *thread __used)
+{
+	struct trace_wakeup_event wakeup_event;
+
+	FILL_COMMON_FIELDS(wakeup_event, event, raw->data);
+
+	FILL_ARRAY(wakeup_event, comm, event, raw->data);
+	FILL_FIELD(wakeup_event, pid, event, raw->data);
+	FILL_FIELD(wakeup_event, prio, event, raw->data);
+	FILL_FIELD(wakeup_event, success, event, raw->data);
+	FILL_FIELD(wakeup_event, cpu, event, raw->data);
+
+	trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
+}
+
+static void
+process_sched_switch_event(struct raw_event_sample *raw,
+			   struct event *event,
+			   int cpu __used,
+			   u64 timestamp __used,
+			   struct thread *thread __used)
+{
+	struct trace_switch_event switch_event;
+
+	FILL_COMMON_FIELDS(switch_event, event, raw->data);
+
+	FILL_ARRAY(switch_event, prev_comm, event, raw->data);
+	FILL_FIELD(switch_event, prev_pid, event, raw->data);
+	FILL_FIELD(switch_event, prev_prio, event, raw->data);
+	FILL_FIELD(switch_event, prev_state, event, raw->data);
+	FILL_ARRAY(switch_event, next_comm, event, raw->data);
+	FILL_FIELD(switch_event, next_pid, event, raw->data);
+	FILL_FIELD(switch_event, next_prio, event, raw->data);
+
+	trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread);
+}
+
+static void
+process_sched_fork_event(struct raw_event_sample *raw,
+			 struct event *event,
+			 int cpu __used,
+			 u64 timestamp __used,
+			 struct thread *thread __used)
 {
 	struct trace_fork_event fork_event;
 
@@ -825,17 +908,14 @@ process_sched_fork_event(struct raw_event_sample *raw, struct event *event,
 	FILL_ARRAY(fork_event, child_comm, event, raw->data);
 	FILL_FIELD(fork_event, child_pid, event, raw->data);
 
-	if (verbose) {
-		printf("sched_fork event %p\n", event);
-		printf("... parent: %s/%d\n", fork_event.parent_comm, fork_event.parent_pid);
-		printf("...  child: %s/%d\n", fork_event.child_comm, fork_event.child_pid);
-	}
-	register_pid(fork_event.parent_pid, fork_event.parent_comm);
-	register_pid(fork_event.child_pid, fork_event.child_comm);
+	trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread);
 }
 
-static void process_sched_exit_event(struct event *event,
-		  int cpu __used, u64 timestamp __used, struct thread *thread __used)
+static void
+process_sched_exit_event(struct event *event,
+			 int cpu __used,
+			 u64 timestamp __used,
+			 struct thread *thread __used)
 {
 	if (verbose)
 		printf("sched_exit event %p\n", event);
@@ -1072,6 +1152,8 @@ static const char * const annotate_usage[] = {
 static const struct option options[] = {
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
+	OPT_BOOLEAN('r', "replay", &replay_mode,
+		    "replay sched behaviour from traces"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
@@ -1096,6 +1178,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 
 //	setup_pager();
 
+	if (replay_mode)
+		trace_handler = &replay_ops;
+	else /* We may need a default subcommand */
+		die("Please select a sub command (-r)\n");
+
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();
 
-- 
cgit v1.2.3


From cdce9d738b91e1ec686e4ef6ca46d46e2891e208 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 12 Sep 2009 08:06:14 +0200
Subject: perf sched: Add sched latency profiling

Add the -l --latency option that reports statistics about the
scheduler latencies.

For now, the latencies are measured in the following sequence
scope:

- task A is sleeping (D or S state)
- task B wakes up A
         ^
         |
         |

   latency timeframe

         |
         |
         v
- task A is scheduled in

Start by recording every scheduler events:

	perf record -e sched:*

and then fetch the results:

	perf sched -l

 Tasks                     count          total              avg            max

migration/0                  2             39849            19924           28826
ksoftirqd/0                  7            756383           108054          373014
migration/1                  5             45391             9078           10452
ksoftirqd/1                  2            399055           199527          359130
events/0                     8           4780110           597513         4500250
events/1                     9           6353057           705895         2986012
kblockd/0                   42          37805097           900121         5077684

The snapshot are in nanoseconds.

- Count: number of snapshots taken for the given task
- Total: total latencies in nanosec
- Avg  : average of latency between wake up and sched in
- Max  : max snapshot latency

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 296 +++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 285 insertions(+), 11 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 756fe62eb046..4f9e943181a3 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -31,6 +31,7 @@ static struct perf_header	*header;
 static u64			sample_type;
 
 static int			replay_mode;
+static int			lat_mode;
 
 
 /*
@@ -847,6 +848,269 @@ static struct trace_sched_handler replay_ops  = {
 	.fork_event = replay_fork_event,
 };
 
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
+enum thread_state {
+	THREAD_SLEEPING,
+	THREAD_WAKED_UP,
+	THREAD_SCHED_IN,
+	THREAD_IGNORE
+};
+
+struct lat_snapshot {
+	struct list_head	list;
+	enum thread_state	state;
+	u64			wake_up_time;
+	u64			sched_in_time;
+};
+
+struct thread_latency {
+	struct list_head	snapshot_list;
+	struct thread		*thread;
+	struct rb_node		node;
+};
+
+static struct rb_root lat_snapshot_root;
+
+static struct thread_latency *
+thread_latency_search(struct rb_root *root, struct thread *thread)
+{
+	struct rb_node *node = root->rb_node;
+
+	while (node) {
+		struct thread_latency *lat;
+
+		lat = container_of(node, struct thread_latency, node);
+		if (thread->pid < lat->thread->pid)
+			node = node->rb_left;
+		else if (thread->pid > lat->thread->pid)
+			node = node->rb_right;
+		else {
+			return lat;
+		}
+	}
+	return NULL;
+}
+
+static void
+__thread_latency_insert(struct rb_root *root, struct thread_latency *data)
+{
+	struct rb_node **new = &(root->rb_node), *parent = NULL;
+
+	while (*new) {
+		struct thread_latency *this;
+
+		this = container_of(*new, struct thread_latency, node);
+		parent = *new;
+		if (data->thread->pid < this->thread->pid)
+			new = &((*new)->rb_left);
+		else if (data->thread->pid > this->thread->pid)
+			new = &((*new)->rb_right);
+		else
+			die("Double thread insertion\n");
+	}
+
+	rb_link_node(&data->node, parent, new);
+	rb_insert_color(&data->node, root);
+}
+
+static void thread_latency_insert(struct thread *thread)
+{
+	struct thread_latency *lat;
+	lat = calloc(sizeof(*lat), 1);
+	if (!lat)
+		die("No memory");
+
+	lat->thread = thread;
+	INIT_LIST_HEAD(&lat->snapshot_list);
+	__thread_latency_insert(&lat_snapshot_root, lat);
+}
+
+static void
+latency_fork_event(struct trace_fork_event *fork_event __used,
+		   struct event *event __used,
+		   int cpu __used,
+		   u64 timestamp __used,
+		   struct thread *thread __used)
+{
+	/* should insert the newcomer */
+}
+
+static char sched_out_state(struct trace_switch_event *switch_event)
+{
+	const char *str = TASK_STATE_TO_CHAR_STR;
+
+	return str[switch_event->prev_state];
+}
+
+static void
+lat_sched_out(struct thread_latency *lat,
+	     struct trace_switch_event *switch_event)
+{
+	struct lat_snapshot *snapshot;
+
+	if (sched_out_state(switch_event) == 'R')
+		return;
+
+	snapshot = calloc(sizeof(*snapshot), 1);
+	if (!snapshot)
+		die("Non memory");
+
+	list_add_tail(&snapshot->list, &lat->snapshot_list);
+}
+
+static void
+lat_sched_in(struct thread_latency *lat, u64 timestamp)
+{
+	struct lat_snapshot *snapshot;
+
+	if (list_empty(&lat->snapshot_list))
+		return;
+
+	snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot,
+			      list);
+
+	if (snapshot->state != THREAD_WAKED_UP)
+		return;
+
+	if (timestamp < snapshot->wake_up_time) {
+		snapshot->state = THREAD_IGNORE;
+		return;
+	}
+
+	snapshot->state = THREAD_SCHED_IN;
+	snapshot->sched_in_time = timestamp;
+}
+
+
+static void
+latency_switch_event(struct trace_switch_event *switch_event,
+		     struct event *event __used,
+		     int cpu __used,
+		     u64 timestamp,
+		     struct thread *thread __used)
+{
+	struct thread_latency *out_lat, *in_lat;
+	struct thread *sched_out, *sched_in;
+
+	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
+	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
+
+	in_lat = thread_latency_search(&lat_snapshot_root, sched_in);
+	if (!in_lat) {
+		thread_latency_insert(sched_in);
+		in_lat = thread_latency_search(&lat_snapshot_root, sched_in);
+		if (!in_lat)
+			die("Internal latency tree error");
+	}
+
+	out_lat = thread_latency_search(&lat_snapshot_root, sched_out);
+	if (!out_lat) {
+		thread_latency_insert(sched_out);
+		out_lat = thread_latency_search(&lat_snapshot_root, sched_out);
+		if (!out_lat)
+			die("Internal latency tree error");
+	}
+
+	lat_sched_in(in_lat, timestamp);
+	lat_sched_out(out_lat, switch_event);
+}
+
+static void
+latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
+		     struct event *event __used,
+		     int cpu __used,
+		     u64 timestamp,
+		     struct thread *thread __used)
+{
+	struct thread_latency *lat;
+	struct lat_snapshot *snapshot;
+	struct thread *wakee;
+
+	/* Note for later, it may be interesting to observe the failing cases */
+	if (!wakeup_event->success)
+		return;
+
+	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
+	lat = thread_latency_search(&lat_snapshot_root, wakee);
+	if (!lat) {
+		thread_latency_insert(wakee);
+		return;
+	}
+
+	if (list_empty(&lat->snapshot_list))
+		return;
+
+	snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot,
+			      list);
+
+	if (snapshot->state != THREAD_SLEEPING)
+		return;
+
+	snapshot->state = THREAD_WAKED_UP;
+	snapshot->wake_up_time = timestamp;
+}
+
+static struct trace_sched_handler lat_ops  = {
+	.wakeup_event = latency_wakeup_event,
+	.switch_event = latency_switch_event,
+	.fork_event = latency_fork_event,
+};
+
+static void output_lat_thread(struct thread_latency *lat)
+{
+	struct lat_snapshot *shot;
+	int count = 0;
+	int i;
+	int ret;
+	u64 max = 0, avg;
+	u64 total = 0, delta;
+
+	list_for_each_entry(shot, &lat->snapshot_list, list) {
+		if (shot->state != THREAD_SCHED_IN)
+			continue;
+
+		count++;
+
+		delta = shot->sched_in_time - shot->wake_up_time;
+		if (delta > max)
+			max = delta;
+		total += delta;
+	}
+
+	if (!count)
+		return;
+
+	ret = printf("%s", lat->thread->comm);
+
+	for (i = 0; i < 25 - ret; i++)
+		printf(" ");
+
+	avg = total / count;
+
+	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
+}
+
+static void output_lat_results(void)
+{
+	struct rb_node *next;
+
+	printf(" Tasks");
+	printf("                     count");
+	printf("          total");
+	printf("              avg");
+	printf("            max\n\n");
+
+	next = rb_first(&lat_snapshot_root);
+
+	while (next) {
+		struct thread_latency *lat;
+
+		lat = rb_entry(next, struct thread_latency, node);
+		output_lat_thread(lat);
+		next = rb_next(next);
+	}
+}
 
 static struct trace_sched_handler *trace_handler;
 
@@ -1154,6 +1418,8 @@ static const struct option options[] = {
 		    "dump raw trace in ASCII"),
 	OPT_BOOLEAN('r', "replay", &replay_mode,
 		    "replay sched behaviour from traces"),
+	OPT_BOOLEAN('l', "latency", &lat_mode,
+		    "measure various latencies"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
@@ -1180,22 +1446,30 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 
 	if (replay_mode)
 		trace_handler = &replay_ops;
-	else /* We may need a default subcommand */
+	else if (lat_mode)
+		trace_handler = &lat_ops;
+	else /* We may need a default subcommand (perf trace?) */
 		die("Please select a sub command (-r)\n");
 
-	calibrate_run_measurement_overhead();
-	calibrate_sleep_measurement_overhead();
+	if (replay_mode) {
+		calibrate_run_measurement_overhead();
+		calibrate_sleep_measurement_overhead();
 
-	test_calibrations();
+		test_calibrations();
 
-	parse_trace();
-	print_task_traces();
-	add_cross_task_wakeups();
+		parse_trace();
+		print_task_traces();
+		add_cross_task_wakeups();
 
-	create_tasks();
-	printf("------------------------------------------------------------\n");
-	for (i = 0; i < nr_iterations; i++)
-		run_one_test();
+		create_tasks();
+		printf("------------------------------------------------------------\n");
+		for (i = 0; i < nr_iterations; i++)
+			run_one_test();
+	} else if (lat_mode) {
+		setup_pager();
+		__cmd_sched();
+		output_lat_results();
+	}
 
 	return 0;
 }
-- 
cgit v1.2.3


From 46f392c97f9fd772426ed3361c5179a0d44b8c3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 12 Sep 2009 10:08:34 +0200
Subject: perf sched: Clean up latency and replay sub-commands

- Separate the latency and the replay commands more cleanly

 - Use consistent naming

 - Display help page on 'perf sched' outlining comments,
   instead of aborting

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 99 +++++++++++++++++++++++-----------------------
 1 file changed, 49 insertions(+), 50 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 4f9e943181a3..84699cf036ab 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -543,24 +543,7 @@ static void wait_for_tasks(void)
 	}
 }
 
-static int __cmd_sched(void);
-
-static void parse_trace(void)
-{
-	__cmd_sched();
-
-	printf("nr_run_events:        %ld\n", nr_run_events);
-	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
-
-	if (targetless_wakeups)
-		printf("target-less wakeups:  %ld\n", targetless_wakeups);
-	if (multitarget_wakeups)
-		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
-	if (nr_run_events_optimized)
-		printf("run events optimized: %ld\n",
-			nr_run_events_optimized);
-}
+static int read_events(void);
 
 static unsigned long nr_runs;
 static nsec_t sum_runtime;
@@ -637,6 +620,38 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
+static void __cmd_replay(void)
+{
+	long nr_iterations = 10, i;
+
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	read_events();
+
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printf("run events optimized: %ld\n",
+			nr_run_events_optimized);
+
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printf("------------------------------------------------------------\n");
+	for (i = 0; i < nr_iterations; i++)
+		run_one_test();
+}
+
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1091,10 +1106,13 @@ static void output_lat_thread(struct thread_latency *lat)
 	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
 }
 
-static void output_lat_results(void)
+static void __cmd_lat(void)
 {
 	struct rb_node *next;
 
+	setup_pager();
+	read_events();
+
 	printf(" Tasks");
 	printf("                     count");
 	printf("          total");
@@ -1312,7 +1330,7 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static int __cmd_sched(void)
+static int read_events(void)
 {
 	int ret, rc = EXIT_FAILURE;
 	unsigned long offset = 0;
@@ -1408,8 +1426,8 @@ more:
 	return rc;
 }
 
-static const char * const annotate_usage[] = {
-	"perf trace [<options>] <command>",
+static const char * const sched_usage[] = {
+	"perf sched [<options>] <command>",
 	NULL
 };
 
@@ -1427,49 +1445,30 @@ static const struct option options[] = {
 
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
-	long nr_iterations = 10, i;
-
 	symbol__init();
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, annotate_usage, 0);
+	argc = parse_options(argc, argv, options, sched_usage, 0);
 	if (argc) {
 		/*
 		 * Special case: if there's an argument left then assume tha
 		 * it's a symbol filter:
 		 */
 		if (argc > 1)
-			usage_with_options(annotate_usage, options);
+			usage_with_options(sched_usage, options);
 	}
 
-//	setup_pager();
-
 	if (replay_mode)
 		trace_handler = &replay_ops;
 	else if (lat_mode)
 		trace_handler = &lat_ops;
-	else /* We may need a default subcommand (perf trace?) */
-		die("Please select a sub command (-r)\n");
-
-	if (replay_mode) {
-		calibrate_run_measurement_overhead();
-		calibrate_sleep_measurement_overhead();
-
-		test_calibrations();
-
-		parse_trace();
-		print_task_traces();
-		add_cross_task_wakeups();
-
-		create_tasks();
-		printf("------------------------------------------------------------\n");
-		for (i = 0; i < nr_iterations; i++)
-			run_one_test();
-	} else if (lat_mode) {
-		setup_pager();
-		__cmd_sched();
-		output_lat_results();
-	}
+	else
+		usage_with_options(sched_usage, options);
+
+	if (replay_mode)
+		__cmd_replay();
+	else if (lat_mode)
+		__cmd_lat();
 
 	return 0;
 }
-- 
cgit v1.2.3


From d9340c1db3f52460a8335eeb127a2728c5bba6ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 12 Sep 2009 10:08:34 +0200
Subject: perf sched: Display time in milliseconds, reorganize output

After:

-----------------------------------------------------------------------------------
 Task              |  runtime ms | switches | average delay ms | maximum delay ms |
-----------------------------------------------------------------------------------
 migration/0       |    0.000 ms |        1 | avg:    0.047 ms | max:    0.047 ms |
 ksoftirqd/0       |    0.000 ms |        1 | avg:    0.039 ms | max:    0.039 ms |
 migration/1       |    0.000 ms |        3 | avg:    0.013 ms | max:    0.016 ms |
 migration/3       |    0.000 ms |        2 | avg:    0.003 ms | max:    0.004 ms |
 migration/4       |    0.000 ms |        1 | avg:    0.022 ms | max:    0.022 ms |
 distccd           |    0.000 ms |        1 | avg:    0.004 ms | max:    0.004 ms |
 distccd           |    0.000 ms |        1 | avg:    0.014 ms | max:    0.014 ms |
 distccd           |    0.000 ms |        2 | avg:    0.000 ms | max:    0.000 ms |
 distccd           |    0.000 ms |        2 | avg:    0.012 ms | max:    0.019 ms |
 distccd           |    0.000 ms |        1 | avg:    0.002 ms | max:    0.002 ms |
 as                |    0.000 ms |        2 | avg:    0.019 ms | max:    0.019 ms |
 as                |    0.000 ms |        3 | avg:    0.015 ms | max:    0.017 ms |
 as                |    0.000 ms |        1 | avg:    0.009 ms | max:    0.009 ms |
 perf              |    0.000 ms |        1 | avg:    0.001 ms | max:    0.001 ms |
 gcc               |    0.000 ms |        1 | avg:    0.021 ms | max:    0.021 ms |
 run-mozilla.sh    |    0.000 ms |        2 | avg:    0.010 ms | max:    0.017 ms |
 mozilla-plugin-   |    0.000 ms |        1 | avg:    0.006 ms | max:    0.006 ms |
 gcc               |    0.000 ms |        2 | avg:    0.013 ms | max:    0.013 ms |
-----------------------------------------------------------------------------------

(The runtime ms column is not filled in yet.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c         | 17 +++++++++--------
 tools/perf/util/trace-event-read.c |  6 ++++--
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 84699cf036ab..a084c284e198 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1096,14 +1096,15 @@ static void output_lat_thread(struct thread_latency *lat)
 	if (!count)
 		return;
 
-	ret = printf("%s", lat->thread->comm);
+	ret = printf(" %s ", lat->thread->comm);
 
-	for (i = 0; i < 25 - ret; i++)
+	for (i = 0; i < 19 - ret; i++)
 		printf(" ");
 
 	avg = total / count;
 
-	printf("%5d        %10llu       %10llu      %10llu\n", count, total, avg, max);
+	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
+		0.0, count, (double)avg/1e9, (double)max/1e9);
 }
 
 static void __cmd_lat(void)
@@ -1113,11 +1114,9 @@ static void __cmd_lat(void)
 	setup_pager();
 	read_events();
 
-	printf(" Tasks");
-	printf("                     count");
-	printf("          total");
-	printf("              avg");
-	printf("            max\n\n");
+	printf("-----------------------------------------------------------------------------------\n");
+	printf(" Task              |  runtime ms | switches | average delay ms | maximum delay ms |\n");
+	printf("-----------------------------------------------------------------------------------\n");
 
 	next = rb_first(&lat_snapshot_root);
 
@@ -1128,6 +1127,8 @@ static void __cmd_lat(void)
 		output_lat_thread(lat);
 		next = rb_next(next);
 	}
+
+	printf("-----------------------------------------------------------------------------------\n");
 }
 
 static struct trace_sched_handler *trace_handler;
diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c
index a1217a10632f..1b5c847d2c22 100644
--- a/tools/perf/util/trace-event-read.c
+++ b/tools/perf/util/trace-event-read.c
@@ -458,12 +458,13 @@ struct record *trace_read_data(int cpu)
 	return data;
 }
 
-void trace_report (void)
+void trace_report(void)
 {
 	const char *input_file = "trace.info";
 	char buf[BUFSIZ];
 	char test[] = { 23, 8, 68 };
 	char *version;
+	int show_version = 0;
 	int show_funcs = 0;
 	int show_printk = 0;
 
@@ -480,7 +481,8 @@ void trace_report (void)
 		die("not a trace file (missing tracing)");
 
 	version = read_string();
-	printf("version = %s\n", version);
+	if (show_version)
+		printf("version = %s\n", version);
 	free(version);
 
 	read_or_die(buf, 1);
-- 
cgit v1.2.3


From ea92ed5a8f4e6c638efe7de2efe8a875d580ad3f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 12 Sep 2009 10:08:34 +0200
Subject: perf sched: Add runtime stats

Extend the latency tracking structure with scheduling atom
runtime info - and sum it up during per task display.

(Also clean up a few details.)

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 58 ++++++++++++++++++++++++++++++----------------
 1 file changed, 38 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index a084c284e198..c382f530d4c6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -243,8 +243,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 	nr_run_events++;
 }
 
-static unsigned long targetless_wakeups;
-static unsigned long multitarget_wakeups;
+static unsigned long		targetless_wakeups;
+static unsigned long		multitarget_wakeups;
 
 static void
 add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
@@ -485,10 +485,10 @@ static void create_tasks(void)
 	}
 }
 
-static nsec_t cpu_usage;
-static nsec_t runavg_cpu_usage;
-static nsec_t parent_cpu_usage;
-static nsec_t runavg_parent_cpu_usage;
+static nsec_t			cpu_usage;
+static nsec_t			runavg_cpu_usage;
+static nsec_t			parent_cpu_usage;
+static nsec_t			runavg_parent_cpu_usage;
 
 static void wait_for_tasks(void)
 {
@@ -858,9 +858,9 @@ replay_fork_event(struct trace_fork_event *fork_event,
 }
 
 static struct trace_sched_handler replay_ops  = {
-	.wakeup_event = replay_wakeup_event,
-	.switch_event = replay_switch_event,
-	.fork_event = replay_fork_event,
+	.wakeup_event		= replay_wakeup_event,
+	.switch_event		= replay_switch_event,
+	.fork_event		= replay_fork_event,
 };
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
@@ -877,6 +877,7 @@ struct lat_snapshot {
 	enum thread_state	state;
 	u64			wake_up_time;
 	u64			sched_in_time;
+	u64			runtime;
 };
 
 struct thread_latency {
@@ -951,6 +952,7 @@ latency_fork_event(struct trace_fork_event *fork_event __used,
 	/* should insert the newcomer */
 }
 
+__used
 static char sched_out_state(struct trace_switch_event *switch_event)
 {
 	const char *str = TASK_STATE_TO_CHAR_STR;
@@ -960,17 +962,15 @@ static char sched_out_state(struct trace_switch_event *switch_event)
 
 static void
 lat_sched_out(struct thread_latency *lat,
-	     struct trace_switch_event *switch_event)
+	     struct trace_switch_event *switch_event __used, u64 delta)
 {
 	struct lat_snapshot *snapshot;
 
-	if (sched_out_state(switch_event) == 'R')
-		return;
-
 	snapshot = calloc(sizeof(*snapshot), 1);
 	if (!snapshot)
 		die("Non memory");
 
+	snapshot->runtime = delta;
 	list_add_tail(&snapshot->list, &lat->snapshot_list);
 }
 
@@ -997,16 +997,31 @@ lat_sched_in(struct thread_latency *lat, u64 timestamp)
 	snapshot->sched_in_time = timestamp;
 }
 
-
 static void
 latency_switch_event(struct trace_switch_event *switch_event,
 		     struct event *event __used,
-		     int cpu __used,
+		     int cpu,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
 	struct thread_latency *out_lat, *in_lat;
 	struct thread *sched_out, *sched_in;
+	u64 timestamp0;
+	s64 delta;
+
+	if (cpu >= MAX_CPUS || cpu < 0)
+		return;
+
+	timestamp0 = cpu_last_switched[cpu];
+	cpu_last_switched[cpu] = timestamp;
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
 
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
@@ -1028,7 +1043,7 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	}
 
 	lat_sched_in(in_lat, timestamp);
-	lat_sched_out(out_lat, switch_event);
+	lat_sched_out(out_lat, switch_event, delta);
 }
 
 static void
@@ -1067,9 +1082,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 }
 
 static struct trace_sched_handler lat_ops  = {
-	.wakeup_event = latency_wakeup_event,
-	.switch_event = latency_switch_event,
-	.fork_event = latency_fork_event,
+	.wakeup_event		= latency_wakeup_event,
+	.switch_event		= latency_switch_event,
+	.fork_event		= latency_fork_event,
 };
 
 static void output_lat_thread(struct thread_latency *lat)
@@ -1080,8 +1095,11 @@ static void output_lat_thread(struct thread_latency *lat)
 	int ret;
 	u64 max = 0, avg;
 	u64 total = 0, delta;
+	u64 total_runtime = 0;
 
 	list_for_each_entry(shot, &lat->snapshot_list, list) {
+		total_runtime += shot->runtime;
+
 		if (shot->state != THREAD_SCHED_IN)
 			continue;
 
@@ -1104,7 +1122,7 @@ static void output_lat_thread(struct thread_latency *lat)
 	avg = total / count;
 
 	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
-		0.0, count, (double)avg/1e9, (double)max/1e9);
+		(double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9);
 }
 
 static void __cmd_lat(void)
-- 
cgit v1.2.3


From 3e304147cdb404ce6d1dd0e50cb19f52142bb363 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sat, 12 Sep 2009 10:08:34 +0200
Subject: perf sched: Output runtime and context switch totals

After:

-----------------------------------------------------------------------------------
 Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
-----------------------------------------------------------------------------------
 make              |    0.678 ms |       13 | avg:    0.018 ms | max:    0.050 ms |
 gcc               |    0.014 ms |        2 | avg:    0.320 ms | max:    0.627 ms |
 gcc               |    0.000 ms |        2 | avg:    0.185 ms | max:    0.369 ms |
...
-----------------------------------------------------------------------------------
 TOTAL:            |   21.316 ms |       63 |
---------------------------------------------

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index c382f530d4c6..727cc5b852c2 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1087,6 +1087,9 @@ static struct trace_sched_handler lat_ops  = {
 	.fork_event		= latency_fork_event,
 };
 
+static u64 all_runtime;
+static u64 all_count;
+
 static void output_lat_thread(struct thread_latency *lat)
 {
 	struct lat_snapshot *shot;
@@ -1111,6 +1114,9 @@ static void output_lat_thread(struct thread_latency *lat)
 		total += delta;
 	}
 
+	all_runtime += total_runtime;
+	all_count += count;
+
 	if (!count)
 		return;
 
@@ -1133,7 +1139,7 @@ static void __cmd_lat(void)
 	read_events();
 
 	printf("-----------------------------------------------------------------------------------\n");
-	printf(" Task              |  runtime ms | switches | average delay ms | maximum delay ms |\n");
+	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
 	printf("-----------------------------------------------------------------------------------\n");
 
 	next = rb_first(&lat_snapshot_root);
@@ -1147,6 +1153,9 @@ static void __cmd_lat(void)
 	}
 
 	printf("-----------------------------------------------------------------------------------\n");
+	printf(" TOTAL:            |%9.3f ms |%9Ld |\n",
+		(double)all_runtime/1e9, all_count);
+	printf("---------------------------------------------\n");
 }
 
 static struct trace_sched_handler *trace_handler;
-- 
cgit v1.2.3


From 175622053069afbd366ba3c6030b5af82f378d40 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sat, 12 Sep 2009 23:11:32 +0200
Subject: perf sched: Rename struct lat_snapshot to struct work atoms

To measures the latencies, we capture the sched atoms data into
a specific structure named struct lat_snapshot.

As this structure can be used for other purposes of scheduler
profiling and mirrors what happens in a thread work atom, lets
rename it to struct work_atom and propagate this renaming in
other functions and structures names to keep it coherent.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 112 ++++++++++++++++++++++-----------------------
 1 file changed, 56 insertions(+), 56 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 727cc5b852c2..7e57a986c056 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -872,7 +872,7 @@ enum thread_state {
 	THREAD_IGNORE
 };
 
-struct lat_snapshot {
+struct work_atom {
 	struct list_head	list;
 	enum thread_state	state;
 	u64			wake_up_time;
@@ -880,7 +880,7 @@ struct lat_snapshot {
 	u64			runtime;
 };
 
-struct thread_latency {
+struct task_atoms {
 	struct list_head	snapshot_list;
 	struct thread		*thread;
 	struct rb_node		node;
@@ -888,35 +888,35 @@ struct thread_latency {
 
 static struct rb_root lat_snapshot_root;
 
-static struct thread_latency *
-thread_latency_search(struct rb_root *root, struct thread *thread)
+static struct task_atoms *
+thread_atom_list_search(struct rb_root *root, struct thread *thread)
 {
 	struct rb_node *node = root->rb_node;
 
 	while (node) {
-		struct thread_latency *lat;
+		struct task_atoms *atoms;
 
-		lat = container_of(node, struct thread_latency, node);
-		if (thread->pid < lat->thread->pid)
+		atoms = container_of(node, struct task_atoms, node);
+		if (thread->pid < atoms->thread->pid)
 			node = node->rb_left;
-		else if (thread->pid > lat->thread->pid)
+		else if (thread->pid > atoms->thread->pid)
 			node = node->rb_right;
 		else {
-			return lat;
+			return atoms;
 		}
 	}
 	return NULL;
 }
 
 static void
-__thread_latency_insert(struct rb_root *root, struct thread_latency *data)
+__thread_latency_insert(struct rb_root *root, struct task_atoms *data)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 
 	while (*new) {
-		struct thread_latency *this;
+		struct task_atoms *this;
 
-		this = container_of(*new, struct thread_latency, node);
+		this = container_of(*new, struct task_atoms, node);
 		parent = *new;
 		if (data->thread->pid < this->thread->pid)
 			new = &((*new)->rb_left);
@@ -930,16 +930,16 @@ __thread_latency_insert(struct rb_root *root, struct thread_latency *data)
 	rb_insert_color(&data->node, root);
 }
 
-static void thread_latency_insert(struct thread *thread)
+static void thread_atom_list_insert(struct thread *thread)
 {
-	struct thread_latency *lat;
-	lat = calloc(sizeof(*lat), 1);
-	if (!lat)
+	struct task_atoms *atoms;
+	atoms = calloc(sizeof(*atoms), 1);
+	if (!atoms)
 		die("No memory");
 
-	lat->thread = thread;
-	INIT_LIST_HEAD(&lat->snapshot_list);
-	__thread_latency_insert(&lat_snapshot_root, lat);
+	atoms->thread = thread;
+	INIT_LIST_HEAD(&atoms->snapshot_list);
+	__thread_latency_insert(&lat_snapshot_root, atoms);
 }
 
 static void
@@ -961,28 +961,28 @@ static char sched_out_state(struct trace_switch_event *switch_event)
 }
 
 static void
-lat_sched_out(struct thread_latency *lat,
+lat_sched_out(struct task_atoms *atoms,
 	     struct trace_switch_event *switch_event __used, u64 delta)
 {
-	struct lat_snapshot *snapshot;
+	struct work_atom *snapshot;
 
 	snapshot = calloc(sizeof(*snapshot), 1);
 	if (!snapshot)
 		die("Non memory");
 
 	snapshot->runtime = delta;
-	list_add_tail(&snapshot->list, &lat->snapshot_list);
+	list_add_tail(&snapshot->list, &atoms->snapshot_list);
 }
 
 static void
-lat_sched_in(struct thread_latency *lat, u64 timestamp)
+lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 {
-	struct lat_snapshot *snapshot;
+	struct work_atom *snapshot;
 
-	if (list_empty(&lat->snapshot_list))
+	if (list_empty(&atoms->snapshot_list))
 		return;
 
-	snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot,
+	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
 			      list);
 
 	if (snapshot->state != THREAD_WAKED_UP)
@@ -1004,7 +1004,7 @@ latency_switch_event(struct trace_switch_event *switch_event,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
-	struct thread_latency *out_lat, *in_lat;
+	struct task_atoms *out_atoms, *in_atoms;
 	struct thread *sched_out, *sched_in;
 	u64 timestamp0;
 	s64 delta;
@@ -1026,24 +1026,24 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
 
-	in_lat = thread_latency_search(&lat_snapshot_root, sched_in);
-	if (!in_lat) {
-		thread_latency_insert(sched_in);
-		in_lat = thread_latency_search(&lat_snapshot_root, sched_in);
-		if (!in_lat)
+	in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in);
+	if (!in_atoms) {
+		thread_atom_list_insert(sched_in);
+		in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in);
+		if (!in_atoms)
 			die("Internal latency tree error");
 	}
 
-	out_lat = thread_latency_search(&lat_snapshot_root, sched_out);
-	if (!out_lat) {
-		thread_latency_insert(sched_out);
-		out_lat = thread_latency_search(&lat_snapshot_root, sched_out);
-		if (!out_lat)
+	out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out);
+	if (!out_atoms) {
+		thread_atom_list_insert(sched_out);
+		out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out);
+		if (!out_atoms)
 			die("Internal latency tree error");
 	}
 
-	lat_sched_in(in_lat, timestamp);
-	lat_sched_out(out_lat, switch_event, delta);
+	lat_sched_in(in_atoms, timestamp);
+	lat_sched_out(out_atoms, switch_event, delta);
 }
 
 static void
@@ -1053,8 +1053,8 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
-	struct thread_latency *lat;
-	struct lat_snapshot *snapshot;
+	struct task_atoms *atoms;
+	struct work_atom *snapshot;
 	struct thread *wakee;
 
 	/* Note for later, it may be interesting to observe the failing cases */
@@ -1062,16 +1062,16 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		return;
 
 	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
-	lat = thread_latency_search(&lat_snapshot_root, wakee);
-	if (!lat) {
-		thread_latency_insert(wakee);
+	atoms = thread_atom_list_search(&lat_snapshot_root, wakee);
+	if (!atoms) {
+		thread_atom_list_insert(wakee);
 		return;
 	}
 
-	if (list_empty(&lat->snapshot_list))
+	if (list_empty(&atoms->snapshot_list))
 		return;
 
-	snapshot = list_entry(lat->snapshot_list.prev, struct lat_snapshot,
+	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
 			      list);
 
 	if (snapshot->state != THREAD_SLEEPING)
@@ -1090,9 +1090,9 @@ static struct trace_sched_handler lat_ops  = {
 static u64 all_runtime;
 static u64 all_count;
 
-static void output_lat_thread(struct thread_latency *lat)
+static void output_lat_thread(struct task_atoms *atom_list)
 {
-	struct lat_snapshot *shot;
+	struct work_atom *atom;
 	int count = 0;
 	int i;
 	int ret;
@@ -1100,15 +1100,15 @@ static void output_lat_thread(struct thread_latency *lat)
 	u64 total = 0, delta;
 	u64 total_runtime = 0;
 
-	list_for_each_entry(shot, &lat->snapshot_list, list) {
-		total_runtime += shot->runtime;
+	list_for_each_entry(atom, &atom_list->snapshot_list, list) {
+		total_runtime += atom->runtime;
 
-		if (shot->state != THREAD_SCHED_IN)
+		if (atom->state != THREAD_SCHED_IN)
 			continue;
 
 		count++;
 
-		delta = shot->sched_in_time - shot->wake_up_time;
+		delta = atom->sched_in_time - atom->wake_up_time;
 		if (delta > max)
 			max = delta;
 		total += delta;
@@ -1120,7 +1120,7 @@ static void output_lat_thread(struct thread_latency *lat)
 	if (!count)
 		return;
 
-	ret = printf(" %s ", lat->thread->comm);
+	ret = printf(" %s ", atom_list->thread->comm);
 
 	for (i = 0; i < 19 - ret; i++)
 		printf(" ");
@@ -1145,10 +1145,10 @@ static void __cmd_lat(void)
 	next = rb_first(&lat_snapshot_root);
 
 	while (next) {
-		struct thread_latency *lat;
+		struct task_atoms *atom_list;
 
-		lat = rb_entry(next, struct thread_latency, node);
-		output_lat_thread(lat);
+		atom_list = rb_entry(next, struct task_atoms, node);
+		output_lat_thread(atom_list);
 		next = rb_next(next);
 	}
 
-- 
cgit v1.2.3


From c6ced61112f1e6139914149fab65695801a74f0f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 13 Sep 2009 00:46:19 +0200
Subject: perf sched: Add involuntarily sleeping task in work atoms

Currently in perf sched, we are measuring the scheduler wakeup
latencies.

Now we also want measure the time a task wait to be scheduled
after it gets preempted.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 19 +++++++++++++------
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 7e57a986c056..61a80e8c9d0d 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -866,8 +866,8 @@ static struct trace_sched_handler replay_ops  = {
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 
 enum thread_state {
-	THREAD_SLEEPING,
-	THREAD_WAKED_UP,
+	THREAD_SLEEPING = 0,
+	THREAD_WAIT_CPU,
 	THREAD_SCHED_IN,
 	THREAD_IGNORE
 };
@@ -962,7 +962,9 @@ static char sched_out_state(struct trace_switch_event *switch_event)
 
 static void
 lat_sched_out(struct task_atoms *atoms,
-	     struct trace_switch_event *switch_event __used, u64 delta)
+	      struct trace_switch_event *switch_event __used,
+	      u64 delta,
+	      u64 timestamp)
 {
 	struct work_atom *snapshot;
 
@@ -970,6 +972,11 @@ lat_sched_out(struct task_atoms *atoms,
 	if (!snapshot)
 		die("Non memory");
 
+	if (sched_out_state(switch_event) == 'R') {
+		snapshot->state = THREAD_WAIT_CPU;
+		snapshot->wake_up_time = timestamp;
+	}
+
 	snapshot->runtime = delta;
 	list_add_tail(&snapshot->list, &atoms->snapshot_list);
 }
@@ -985,7 +992,7 @@ lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
 			      list);
 
-	if (snapshot->state != THREAD_WAKED_UP)
+	if (snapshot->state != THREAD_WAIT_CPU)
 		return;
 
 	if (timestamp < snapshot->wake_up_time) {
@@ -1043,7 +1050,7 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	}
 
 	lat_sched_in(in_atoms, timestamp);
-	lat_sched_out(out_atoms, switch_event, delta);
+	lat_sched_out(out_atoms, switch_event, delta, timestamp);
 }
 
 static void
@@ -1077,7 +1084,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	if (snapshot->state != THREAD_SLEEPING)
 		return;
 
-	snapshot->state = THREAD_WAKED_UP;
+	snapshot->state = THREAD_WAIT_CPU;
 	snapshot->wake_up_time = timestamp;
 }
 
-- 
cgit v1.2.3


From 66685678a03d0d8e8ba015472f280fb4f12f84c1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 13 Sep 2009 01:56:25 +0200
Subject: perf sched: Export the total, max latency and total runtime to thread
 atoms list

Add a field in the thread atom list that keeps track of the
total and max latencies and also the total runtime. This makes
a faster output and also prepares for sorting.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 48 +++++++++++++++++++++-------------------------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 61a80e8c9d0d..435702702789 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -884,6 +884,10 @@ struct task_atoms {
 	struct list_head	snapshot_list;
 	struct thread		*thread;
 	struct rb_node		node;
+	u64			max_lat;
+	u64			total_lat;
+	u64			nb_atoms;
+	u64			total_runtime;
 };
 
 static struct rb_root lat_snapshot_root;
@@ -985,6 +989,7 @@ static void
 lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 {
 	struct work_atom *snapshot;
+	u64 delta;
 
 	if (list_empty(&atoms->snapshot_list))
 		return;
@@ -1002,6 +1007,13 @@ lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 
 	snapshot->state = THREAD_SCHED_IN;
 	snapshot->sched_in_time = timestamp;
+
+	delta = snapshot->sched_in_time - snapshot->wake_up_time;
+	atoms->total_lat += delta;
+	if (delta > atoms->max_lat)
+		atoms->max_lat = delta;
+	atoms->nb_atoms++;
+	atoms->total_runtime += snapshot->runtime;
 }
 
 static void
@@ -1099,43 +1111,27 @@ static u64 all_count;
 
 static void output_lat_thread(struct task_atoms *atom_list)
 {
-	struct work_atom *atom;
-	int count = 0;
 	int i;
 	int ret;
-	u64 max = 0, avg;
-	u64 total = 0, delta;
-	u64 total_runtime = 0;
-
-	list_for_each_entry(atom, &atom_list->snapshot_list, list) {
-		total_runtime += atom->runtime;
-
-		if (atom->state != THREAD_SCHED_IN)
-			continue;
-
-		count++;
+	u64 avg;
 
-		delta = atom->sched_in_time - atom->wake_up_time;
-		if (delta > max)
-			max = delta;
-		total += delta;
-	}
-
-	all_runtime += total_runtime;
-	all_count += count;
-
-	if (!count)
+	if (!atom_list->nb_atoms)
 		return;
 
+	all_runtime += atom_list->total_runtime;
+	all_count += atom_list->nb_atoms;
+
 	ret = printf(" %s ", atom_list->thread->comm);
 
 	for (i = 0; i < 19 - ret; i++)
 		printf(" ");
 
-	avg = total / count;
+	avg = atom_list->total_lat / atom_list->nb_atoms;
 
-	printf("|%9.3f ms |%9d | avg:%9.3f ms | max:%9.3f ms |\n",
-		(double)total_runtime/1e9, count, (double)avg/1e9, (double)max/1e9);
+	printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
+	      (double)atom_list->total_runtime / 1e9,
+		 atom_list->nb_atoms, (double)avg / 1e9,
+		 (double)atom_list->max_lat / 1e9);
 }
 
 static void __cmd_lat(void)
-- 
cgit v1.2.3


From 7362262687b21b0d04927a7615c162a3d064849e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 13 Sep 2009 01:59:05 +0200
Subject: perf sched: Fix nsec to msec conversion

We are dividing a time in ns by 1e9. This is a nsec to sec
conversion. What we want is msecs. Fix it by dividing by 1e6.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 435702702789..67a0ba88aecf 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1129,9 +1129,9 @@ static void output_lat_thread(struct task_atoms *atom_list)
 	avg = atom_list->total_lat / atom_list->nb_atoms;
 
 	printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
-	      (double)atom_list->total_runtime / 1e9,
-		 atom_list->nb_atoms, (double)avg / 1e9,
-		 (double)atom_list->max_lat / 1e9);
+	      (double)atom_list->total_runtime / 1e6,
+		 atom_list->nb_atoms, (double)avg / 1e6,
+		 (double)atom_list->max_lat / 1e6);
 }
 
 static void __cmd_lat(void)
@@ -1157,7 +1157,7 @@ static void __cmd_lat(void)
 
 	printf("-----------------------------------------------------------------------------------\n");
 	printf(" TOTAL:            |%9.3f ms |%9Ld |\n",
-		(double)all_runtime/1e9, all_count);
+		(double)all_runtime/1e6, all_count);
 	printf("---------------------------------------------\n");
 }
 
-- 
cgit v1.2.3


From daa1d7a5eafc0a3a91a9add6a9a9f1bcaed63108 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Sun, 13 Sep 2009 03:36:29 +0200
Subject: perf sched: Implement multidimensional sorting

Implement multidimensional sorting on perf sched so that
you can sort either by number of switches, latency average,
latency maximum, runtime.

perf sched -l -s avg,max  (this is the default)

-----------------------------------------------------------------------------------
 Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
-----------------------------------------------------------------------------------
 gnome-power-man   |    0.113 ms |        1 | avg: 4998.531 ms | max: 4998.531 ms |
 xfdesktop         |    1.190 ms |        7 | avg:  136.475 ms | max:  940.933 ms |
 xfce-mcs-manage   |    2.194 ms |       22 | avg:   38.534 ms | max:  735.174 ms |
 notification-da   |    2.749 ms |       31 | avg:   27.436 ms | max:  731.791 ms |
 xfce4-session     |    3.343 ms |       28 | avg:   26.796 ms | max:  734.891 ms |
 xfwm4             |    3.159 ms |       22 | avg:   12.406 ms | max:  241.333 ms |
 xchat             |   42.789 ms |      214 | avg:   11.886 ms | max:  100.349 ms |
 xfce4-terminal    |    5.386 ms |       22 | avg:   11.414 ms | max:  241.611 ms |
 firefox           |  151.992 ms |      123 | avg:    9.543 ms | max:  153.717 ms |
 xfce4-panel       |   24.324 ms |       47 | avg:    8.189 ms | max:  242.352 ms |
 :5090             |    6.932 ms |      111 | avg:    8.131 ms | max:  102.665 ms |
 events/0          |    0.758 ms |       12 | avg:    1.964 ms | max:   21.879 ms |
 Xorg              |  280.558 ms |      340 | avg:    1.864 ms | max:   99.526 ms |
 geany             |   63.391 ms |      295 | avg:    1.099 ms | max:    9.334 ms |
 reiserfs/0        |    0.039 ms |        2 | avg:    0.854 ms | max:    1.487 ms |
 kondemand/0       |    8.251 ms |      245 | avg:    0.691 ms | max:   34.372 ms |

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 206 ++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 196 insertions(+), 10 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 67a0ba88aecf..10fcd49e298b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -33,6 +33,9 @@ static u64			sample_type;
 static int			replay_mode;
 static int			lat_mode;
 
+static char			default_sort_order[] = "avg, max, switch, runtime";
+static char			*sort_order = default_sort_order;
+
 
 /*
  * Scheduler benchmarks
@@ -890,7 +893,17 @@ struct task_atoms {
 	u64			total_runtime;
 };
 
-static struct rb_root lat_snapshot_root;
+typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
+
+struct sort_dimension {
+	const char 		*name;
+	sort_thread_lat		cmp;
+	struct list_head 	list;
+};
+
+static LIST_HEAD(cmp_pid);
+
+static struct rb_root lat_snapshot_root, sorted_lat_snapshot_root;
 
 static struct task_atoms *
 thread_atom_list_search(struct rb_root *root, struct thread *thread)
@@ -901,9 +914,9 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread)
 		struct task_atoms *atoms;
 
 		atoms = container_of(node, struct task_atoms, node);
-		if (thread->pid < atoms->thread->pid)
+		if (thread->pid > atoms->thread->pid)
 			node = node->rb_left;
-		else if (thread->pid > atoms->thread->pid)
+		else if (thread->pid < atoms->thread->pid)
 			node = node->rb_right;
 		else {
 			return atoms;
@@ -912,22 +925,41 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread)
 	return NULL;
 }
 
+static int
+thread_lat_cmp(struct list_head *list, struct task_atoms *l,
+	       struct task_atoms *r)
+{
+	struct sort_dimension *sort;
+	int ret = 0;
+
+	list_for_each_entry(sort, list, list) {
+		ret = sort->cmp(l, r);
+		if (ret)
+			return ret;
+	}
+
+	return ret;
+}
+
 static void
-__thread_latency_insert(struct rb_root *root, struct task_atoms *data)
+__thread_latency_insert(struct rb_root *root, struct task_atoms *data,
+			 struct list_head *sort_list)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 
 	while (*new) {
 		struct task_atoms *this;
+		int cmp;
 
 		this = container_of(*new, struct task_atoms, node);
 		parent = *new;
-		if (data->thread->pid < this->thread->pid)
+
+		cmp = thread_lat_cmp(sort_list, data, this);
+
+		if (cmp > 0)
 			new = &((*new)->rb_left);
-		else if (data->thread->pid > this->thread->pid)
-			new = &((*new)->rb_right);
 		else
-			die("Double thread insertion\n");
+			new = &((*new)->rb_right);
 	}
 
 	rb_link_node(&data->node, parent, new);
@@ -943,7 +975,7 @@ static void thread_atom_list_insert(struct thread *thread)
 
 	atoms->thread = thread;
 	INIT_LIST_HEAD(&atoms->snapshot_list);
-	__thread_latency_insert(&lat_snapshot_root, atoms);
+	__thread_latency_insert(&lat_snapshot_root, atoms, &cmp_pid);
 }
 
 static void
@@ -1134,18 +1166,151 @@ static void output_lat_thread(struct task_atoms *atom_list)
 		 (double)atom_list->max_lat / 1e6);
 }
 
+static int pid_cmp(struct task_atoms *l, struct task_atoms *r)
+{
+
+	if (l->thread->pid < r->thread->pid)
+		return -1;
+	if (l->thread->pid > r->thread->pid)
+		return 1;
+
+	return 0;
+}
+
+static struct sort_dimension pid_sort_dimension = {
+	.name = "pid",
+	.cmp = pid_cmp,
+};
+
+static int avg_cmp(struct task_atoms *l, struct task_atoms *r)
+{
+	u64 avgl, avgr;
+
+	if (!l->nb_atoms)
+		return -1;
+
+	if (!r->nb_atoms)
+		return 1;
+
+	avgl = l->total_lat / l->nb_atoms;
+	avgr = r->total_lat / r->nb_atoms;
+
+	if (avgl < avgr)
+		return -1;
+	if (avgl > avgr)
+		return 1;
+
+	return 0;
+}
+
+static struct sort_dimension avg_sort_dimension = {
+	.name 	= "avg",
+	.cmp	= avg_cmp,
+};
+
+static int max_cmp(struct task_atoms *l, struct task_atoms *r)
+{
+	if (l->max_lat < r->max_lat)
+		return -1;
+	if (l->max_lat > r->max_lat)
+		return 1;
+
+	return 0;
+}
+
+static struct sort_dimension max_sort_dimension = {
+	.name 	= "max",
+	.cmp	= max_cmp,
+};
+
+static int switch_cmp(struct task_atoms *l, struct task_atoms *r)
+{
+	if (l->nb_atoms < r->nb_atoms)
+		return -1;
+	if (l->nb_atoms > r->nb_atoms)
+		return 1;
+
+	return 0;
+}
+
+static struct sort_dimension switch_sort_dimension = {
+	.name 	= "switch",
+	.cmp	= switch_cmp,
+};
+
+static int runtime_cmp(struct task_atoms *l, struct task_atoms *r)
+{
+	if (l->total_runtime < r->total_runtime)
+		return -1;
+	if (l->total_runtime > r->total_runtime)
+		return 1;
+
+	return 0;
+}
+
+static struct sort_dimension runtime_sort_dimension = {
+	.name 	= "runtime",
+	.cmp	= runtime_cmp,
+};
+
+static struct sort_dimension *available_sorts[] = {
+	&pid_sort_dimension,
+	&avg_sort_dimension,
+	&max_sort_dimension,
+	&switch_sort_dimension,
+	&runtime_sort_dimension,
+};
+
+#define NB_AVAILABLE_SORTS	(int)(sizeof(available_sorts) / sizeof(struct sort_dimension *))
+
+static LIST_HEAD(sort_list);
+
+static int sort_dimension__add(char *tok, struct list_head *list)
+{
+	int i;
+
+	for (i = 0; i < NB_AVAILABLE_SORTS; i++) {
+		if (!strcmp(available_sorts[i]->name, tok)) {
+			list_add_tail(&available_sorts[i]->list, list);
+
+			return 0;
+		}
+	}
+
+	return -1;
+}
+
+static void setup_sorting(void);
+
+static void sort_lat(void)
+{
+	struct rb_node *node;
+
+	for (;;) {
+		struct task_atoms *data;
+		node = rb_first(&lat_snapshot_root);
+		if (!node)
+			break;
+
+		rb_erase(node, &lat_snapshot_root);
+		data = rb_entry(node, struct task_atoms, node);
+		__thread_latency_insert(&sorted_lat_snapshot_root, data, &sort_list);
+	}
+}
+
 static void __cmd_lat(void)
 {
 	struct rb_node *next;
 
 	setup_pager();
 	read_events();
+	sort_lat();
 
 	printf("-----------------------------------------------------------------------------------\n");
 	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
 	printf("-----------------------------------------------------------------------------------\n");
 
-	next = rb_first(&lat_snapshot_root);
+	next = rb_first(&sorted_lat_snapshot_root);
 
 	while (next) {
 		struct task_atoms *atom_list;
@@ -1469,11 +1634,30 @@ static const struct option options[] = {
 		    "replay sched behaviour from traces"),
 	OPT_BOOLEAN('l', "latency", &lat_mode,
 		    "measure various latencies"),
+	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
+		   "sort by key(s): runtime, switch, avg, max"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_END()
 };
 
+static void setup_sorting(void)
+{
+	char *tmp, *tok, *str = strdup(sort_order);
+
+	for (tok = strtok_r(str, ", ", &tmp);
+			tok; tok = strtok_r(NULL, ", ", &tmp)) {
+		if (sort_dimension__add(tok, &sort_list) < 0) {
+			error("Unknown --sort key: `%s'", tok);
+			usage_with_options(sched_usage, options);
+		}
+	}
+
+	free(str);
+
+	sort_dimension__add((char *)"pid", &cmp_pid);
+}
+
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
 	symbol__init();
@@ -1496,6 +1680,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	else
 		usage_with_options(sched_usage, options);
 
+	setup_sorting();
+
 	if (replay_mode)
 		__cmd_replay();
 	else if (lat_mode)
-- 
cgit v1.2.3


From f2858d8ad9858e63c87257553c5721cba5db95ae Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf sched: Add 'perf sched latency' and 'perf sched replay'

Separate the option parsing cleanly and add two variants:

 - 'perf sched latency' (can be abbreviated via 'perf sched lat')
 - 'perf sched replay'  (can be abbreviated via 'perf sched rep')

Also add a repeat count option to replay and add a separation
set of options for replay.

Do the sorting setup only in the latency sub-command.

Display separate help screens for 'perf sched' and
'perf sched replay -h' - i.e. further separation of the
sub-commands.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 90 +++++++++++++++++++++++++++++-----------------
 1 file changed, 58 insertions(+), 32 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 10fcd49e298b..e01cc63b98cc 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -30,9 +30,6 @@ static struct thread		*last_match;
 static struct perf_header	*header;
 static u64			sample_type;
 
-static int			replay_mode;
-static int			lat_mode;
-
 static char			default_sort_order[] = "avg, max, switch, runtime";
 static char			*sort_order = default_sort_order;
 
@@ -623,9 +620,11 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
+static unsigned long replay_repeat = 10;
+
 static void __cmd_replay(void)
 {
-	long nr_iterations = 10, i;
+	unsigned long i;
 
 	calibrate_run_measurement_overhead();
 	calibrate_sleep_measurement_overhead();
@@ -651,7 +650,7 @@ static void __cmd_replay(void)
 
 	create_tasks();
 	printf("------------------------------------------------------------\n");
-	for (i = 0; i < nr_iterations; i++)
+	for (i = 0; i < replay_repeat; i++)
 		run_one_test();
 }
 
@@ -1623,21 +1622,45 @@ more:
 }
 
 static const char * const sched_usage[] = {
-	"perf sched [<options>] <command>",
+	"perf sched [<options>] {record|latency|replay}",
 	NULL
 };
 
-static const struct option options[] = {
+static const struct option sched_options[] = {
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
 		    "dump raw trace in ASCII"),
-	OPT_BOOLEAN('r', "replay", &replay_mode,
-		    "replay sched behaviour from traces"),
-	OPT_BOOLEAN('l', "latency", &lat_mode,
-		    "measure various latencies"),
+	OPT_END()
+};
+
+static const char * const latency_usage[] = {
+	"perf sched latency [<options>]",
+	NULL
+};
+
+static const struct option latency_options[] = {
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): runtime, switch, avg, max"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
+	OPT_END()
+};
+
+static const char * const replay_usage[] = {
+	"perf sched replay [<options>]",
+	NULL
+};
+
+static const struct option replay_options[] = {
+	OPT_INTEGER('r', "repeat", &replay_repeat,
+		    "repeat the workload replay N times (-1: infinite)"),
+	OPT_BOOLEAN('v', "verbose", &verbose,
+		    "be more verbose (show symbol address, etc)"),
+	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
+		    "dump raw trace in ASCII"),
 	OPT_END()
 };
 
@@ -1649,7 +1672,7 @@ static void setup_sorting(void)
 			tok; tok = strtok_r(NULL, ", ", &tmp)) {
 		if (sort_dimension__add(tok, &sort_list) < 0) {
 			error("Unknown --sort key: `%s'", tok);
-			usage_with_options(sched_usage, options);
+			usage_with_options(latency_usage, latency_options);
 		}
 	}
 
@@ -1663,29 +1686,32 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	symbol__init();
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, sched_usage, 0);
-	if (argc) {
-		/*
-		 * Special case: if there's an argument left then assume tha
-		 * it's a symbol filter:
-		 */
-		if (argc > 1)
-			usage_with_options(sched_usage, options);
-	}
+	argc = parse_options(argc, argv, sched_options, sched_usage,
+			     PARSE_OPT_STOP_AT_NON_OPTION);
+	if (!argc)
+		usage_with_options(sched_usage, sched_options);
 
-	if (replay_mode)
-		trace_handler = &replay_ops;
-	else if (lat_mode)
+	if (!strncmp(argv[0], "lat", 3)) {
 		trace_handler = &lat_ops;
-	else
-		usage_with_options(sched_usage, options);
-
-	setup_sorting();
-
-	if (replay_mode)
-		__cmd_replay();
-	else if (lat_mode)
+		if (argc > 1) {
+			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
+			if (argc)
+				usage_with_options(latency_usage, latency_options);
+			setup_sorting();
+		}
 		__cmd_lat();
+	} else if (!strncmp(argv[0], "rep", 3)) {
+		trace_handler = &replay_ops;
+		if (argc) {
+			argc = parse_options(argc, argv, replay_options, replay_usage, 0);
+			if (argc)
+				usage_with_options(replay_usage, replay_options);
+		}
+		__cmd_replay();
+	} else {
+		usage_with_options(sched_usage, sched_options);
+	}
+
 
 	return 0;
 }
-- 
cgit v1.2.3


From b1ffe8f3e0c96f5527a89e24410d6b0e59b3554a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf sched: Finish latency => atom rename and misc cleanups

- Rename 'latency' field/variable names to the better 'atom' ones

 - Reduce the number of #include lines and consolidate them

 - Gather file scope variables at the top of the file

 - Remove unused bits

No change in functionality.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 399 +++++++++++++++++++++------------------------
 1 file changed, 184 insertions(+), 215 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index e01cc63b98cc..cc2dbd5b50eb 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1,4 +1,5 @@
 #include "builtin.h"
+#include "perf.h"
 
 #include "util/util.h"
 #include "util/cache.h"
@@ -7,15 +8,16 @@
 #include "util/header.h"
 
 #include "util/parse-options.h"
+#include "util/trace-event.h"
 
-#include "perf.h"
 #include "util/debug.h"
 
-#include "util/trace-event.h"
 #include <sys/types.h>
+#include <sys/prctl.h>
 
-
-#define MAX_CPUS 4096
+#include <semaphore.h>
+#include <pthread.h>
+#include <math.h>
 
 static char			const *input_name = "perf.data";
 static int			input;
@@ -33,44 +35,126 @@ static u64			sample_type;
 static char			default_sort_order[] = "avg, max, switch, runtime";
 static char			*sort_order = default_sort_order;
 
+#define PR_SET_NAME		15               /* Set process name */
+#define MAX_CPUS		4096
 
-/*
- * Scheduler benchmarks
- */
-#include <sys/resource.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/time.h>
-#include <sys/prctl.h>
+#define BUG_ON(x)		assert(!(x))
 
-#include <linux/unistd.h>
+static u64			run_measurement_overhead;
+static u64			sleep_measurement_overhead;
 
-#include <semaphore.h>
-#include <pthread.h>
-#include <signal.h>
-#include <values.h>
-#include <string.h>
-#include <unistd.h>
-#include <stdlib.h>
-#include <assert.h>
-#include <fcntl.h>
-#include <time.h>
-#include <math.h>
+#define COMM_LEN		20
+#define SYM_LEN			129
 
-#include <stdio.h>
+#define MAX_PID			65536
 
-#define PR_SET_NAME	15               /* Set process name */
+static unsigned long		nr_tasks;
 
-#define BUG_ON(x)	assert(!(x))
+struct sched_event;
 
-#define DEBUG		0
+struct task_desc {
+	unsigned long		nr;
+	unsigned long		pid;
+	char			comm[COMM_LEN];
 
-typedef unsigned long long nsec_t;
+	unsigned long		nr_events;
+	unsigned long		curr_event;
+	struct sched_event	**events;
+
+	pthread_t		thread;
+	sem_t			sleep_sem;
 
-static nsec_t run_measurement_overhead;
-static nsec_t sleep_measurement_overhead;
+	sem_t			ready_for_work;
+	sem_t			work_done_sem;
+
+	u64			cpu_usage;
+};
+
+enum sched_event_type {
+	SCHED_EVENT_RUN,
+	SCHED_EVENT_SLEEP,
+	SCHED_EVENT_WAKEUP,
+};
+
+struct sched_event {
+	enum sched_event_type	type;
+	u64			timestamp;
+	u64			duration;
+	unsigned long		nr;
+	int			specific_wait;
+	sem_t			*wait_sem;
+	struct task_desc	*wakee;
+};
+
+static struct task_desc		*pid_to_task[MAX_PID];
+
+static struct task_desc		**tasks;
+
+static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
+static u64			start_time;
+
+static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
 
-static nsec_t get_nsecs(void)
+static unsigned long		nr_run_events;
+static unsigned long		nr_sleep_events;
+static unsigned long		nr_wakeup_events;
+
+static unsigned long		nr_sleep_corrections;
+static unsigned long		nr_run_events_optimized;
+
+static unsigned long		targetless_wakeups;
+static unsigned long		multitarget_wakeups;
+
+static u64			cpu_usage;
+static u64			runavg_cpu_usage;
+static u64			parent_cpu_usage;
+static u64			runavg_parent_cpu_usage;
+
+static unsigned long		nr_runs;
+static u64			sum_runtime;
+static u64			sum_fluct;
+static u64			run_avg;
+
+static unsigned long		replay_repeat = 10;
+
+#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
+
+enum thread_state {
+	THREAD_SLEEPING = 0,
+	THREAD_WAIT_CPU,
+	THREAD_SCHED_IN,
+	THREAD_IGNORE
+};
+
+struct work_atom {
+	struct list_head	list;
+	enum thread_state	state;
+	u64			wake_up_time;
+	u64			sched_in_time;
+	u64			runtime;
+};
+
+struct task_atoms {
+	struct list_head	atom_list;
+	struct thread		*thread;
+	struct rb_node		node;
+	u64			max_lat;
+	u64			total_lat;
+	u64			nb_atoms;
+	u64			total_runtime;
+};
+
+typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
+
+static struct rb_root		atom_root, sorted_atom_root;
+
+static u64			all_runtime;
+static u64			all_count;
+
+static int read_events(void);
+
+
+static u64 get_nsecs(void)
 {
 	struct timespec ts;
 
@@ -79,16 +163,16 @@ static nsec_t get_nsecs(void)
 	return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
-static void burn_nsecs(nsec_t nsecs)
+static void burn_nsecs(u64 nsecs)
 {
-	nsec_t T0 = get_nsecs(), T1;
+	u64 T0 = get_nsecs(), T1;
 
 	do {
 		T1 = get_nsecs();
 	} while (T1 + run_measurement_overhead < T0 + nsecs);
 }
 
-static void sleep_nsecs(nsec_t nsecs)
+static void sleep_nsecs(u64 nsecs)
 {
 	struct timespec ts;
 
@@ -100,7 +184,7 @@ static void sleep_nsecs(nsec_t nsecs)
 
 static void calibrate_run_measurement_overhead(void)
 {
-	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	u64 T0, T1, delta, min_delta = 1000000000ULL;
 	int i;
 
 	for (i = 0; i < 10; i++) {
@@ -117,7 +201,7 @@ static void calibrate_run_measurement_overhead(void)
 
 static void calibrate_sleep_measurement_overhead(void)
 {
-	nsec_t T0, T1, delta, min_delta = 1000000000ULL;
+	u64 T0, T1, delta, min_delta = 1000000000ULL;
 	int i;
 
 	for (i = 0; i < 10; i++) {
@@ -133,67 +217,8 @@ static void calibrate_sleep_measurement_overhead(void)
 	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
 }
 
-#define COMM_LEN	20
-#define SYM_LEN		129
-
-#define MAX_PID		65536
-
-static unsigned long nr_tasks;
-
-struct sched_event;
-
-struct task_desc {
-	unsigned long		nr;
-	unsigned long		pid;
-	char			comm[COMM_LEN];
-
-	unsigned long		nr_events;
-	unsigned long		curr_event;
-	struct sched_event	**events;
-
-	pthread_t		thread;
-	sem_t			sleep_sem;
-
-	sem_t			ready_for_work;
-	sem_t			work_done_sem;
-
-	nsec_t			cpu_usage;
-};
-
-enum sched_event_type {
-	SCHED_EVENT_RUN,
-	SCHED_EVENT_SLEEP,
-	SCHED_EVENT_WAKEUP,
-};
-
-struct sched_event {
-	enum sched_event_type	type;
-	nsec_t			timestamp;
-	nsec_t			duration;
-	unsigned long		nr;
-	int			specific_wait;
-	sem_t			*wait_sem;
-	struct task_desc	*wakee;
-};
-
-static struct task_desc		*pid_to_task[MAX_PID];
-
-static struct task_desc		**tasks;
-
-static pthread_mutex_t		start_work_mutex = PTHREAD_MUTEX_INITIALIZER;
-static nsec_t			start_time;
-
-static pthread_mutex_t		work_done_wait_mutex = PTHREAD_MUTEX_INITIALIZER;
-
-static unsigned long		nr_run_events;
-static unsigned long		nr_sleep_events;
-static unsigned long		nr_wakeup_events;
-
-static unsigned long		nr_sleep_corrections;
-static unsigned long		nr_run_events_optimized;
-
 static struct sched_event *
-get_new_event(struct task_desc *task, nsec_t timestamp)
+get_new_event(struct task_desc *task, u64 timestamp)
 {
 	struct sched_event *event = calloc(1, sizeof(*event));
 	unsigned long idx = task->nr_events;
@@ -221,7 +246,7 @@ static struct sched_event *last_event(struct task_desc *task)
 }
 
 static void
-add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
+add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 {
 	struct sched_event *event, *curr_event = last_event(task);
 
@@ -243,11 +268,8 @@ add_sched_event_run(struct task_desc *task, nsec_t timestamp, u64 duration)
 	nr_run_events++;
 }
 
-static unsigned long		targetless_wakeups;
-static unsigned long		multitarget_wakeups;
-
 static void
-add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
+add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
 		       struct task_desc *wakee)
 {
 	struct sched_event *event, *wakee_event;
@@ -275,7 +297,7 @@ add_sched_event_wakeup(struct task_desc *task, nsec_t timestamp,
 }
 
 static void
-add_sched_event_sleep(struct task_desc *task, nsec_t timestamp,
+add_sched_event_sleep(struct task_desc *task, u64 timestamp,
 		      u64 task_state __used)
 {
 	struct sched_event *event = get_new_event(task, timestamp);
@@ -350,7 +372,7 @@ static void
 process_sched_event(struct task_desc *this_task __used, struct sched_event *event)
 {
 	int ret = 0;
-	nsec_t now;
+	u64 now;
 	long long delta;
 
 	now = get_nsecs();
@@ -375,10 +397,10 @@ process_sched_event(struct task_desc *this_task __used, struct sched_event *even
 	}
 }
 
-static nsec_t get_cpu_usage_nsec_parent(void)
+static u64 get_cpu_usage_nsec_parent(void)
 {
 	struct rusage ru;
-	nsec_t sum;
+	u64 sum;
 	int err;
 
 	err = getrusage(RUSAGE_SELF, &ru);
@@ -390,12 +412,12 @@ static nsec_t get_cpu_usage_nsec_parent(void)
 	return sum;
 }
 
-static nsec_t get_cpu_usage_nsec_self(void)
+static u64 get_cpu_usage_nsec_self(void)
 {
 	char filename [] = "/proc/1234567890/sched";
 	unsigned long msecs, nsecs;
 	char *line = NULL;
-	nsec_t total = 0;
+	u64 total = 0;
 	size_t len = 0;
 	ssize_t chars;
 	FILE *file;
@@ -423,7 +445,7 @@ static nsec_t get_cpu_usage_nsec_self(void)
 static void *thread_func(void *ctx)
 {
 	struct task_desc *this_task = ctx;
-	nsec_t cpu_usage_0, cpu_usage_1;
+	u64 cpu_usage_0, cpu_usage_1;
 	unsigned long i, ret;
 	char comm2[22];
 
@@ -485,14 +507,9 @@ static void create_tasks(void)
 	}
 }
 
-static nsec_t			cpu_usage;
-static nsec_t			runavg_cpu_usage;
-static nsec_t			parent_cpu_usage;
-static nsec_t			runavg_parent_cpu_usage;
-
 static void wait_for_tasks(void)
 {
-	nsec_t cpu_usage_0, cpu_usage_1;
+	u64 cpu_usage_0, cpu_usage_1;
 	struct task_desc *task;
 	unsigned long i, ret;
 
@@ -543,16 +560,9 @@ static void wait_for_tasks(void)
 	}
 }
 
-static int read_events(void);
-
-static unsigned long nr_runs;
-static nsec_t sum_runtime;
-static nsec_t sum_fluct;
-static nsec_t run_avg;
-
 static void run_one_test(void)
 {
-	nsec_t T0, T1, delta, avg_delta, fluct, std_dev;
+	u64 T0, T1, delta, avg_delta, fluct, std_dev;
 
 	T0 = get_nsecs();
 	wait_for_tasks();
@@ -576,10 +586,6 @@ static void run_one_test(void)
 	printf("#%-3ld: %0.3f, ",
 		nr_runs, (double)delta/1000000.0);
 
-#if 0
-	printf("%0.2f +- %0.2f, ",
-		(double)avg_delta/1e6, (double)std_dev/1e6);
-#endif
 	printf("ravg: %0.2f, ",
 		(double)run_avg/1e6);
 
@@ -605,7 +611,7 @@ static void run_one_test(void)
 
 static void test_calibrations(void)
 {
-	nsec_t T0, T1;
+	u64 T0, T1;
 
 	T0 = get_nsecs();
 	burn_nsecs(1e6);
@@ -620,8 +626,6 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
-static unsigned long replay_repeat = 10;
-
 static void __cmd_replay(void)
 {
 	unsigned long i;
@@ -865,47 +869,8 @@ static struct trace_sched_handler replay_ops  = {
 	.fork_event		= replay_fork_event,
 };
 
-#define TASK_STATE_TO_CHAR_STR "RSDTtZX"
-
-enum thread_state {
-	THREAD_SLEEPING = 0,
-	THREAD_WAIT_CPU,
-	THREAD_SCHED_IN,
-	THREAD_IGNORE
-};
-
-struct work_atom {
-	struct list_head	list;
-	enum thread_state	state;
-	u64			wake_up_time;
-	u64			sched_in_time;
-	u64			runtime;
-};
-
-struct task_atoms {
-	struct list_head	snapshot_list;
-	struct thread		*thread;
-	struct rb_node		node;
-	u64			max_lat;
-	u64			total_lat;
-	u64			nb_atoms;
-	u64			total_runtime;
-};
-
-typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
-
-struct sort_dimension {
-	const char 		*name;
-	sort_thread_lat		cmp;
-	struct list_head 	list;
-};
-
-static LIST_HEAD(cmp_pid);
-
-static struct rb_root lat_snapshot_root, sorted_lat_snapshot_root;
-
 static struct task_atoms *
-thread_atom_list_search(struct rb_root *root, struct thread *thread)
+thread_atoms_search(struct rb_root *root, struct thread *thread)
 {
 	struct rb_node *node = root->rb_node;
 
@@ -924,6 +889,14 @@ thread_atom_list_search(struct rb_root *root, struct thread *thread)
 	return NULL;
 }
 
+struct sort_dimension {
+	const char		*name;
+	sort_thread_lat		cmp;
+	struct list_head	list;
+};
+
+static LIST_HEAD(cmp_pid);
+
 static int
 thread_lat_cmp(struct list_head *list, struct task_atoms *l,
 	       struct task_atoms *r)
@@ -965,16 +938,17 @@ __thread_latency_insert(struct rb_root *root, struct task_atoms *data,
 	rb_insert_color(&data->node, root);
 }
 
-static void thread_atom_list_insert(struct thread *thread)
+static void thread_atoms_insert(struct thread *thread)
 {
 	struct task_atoms *atoms;
+
 	atoms = calloc(sizeof(*atoms), 1);
 	if (!atoms)
 		die("No memory");
 
 	atoms->thread = thread;
-	INIT_LIST_HEAD(&atoms->snapshot_list);
-	__thread_latency_insert(&lat_snapshot_root, atoms, &cmp_pid);
+	INIT_LIST_HEAD(&atoms->atom_list);
+	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
 }
 
 static void
@@ -1001,50 +975,49 @@ lat_sched_out(struct task_atoms *atoms,
 	      u64 delta,
 	      u64 timestamp)
 {
-	struct work_atom *snapshot;
+	struct work_atom *atom;
 
-	snapshot = calloc(sizeof(*snapshot), 1);
-	if (!snapshot)
+	atom = calloc(sizeof(*atom), 1);
+	if (!atom)
 		die("Non memory");
 
 	if (sched_out_state(switch_event) == 'R') {
-		snapshot->state = THREAD_WAIT_CPU;
-		snapshot->wake_up_time = timestamp;
+		atom->state = THREAD_WAIT_CPU;
+		atom->wake_up_time = timestamp;
 	}
 
-	snapshot->runtime = delta;
-	list_add_tail(&snapshot->list, &atoms->snapshot_list);
+	atom->runtime = delta;
+	list_add_tail(&atom->list, &atoms->atom_list);
 }
 
 static void
 lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 {
-	struct work_atom *snapshot;
+	struct work_atom *atom;
 	u64 delta;
 
-	if (list_empty(&atoms->snapshot_list))
+	if (list_empty(&atoms->atom_list))
 		return;
 
-	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
-			      list);
+	atom = list_entry(atoms->atom_list.prev, struct work_atom, list);
 
-	if (snapshot->state != THREAD_WAIT_CPU)
+	if (atom->state != THREAD_WAIT_CPU)
 		return;
 
-	if (timestamp < snapshot->wake_up_time) {
-		snapshot->state = THREAD_IGNORE;
+	if (timestamp < atom->wake_up_time) {
+		atom->state = THREAD_IGNORE;
 		return;
 	}
 
-	snapshot->state = THREAD_SCHED_IN;
-	snapshot->sched_in_time = timestamp;
+	atom->state = THREAD_SCHED_IN;
+	atom->sched_in_time = timestamp;
 
-	delta = snapshot->sched_in_time - snapshot->wake_up_time;
+	delta = atom->sched_in_time - atom->wake_up_time;
 	atoms->total_lat += delta;
 	if (delta > atoms->max_lat)
 		atoms->max_lat = delta;
 	atoms->nb_atoms++;
-	atoms->total_runtime += snapshot->runtime;
+	atoms->total_runtime += atom->runtime;
 }
 
 static void
@@ -1076,20 +1049,20 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
 
-	in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in);
+	in_atoms = thread_atoms_search(&atom_root, sched_in);
 	if (!in_atoms) {
-		thread_atom_list_insert(sched_in);
-		in_atoms = thread_atom_list_search(&lat_snapshot_root, sched_in);
+		thread_atoms_insert(sched_in);
+		in_atoms = thread_atoms_search(&atom_root, sched_in);
 		if (!in_atoms)
-			die("Internal latency tree error");
+			die("in-atom: Internal tree error");
 	}
 
-	out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out);
+	out_atoms = thread_atoms_search(&atom_root, sched_out);
 	if (!out_atoms) {
-		thread_atom_list_insert(sched_out);
-		out_atoms = thread_atom_list_search(&lat_snapshot_root, sched_out);
+		thread_atoms_insert(sched_out);
+		out_atoms = thread_atoms_search(&atom_root, sched_out);
 		if (!out_atoms)
-			die("Internal latency tree error");
+			die("out-atom: Internal tree error");
 	}
 
 	lat_sched_in(in_atoms, timestamp);
@@ -1104,7 +1077,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		     struct thread *thread __used)
 {
 	struct task_atoms *atoms;
-	struct work_atom *snapshot;
+	struct work_atom *atom;
 	struct thread *wakee;
 
 	/* Note for later, it may be interesting to observe the failing cases */
@@ -1112,23 +1085,22 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		return;
 
 	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
-	atoms = thread_atom_list_search(&lat_snapshot_root, wakee);
+	atoms = thread_atoms_search(&atom_root, wakee);
 	if (!atoms) {
-		thread_atom_list_insert(wakee);
+		thread_atoms_insert(wakee);
 		return;
 	}
 
-	if (list_empty(&atoms->snapshot_list))
+	if (list_empty(&atoms->atom_list))
 		return;
 
-	snapshot = list_entry(atoms->snapshot_list.prev, struct work_atom,
-			      list);
+	atom = list_entry(atoms->atom_list.prev, struct work_atom, list);
 
-	if (snapshot->state != THREAD_SLEEPING)
+	if (atom->state != THREAD_SLEEPING)
 		return;
 
-	snapshot->state = THREAD_WAIT_CPU;
-	snapshot->wake_up_time = timestamp;
+	atom->state = THREAD_WAIT_CPU;
+	atom->wake_up_time = timestamp;
 }
 
 static struct trace_sched_handler lat_ops  = {
@@ -1137,9 +1109,6 @@ static struct trace_sched_handler lat_ops  = {
 	.fork_event		= latency_fork_event,
 };
 
-static u64 all_runtime;
-static u64 all_count;
-
 static void output_lat_thread(struct task_atoms *atom_list)
 {
 	int i;
@@ -1287,13 +1256,13 @@ static void sort_lat(void)
 
 	for (;;) {
 		struct task_atoms *data;
-		node = rb_first(&lat_snapshot_root);
+		node = rb_first(&atom_root);
 		if (!node)
 			break;
 
-		rb_erase(node, &lat_snapshot_root);
+		rb_erase(node, &atom_root);
 		data = rb_entry(node, struct task_atoms, node);
-		__thread_latency_insert(&sorted_lat_snapshot_root, data, &sort_list);
+		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
 	}
 }
 
@@ -1309,7 +1278,7 @@ static void __cmd_lat(void)
 	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
 	printf("-----------------------------------------------------------------------------------\n");
 
-	next = rb_first(&sorted_lat_snapshot_root);
+	next = rb_first(&sorted_atom_root);
 
 	while (next) {
 		struct task_atoms *atom_list;
-- 
cgit v1.2.3


From b5fae128e41021889777f8ead810cbd2a8b249fc Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Fri, 11 Sep 2009 12:12:54 +0200
Subject: perf sched: Clean up PID sorting logic

Use a sort list for thread atoms insertion as well - instead of
hardcoded for PID.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 88 +++++++++++++++++++++++++---------------------
 tools/perf/util/thread.h   |  8 ++---
 2 files changed, 51 insertions(+), 45 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index cc2dbd5b50eb..b72544f2b964 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -144,7 +144,7 @@ struct task_atoms {
 	u64			total_runtime;
 };
 
-typedef int (*sort_thread_lat)(struct task_atoms *, struct task_atoms *);
+typedef int (*sort_fn_t)(struct task_atoms *, struct task_atoms *);
 
 static struct rb_root		atom_root, sorted_atom_root;
 
@@ -869,41 +869,22 @@ static struct trace_sched_handler replay_ops  = {
 	.fork_event		= replay_fork_event,
 };
 
-static struct task_atoms *
-thread_atoms_search(struct rb_root *root, struct thread *thread)
-{
-	struct rb_node *node = root->rb_node;
-
-	while (node) {
-		struct task_atoms *atoms;
-
-		atoms = container_of(node, struct task_atoms, node);
-		if (thread->pid > atoms->thread->pid)
-			node = node->rb_left;
-		else if (thread->pid < atoms->thread->pid)
-			node = node->rb_right;
-		else {
-			return atoms;
-		}
-	}
-	return NULL;
-}
-
 struct sort_dimension {
 	const char		*name;
-	sort_thread_lat		cmp;
+	sort_fn_t		cmp;
 	struct list_head	list;
 };
 
 static LIST_HEAD(cmp_pid);
 
 static int
-thread_lat_cmp(struct list_head *list, struct task_atoms *l,
-	       struct task_atoms *r)
+thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *r)
 {
 	struct sort_dimension *sort;
 	int ret = 0;
 
+	BUG_ON(list_empty(list));
+
 	list_for_each_entry(sort, list, list) {
 		ret = sort->cmp(l, r);
 		if (ret)
@@ -913,6 +894,32 @@ thread_lat_cmp(struct list_head *list, struct task_atoms *l,
 	return ret;
 }
 
+static struct task_atoms *
+thread_atoms_search(struct rb_root *root, struct thread *thread,
+			 struct list_head *sort_list)
+{
+	struct rb_node *node = root->rb_node;
+	struct task_atoms key = { .thread = thread };
+
+	while (node) {
+		struct task_atoms *atoms;
+		int cmp;
+
+		atoms = container_of(node, struct task_atoms, node);
+
+		cmp = thread_lat_cmp(sort_list, &key, atoms);
+		if (cmp > 0)
+			node = node->rb_left;
+		else if (cmp < 0)
+			node = node->rb_right;
+		else {
+			BUG_ON(thread != atoms->thread);
+			return atoms;
+		}
+	}
+	return NULL;
+}
+
 static void
 __thread_latency_insert(struct rb_root *root, struct task_atoms *data,
 			 struct list_head *sort_list)
@@ -1049,18 +1056,18 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
 
-	in_atoms = thread_atoms_search(&atom_root, sched_in);
+	in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
 	if (!in_atoms) {
 		thread_atoms_insert(sched_in);
-		in_atoms = thread_atoms_search(&atom_root, sched_in);
+		in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
 		if (!in_atoms)
 			die("in-atom: Internal tree error");
 	}
 
-	out_atoms = thread_atoms_search(&atom_root, sched_out);
+	out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
 	if (!out_atoms) {
 		thread_atoms_insert(sched_out);
-		out_atoms = thread_atoms_search(&atom_root, sched_out);
+		out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
 		if (!out_atoms)
 			die("out-atom: Internal tree error");
 	}
@@ -1085,7 +1092,7 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 		return;
 
 	wakee = threads__findnew(wakeup_event->pid, &threads, &last_match);
-	atoms = thread_atoms_search(&atom_root, wakee);
+	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
 	if (!atoms) {
 		thread_atoms_insert(wakee);
 		return;
@@ -1136,7 +1143,6 @@ static void output_lat_thread(struct task_atoms *atom_list)
 
 static int pid_cmp(struct task_atoms *l, struct task_atoms *r)
 {
-
 	if (l->thread->pid < r->thread->pid)
 		return -1;
 	if (l->thread->pid > r->thread->pid)
@@ -1146,8 +1152,8 @@ static int pid_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension pid_sort_dimension = {
-	.name = "pid",
-	.cmp = pid_cmp,
+	.name			= "pid",
+	.cmp			= pid_cmp,
 };
 
 static int avg_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1172,8 +1178,8 @@ static int avg_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension avg_sort_dimension = {
-	.name 	= "avg",
-	.cmp	= avg_cmp,
+	.name			= "avg",
+	.cmp			= avg_cmp,
 };
 
 static int max_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1187,8 +1193,8 @@ static int max_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension max_sort_dimension = {
-	.name 	= "max",
-	.cmp	= max_cmp,
+	.name			= "max",
+	.cmp			= max_cmp,
 };
 
 static int switch_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1202,8 +1208,8 @@ static int switch_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension switch_sort_dimension = {
-	.name 	= "switch",
-	.cmp	= switch_cmp,
+	.name			= "switch",
+	.cmp			= switch_cmp,
 };
 
 static int runtime_cmp(struct task_atoms *l, struct task_atoms *r)
@@ -1217,8 +1223,8 @@ static int runtime_cmp(struct task_atoms *l, struct task_atoms *r)
 }
 
 static struct sort_dimension runtime_sort_dimension = {
-	.name 	= "runtime",
-	.cmp	= runtime_cmp,
+	.name			= "runtime",
+	.cmp			= runtime_cmp,
 };
 
 static struct sort_dimension *available_sorts[] = {
@@ -1666,8 +1672,8 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
 			if (argc)
 				usage_with_options(latency_usage, latency_options);
-			setup_sorting();
 		}
+		setup_sorting();
 		__cmd_lat();
 	} else if (!strncmp(argv[0], "rep", 3)) {
 		trace_handler = &replay_ops;
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 634f2809a342..665d1f3dc977 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -4,10 +4,10 @@
 #include "symbol.h"
 
 struct thread {
-	struct rb_node	 rb_node;
-	struct list_head maps;
-	pid_t		 pid;
-	char		 *comm;
+	struct rb_node		rb_node;
+	struct list_head	maps;
+	pid_t			pid;
+	char			*comm;
 };
 
 int thread__set_comm(struct thread *self, const char *comm);
-- 
cgit v1.2.3


From 1fc35b29b4098aa3bf9fc9acb4c1615d0b5dd95d Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Sep 2009 09:44:29 +0200
Subject: perf sched: Implement the 'perf sched record' subcommand

Implement the 'perf sched record' subcommand that adds a
default list of events, turns on raw sampling and system-wide
tracing and passes off the rest of the command to perf record.

This is more convenient than having to specify the events all
the time.

Before:

 $ perf record -a -R -e sched:sched_switch:r -e sched:sched_stat_wait:r -e sched:sched_stat_sleep:r -e sched:sched_stat_iowait:r -e sched:sched_process_exit:r -e sched:sched_process_fork:r -e sched:sched_wakeup:r -e sched:sched_migrate_task:r -c 1 sleep 1

After:

 $ perf sched record -f sleep 1

Also fix an assumption in the event string parser that assumed
that strings passed in can be modified. (In this case they wont
be as they come from a readonly constant section.)

Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c     | 39 +++++++++++++++++++++++++++++++++++++--
 tools/perf/util/parse-events.c |  3 ++-
 2 files changed, 39 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index b72544f2b964..ede40c1429a8 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1656,6 +1656,40 @@ static void setup_sorting(void)
 	sort_dimension__add((char *)"pid", &cmp_pid);
 }
 
+static const char *record_args[] = {
+	"record",
+	"-a",
+	"-R",
+	"-c", "1",
+	"-e", "sched:sched_switch:r",
+	"-e", "sched:sched_stat_wait:r",
+	"-e", "sched:sched_stat_sleep:r",
+	"-e", "sched:sched_stat_iowait:r",
+	"-e", "sched:sched_process_exit:r",
+	"-e", "sched:sched_process_fork:r",
+	"-e", "sched:sched_wakeup:r",
+	"-e", "sched:sched_migrate_task:r",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+
+	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = strdup(record_args[i]);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	BUG_ON(i != rec_argc);
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
 int cmd_sched(int argc, const char **argv, const char *prefix __used)
 {
 	symbol__init();
@@ -1666,7 +1700,9 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 	if (!argc)
 		usage_with_options(sched_usage, sched_options);
 
-	if (!strncmp(argv[0], "lat", 3)) {
+	if (!strncmp(argv[0], "rec", 3)) {
+		return __cmd_record(argc, argv);
+	} else if (!strncmp(argv[0], "lat", 3)) {
 		trace_handler = &lat_ops;
 		if (argc > 1) {
 			argc = parse_options(argc, argv, latency_options, latency_usage, 0);
@@ -1687,6 +1723,5 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 		usage_with_options(sched_usage, sched_options);
 	}
 
-
 	return 0;
 }
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d06c66cd358b..034245e46817 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -525,7 +525,8 @@ static enum event_result parse_tracepoint_event(const char **strp,
 
 	flags = strchr(evt_name, ':');
 	if (flags) {
-		*flags = '\0';
+		/* split it out: */
+		evt_name = strndup(evt_name, flags - evt_name);
 		flags++;
 	}
 
-- 
cgit v1.2.3


From 459ec28ab404d7afcd512ce9b855959ad301605a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Sep 2009 17:33:44 +0200
Subject: perf_counter: Allow mmap if paranoid checks are turned off

Before:

  $ perf sched record -f sleep 1
  Error: failed to mmap with 1 (Operation not permitted)

After:

  $ perf sched record -f sleep 1
  [ perf record: Captured and wrote 0.095 MB perf.data (~4161 samples) ]

Note, this is only allowed if perfcounter_paranoid is set to
the most permissive (non-default) value of -1.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e0d91fdf0c3c..667ab25ad3d5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2315,7 +2315,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	lock_limit >>= PAGE_SHIFT;
 	locked = vma->vm_mm->locked_vm + extra;
 
-	if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) {
+	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+		!capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
 		goto unlock;
 	}
-- 
cgit v1.2.3


From c13f0d3c8165e9592102687fa999da0a0d9c3724 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Sep 2009 16:51:04 +0200
Subject: perf sched: Add 'perf sched trace', improve documentation

Alias 'perf sched trace' to 'perf trace', for workflow completeness.

Add a bit of documentation for perf sched.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-sched.txt | 22 +++++++++++++++++++---
 tools/perf/builtin-sched.c              |  7 ++++++-
 2 files changed, 25 insertions(+), 4 deletions(-)

diff --git a/tools/perf/Documentation/perf-sched.txt b/tools/perf/Documentation/perf-sched.txt
index 056320eecb3a..1ce79198997b 100644
--- a/tools/perf/Documentation/perf-sched.txt
+++ b/tools/perf/Documentation/perf-sched.txt
@@ -3,16 +3,32 @@ perf-sched(1)
 
 NAME
 ----
-perf-sched - Read perf.data (created by perf record) and display sched output
+perf-sched - Tool to trace/measure scheduler properties (latencies)
 
 SYNOPSIS
 --------
 [verse]
-'perf sched' [-i <file> | --input=file] symbol_name
+'perf sched' {record|latency|replay|trace}
 
 DESCRIPTION
 -----------
-This command reads the input file and displays the latencies recorded.
+There's four variants of perf sched:
+
+  'perf sched record <command>' to record the scheduling events
+  of an arbitrary workload.
+
+  'perf sched latency' to report the per task scheduling latencies
+  and other scheduling properties of the workload.
+
+  'perf sched trace' to see a detailed trace of the workload that
+  was recorded.
+
+  'perf sched replay' to simulate the workload that was recorded
+  via perf sched record. (this is done by starting up mockup threads
+  that mimic the workload based on the events in the trace. These
+  threads can then replay the timings (CPU runtime and sleep patterns)
+  of the workload as it occured when it was recorded - and can repeat
+  it a number of times, measuring its performance.)
 
 OPTIONS
 -------
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index ede40c1429a8..8db0fd222f80 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1597,7 +1597,7 @@ more:
 }
 
 static const char * const sched_usage[] = {
-	"perf sched [<options>] {record|latency|replay}",
+	"perf sched [<options>] {record|latency|replay|trace}",
 	NULL
 };
 
@@ -1719,6 +1719,11 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 				usage_with_options(replay_usage, replay_options);
 		}
 		__cmd_replay();
+	} else if (!strcmp(argv[0], "trace")) {
+		/*
+		 * Aliased to 'perf trace' for now:
+		 */
+		return cmd_trace(argc, argv, prefix);
 	} else {
 		usage_with_options(sched_usage, sched_options);
 	}
-- 
cgit v1.2.3


From f977bb4937857994312fff4f9c2cad336a36a932 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Sep 2009 18:15:54 +0200
Subject: perf_counter, sched: Add sched_stat_runtime tracepoint

This allows more precise tracking of how the scheduler accounts
(and acts upon) a task having spent N nanoseconds of CPU time.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/trace/events/sched.h | 33 +++++++++++++++++++++++++++++++++
 kernel/sched_fair.c          |  1 +
 2 files changed, 34 insertions(+)

diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h
index b48f1ad7c946..4069c43f4187 100644
--- a/include/trace/events/sched.h
+++ b/include/trace/events/sched.h
@@ -379,6 +379,39 @@ TRACE_EVENT(sched_stat_wait,
 			(unsigned long long)__entry->delay)
 );
 
+/*
+ * Tracepoint for accounting runtime (time the task is executing
+ * on a CPU).
+ */
+TRACE_EVENT(sched_stat_runtime,
+
+	TP_PROTO(struct task_struct *tsk, u64 runtime, u64 vruntime),
+
+	TP_ARGS(tsk, runtime, vruntime),
+
+	TP_STRUCT__entry(
+		__array( char,	comm,	TASK_COMM_LEN	)
+		__field( pid_t,	pid			)
+		__field( u64,	runtime			)
+		__field( u64,	vruntime			)
+	),
+
+	TP_fast_assign(
+		memcpy(__entry->comm, tsk->comm, TASK_COMM_LEN);
+		__entry->pid		= tsk->pid;
+		__entry->runtime	= runtime;
+		__entry->vruntime	= vruntime;
+	)
+	TP_perf_assign(
+		__perf_count(runtime);
+	),
+
+	TP_printk("task: %s:%d runtime: %Lu [ns], vruntime: %Lu [ns]",
+			__entry->comm, __entry->pid,
+			(unsigned long long)__entry->runtime,
+			(unsigned long long)__entry->vruntime)
+);
+
 /*
  * Tracepoint for accounting sleep time (time the task is not runnable,
  * including iowait, see below).
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index aa7f84121016..a097e909e80f 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -513,6 +513,7 @@ static void update_curr(struct cfs_rq *cfs_rq)
 	if (entity_is_task(curr)) {
 		struct task_struct *curtask = task_of(curr);
 
+		trace_sched_stat_runtime(curtask, delta_exec, curr->vruntime);
 		cpuacct_charge(curtask, delta_exec);
 		account_group_exec_runtime(curtask, delta_exec);
 	}
-- 
cgit v1.2.3


From d13025222cdb0043e2239b3b819389358bb31bc0 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 14 Sep 2009 08:57:15 +0200
Subject: perf tools: Add an option to multiplex counters in a single channel

Add an option to multiplex counters output in the channel of
the group leader, ie: the first counter opened:

	-M --multiplex

The effect is better serialized samples. This is especially
useful for tracepoint samples that need to be well serialized
for their post-processing.

Also make use of this option in 'perf sched'.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 6 ++++++
 tools/perf/builtin-sched.c  | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 99a12fe86e9f..79f99dba5be0 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -48,6 +48,7 @@ static int			call_graph			= 0;
 static int			inherit_stat			= 0;
 static int			no_samples			= 0;
 static int			sample_address			= 0;
+static int			multiplex			= 0;
 
 static long			samples;
 static struct timeval		last_read;
@@ -485,6 +486,9 @@ try_again:
 		exit(-1);
 	}
 
+	if (multiplex && fd[nr_cpu][counter] != group_fd)
+		ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, group_fd);
+
 	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
 }
 
@@ -681,6 +685,8 @@ static const struct option options[] = {
 		    "Sample addresses"),
 	OPT_BOOLEAN('n', "no-samples", &no_samples,
 		    "don't sample"),
+	OPT_BOOLEAN('M', "multiplex", &multiplex,
+		    "multiplex counter output in a single channel"),
 	OPT_END()
 };
 
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 8db0fd222f80..686af633b35b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1660,6 +1660,8 @@ static const char *record_args[] = {
 	"record",
 	"-a",
 	"-R",
+	"-M",
+	"-g",
 	"-c", "1",
 	"-e", "sched:sched_switch:r",
 	"-e", "sched:sched_stat_wait:r",
-- 
cgit v1.2.3


From aa1ab9d26ae9fe2566a9036e3cb83e7d555b3987 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Mon, 14 Sep 2009 03:01:12 +0200
Subject: perf tools: Fix processing of randomly serialized sched traces

Currently it's possible to meet such too high latency results
with 'perf sched latency'.

 -----------------------------------------------------------------------------------
 Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
 -----------------------------------------------------------------------------------
 xfce4-panel       |    0.222 ms |        2 | avg: 4718.345 ms | max: 9436.493 ms |
 scsi_eh_3         |    3.962 ms |       36 | avg:   55.957 ms | max: 1977.829 ms |

The origin is on traces that are sometimes badly serialized across cpus.
For example the raw traces that raised such results for xfce4-panel:

(1)          [init]-0     [000]  1494.663899990: sched_switch: task swapper:0 [140] (R) ==> xfce4-panel:4569 [120]
(2)     xfce4-panel-4569  [000]  1494.663928373: sched_switch: task xfce4-panel:4569 [120] (S) ==> swapper:0 [140]
(3)            Xorg-4276  [001]  1494.663860125: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000]
(4)            Xorg-4276  [001]  1504.098252756: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000]
(5)            perf-5219  [000]  1504.100353302: sched_switch: task perf:5219 [120] (S) ==> xfce4-panel:4569 [120]

The traces are processed in the order they arrive. Then in (2),
xfce4-panel sleeps, it is first waken up in (3) and eventually
scheduled in (5).

The latency reported is then 1504 - 1495 = 9 secs, as reported by perf
sched. But this is wrong, we are confident in the fact the traces are
nicely serialized while we should actually more trust the timestamps.

If we reorder by timestamps we get:

(1)            Xorg-4276  [001]  1494.663860125: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000]
(2)          [init]-0     [000]  1494.663899990: sched_switch: task swapper:0 [140] (R) ==> xfce4-panel:4569 [120]
(3)     xfce4-panel-4569  [000]  1494.663928373: sched_switch: task xfce4-panel:4569 [120] (S) ==> swapper:0 [140]
(4)            Xorg-4276  [001]  1504.098252756: sched_wakeup: task xfce4-panel:4569 [120] success=1 [000]
(5)            perf-5219  [000]  1504.100353302: sched_switch: task perf:5219 [120] (S) ==> xfce4-panel:4569 [120]

Now the trace make more sense, xfce4-panel is sleeping. Then it is
woken up in (1), scheduled in (2)
It goes to sleep in (3), woken up in (4) and scheduled in (5).

Now, latency captured between (1) and (2) is of 39 us.
And between (4) and (5) it is 2.1 ms.

Such pattern of bad serializing is the origin of the high latencies
reported by perf sched.

Basically, we need to check whether wake up time is higher than
schedule out time. If it's not the case, we need to tag the current
work atom as invalid.

Beside that, we may need to work later on a better ordering of the
traces given by the kernel.

After this patch:

xfce4-session     |    0.221 ms |        1 | avg:    0.538 ms | max:    0.538 ms |

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 686af633b35b..3e003237c42f 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -129,6 +129,7 @@ enum thread_state {
 struct work_atom {
 	struct list_head	list;
 	enum thread_state	state;
+	u64			sched_out_time;
 	u64			wake_up_time;
 	u64			sched_in_time;
 	u64			runtime;
@@ -988,9 +989,11 @@ lat_sched_out(struct task_atoms *atoms,
 	if (!atom)
 		die("Non memory");
 
+	atom->sched_out_time = timestamp;
+
 	if (sched_out_state(switch_event) == 'R') {
 		atom->state = THREAD_WAIT_CPU;
-		atom->wake_up_time = timestamp;
+		atom->wake_up_time = atom->sched_out_time;
 	}
 
 	atom->runtime = delta;
@@ -1106,6 +1109,9 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	if (atom->state != THREAD_SLEEPING)
 		return;
 
+	if (atom->sched_out_time > timestamp)
+		return;
+
 	atom->state = THREAD_WAIT_CPU;
 	atom->wake_up_time = timestamp;
 }
-- 
cgit v1.2.3


From ea57c4f5203d82c7844c54cdef54e972cf4e9d1f Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Sun, 13 Sep 2009 18:15:54 +0200
Subject: perf tools: Implement counter output multiplexing

Finish the -M/--multiplex option implementation:

 - separate it out from group_fd

 - correctly set it via the ioctl and dont mmap counters that
   are multiplexed

 - modify the perf record event loop to deal with buffer-less
   counters.

 - remove the -g option from perf sched record

 - account for unordered events in perf sched latency

 - (add -f to perf sched record to ease measurements)

 - skip idle threads (pid==0) in latency output

The result is better latency output by 'perf sched latency':

 -----------------------------------------------------------------------------------
  Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
 -----------------------------------------------------------------------------------
  ksoftirqd/8       |    0.071 ms |        2 | avg:    0.458 ms | max:    0.913 ms |
  at-spi-registry   |    0.609 ms |       19 | avg:    0.013 ms | max:    0.023 ms |
  perf              |    3.316 ms |       16 | avg:    0.013 ms | max:    0.054 ms |
  Xorg              |    0.392 ms |       19 | avg:    0.011 ms | max:    0.018 ms |
  sleep             |    0.537 ms |        2 | avg:    0.009 ms | max:    0.009 ms |
 -----------------------------------------------------------------------------------
  TOTAL:            |    4.925 ms |       58 |
 ---------------------------------------------

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c         | 43 ++++++++++++++++++++++---------------
 tools/perf/builtin-sched.c          | 25 ++++++++++++++++++---
 tools/perf/util/trace-event-parse.c |  6 ++++--
 3 files changed, 52 insertions(+), 22 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 79f99dba5be0..5f3127e7a615 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -49,6 +49,7 @@ static int			inherit_stat			= 0;
 static int			no_samples			= 0;
 static int			sample_address			= 0;
 static int			multiplex			= 0;
+static int			multiplex_fd			= -1;
 
 static long			samples;
 static struct timeval		last_read;
@@ -471,23 +472,29 @@ try_again:
 	 */
 	if (group && group_fd == -1)
 		group_fd = fd[nr_cpu][counter];
+	if (multiplex && multiplex_fd == -1)
+		multiplex_fd = fd[nr_cpu][counter];
 
-	event_array[nr_poll].fd = fd[nr_cpu][counter];
-	event_array[nr_poll].events = POLLIN;
-	nr_poll++;
-
-	mmap_array[nr_cpu][counter].counter = counter;
-	mmap_array[nr_cpu][counter].prev = 0;
-	mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
-	mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
-			PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
-	if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
-		error("failed to mmap with %d (%s)\n", errno, strerror(errno));
-		exit(-1);
-	}
+	if (multiplex && fd[nr_cpu][counter] != multiplex_fd) {
+		int ret;
 
-	if (multiplex && fd[nr_cpu][counter] != group_fd)
-		ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, group_fd);
+		ret = ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_SET_OUTPUT, multiplex_fd);
+		assert(ret != -1);
+	} else {
+		event_array[nr_poll].fd = fd[nr_cpu][counter];
+		event_array[nr_poll].events = POLLIN;
+		nr_poll++;
+
+		mmap_array[nr_cpu][counter].counter = counter;
+		mmap_array[nr_cpu][counter].prev = 0;
+		mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1;
+		mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size,
+				PROT_READ|PROT_WRITE, MAP_SHARED, fd[nr_cpu][counter], 0);
+		if (mmap_array[nr_cpu][counter].base == MAP_FAILED) {
+			error("failed to mmap with %d (%s)\n", errno, strerror(errno));
+			exit(-1);
+		}
+	}
 
 	ioctl(fd[nr_cpu][counter], PERF_COUNTER_IOC_ENABLE);
 }
@@ -618,8 +625,10 @@ static int __cmd_record(int argc, const char **argv)
 		int hits = samples;
 
 		for (i = 0; i < nr_cpu; i++) {
-			for (counter = 0; counter < nr_counters; counter++)
-				mmap_read(&mmap_array[i][counter]);
+			for (counter = 0; counter < nr_counters; counter++) {
+				if (mmap_array[i][counter].base)
+					mmap_read(&mmap_array[i][counter]);
+			}
 		}
 
 		if (hits == samples) {
diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 3e003237c42f..2ce87ef5a3e6 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -116,6 +116,8 @@ static u64			sum_fluct;
 static u64			run_avg;
 
 static unsigned long		replay_repeat = 10;
+static unsigned long		nr_timestamps;
+static unsigned long		unordered_timestamps;
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 
@@ -1109,8 +1111,11 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	if (atom->state != THREAD_SLEEPING)
 		return;
 
-	if (atom->sched_out_time > timestamp)
+	nr_timestamps++;
+	if (atom->sched_out_time > timestamp) {
+		unordered_timestamps++;
 		return;
+	}
 
 	atom->state = THREAD_WAIT_CPU;
 	atom->wake_up_time = timestamp;
@@ -1130,6 +1135,11 @@ static void output_lat_thread(struct task_atoms *atom_list)
 
 	if (!atom_list->nb_atoms)
 		return;
+	/*
+	 * Ignore idle threads:
+	 */
+	if (!atom_list->thread->pid)
+		return;
 
 	all_runtime += atom_list->total_runtime;
 	all_count += atom_list->nb_atoms;
@@ -1301,8 +1311,16 @@ static void __cmd_lat(void)
 	}
 
 	printf("-----------------------------------------------------------------------------------\n");
-	printf(" TOTAL:            |%9.3f ms |%9Ld |\n",
+	printf(" TOTAL:            |%9.3f ms |%9Ld |",
 		(double)all_runtime/1e6, all_count);
+
+	if (unordered_timestamps && nr_timestamps) {
+		printf(" INFO: %.2f%% unordered events.\n",
+			(double)unordered_timestamps/(double)nr_timestamps*100.0);
+	} else {
+		printf("\n");
+	}
+
 	printf("---------------------------------------------\n");
 }
 
@@ -1667,12 +1685,13 @@ static const char *record_args[] = {
 	"-a",
 	"-R",
 	"-M",
-	"-g",
+	"-f",
 	"-c", "1",
 	"-e", "sched:sched_switch:r",
 	"-e", "sched:sched_stat_wait:r",
 	"-e", "sched:sched_stat_sleep:r",
 	"-e", "sched:sched_stat_iowait:r",
+	"-e", "sched:sched_stat_runtime:r",
 	"-e", "sched:sched_process_exit:r",
 	"-e", "sched:sched_process_fork:r",
 	"-e", "sched:sched_wakeup:r",
diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c
index 64d6e302751a..f6a8437141c8 100644
--- a/tools/perf/util/trace-event-parse.c
+++ b/tools/perf/util/trace-event-parse.c
@@ -2722,8 +2722,10 @@ void print_event(int cpu, void *data, int size, unsigned long long nsecs,
 	type = trace_parse_common_type(data);
 
 	event = trace_find_event(type);
-	if (!event)
-		die("ug! no event found for type %d", type);
+	if (!event) {
+		printf("ug! no event found for type %d\n", type);
+		return;
+	}
 
 	pid = parse_common_pid(data);
 
-- 
cgit v1.2.3


From d11533893b31ab7806ff04bfa69ae646068610ce Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Mon, 14 Sep 2009 18:22:53 +0200
Subject: perf sched: Fix 'perf sched latency' output on 32-bit systems

Before:

  -----------------------------------------------------------------------------------
   Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
  -----------------------------------------------------------------------------------
   perf              |4853313.251 ms |       10 | avg:    0.046 ms | max:    0.337 ms |
   flush-8:0         |2426659.202 ms |        5 | avg:    0.015 ms | max:    0.016 ms |
   sleep             |485331.966 ms |        1 | avg:    0.012 ms | max:    0.012 ms |
   ksoftirqd/1       |485331.320 ms |        1 | avg:    0.005 ms | max:    0.005 ms |
  -----------------------------------------------------------------------------------
   TOTAL:            |8250635.739 ms |       17 |
  ---------------------------------------------

After:

  -----------------------------------------------------------------------------------
   Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
  -----------------------------------------------------------------------------------
   perf              |    0.206 ms |       10 | avg:    0.046 ms | max:    0.337 ms |
   flush-8:0         |    2.680 ms |        5 | avg:    0.015 ms | max:    0.016 ms |
   sleep             |    0.662 ms |        1 | avg:    0.012 ms | max:    0.012 ms |
   ksoftirqd/1       |    0.015 ms |        1 | avg:    0.005 ms | max:    0.005 ms |
  -----------------------------------------------------------------------------------
   TOTAL:            |    3.563 ms |       17 |
  ---------------------------------------------

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 2ce87ef5a3e6..f856a02cd4fc 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -805,7 +805,7 @@ replay_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	add_sched_event_wakeup(waker, timestamp, wakee);
 }
 
-static unsigned long cpu_last_switched[MAX_CPUS];
+static u64 cpu_last_switched[MAX_CPUS];
 
 static void
 replay_switch_event(struct trace_switch_event *switch_event,
-- 
cgit v1.2.3


From 08f69e6c2e59b3d73343f8c9ecf758e0133dbc22 Mon Sep 17 00:00:00 2001
From: mingo <mingo@europe.(none)>
Date: Mon, 14 Sep 2009 18:30:44 +0200
Subject: perf sched: Print PIDs too

Often it's useful to know the PID of the task as well - print it
out too.

( While at it, reformat the output to be a bit more
  paste-into-commit-logs friendly. )

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f856a02cd4fc..93ef7b215aba 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1144,9 +1144,9 @@ static void output_lat_thread(struct task_atoms *atom_list)
 	all_runtime += atom_list->total_runtime;
 	all_count += atom_list->nb_atoms;
 
-	ret = printf(" %s ", atom_list->thread->comm);
+	ret = printf("  %s-%d ", atom_list->thread->comm, atom_list->thread->pid);
 
-	for (i = 0; i < 19 - ret; i++)
+	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
 
 	avg = atom_list->total_lat / atom_list->nb_atoms;
@@ -1296,9 +1296,9 @@ static void __cmd_lat(void)
 	read_events();
 	sort_lat();
 
-	printf("-----------------------------------------------------------------------------------\n");
-	printf(" Task              |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
-	printf("-----------------------------------------------------------------------------------\n");
+	printf("\n ---------------------------------------------------------------------------------------\n");
+	printf("  Task                  |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
+	printf(" ---------------------------------------------------------------------------------------\n");
 
 	next = rb_first(&sorted_atom_root);
 
@@ -1310,8 +1310,8 @@ static void __cmd_lat(void)
 		next = rb_next(next);
 	}
 
-	printf("-----------------------------------------------------------------------------------\n");
-	printf(" TOTAL:            |%9.3f ms |%9Ld |",
+	printf(" ---------------------------------------------------------------------------------------\n");
+	printf("  TOTAL:                |%9.3f ms |%9Ld |",
 		(double)all_runtime/1e6, all_count);
 
 	if (unordered_timestamps && nr_timestamps) {
@@ -1321,7 +1321,7 @@ static void __cmd_lat(void)
 		printf("\n");
 	}
 
-	printf("---------------------------------------------\n");
+	printf(" -------------------------------------------------\n\n");
 }
 
 static struct trace_sched_handler *trace_handler;
-- 
cgit v1.2.3


From 39aeb52f99f2380c1f16036deed2f7bb8b2e0559 Mon Sep 17 00:00:00 2001
From: mingo <mingo@europe.(none)>
Date: Mon, 14 Sep 2009 20:04:48 +0200
Subject: perf sched: Add support for sched:sched_stat_runtime events

This allows more precise 'perf sched latency' output:

 ---------------------------------------------------------------------------------------
  Task                  |  Runtime ms | Switches | Average delay ms | Maximum delay ms |
 ---------------------------------------------------------------------------------------
  ksoftirqd/0-4         |    0.010 ms |        2 | avg:    2.476 ms | max:    2.977 ms |
  perf-12328            |   15.844 ms |       66 | avg:    1.118 ms | max:    9.979 ms |
  bdi-default-235       |    0.009 ms |        1 | avg:    0.998 ms | max:    0.998 ms |
  events/1-8            |    0.020 ms |        2 | avg:    0.998 ms | max:    0.998 ms |
  events/0-7            |    0.018 ms |        2 | avg:    0.992 ms | max:    0.996 ms |
  sleep-12329           |    0.742 ms |        3 | avg:    0.906 ms | max:    2.289 ms |
  sshd-12122            |    0.163 ms |        2 | avg:    0.283 ms | max:    0.562 ms |
  loop-getpid-lon-12322 | 1023.636 ms |       69 | avg:    0.208 ms | max:    5.996 ms |
  loop-getpid-lon-12321 | 1038.638 ms |        5 | avg:    0.073 ms | max:    0.171 ms |
  migration/1-5         |    0.000 ms |        1 | avg:    0.006 ms | max:    0.006 ms |
 ---------------------------------------------------------------------------------------
  TOTAL:                | 2079.078 ms |      153 |
 -------------------------------------------------

Also, streamline the code a bit more, add asserts for various state
machine failures (they should be debugged if they occur) and fix
a few odd ends.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 264 +++++++++++++++++++++++++++++----------------
 1 file changed, 173 insertions(+), 91 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 93ef7b215aba..adcb563ec4d2 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -50,7 +50,7 @@ static u64			sleep_measurement_overhead;
 
 static unsigned long		nr_tasks;
 
-struct sched_event;
+struct sched_atom;
 
 struct task_desc {
 	unsigned long		nr;
@@ -59,7 +59,7 @@ struct task_desc {
 
 	unsigned long		nr_events;
 	unsigned long		curr_event;
-	struct sched_event	**events;
+	struct sched_atom	**atoms;
 
 	pthread_t		thread;
 	sem_t			sleep_sem;
@@ -76,7 +76,7 @@ enum sched_event_type {
 	SCHED_EVENT_WAKEUP,
 };
 
-struct sched_event {
+struct sched_atom {
 	enum sched_event_type	type;
 	u64			timestamp;
 	u64			duration;
@@ -137,8 +137,8 @@ struct work_atom {
 	u64			runtime;
 };
 
-struct task_atoms {
-	struct list_head	atom_list;
+struct work_atoms {
+	struct list_head	work_list;
 	struct thread		*thread;
 	struct rb_node		node;
 	u64			max_lat;
@@ -147,7 +147,7 @@ struct task_atoms {
 	u64			total_runtime;
 };
 
-typedef int (*sort_fn_t)(struct task_atoms *, struct task_atoms *);
+typedef int (*sort_fn_t)(struct work_atoms *, struct work_atoms *);
 
 static struct rb_root		atom_root, sorted_atom_root;
 
@@ -220,10 +220,10 @@ static void calibrate_sleep_measurement_overhead(void)
 	printf("sleep measurement overhead: %Ld nsecs\n", min_delta);
 }
 
-static struct sched_event *
+static struct sched_atom *
 get_new_event(struct task_desc *task, u64 timestamp)
 {
-	struct sched_event *event = calloc(1, sizeof(*event));
+	struct sched_atom *event = calloc(1, sizeof(*event));
 	unsigned long idx = task->nr_events;
 	size_t size;
 
@@ -231,27 +231,27 @@ get_new_event(struct task_desc *task, u64 timestamp)
 	event->nr = idx;
 
 	task->nr_events++;
-	size = sizeof(struct sched_event *) * task->nr_events;
-	task->events = realloc(task->events, size);
-	BUG_ON(!task->events);
+	size = sizeof(struct sched_atom *) * task->nr_events;
+	task->atoms = realloc(task->atoms, size);
+	BUG_ON(!task->atoms);
 
-	task->events[idx] = event;
+	task->atoms[idx] = event;
 
 	return event;
 }
 
-static struct sched_event *last_event(struct task_desc *task)
+static struct sched_atom *last_event(struct task_desc *task)
 {
 	if (!task->nr_events)
 		return NULL;
 
-	return task->events[task->nr_events - 1];
+	return task->atoms[task->nr_events - 1];
 }
 
 static void
 add_sched_event_run(struct task_desc *task, u64 timestamp, u64 duration)
 {
-	struct sched_event *event, *curr_event = last_event(task);
+	struct sched_atom *event, *curr_event = last_event(task);
 
 	/*
 	 * optimize an existing RUN event by merging this one
@@ -275,7 +275,7 @@ static void
 add_sched_event_wakeup(struct task_desc *task, u64 timestamp,
 		       struct task_desc *wakee)
 {
-	struct sched_event *event, *wakee_event;
+	struct sched_atom *event, *wakee_event;
 
 	event = get_new_event(task, timestamp);
 	event->type = SCHED_EVENT_WAKEUP;
@@ -303,7 +303,7 @@ static void
 add_sched_event_sleep(struct task_desc *task, u64 timestamp,
 		      u64 task_state __used)
 {
-	struct sched_event *event = get_new_event(task, timestamp);
+	struct sched_atom *event = get_new_event(task, timestamp);
 
 	event->type = SCHED_EVENT_SLEEP;
 
@@ -372,27 +372,27 @@ static void add_cross_task_wakeups(void)
 }
 
 static void
-process_sched_event(struct task_desc *this_task __used, struct sched_event *event)
+process_sched_event(struct task_desc *this_task __used, struct sched_atom *atom)
 {
 	int ret = 0;
 	u64 now;
 	long long delta;
 
 	now = get_nsecs();
-	delta = start_time + event->timestamp - now;
+	delta = start_time + atom->timestamp - now;
 
-	switch (event->type) {
+	switch (atom->type) {
 		case SCHED_EVENT_RUN:
-			burn_nsecs(event->duration);
+			burn_nsecs(atom->duration);
 			break;
 		case SCHED_EVENT_SLEEP:
-			if (event->wait_sem)
-				ret = sem_wait(event->wait_sem);
+			if (atom->wait_sem)
+				ret = sem_wait(atom->wait_sem);
 			BUG_ON(ret);
 			break;
 		case SCHED_EVENT_WAKEUP:
-			if (event->wait_sem)
-				ret = sem_post(event->wait_sem);
+			if (atom->wait_sem)
+				ret = sem_post(atom->wait_sem);
 			BUG_ON(ret);
 			break;
 		default:
@@ -467,7 +467,7 @@ again:
 
 	for (i = 0; i < this_task->nr_events; i++) {
 		this_task->curr_event = i;
-		process_sched_event(this_task, this_task->events[i]);
+		process_sched_event(this_task, this_task->atoms[i]);
 	}
 
 	cpu_usage_1 = get_cpu_usage_nsec_self();
@@ -649,7 +649,7 @@ static void __cmd_replay(void)
 	if (multitarget_wakeups)
 		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
 	if (nr_run_events_optimized)
-		printf("run events optimized: %ld\n",
+		printf("run atoms optimized: %ld\n",
 			nr_run_events_optimized);
 
 	print_task_traces();
@@ -727,6 +727,20 @@ struct trace_switch_event {
 	u32 next_prio;
 };
 
+struct trace_runtime_event {
+	u32 size;
+
+	u16 common_type;
+	u8 common_flags;
+	u8 common_preempt_count;
+	u32 common_pid;
+	u32 common_tgid;
+
+	char comm[16];
+	u32 pid;
+	u64 runtime;
+	u64 vruntime;
+};
 
 struct trace_wakeup_event {
 	u32 size;
@@ -767,6 +781,12 @@ struct trace_sched_handler {
 			     u64 timestamp,
 			     struct thread *thread);
 
+	void (*runtime_event)(struct trace_runtime_event *,
+			      struct event *,
+			      int cpu,
+			      u64 timestamp,
+			      struct thread *thread);
+
 	void (*wakeup_event)(struct trace_wakeup_event *,
 			     struct event *,
 			     int cpu,
@@ -881,7 +901,7 @@ struct sort_dimension {
 static LIST_HEAD(cmp_pid);
 
 static int
-thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *r)
+thread_lat_cmp(struct list_head *list, struct work_atoms *l, struct work_atoms *r)
 {
 	struct sort_dimension *sort;
 	int ret = 0;
@@ -897,18 +917,18 @@ thread_lat_cmp(struct list_head *list, struct task_atoms *l, struct task_atoms *
 	return ret;
 }
 
-static struct task_atoms *
+static struct work_atoms *
 thread_atoms_search(struct rb_root *root, struct thread *thread,
 			 struct list_head *sort_list)
 {
 	struct rb_node *node = root->rb_node;
-	struct task_atoms key = { .thread = thread };
+	struct work_atoms key = { .thread = thread };
 
 	while (node) {
-		struct task_atoms *atoms;
+		struct work_atoms *atoms;
 		int cmp;
 
-		atoms = container_of(node, struct task_atoms, node);
+		atoms = container_of(node, struct work_atoms, node);
 
 		cmp = thread_lat_cmp(sort_list, &key, atoms);
 		if (cmp > 0)
@@ -924,16 +944,16 @@ thread_atoms_search(struct rb_root *root, struct thread *thread,
 }
 
 static void
-__thread_latency_insert(struct rb_root *root, struct task_atoms *data,
+__thread_latency_insert(struct rb_root *root, struct work_atoms *data,
 			 struct list_head *sort_list)
 {
 	struct rb_node **new = &(root->rb_node), *parent = NULL;
 
 	while (*new) {
-		struct task_atoms *this;
+		struct work_atoms *this;
 		int cmp;
 
-		this = container_of(*new, struct task_atoms, node);
+		this = container_of(*new, struct work_atoms, node);
 		parent = *new;
 
 		cmp = thread_lat_cmp(sort_list, data, this);
@@ -950,14 +970,14 @@ __thread_latency_insert(struct rb_root *root, struct task_atoms *data,
 
 static void thread_atoms_insert(struct thread *thread)
 {
-	struct task_atoms *atoms;
+	struct work_atoms *atoms;
 
 	atoms = calloc(sizeof(*atoms), 1);
 	if (!atoms)
 		die("No memory");
 
 	atoms->thread = thread;
-	INIT_LIST_HEAD(&atoms->atom_list);
+	INIT_LIST_HEAD(&atoms->work_list);
 	__thread_latency_insert(&atom_root, atoms, &cmp_pid);
 }
 
@@ -980,10 +1000,9 @@ static char sched_out_state(struct trace_switch_event *switch_event)
 }
 
 static void
-lat_sched_out(struct task_atoms *atoms,
-	      struct trace_switch_event *switch_event __used,
-	      u64 delta,
-	      u64 timestamp)
+add_sched_out_event(struct work_atoms *atoms,
+		    char run_state,
+		    u64 timestamp)
 {
 	struct work_atom *atom;
 
@@ -993,25 +1012,37 @@ lat_sched_out(struct task_atoms *atoms,
 
 	atom->sched_out_time = timestamp;
 
-	if (sched_out_state(switch_event) == 'R') {
+	if (run_state == 'R') {
 		atom->state = THREAD_WAIT_CPU;
 		atom->wake_up_time = atom->sched_out_time;
 	}
 
-	atom->runtime = delta;
-	list_add_tail(&atom->list, &atoms->atom_list);
+	list_add_tail(&atom->list, &atoms->work_list);
+}
+
+static void
+add_runtime_event(struct work_atoms *atoms, u64 delta, u64 timestamp __used)
+{
+	struct work_atom *atom;
+
+	BUG_ON(list_empty(&atoms->work_list));
+
+	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
+
+	atom->runtime += delta;
+	atoms->total_runtime += delta;
 }
 
 static void
-lat_sched_in(struct task_atoms *atoms, u64 timestamp)
+add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
 {
 	struct work_atom *atom;
 	u64 delta;
 
-	if (list_empty(&atoms->atom_list))
+	if (list_empty(&atoms->work_list))
 		return;
 
-	atom = list_entry(atoms->atom_list.prev, struct work_atom, list);
+	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 
 	if (atom->state != THREAD_WAIT_CPU)
 		return;
@@ -1029,7 +1060,6 @@ lat_sched_in(struct task_atoms *atoms, u64 timestamp)
 	if (delta > atoms->max_lat)
 		atoms->max_lat = delta;
 	atoms->nb_atoms++;
-	atoms->total_runtime += atom->runtime;
 }
 
 static void
@@ -1039,13 +1069,12 @@ latency_switch_event(struct trace_switch_event *switch_event,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
-	struct task_atoms *out_atoms, *in_atoms;
+	struct work_atoms *out_events, *in_events;
 	struct thread *sched_out, *sched_in;
 	u64 timestamp0;
 	s64 delta;
 
-	if (cpu >= MAX_CPUS || cpu < 0)
-		return;
+	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
 
 	timestamp0 = cpu_last_switched[cpu];
 	cpu_last_switched[cpu] = timestamp;
@@ -1061,34 +1090,63 @@ latency_switch_event(struct trace_switch_event *switch_event,
 	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
 	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
 
-	in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
-	if (!in_atoms) {
+	out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
+	if (!out_events) {
+		thread_atoms_insert(sched_out);
+		out_events = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
+		if (!out_events)
+			die("out-event: Internal tree error");
+	}
+	add_sched_out_event(out_events, sched_out_state(switch_event), timestamp);
+
+	in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
+	if (!in_events) {
 		thread_atoms_insert(sched_in);
-		in_atoms = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
-		if (!in_atoms)
-			die("in-atom: Internal tree error");
+		in_events = thread_atoms_search(&atom_root, sched_in, &cmp_pid);
+		if (!in_events)
+			die("in-event: Internal tree error");
+		/*
+		 * Take came in we have not heard about yet,
+		 * add in an initial atom in runnable state:
+		 */
+		add_sched_out_event(in_events, 'R', timestamp);
 	}
+	add_sched_in_event(in_events, timestamp);
+}
 
-	out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
-	if (!out_atoms) {
-		thread_atoms_insert(sched_out);
-		out_atoms = thread_atoms_search(&atom_root, sched_out, &cmp_pid);
-		if (!out_atoms)
-			die("out-atom: Internal tree error");
+static void
+latency_runtime_event(struct trace_runtime_event *runtime_event,
+		     struct event *event __used,
+		     int cpu,
+		     u64 timestamp,
+		     struct thread *this_thread __used)
+{
+	struct work_atoms *atoms;
+	struct thread *thread;
+
+	BUG_ON(cpu >= MAX_CPUS || cpu < 0);
+
+	thread = threads__findnew(runtime_event->pid, &threads, &last_match);
+	atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
+	if (!atoms) {
+		thread_atoms_insert(thread);
+		atoms = thread_atoms_search(&atom_root, thread, &cmp_pid);
+		if (!atoms)
+			die("in-event: Internal tree error");
+		add_sched_out_event(atoms, 'R', timestamp);
 	}
 
-	lat_sched_in(in_atoms, timestamp);
-	lat_sched_out(out_atoms, switch_event, delta, timestamp);
+	add_runtime_event(atoms, runtime_event->runtime, timestamp);
 }
 
 static void
 latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
-		     struct event *event __used,
+		     struct event *__event __used,
 		     int cpu __used,
 		     u64 timestamp,
 		     struct thread *thread __used)
 {
-	struct task_atoms *atoms;
+	struct work_atoms *atoms;
 	struct work_atom *atom;
 	struct thread *wakee;
 
@@ -1100,16 +1158,20 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 	atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
 	if (!atoms) {
 		thread_atoms_insert(wakee);
-		return;
+		atoms = thread_atoms_search(&atom_root, wakee, &cmp_pid);
+		if (!atoms)
+			die("wakeup-event: Internal tree error");
+		add_sched_out_event(atoms, 'S', timestamp);
 	}
 
-	if (list_empty(&atoms->atom_list))
-		return;
+	BUG_ON(list_empty(&atoms->work_list));
 
-	atom = list_entry(atoms->atom_list.prev, struct work_atom, list);
+	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 
-	if (atom->state != THREAD_SLEEPING)
+	if (atom->state != THREAD_SLEEPING) {
+		printf("boo2\n");
 		return;
+	}
 
 	nr_timestamps++;
 	if (atom->sched_out_time > timestamp) {
@@ -1124,40 +1186,41 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 static struct trace_sched_handler lat_ops  = {
 	.wakeup_event		= latency_wakeup_event,
 	.switch_event		= latency_switch_event,
+	.runtime_event		= latency_runtime_event,
 	.fork_event		= latency_fork_event,
 };
 
-static void output_lat_thread(struct task_atoms *atom_list)
+static void output_lat_thread(struct work_atoms *work_list)
 {
 	int i;
 	int ret;
 	u64 avg;
 
-	if (!atom_list->nb_atoms)
+	if (!work_list->nb_atoms)
 		return;
 	/*
 	 * Ignore idle threads:
 	 */
-	if (!atom_list->thread->pid)
+	if (!work_list->thread->pid)
 		return;
 
-	all_runtime += atom_list->total_runtime;
-	all_count += atom_list->nb_atoms;
+	all_runtime += work_list->total_runtime;
+	all_count += work_list->nb_atoms;
 
-	ret = printf("  %s-%d ", atom_list->thread->comm, atom_list->thread->pid);
+	ret = printf("  %s-%d ", work_list->thread->comm, work_list->thread->pid);
 
 	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
 
-	avg = atom_list->total_lat / atom_list->nb_atoms;
+	avg = work_list->total_lat / work_list->nb_atoms;
 
 	printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
-	      (double)atom_list->total_runtime / 1e6,
-		 atom_list->nb_atoms, (double)avg / 1e6,
-		 (double)atom_list->max_lat / 1e6);
+	      (double)work_list->total_runtime / 1e6,
+		 work_list->nb_atoms, (double)avg / 1e6,
+		 (double)work_list->max_lat / 1e6);
 }
 
-static int pid_cmp(struct task_atoms *l, struct task_atoms *r)
+static int pid_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->thread->pid < r->thread->pid)
 		return -1;
@@ -1172,7 +1235,7 @@ static struct sort_dimension pid_sort_dimension = {
 	.cmp			= pid_cmp,
 };
 
-static int avg_cmp(struct task_atoms *l, struct task_atoms *r)
+static int avg_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	u64 avgl, avgr;
 
@@ -1198,7 +1261,7 @@ static struct sort_dimension avg_sort_dimension = {
 	.cmp			= avg_cmp,
 };
 
-static int max_cmp(struct task_atoms *l, struct task_atoms *r)
+static int max_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->max_lat < r->max_lat)
 		return -1;
@@ -1213,7 +1276,7 @@ static struct sort_dimension max_sort_dimension = {
 	.cmp			= max_cmp,
 };
 
-static int switch_cmp(struct task_atoms *l, struct task_atoms *r)
+static int switch_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->nb_atoms < r->nb_atoms)
 		return -1;
@@ -1228,7 +1291,7 @@ static struct sort_dimension switch_sort_dimension = {
 	.cmp			= switch_cmp,
 };
 
-static int runtime_cmp(struct task_atoms *l, struct task_atoms *r)
+static int runtime_cmp(struct work_atoms *l, struct work_atoms *r)
 {
 	if (l->total_runtime < r->total_runtime)
 		return -1;
@@ -1277,13 +1340,13 @@ static void sort_lat(void)
 	struct rb_node *node;
 
 	for (;;) {
-		struct task_atoms *data;
+		struct work_atoms *data;
 		node = rb_first(&atom_root);
 		if (!node)
 			break;
 
 		rb_erase(node, &atom_root);
-		data = rb_entry(node, struct task_atoms, node);
+		data = rb_entry(node, struct work_atoms, node);
 		__thread_latency_insert(&sorted_atom_root, data, &sort_list);
 	}
 }
@@ -1303,10 +1366,10 @@ static void __cmd_lat(void)
 	next = rb_first(&sorted_atom_root);
 
 	while (next) {
-		struct task_atoms *atom_list;
+		struct work_atoms *work_list;
 
-		atom_list = rb_entry(next, struct task_atoms, node);
-		output_lat_thread(atom_list);
+		work_list = rb_entry(next, struct work_atoms, node);
+		output_lat_thread(work_list);
 		next = rb_next(next);
 	}
 
@@ -1368,6 +1431,23 @@ process_sched_switch_event(struct raw_event_sample *raw,
 	trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread);
 }
 
+static void
+process_sched_runtime_event(struct raw_event_sample *raw,
+			   struct event *event,
+			   int cpu __used,
+			   u64 timestamp __used,
+			   struct thread *thread __used)
+{
+	struct trace_runtime_event runtime_event;
+
+	FILL_ARRAY(runtime_event, comm, event, raw->data);
+	FILL_FIELD(runtime_event, pid, event, raw->data);
+	FILL_FIELD(runtime_event, runtime, event, raw->data);
+	FILL_FIELD(runtime_event, vruntime, event, raw->data);
+
+	trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread);
+}
+
 static void
 process_sched_fork_event(struct raw_event_sample *raw,
 			 struct event *event,
@@ -1410,6 +1490,8 @@ process_raw_event(event_t *raw_event __used, void *more_data,
 
 	if (!strcmp(event->name, "sched_switch"))
 		process_sched_switch_event(raw, event, cpu, timestamp, thread);
+	if (!strcmp(event->name, "sched_stat_runtime"))
+		process_sched_runtime_event(raw, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup"))
 		process_sched_wakeup_event(raw, event, cpu, timestamp, thread);
 	if (!strcmp(event->name, "sched_wakeup_new"))
-- 
cgit v1.2.3


From dc02bf7178c8e2cb3d442ae19027b736d51c7dd5 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Sep 2009 13:45:00 +0200
Subject: perf sched: Account for lost events, increase default buffering

Output such lost event and state machine weirdness stats:

   TOTAL:                |  14974.910 ms |    46384 |
  ---------------------------------------------------
   INFO: 8.865% lost events (19132 out of 215819, in 8 chunks)
   INFO: 0.198% state machine bugs (49 out of 24708) (due to lost events?)

And increase buffering to -m 1024 (4 MB) by default. Since we
use output multiplexing that kind of space is needed.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 60 +++++++++++++++++++++++++++++++---------------
 tools/perf/util/event.h    |  2 +-
 2 files changed, 42 insertions(+), 20 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index adcb563ec4d2..1f0f9be34faa 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -117,7 +117,11 @@ static u64			run_avg;
 
 static unsigned long		replay_repeat = 10;
 static unsigned long		nr_timestamps;
-static unsigned long		unordered_timestamps;
+static unsigned long		nr_unordered_timestamps;
+static unsigned long		nr_state_machine_bugs;
+static unsigned long		nr_events;
+static unsigned long		nr_lost_chunks;
+static unsigned long		nr_lost_events;
 
 #define TASK_STATE_TO_CHAR_STR "RSDTtZX"
 
@@ -668,14 +672,14 @@ process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 
 	thread = threads__findnew(event->comm.pid, &threads, &last_match);
 
-	dump_printf("%p [%p]: PERF_EVENT_COMM: %s:%d\n",
+	dump_printf("%p [%p]: perf_event_comm: %s:%d\n",
 		(void *)(offset + head),
 		(void *)(long)(event->header.size),
 		event->comm.comm, event->comm.pid);
 
 	if (thread == NULL ||
 	    thread__set_comm(thread, event->comm.comm)) {
-		dump_printf("problem processing PERF_EVENT_COMM, skipping event.\n");
+		dump_printf("problem processing perf_event_comm, skipping event.\n");
 		return -1;
 	}
 	total_comm++;
@@ -1168,14 +1172,12 @@ latency_wakeup_event(struct trace_wakeup_event *wakeup_event,
 
 	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
 
-	if (atom->state != THREAD_SLEEPING) {
-		printf("boo2\n");
-		return;
-	}
+	if (atom->state != THREAD_SLEEPING)
+		nr_state_machine_bugs++;
 
 	nr_timestamps++;
 	if (atom->sched_out_time > timestamp) {
-		unordered_timestamps++;
+		nr_unordered_timestamps++;
 		return;
 	}
 
@@ -1214,7 +1216,7 @@ static void output_lat_thread(struct work_atoms *work_list)
 
 	avg = work_list->total_lat / work_list->nb_atoms;
 
-	printf("|%9.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
+	printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n",
 	      (double)work_list->total_runtime / 1e6,
 		 work_list->nb_atoms, (double)avg / 1e6,
 		 (double)work_list->max_lat / 1e6);
@@ -1359,9 +1361,9 @@ static void __cmd_lat(void)
 	read_events();
 	sort_lat();
 
-	printf("\n ---------------------------------------------------------------------------------------\n");
-	printf("  Task                  |  Runtime ms | Switches | Average delay ms | Maximum delay ms |\n");
-	printf(" ---------------------------------------------------------------------------------------\n");
+	printf("\n -----------------------------------------------------------------------------------------\n");
+	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms |\n");
+	printf(" -----------------------------------------------------------------------------------------\n");
 
 	next = rb_first(&sorted_atom_root);
 
@@ -1373,18 +1375,32 @@ static void __cmd_lat(void)
 		next = rb_next(next);
 	}
 
-	printf(" ---------------------------------------------------------------------------------------\n");
-	printf("  TOTAL:                |%9.3f ms |%9Ld |",
+	printf(" -----------------------------------------------------------------------------------------\n");
+	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
 		(double)all_runtime/1e6, all_count);
 
-	if (unordered_timestamps && nr_timestamps) {
-		printf(" INFO: %.2f%% unordered events.\n",
-			(double)unordered_timestamps/(double)nr_timestamps*100.0);
+	printf(" ---------------------------------------------------\n");
+	if (nr_unordered_timestamps && nr_timestamps) {
+		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
+			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
+			nr_unordered_timestamps, nr_timestamps);
 	} else {
+	}
+	if (nr_lost_events && nr_events) {
+		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
+			(double)nr_lost_events/(double)nr_events*100.0,
+			nr_lost_events, nr_events, nr_lost_chunks);
+	}
+	if (nr_state_machine_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
+			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
+			nr_state_machine_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
 		printf("\n");
 	}
+	printf("\n");
 
-	printf(" -------------------------------------------------\n\n");
 }
 
 static struct trace_sched_handler *trace_handler;
@@ -1585,8 +1601,13 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 {
 	trace_event(event);
 
+	nr_events++;
 	switch (event->header.type) {
-	case PERF_EVENT_MMAP ... PERF_EVENT_LOST:
+	case PERF_EVENT_MMAP:
+		return 0;
+	case PERF_EVENT_LOST:
+		nr_lost_chunks++;
+		nr_lost_events += event->lost.lost;
 		return 0;
 
 	case PERF_EVENT_COMM:
@@ -1768,6 +1789,7 @@ static const char *record_args[] = {
 	"-R",
 	"-M",
 	"-f",
+	"-m", "1024",
 	"-c", "1",
 	"-e", "sched:sched_switch:r",
 	"-e", "sched:sched_stat_wait:r",
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index fa2d4e91d329..2495529cae7d 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -52,7 +52,7 @@ struct lost_event {
  */
 struct read_event {
 	struct perf_event_header header;
-	u32 pid,tid;
+	u32 pid, tid;
 	u64 value;
 	u64 time_enabled;
 	u64 time_running;
-- 
cgit v1.2.3


From c8a37751043427c6e4397a2cbfd617cb5f215c72 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Sep 2009 14:07:00 +0200
Subject: perf sched: Sanity check context switch events

Use 'perf sched latency' to track the current task based on
context-switch events, and flag the cases where there's some
impossible transition: such as a PID being switched out that
was not switched in.

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 27 ++++++++++++++++++++++++++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 1f0f9be34faa..2d542368de3c 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -119,6 +119,7 @@ static unsigned long		replay_repeat = 10;
 static unsigned long		nr_timestamps;
 static unsigned long		nr_unordered_timestamps;
 static unsigned long		nr_state_machine_bugs;
+static unsigned long		nr_context_switch_bugs;
 static unsigned long		nr_events;
 static unsigned long		nr_lost_chunks;
 static unsigned long		nr_lost_events;
@@ -1399,6 +1400,14 @@ static void __cmd_lat(void)
 			printf(" (due to lost events?)");
 		printf("\n");
 	}
+	if (nr_context_switch_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
+			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
+			nr_context_switch_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
+		printf("\n");
+	}
 	printf("\n");
 
 }
@@ -1425,10 +1434,16 @@ process_sched_wakeup_event(struct raw_event_sample *raw,
 	trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
 }
 
+/*
+ * Track the current task - that way we can know whether there's any
+ * weird events, such as a task being switched away that is not current.
+ */
+static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
+
 static void
 process_sched_switch_event(struct raw_event_sample *raw,
 			   struct event *event,
-			   int cpu __used,
+			   int cpu,
 			   u64 timestamp __used,
 			   struct thread *thread __used)
 {
@@ -1444,6 +1459,16 @@ process_sched_switch_event(struct raw_event_sample *raw,
 	FILL_FIELD(switch_event, next_pid, event, raw->data);
 	FILL_FIELD(switch_event, next_prio, event, raw->data);
 
+	if (curr_pid[cpu] != (u32)-1) {
+		/*
+		 * Are we trying to switch away a PID that is
+		 * not current?
+		 */
+		if (curr_pid[cpu] != switch_event.prev_pid)
+			nr_context_switch_bugs++;
+	}
+	curr_pid[cpu] = switch_event.next_pid;
+
 	trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread);
 }
 
-- 
cgit v1.2.3


From 80ed0987f363d7eb50193df3e6f6d71451f74bc3 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Sep 2009 14:12:36 +0200
Subject: perf sched: Make idle thread and comm/pid names more consistent

Peter noticed that we have 3 ways of referring to the idle thread:

 [idle]:0
 swapper:0
 swapper-0

Standardize on 'swapper:0'.

Reported-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 4 ++--
 tools/perf/util/thread.c   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 2d542368de3c..da8f67483ae7 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1204,13 +1204,13 @@ static void output_lat_thread(struct work_atoms *work_list)
 	/*
 	 * Ignore idle threads:
 	 */
-	if (!work_list->thread->pid)
+	if (!strcmp(work_list->thread->comm, "swapper"))
 		return;
 
 	all_runtime += work_list->total_runtime;
 	all_count += work_list->nb_atoms;
 
-	ret = printf("  %s-%d ", work_list->thread->comm, work_list->thread->pid);
+	ret = printf("  %s:%d ", work_list->thread->comm, work_list->thread->pid);
 
 	for (i = 0; i < 24 - ret; i++)
 		printf(" ");
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 7635928ca278..12c4341078f9 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -85,7 +85,7 @@ register_idle_thread(struct rb_root *threads, struct thread **last_match)
 {
 	struct thread *thread = threads__findnew(0, threads, last_match);
 
-	if (!thread || thread__set_comm(thread, "[init]")) {
+	if (!thread || thread__set_comm(thread, "swapper")) {
 		fprintf(stderr, "problem inserting idle task.\n");
 		exit(-1);
 	}
-- 
cgit v1.2.3


From 0ec04e16d08b69d8da46abbcfa3e3f2cd9738852 Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Wed, 16 Sep 2009 17:40:48 +0200
Subject: perf sched: Add 'perf sched map' scheduling event map printout

This prints a textual context-switching outline of workload
captured via perf sched record.

For example, on a 16 CPU box it outputs:

   N1  O1  .   .   .   S1  .   .   .   B0  .  *I0  C1  .   M1  .    23002.773423 secs
   N1  O1  .  *Q0  .   S1  .   .   .   B0  .   I0  C1  .   M1  .    23002.773423 secs
   N1  O1  .   Q0  .   S1  .   .   .   B0  .  *R1  C1  .   M1  .    23002.773485 secs
   N1  O1  .   Q0  .   S1  .  *S0  .   B0  .   R1  C1  .   M1  .    23002.773478 secs
  *L0  O1  .   Q0  .   S1  .   S0  .   B0  .   R1  C1  .   M1  .    23002.773523 secs
   L0  O1  .  *.   .   S1  .   S0  .   B0  .   R1  C1  .   M1  .    23002.773531 secs
   L0  O1  .   .   .   S1  .   S0  .   B0  .   R1  C1 *T1  M1  .    23002.773547 secs T1 => irqbalance:2089
   L0  O1  .   .   .   S1  .   S0  .  *P0  .   R1  C1  T1  M1  .    23002.773549 secs
  *N1  O1  .   .   .   S1  .   S0  .   P0  .   R1  C1  T1  M1  .    23002.773566 secs
   N1  O1  .   .   .  *J0  .   S0  .   P0  .   R1  C1  T1  M1  .    23002.773571 secs
   N1  O1  .   .   .   J0  .   S0 *B0  P0  .   R1  C1  T1  M1  .    23002.773592 secs
   N1  O1  .   .   .   J0  .  *U0  B0  P0  .   R1  C1  T1  M1  .    23002.773582 secs
   N1  O1  .   .   .  *S1  .   U0  B0  P0  .   R1  C1  T1  M1  .    23002.773604 secs
   N1  O1  .   .   .   S1  .   U0  B0 *.   .   R1  C1  T1  M1  .    23002.773615 secs
   N1  O1  .   .   .   S1  .   U0  B0  .   .  *K0  C1  T1  M1  .    23002.773631 secs
   N1  O1  .  *M0  .   S1  .   U0  B0  .   .   K0  C1  T1  M1  .    23002.773624 secs
   N1  O1  .   M0  .   S1  .   U0 *.   .   .   K0  C1  T1  M1  .    23002.773644 secs
   N1  O1  .   M0  .   S1  .   U0  .   .   .  *R1  C1  T1  M1  .    23002.773662 secs
   N1  O1  .   M0  .   S1  .  *.   .   .   .   R1  C1  T1  M1  .    23002.773648 secs
   N1  O1  .  *.   .   S1  .   .   .   .   .   R1  C1  T1  M1  .    23002.773680 secs
   N1  O1  .   .   .  *L0  .   .   .   .   .   R1  C1  T1  M1  .    23002.773717 secs
  *N0  O1  .   .   .   L0  .   .   .   .   .   R1  C1  T1  M1  .    23002.773709 secs
  *N1  O1  .   .   .   L0  .   .   .   .   .   R1  C1  T1  M1  .    23002.773747 secs

Columns stand for individual CPUs, from CPU0 to CPU15, and the
two-letter shortcuts stand for tasks that are running on a CPU.

'*' denotes the CPU that had the event.

A dot signals an idle CPU.

New tasks are assigned new two-letter shortcuts - when they occur
first they are printed. In the above example 'T1' stood for irqbalance:

      T1 => irqbalance:2089

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 312 ++++++++++++++++++++++++++++++---------------
 tools/perf/util/thread.c   |   2 +-
 tools/perf/util/thread.h   |   1 +
 3 files changed, 214 insertions(+), 101 deletions(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index da8f67483ae7..f67e351b050b 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -159,8 +159,6 @@ static struct rb_root		atom_root, sorted_atom_root;
 static u64			all_runtime;
 static u64			all_count;
 
-static int read_events(void);
-
 
 static u64 get_nsecs(void)
 {
@@ -634,38 +632,6 @@ static void test_calibrations(void)
 	printf("the sleep test took %Ld nsecs\n", T1-T0);
 }
 
-static void __cmd_replay(void)
-{
-	unsigned long i;
-
-	calibrate_run_measurement_overhead();
-	calibrate_sleep_measurement_overhead();
-
-	test_calibrations();
-
-	read_events();
-
-	printf("nr_run_events:        %ld\n", nr_run_events);
-	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
-	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
-
-	if (targetless_wakeups)
-		printf("target-less wakeups:  %ld\n", targetless_wakeups);
-	if (multitarget_wakeups)
-		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
-	if (nr_run_events_optimized)
-		printf("run atoms optimized: %ld\n",
-			nr_run_events_optimized);
-
-	print_task_traces();
-	add_cross_task_wakeups();
-
-	create_tasks();
-	printf("------------------------------------------------------------\n");
-	for (i = 0; i < replay_repeat; i++)
-		run_one_test();
-}
-
 static int
 process_comm_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1354,64 +1320,6 @@ static void sort_lat(void)
 	}
 }
 
-static void __cmd_lat(void)
-{
-	struct rb_node *next;
-
-	setup_pager();
-	read_events();
-	sort_lat();
-
-	printf("\n -----------------------------------------------------------------------------------------\n");
-	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms |\n");
-	printf(" -----------------------------------------------------------------------------------------\n");
-
-	next = rb_first(&sorted_atom_root);
-
-	while (next) {
-		struct work_atoms *work_list;
-
-		work_list = rb_entry(next, struct work_atoms, node);
-		output_lat_thread(work_list);
-		next = rb_next(next);
-	}
-
-	printf(" -----------------------------------------------------------------------------------------\n");
-	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
-		(double)all_runtime/1e6, all_count);
-
-	printf(" ---------------------------------------------------\n");
-	if (nr_unordered_timestamps && nr_timestamps) {
-		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
-			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
-			nr_unordered_timestamps, nr_timestamps);
-	} else {
-	}
-	if (nr_lost_events && nr_events) {
-		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
-			(double)nr_lost_events/(double)nr_events*100.0,
-			nr_lost_events, nr_events, nr_lost_chunks);
-	}
-	if (nr_state_machine_bugs && nr_timestamps) {
-		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
-			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
-			nr_state_machine_bugs, nr_timestamps);
-		if (nr_lost_events)
-			printf(" (due to lost events?)");
-		printf("\n");
-	}
-	if (nr_context_switch_bugs && nr_timestamps) {
-		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
-			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
-			nr_context_switch_bugs, nr_timestamps);
-		if (nr_lost_events)
-			printf(" (due to lost events?)");
-		printf("\n");
-	}
-	printf("\n");
-
-}
-
 static struct trace_sched_handler *trace_handler;
 
 static void
@@ -1431,19 +1339,106 @@ process_sched_wakeup_event(struct raw_event_sample *raw,
 	FILL_FIELD(wakeup_event, success, event, raw->data);
 	FILL_FIELD(wakeup_event, cpu, event, raw->data);
 
-	trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
+	if (trace_handler->wakeup_event)
+		trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread);
 }
 
 /*
  * Track the current task - that way we can know whether there's any
  * weird events, such as a task being switched away that is not current.
  */
+static int max_cpu = 15;
+
 static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
 
+static struct thread *curr_thread[MAX_CPUS];
+
+static char next_shortname1 = 'A';
+static char next_shortname2 = '0';
+
+static void
+map_switch_event(struct trace_switch_event *switch_event,
+		 struct event *event __used,
+		 int this_cpu,
+		 u64 timestamp,
+		 struct thread *thread __used)
+{
+	struct thread *sched_out, *sched_in;
+	int new_shortname;
+	u64 timestamp0;
+	s64 delta;
+	int cpu;
+
+	BUG_ON(this_cpu >= MAX_CPUS || this_cpu < 0);
+
+	if (this_cpu > max_cpu)
+		max_cpu = this_cpu;
+
+	timestamp0 = cpu_last_switched[this_cpu];
+	cpu_last_switched[this_cpu] = timestamp;
+	if (timestamp0)
+		delta = timestamp - timestamp0;
+	else
+		delta = 0;
+
+	if (delta < 0)
+		die("hm, delta: %Ld < 0 ?\n", delta);
+
+
+	sched_out = threads__findnew(switch_event->prev_pid, &threads, &last_match);
+	sched_in = threads__findnew(switch_event->next_pid, &threads, &last_match);
+
+	curr_thread[this_cpu] = sched_in;
+
+	printf("  ");
+
+	new_shortname = 0;
+	if (!sched_in->shortname[0]) {
+		sched_in->shortname[0] = next_shortname1;
+		sched_in->shortname[1] = next_shortname2;
+
+		if (next_shortname1 < 'Z') {
+			next_shortname1++;
+		} else {
+			next_shortname1='A';
+			if (next_shortname2 < '9') {
+				next_shortname2++;
+			} else {
+				next_shortname2='0';
+			}
+		}
+		new_shortname = 1;
+	}
+
+	for (cpu = 0; cpu <= max_cpu; cpu++) {
+		if (cpu != this_cpu)
+			printf(" ");
+		else
+			printf("*");
+
+		if (curr_thread[cpu]) {
+			if (curr_thread[cpu]->pid)
+				printf("%2s ", curr_thread[cpu]->shortname);
+			else
+				printf(".  ");
+		} else
+			printf("   ");
+	}
+
+	printf("  %12.6f secs ", (double)timestamp/1e9);
+	if (new_shortname) {
+		printf("%s => %s:%d\n",
+			sched_in->shortname, sched_in->comm, sched_in->pid);
+	} else {
+		printf("\n");
+	}
+}
+
+
 static void
 process_sched_switch_event(struct raw_event_sample *raw,
 			   struct event *event,
-			   int cpu,
+			   int this_cpu,
 			   u64 timestamp __used,
 			   struct thread *thread __used)
 {
@@ -1459,17 +1454,18 @@ process_sched_switch_event(struct raw_event_sample *raw,
 	FILL_FIELD(switch_event, next_pid, event, raw->data);
 	FILL_FIELD(switch_event, next_prio, event, raw->data);
 
-	if (curr_pid[cpu] != (u32)-1) {
+	if (curr_pid[this_cpu] != (u32)-1) {
 		/*
 		 * Are we trying to switch away a PID that is
 		 * not current?
 		 */
-		if (curr_pid[cpu] != switch_event.prev_pid)
+		if (curr_pid[this_cpu] != switch_event.prev_pid)
 			nr_context_switch_bugs++;
 	}
-	curr_pid[cpu] = switch_event.next_pid;
+	if (trace_handler->switch_event)
+		trace_handler->switch_event(&switch_event, event, this_cpu, timestamp, thread);
 
-	trace_handler->switch_event(&switch_event, event, cpu, timestamp, thread);
+	curr_pid[this_cpu] = switch_event.next_pid;
 }
 
 static void
@@ -1486,7 +1482,8 @@ process_sched_runtime_event(struct raw_event_sample *raw,
 	FILL_FIELD(runtime_event, runtime, event, raw->data);
 	FILL_FIELD(runtime_event, vruntime, event, raw->data);
 
-	trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread);
+	if (trace_handler->runtime_event)
+		trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread);
 }
 
 static void
@@ -1505,7 +1502,8 @@ process_sched_fork_event(struct raw_event_sample *raw,
 	FILL_ARRAY(fork_event, child_comm, event, raw->data);
 	FILL_FIELD(fork_event, child_pid, event, raw->data);
 
-	trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread);
+	if (trace_handler->fork_event)
+		trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread);
 }
 
 static void
@@ -1748,6 +1746,116 @@ more:
 	return rc;
 }
 
+static void print_bad_events(void)
+{
+	if (nr_unordered_timestamps && nr_timestamps) {
+		printf("  INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
+			(double)nr_unordered_timestamps/(double)nr_timestamps*100.0,
+			nr_unordered_timestamps, nr_timestamps);
+	}
+	if (nr_lost_events && nr_events) {
+		printf("  INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
+			(double)nr_lost_events/(double)nr_events*100.0,
+			nr_lost_events, nr_events, nr_lost_chunks);
+	}
+	if (nr_state_machine_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% state machine bugs (%ld out of %ld)",
+			(double)nr_state_machine_bugs/(double)nr_timestamps*100.0,
+			nr_state_machine_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
+		printf("\n");
+	}
+	if (nr_context_switch_bugs && nr_timestamps) {
+		printf("  INFO: %.3f%% context switch bugs (%ld out of %ld)",
+			(double)nr_context_switch_bugs/(double)nr_timestamps*100.0,
+			nr_context_switch_bugs, nr_timestamps);
+		if (nr_lost_events)
+			printf(" (due to lost events?)");
+		printf("\n");
+	}
+}
+
+static void __cmd_lat(void)
+{
+	struct rb_node *next;
+
+	setup_pager();
+	read_events();
+	sort_lat();
+
+	printf("\n -----------------------------------------------------------------------------------------\n");
+	printf("  Task                  |   Runtime ms  | Switches | Average delay ms | Maximum delay ms |\n");
+	printf(" -----------------------------------------------------------------------------------------\n");
+
+	next = rb_first(&sorted_atom_root);
+
+	while (next) {
+		struct work_atoms *work_list;
+
+		work_list = rb_entry(next, struct work_atoms, node);
+		output_lat_thread(work_list);
+		next = rb_next(next);
+	}
+
+	printf(" -----------------------------------------------------------------------------------------\n");
+	printf("  TOTAL:                |%11.3f ms |%9Ld |\n",
+		(double)all_runtime/1e6, all_count);
+
+	printf(" ---------------------------------------------------\n");
+
+	print_bad_events();
+	printf("\n");
+
+}
+
+static struct trace_sched_handler map_ops  = {
+	.wakeup_event		= NULL,
+	.switch_event		= map_switch_event,
+	.runtime_event		= NULL,
+	.fork_event		= NULL,
+};
+
+static void __cmd_map(void)
+{
+	setup_pager();
+	read_events();
+	print_bad_events();
+}
+
+static void __cmd_replay(void)
+{
+	unsigned long i;
+
+	calibrate_run_measurement_overhead();
+	calibrate_sleep_measurement_overhead();
+
+	test_calibrations();
+
+	read_events();
+
+	printf("nr_run_events:        %ld\n", nr_run_events);
+	printf("nr_sleep_events:      %ld\n", nr_sleep_events);
+	printf("nr_wakeup_events:     %ld\n", nr_wakeup_events);
+
+	if (targetless_wakeups)
+		printf("target-less wakeups:  %ld\n", targetless_wakeups);
+	if (multitarget_wakeups)
+		printf("multi-target wakeups: %ld\n", multitarget_wakeups);
+	if (nr_run_events_optimized)
+		printf("run atoms optimized: %ld\n",
+			nr_run_events_optimized);
+
+	print_task_traces();
+	add_cross_task_wakeups();
+
+	create_tasks();
+	printf("------------------------------------------------------------\n");
+	for (i = 0; i < replay_repeat; i++)
+		run_one_test();
+}
+
+
 static const char * const sched_usage[] = {
 	"perf sched [<options>] {record|latency|replay|trace}",
 	NULL
@@ -1867,6 +1975,10 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used)
 		}
 		setup_sorting();
 		__cmd_lat();
+	} else if (!strcmp(argv[0], "map")) {
+		trace_handler = &map_ops;
+		setup_sorting();
+		__cmd_map();
 	} else if (!strncmp(argv[0], "rep", 3)) {
 		trace_handler = &replay_ops;
 		if (argc) {
diff --git a/tools/perf/util/thread.c b/tools/perf/util/thread.c
index 12c4341078f9..45efb5db0d19 100644
--- a/tools/perf/util/thread.c
+++ b/tools/perf/util/thread.c
@@ -8,7 +8,7 @@
 
 static struct thread *thread__new(pid_t pid)
 {
-	struct thread *self = malloc(sizeof(*self));
+	struct thread *self = calloc(1, sizeof(*self));
 
 	if (self != NULL) {
 		self->pid = pid;
diff --git a/tools/perf/util/thread.h b/tools/perf/util/thread.h
index 665d1f3dc977..32aea3c1c2ad 100644
--- a/tools/perf/util/thread.h
+++ b/tools/perf/util/thread.h
@@ -7,6 +7,7 @@ struct thread {
 	struct rb_node		rb_node;
 	struct list_head	maps;
 	pid_t			pid;
+	char			shortname[3];
 	char			*comm;
 };
 
-- 
cgit v1.2.3


From 850bc73ffcc99cddfb52bc23217c60810c508853 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 Sep 2009 18:47:11 +0200
Subject: perf_counter: Do not throttle single swcounter events

We can have swcounter events that contribute more than a single
count per event, when used with a non-zero period, those can
generate multiple events, which is when we need throttling.

However, swcounter that contribute only a single count per event
can only come as fast as we can run code, hence don't throttle
them.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 17 +++++++++++++----
 1 file changed, 13 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 667ab25ad3d5..fe0d1adde804 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3494,14 +3494,15 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  * Generic counter overflow handling, sampling.
  */
 
-int perf_counter_overflow(struct perf_counter *counter, int nmi,
-			  struct perf_sample_data *data)
+static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
+				   int throttle, struct perf_sample_data *data)
 {
 	int events = atomic_read(&counter->event_limit);
-	int throttle = counter->pmu->unthrottle != NULL;
 	struct hw_perf_counter *hwc = &counter->hw;
 	int ret = 0;
 
+	throttle = (throttle && counter->pmu->unthrottle != NULL);
+
 	if (!throttle) {
 		hwc->interrupts++;
 	} else {
@@ -3554,6 +3555,12 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi,
 	return ret;
 }
 
+int perf_counter_overflow(struct perf_counter *counter, int nmi,
+			  struct perf_sample_data *data)
+{
+	return __perf_counter_overflow(counter, nmi, 1, data);
+}
+
 /*
  * Generic software counter infrastructure
  */
@@ -3592,6 +3599,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 				    int nmi, struct perf_sample_data *data)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
+	int throttle = 0;
 	u64 overflow;
 
 	data->period = counter->hw.last_period;
@@ -3601,13 +3609,14 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 		return;
 
 	for (; overflow; overflow--) {
-		if (perf_counter_overflow(counter, nmi, data)) {
+		if (__perf_counter_overflow(counter, nmi, throttle, data)) {
 			/*
 			 * We inhibit the overflow from happening when
 			 * hwc->interrupts == MAX_INTERRUPTS.
 			 */
 			break;
 		}
+		throttle = 0;
 	}
 }
 
-- 
cgit v1.2.3


From 2667de81f3256c944b06abdf2c56c2f192fcb724 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 Sep 2009 19:01:10 +0200
Subject: perf_counter: Allow for a wakeup watermark

Currently we wake the mmap() consumer once every PAGE_SIZE of data
and/or once event wakeup_events when specified.

For high speed sampling this results in too many wakeups wrt. the
buffer size, hence change this.

We move the default wakeup limit to 1/4-th the buffer size, and
provide for means to manually specify this limit.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h | 10 ++++++++--
 kernel/perf_counter.c        | 32 +++++++++++++++++++-------------
 2 files changed, 27 insertions(+), 15 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 972f90d7a32f..6c1ef72ea501 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -199,10 +199,14 @@ struct perf_counter_attr {
 				inherit_stat   :  1, /* per task counts       */
 				enable_on_exec :  1, /* next exec enables     */
 				task           :  1, /* trace fork/exit       */
+				watermark      :  1, /* wakeup_watermark      */
 
-				__reserved_1   : 50;
+				__reserved_1   : 49;
 
-	__u32			wakeup_events;	/* wakeup every n events */
+	union {
+		__u32		wakeup_events;	  /* wakeup every n events */
+		__u32		wakeup_watermark; /* bytes before wakeup   */
+	};
 	__u32			__reserved_2;
 
 	__u64			__reserved_3;
@@ -521,6 +525,8 @@ struct perf_mmap_data {
 	atomic_t			wakeup;		/* needs a wakeup    */
 	atomic_t			lost;		/* nr records lost   */
 
+	long				watermark;	/* wakeup watermark  */
+
 	struct perf_counter_mmap_page   *user_page;
 	void				*data_pages[0];
 };
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index fe0d1adde804..29b73b6e8146 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2176,6 +2176,13 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages)
 	data->nr_pages = nr_pages;
 	atomic_set(&data->lock, -1);
 
+	if (counter->attr.watermark) {
+		data->watermark = min_t(long, PAGE_SIZE * nr_pages,
+				      counter->attr.wakeup_watermark);
+	}
+	if (!data->watermark)
+		data->watermark = max(PAGE_SIZE, PAGE_SIZE * nr_pages / 4);
+
 	rcu_assign_pointer(counter->data, data);
 
 	return 0;
@@ -2517,23 +2524,15 @@ struct perf_output_handle {
 	unsigned long		flags;
 };
 
-static bool perf_output_space(struct perf_mmap_data *data,
-			      unsigned int offset, unsigned int head)
+static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
+			      unsigned long offset, unsigned long head)
 {
-	unsigned long tail;
 	unsigned long mask;
 
 	if (!data->writable)
 		return true;
 
 	mask = (data->nr_pages << PAGE_SHIFT) - 1;
-	/*
-	 * Userspace could choose to issue a mb() before updating the tail
-	 * pointer. So that all reads will be completed before the write is
-	 * issued.
-	 */
-	tail = ACCESS_ONCE(data->user_page->data_tail);
-	smp_rmb();
 
 	offset = (offset - tail) & mask;
 	head   = (head   - tail) & mask;
@@ -2679,7 +2678,7 @@ static int perf_output_begin(struct perf_output_handle *handle,
 {
 	struct perf_counter *output_counter;
 	struct perf_mmap_data *data;
-	unsigned int offset, head;
+	unsigned long tail, offset, head;
 	int have_lost;
 	struct {
 		struct perf_event_header header;
@@ -2717,16 +2716,23 @@ static int perf_output_begin(struct perf_output_handle *handle,
 	perf_output_lock(handle);
 
 	do {
+		/*
+		 * Userspace could choose to issue a mb() before updating the
+		 * tail pointer. So that all reads will be completed before the
+		 * write is issued.
+		 */
+		tail = ACCESS_ONCE(data->user_page->data_tail);
+		smp_rmb();
 		offset = head = atomic_long_read(&data->head);
 		head += size;
-		if (unlikely(!perf_output_space(data, offset, head)))
+		if (unlikely(!perf_output_space(data, tail, offset, head)))
 			goto fail;
 	} while (atomic_long_cmpxchg(&data->head, offset, head) != offset);
 
 	handle->offset	= offset;
 	handle->head	= head;
 
-	if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT))
+	if (head - tail > data->watermark)
 		atomic_set(&data->wakeup, 1);
 
 	if (have_lost) {
-- 
cgit v1.2.3


From 8b412664d0a487c2e16ac43f4fcede346df33254 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 17 Sep 2009 19:59:05 +0200
Subject: perf record: Disable profiling before draining the buffer

I noticed that perf-record continues profiling itself after the
child terminated and we're draining the buffer.

This can cause a _lot_ of overhead with --all recording - we keep
and keep recording, which produces new and new events.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 5f3127e7a615..2459e5a22ed8 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -524,6 +524,7 @@ static int __cmd_record(int argc, const char **argv)
 	pid_t pid = 0;
 	int flags;
 	int ret;
+	unsigned long waking = 0;
 
 	page_size = sysconf(_SC_PAGE_SIZE);
 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
@@ -634,10 +635,20 @@ static int __cmd_record(int argc, const char **argv)
 		if (hits == samples) {
 			if (done)
 				break;
-			ret = poll(event_array, nr_poll, 100);
+			ret = poll(event_array, nr_poll, -1);
+			waking++;
+		}
+
+		if (done) {
+			for (i = 0; i < nr_cpu; i++) {
+				for (counter = 0; counter < nr_counters; counter++)
+					ioctl(fd[i][counter], PERF_COUNTER_IOC_DISABLE);
+			}
 		}
 	}
 
+	fprintf(stderr, "[ perf record: Woken up %ld times to write data ]\n", waking);
+
 	/*
 	 * Approximate RIP event size: 24 bytes.
 	 */
-- 
cgit v1.2.3


From 40749d0ff49f99c3661b336fe5e5625207bd925a Mon Sep 17 00:00:00 2001
From: Ingo Molnar <mingo@elte.hu>
Date: Thu, 17 Sep 2009 18:24:55 +0200
Subject: perf sched: Determine the number of CPUs automatically

For 'perf sched map' output, determine max_cpu automatically,
instead of the static default of 15.

[ v2: use sysconf() pointed out by Arjan van de Ven <arjan@infradead.org> ]

Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index f67e351b050b..9e04827d16be 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1347,7 +1347,7 @@ process_sched_wakeup_event(struct raw_event_sample *raw,
  * Track the current task - that way we can know whether there's any
  * weird events, such as a task being switched away that is not current.
  */
-static int max_cpu = 15;
+static int max_cpu;
 
 static u32 curr_pid[MAX_CPUS] = { [0 ... MAX_CPUS-1] = -1 };
 
@@ -1818,6 +1818,8 @@ static struct trace_sched_handler map_ops  = {
 
 static void __cmd_map(void)
 {
+	max_cpu = sysconf(_SC_NPROCESSORS_CONF);
+
 	setup_pager();
 	read_events();
 	print_bad_events();
-- 
cgit v1.2.3


From 6706ccf8e776e51e38ffa89fe7dd20e80eb1e860 Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 17 Sep 2009 16:34:23 +0800
Subject: perf tools: Fix memory leak in read_ftrace_printk()

get_tracing_file() should be paired with put_tracing_file().

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4AB1F48F.4070807@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/trace-event-info.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/tools/perf/util/trace-event-info.c b/tools/perf/util/trace-event-info.c
index 6c9302a7274c..1fd824c1f1c4 100644
--- a/tools/perf/util/trace-event-info.c
+++ b/tools/perf/util/trace-event-info.c
@@ -458,7 +458,7 @@ static void read_proc_kallsyms(void)
 static void read_ftrace_printk(void)
 {
 	unsigned int size, check_size;
-	const char *path;
+	char *path;
 	struct stat st;
 	int ret;
 
@@ -468,14 +468,15 @@ static void read_ftrace_printk(void)
 		/* not found */
 		size = 0;
 		write_or_die(&size, 4);
-		return;
+		goto out;
 	}
 	size = get_size(path);
 	write_or_die(&size, 4);
 	check_size = copy_file(path);
 	if (size != check_size)
 		die("error in size of file '%s'", path);
-
+out:
+	put_tracing_file(path);
 }
 
 static struct tracepoint_path *
-- 
cgit v1.2.3


From 270bbbe80d82fad8b698d0b407eb3ad67cc3492b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 17 Sep 2009 16:34:51 +0800
Subject: perf tools: Increase MAX_EVENT_LENGTH

The name length of some trace events is longer than 30, like
sys_enter_sched_get_priority_max and
ext4_mb_discard_preallocations.

Passing those events to perf-record will fail, try:

  # ./perf record -f -e syscalls:sys_enter_sched_get_priority_max -F 1 -a

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4AB1F4AB.7050205@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 034245e46817..910283cbbc8f 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -145,7 +145,7 @@ static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir)
 	   (strcmp(evt_dirent.d_name, "..")) &&				       \
 	   (!tp_event_has_id(&sys_dirent, &evt_dirent)))
 
-#define MAX_EVENT_LENGTH 30
+#define MAX_EVENT_LENGTH 512
 
 int valid_debugfs_mount(const char *debugfs)
 {
-- 
cgit v1.2.3


From 1281a49b7aa865a1f0d60e2b28410e6234fc686b Mon Sep 17 00:00:00 2001
From: Li Zefan <lizf@cn.fujitsu.com>
Date: Thu, 17 Sep 2009 16:49:43 +0800
Subject: perf trace: Sample timestamp and cpu when using record flag

Sample timestamp and cpu just like the -R option.

Before:
            init-0     [-01] 1266874889.17179869184709551615: irq_handler_entry: irq=18 handler=eth0
            init-0     [-01] 1266874889.17179869184709551615: irq_handler_entry: irq=18 handler=eth0
            init-0     [-01] 1266874889.17179869184709551615: irq_handler_entry: irq=1 handler=i8042
            init-0     [-01] 1266874889.17179869184709551615: irq_handler_entry: irq=18 handler=eth0
            init-0     [-01] 1266874889.17179869184709551615: irq_handler_entry: irq=1 handler=i8042

After:
            init-0     [001]  7364.568965353: irq_handler_entry: irq=18 handler=eth0
            init-0     [001]  7365.530226877: irq_handler_entry: irq=1 handler=i8042
            init-0     [001]  7365.542831563: irq_handler_entry: irq=18 handler=eth0
            init-0     [001]  7365.644156299: irq_handler_entry: irq=18 handler=eth0
            init-0     [001]  7365.694556201: irq_handler_entry: irq=18 handler=eth0

Signed-off-by: Li Zefan <lizf@cn.fujitsu.com>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <4AB1F827.8040905@cn.fujitsu.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 910283cbbc8f..a9bdcab8c070 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -425,8 +425,11 @@ parse_single_tracepoint_event(char *sys_name,
 	int fd;
 
 	if (flags) {
-		if (!strncmp(flags, "record", strlen(flags)))
+		if (!strncmp(flags, "record", strlen(flags))) {
 			attr->sample_type |= PERF_SAMPLE_RAW;
+			attr->sample_type |= PERF_SAMPLE_TIME;
+			attr->sample_type |= PERF_SAMPLE_CPU;
+		}
 	}
 
 	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
-- 
cgit v1.2.3


From 4b77a7297795229eca96c41e1709a3c87909fabe Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Fri, 18 Sep 2009 08:22:24 +0200
Subject: perf sched: Add --input=file option to builtin-sched.c

perf sched record passes unparsed args on to perf record, so
specifying an output file via perf sched record -o FILE (cmd) just
works.  Ergo, provide an option to specify input file as well.

Also add the missing 'map' command to help.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1253254944.20589.11.camel@marge.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-sched.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c
index 9e04827d16be..275d79c6627a 100644
--- a/tools/perf/builtin-sched.c
+++ b/tools/perf/builtin-sched.c
@@ -1859,11 +1859,13 @@ static void __cmd_replay(void)
 
 
 static const char * const sched_usage[] = {
-	"perf sched [<options>] {record|latency|replay|trace}",
+	"perf sched [<options>] {record|latency|map|replay|trace}",
 	NULL
 };
 
 static const struct option sched_options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
 	OPT_BOOLEAN('v', "verbose", &verbose,
 		    "be more verbose (show symbol address, etc)"),
 	OPT_BOOLEAN('D', "dump-raw-trace", &dump_trace,
-- 
cgit v1.2.3


From 5622f295b53fb60dbf9bed3e2c89d182490a8b7f Mon Sep 17 00:00:00 2001
From: Markus Metzger <markus.t.metzger@intel.com>
Date: Tue, 15 Sep 2009 13:00:23 +0200
Subject: x86, perf_counter, bts: Optimize BTS overflow handling

Draining the BTS buffer on a buffer overflow interrupt takes too
long resulting in a kernel lockup when tracing the kernel.

Restructure perf_counter sampling into sample creation and sample
output.

Prepare a single reference sample for BTS sampling and update the
from and to address fields when draining the BTS buffer. Drain the
entire BTS buffer between a single perf_output_begin() /
perf_output_end() pair.

Signed-off-by: Markus Metzger <markus.t.metzger@intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090915130023.A16204@sedona.ch.intel.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c |  60 ++++---
 include/linux/perf_counter.h       |  68 +++++++-
 kernel/perf_counter.c              | 312 ++++++++++++++++++++-----------------
 3 files changed, 266 insertions(+), 174 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index f9cd0849bd42..6a0e71b38126 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -36,10 +36,10 @@ static u64 perf_counter_mask __read_mostly;
 #define BTS_RECORD_SIZE		24
 
 /* The size of a per-cpu BTS buffer in bytes: */
-#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 1024)
+#define BTS_BUFFER_SIZE		(BTS_RECORD_SIZE * 2048)
 
 /* The BTS overflow threshold in bytes from the end of the buffer: */
-#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 64)
+#define BTS_OVFL_TH		(BTS_RECORD_SIZE * 128)
 
 
 /*
@@ -1488,8 +1488,7 @@ void perf_counter_print_debug(void)
 	local_irq_restore(flags);
 }
 
-static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
-				       struct perf_sample_data *data)
+static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc)
 {
 	struct debug_store *ds = cpuc->ds;
 	struct bts_record {
@@ -1498,8 +1497,11 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 		u64	flags;
 	};
 	struct perf_counter *counter = cpuc->counters[X86_PMC_IDX_FIXED_BTS];
-	unsigned long orig_ip = data->regs->ip;
 	struct bts_record *at, *top;
+	struct perf_output_handle handle;
+	struct perf_event_header header;
+	struct perf_sample_data data;
+	struct pt_regs regs;
 
 	if (!counter)
 		return;
@@ -1510,19 +1512,38 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_counters *cpuc,
 	at  = (struct bts_record *)(unsigned long)ds->bts_buffer_base;
 	top = (struct bts_record *)(unsigned long)ds->bts_index;
 
+	if (top <= at)
+		return;
+
 	ds->bts_index = ds->bts_buffer_base;
 
+
+	data.period	= counter->hw.last_period;
+	data.addr	= 0;
+	regs.ip		= 0;
+
+	/*
+	 * Prepare a generic sample, i.e. fill in the invariant fields.
+	 * We will overwrite the from and to address before we output
+	 * the sample.
+	 */
+	perf_prepare_sample(&header, &data, counter, &regs);
+
+	if (perf_output_begin(&handle, counter,
+			      header.size * (top - at), 1, 1))
+		return;
+
 	for (; at < top; at++) {
-		data->regs->ip	= at->from;
-		data->addr	= at->to;
+		data.ip		= at->from;
+		data.addr	= at->to;
 
-		perf_counter_output(counter, 1, data);
+		perf_output_sample(&handle, &header, &data, counter);
 	}
 
-	data->regs->ip	= orig_ip;
-	data->addr	= 0;
+	perf_output_end(&handle);
 
 	/* There's new data available. */
+	counter->hw.interrupts++;
 	counter->pending_kill = POLL_IN;
 }
 
@@ -1552,13 +1573,9 @@ static void x86_pmu_disable(struct perf_counter *counter)
 	x86_perf_counter_update(counter, hwc, idx);
 
 	/* Drain the remaining BTS records. */
-	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS)) {
-		struct perf_sample_data data;
-		struct pt_regs regs;
+	if (unlikely(idx == X86_PMC_IDX_FIXED_BTS))
+		intel_pmu_drain_bts_buffer(cpuc);
 
-		data.regs = &regs;
-		intel_pmu_drain_bts_buffer(cpuc, &data);
-	}
 	cpuc->counters[idx] = NULL;
 	clear_bit(idx, cpuc->used_mask);
 
@@ -1619,7 +1636,6 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1644,7 +1660,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			p6_pmu_disable_counter(hwc, idx);
 	}
 
@@ -1665,13 +1681,12 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 	int bit, loops;
 	u64 ack, status;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
 
 	perf_disable();
-	intel_pmu_drain_bts_buffer(cpuc, &data);
+	intel_pmu_drain_bts_buffer(cpuc);
 	status = intel_pmu_get_status();
 	if (!status) {
 		perf_enable();
@@ -1702,7 +1717,7 @@ again:
 
 		data.period = counter->hw.last_period;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			intel_pmu_disable_counter(&counter->hw, bit);
 	}
 
@@ -1729,7 +1744,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 	int idx, handled = 0;
 	u64 val;
 
-	data.regs = regs;
 	data.addr = 0;
 
 	cpuc = &__get_cpu_var(cpu_hw_counters);
@@ -1754,7 +1768,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 		if (!x86_perf_counter_set_period(counter, hwc, idx))
 			continue;
 
-		if (perf_counter_overflow(counter, 1, &data))
+		if (perf_counter_overflow(counter, 1, &data, regs))
 			amd_pmu_disable_counter(hwc, idx);
 	}
 
diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 6c1ef72ea501..c7375f97aa19 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -691,6 +691,17 @@ struct perf_cpu_context {
 	int				recursion[4];
 };
 
+struct perf_output_handle {
+	struct perf_counter	*counter;
+	struct perf_mmap_data	*data;
+	unsigned long		head;
+	unsigned long		offset;
+	int			nmi;
+	int			sample;
+	int			locked;
+	unsigned long		flags;
+};
+
 #ifdef CONFIG_PERF_COUNTERS
 
 /*
@@ -722,16 +733,38 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader,
 extern void perf_counter_update_userpage(struct perf_counter *counter);
 
 struct perf_sample_data {
-	struct pt_regs			*regs;
+	u64				type;
+
+	u64				ip;
+	struct {
+		u32	pid;
+		u32	tid;
+	}				tid_entry;
+	u64				time;
 	u64				addr;
+	u64				id;
+	u64				stream_id;
+	struct {
+		u32	cpu;
+		u32	reserved;
+	}				cpu_entry;
 	u64				period;
+	struct perf_callchain_entry	*callchain;
 	struct perf_raw_record		*raw;
 };
 
+extern void perf_output_sample(struct perf_output_handle *handle,
+			       struct perf_event_header *header,
+			       struct perf_sample_data *data,
+			       struct perf_counter *counter);
+extern void perf_prepare_sample(struct perf_event_header *header,
+				struct perf_sample_data *data,
+				struct perf_counter *counter,
+				struct pt_regs *regs);
+
 extern int perf_counter_overflow(struct perf_counter *counter, int nmi,
-				 struct perf_sample_data *data);
-extern void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data);
+				 struct perf_sample_data *data,
+				 struct pt_regs *regs);
 
 /*
  * Return 1 for a software counter, 0 for a hardware counter
@@ -781,6 +814,12 @@ extern void perf_tpcounter_event(int event_id, u64 addr, u64 count,
 #define perf_instruction_pointer(regs)	instruction_pointer(regs)
 #endif
 
+extern int perf_output_begin(struct perf_output_handle *handle,
+			     struct perf_counter *counter, unsigned int size,
+			     int nmi, int sample);
+extern void perf_output_end(struct perf_output_handle *handle);
+extern void perf_output_copy(struct perf_output_handle *handle,
+			     const void *buf, unsigned int len);
 #else
 static inline void
 perf_counter_task_sched_in(struct task_struct *task, int cpu)		{ }
@@ -807,7 +846,28 @@ static inline void perf_counter_mmap(struct vm_area_struct *vma)	{ }
 static inline void perf_counter_comm(struct task_struct *tsk)		{ }
 static inline void perf_counter_fork(struct task_struct *tsk)		{ }
 static inline void perf_counter_init(void)				{ }
+
+static inline int
+perf_output_begin(struct perf_output_handle *handle, struct perf_counter *c,
+		  unsigned int size, int nmi, int sample)		{ }
+static inline void perf_output_end(struct perf_output_handle *handle)	{ }
+static inline void
+perf_output_copy(struct perf_output_handle *handle,
+		 const void *buf, unsigned int len)			{ }
+static inline void
+perf_output_sample(struct perf_output_handle *handle,
+		   struct perf_event_header *header,
+		   struct perf_sample_data *data,
+		   struct perf_counter *counter)			{ }
+static inline void
+perf_prepare_sample(struct perf_event_header *header,
+		    struct perf_sample_data *data,
+		    struct perf_counter *counter,
+		    struct pt_regs *regs)				{ }
 #endif
 
+#define perf_output_put(handle, x) \
+	perf_output_copy((handle), &(x), sizeof(x))
+
 #endif /* __KERNEL__ */
 #endif /* _LINUX_PERF_COUNTER_H */
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 29b73b6e8146..215845243a69 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2512,18 +2512,6 @@ __weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs)
 /*
  * Output
  */
-
-struct perf_output_handle {
-	struct perf_counter	*counter;
-	struct perf_mmap_data	*data;
-	unsigned long		head;
-	unsigned long		offset;
-	int			nmi;
-	int			sample;
-	int			locked;
-	unsigned long		flags;
-};
-
 static bool perf_output_space(struct perf_mmap_data *data, unsigned long tail,
 			      unsigned long offset, unsigned long head)
 {
@@ -2633,8 +2621,8 @@ out:
 	local_irq_restore(handle->flags);
 }
 
-static void perf_output_copy(struct perf_output_handle *handle,
-			     const void *buf, unsigned int len)
+void perf_output_copy(struct perf_output_handle *handle,
+		      const void *buf, unsigned int len)
 {
 	unsigned int pages_mask;
 	unsigned int offset;
@@ -2669,12 +2657,9 @@ static void perf_output_copy(struct perf_output_handle *handle,
 	WARN_ON_ONCE(((long)(handle->head - handle->offset)) < 0);
 }
 
-#define perf_output_put(handle, x) \
-	perf_output_copy((handle), &(x), sizeof(x))
-
-static int perf_output_begin(struct perf_output_handle *handle,
-			     struct perf_counter *counter, unsigned int size,
-			     int nmi, int sample)
+int perf_output_begin(struct perf_output_handle *handle,
+		      struct perf_counter *counter, unsigned int size,
+		      int nmi, int sample)
 {
 	struct perf_counter *output_counter;
 	struct perf_mmap_data *data;
@@ -2756,7 +2741,7 @@ out:
 	return -ENOSPC;
 }
 
-static void perf_output_end(struct perf_output_handle *handle)
+void perf_output_end(struct perf_output_handle *handle)
 {
 	struct perf_counter *counter = handle->counter;
 	struct perf_mmap_data *data = handle->data;
@@ -2870,82 +2855,151 @@ static void perf_output_read(struct perf_output_handle *handle,
 		perf_output_read_one(handle, counter);
 }
 
-void perf_counter_output(struct perf_counter *counter, int nmi,
-				struct perf_sample_data *data)
+void perf_output_sample(struct perf_output_handle *handle,
+			struct perf_event_header *header,
+			struct perf_sample_data *data,
+			struct perf_counter *counter)
+{
+	u64 sample_type = data->type;
+
+	perf_output_put(handle, *header);
+
+	if (sample_type & PERF_SAMPLE_IP)
+		perf_output_put(handle, data->ip);
+
+	if (sample_type & PERF_SAMPLE_TID)
+		perf_output_put(handle, data->tid_entry);
+
+	if (sample_type & PERF_SAMPLE_TIME)
+		perf_output_put(handle, data->time);
+
+	if (sample_type & PERF_SAMPLE_ADDR)
+		perf_output_put(handle, data->addr);
+
+	if (sample_type & PERF_SAMPLE_ID)
+		perf_output_put(handle, data->id);
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		perf_output_put(handle, data->stream_id);
+
+	if (sample_type & PERF_SAMPLE_CPU)
+		perf_output_put(handle, data->cpu_entry);
+
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		perf_output_put(handle, data->period);
+
+	if (sample_type & PERF_SAMPLE_READ)
+		perf_output_read(handle, counter);
+
+	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
+		if (data->callchain) {
+			int size = 1;
+
+			if (data->callchain)
+				size += data->callchain->nr;
+
+			size *= sizeof(u64);
+
+			perf_output_copy(handle, data->callchain, size);
+		} else {
+			u64 nr = 0;
+			perf_output_put(handle, nr);
+		}
+	}
+
+	if (sample_type & PERF_SAMPLE_RAW) {
+		if (data->raw) {
+			perf_output_put(handle, data->raw->size);
+			perf_output_copy(handle, data->raw->data,
+					 data->raw->size);
+		} else {
+			struct {
+				u32	size;
+				u32	data;
+			} raw = {
+				.size = sizeof(u32),
+				.data = 0,
+			};
+			perf_output_put(handle, raw);
+		}
+	}
+}
+
+void perf_prepare_sample(struct perf_event_header *header,
+			 struct perf_sample_data *data,
+			 struct perf_counter *counter,
+			 struct pt_regs *regs)
 {
-	int ret;
 	u64 sample_type = counter->attr.sample_type;
-	struct perf_output_handle handle;
-	struct perf_event_header header;
-	u64 ip;
-	struct {
-		u32 pid, tid;
-	} tid_entry;
-	struct perf_callchain_entry *callchain = NULL;
-	int callchain_size = 0;
-	u64 time;
-	struct {
-		u32 cpu, reserved;
-	} cpu_entry;
 
-	header.type = PERF_EVENT_SAMPLE;
-	header.size = sizeof(header);
+	data->type = sample_type;
 
-	header.misc = 0;
-	header.misc |= perf_misc_flags(data->regs);
+	header->type = PERF_EVENT_SAMPLE;
+	header->size = sizeof(*header);
+
+	header->misc = 0;
+	header->misc |= perf_misc_flags(regs);
 
 	if (sample_type & PERF_SAMPLE_IP) {
-		ip = perf_instruction_pointer(data->regs);
-		header.size += sizeof(ip);
+		data->ip = perf_instruction_pointer(regs);
+
+		header->size += sizeof(data->ip);
 	}
 
 	if (sample_type & PERF_SAMPLE_TID) {
 		/* namespace issues */
-		tid_entry.pid = perf_counter_pid(counter, current);
-		tid_entry.tid = perf_counter_tid(counter, current);
+		data->tid_entry.pid = perf_counter_pid(counter, current);
+		data->tid_entry.tid = perf_counter_tid(counter, current);
 
-		header.size += sizeof(tid_entry);
+		header->size += sizeof(data->tid_entry);
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME) {
 		/*
 		 * Maybe do better on x86 and provide cpu_clock_nmi()
 		 */
-		time = sched_clock();
+		data->time = sched_clock();
 
-		header.size += sizeof(u64);
+		header->size += sizeof(data->time);
 	}
 
 	if (sample_type & PERF_SAMPLE_ADDR)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->addr);
 
-	if (sample_type & PERF_SAMPLE_ID)
-		header.size += sizeof(u64);
+	if (sample_type & PERF_SAMPLE_ID) {
+		data->id = primary_counter_id(counter);
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID) {
+		data->stream_id = counter->id;
+
+		header->size += sizeof(data->stream_id);
+	}
 
 	if (sample_type & PERF_SAMPLE_CPU) {
-		header.size += sizeof(cpu_entry);
+		data->cpu_entry.cpu		= raw_smp_processor_id();
+		data->cpu_entry.reserved	= 0;
 
-		cpu_entry.cpu = raw_smp_processor_id();
-		cpu_entry.reserved = 0;
+		header->size += sizeof(data->cpu_entry);
 	}
 
 	if (sample_type & PERF_SAMPLE_PERIOD)
-		header.size += sizeof(u64);
+		header->size += sizeof(data->period);
 
 	if (sample_type & PERF_SAMPLE_READ)
-		header.size += perf_counter_read_size(counter);
+		header->size += perf_counter_read_size(counter);
 
 	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		callchain = perf_callchain(data->regs);
+		int size = 1;
 
-		if (callchain) {
-			callchain_size = (1 + callchain->nr) * sizeof(u64);
-			header.size += callchain_size;
-		} else
-			header.size += sizeof(u64);
+		data->callchain = perf_callchain(regs);
+
+		if (data->callchain)
+			size += data->callchain->nr;
+
+		header->size += size * sizeof(u64);
 	}
 
 	if (sample_type & PERF_SAMPLE_RAW) {
@@ -2957,69 +3011,23 @@ void perf_counter_output(struct perf_counter *counter, int nmi,
 			size += sizeof(u32);
 
 		WARN_ON_ONCE(size & (sizeof(u64)-1));
-		header.size += size;
-	}
-
-	ret = perf_output_begin(&handle, counter, header.size, nmi, 1);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, header);
-
-	if (sample_type & PERF_SAMPLE_IP)
-		perf_output_put(&handle, ip);
-
-	if (sample_type & PERF_SAMPLE_TID)
-		perf_output_put(&handle, tid_entry);
-
-	if (sample_type & PERF_SAMPLE_TIME)
-		perf_output_put(&handle, time);
-
-	if (sample_type & PERF_SAMPLE_ADDR)
-		perf_output_put(&handle, data->addr);
-
-	if (sample_type & PERF_SAMPLE_ID) {
-		u64 id = primary_counter_id(counter);
-
-		perf_output_put(&handle, id);
+		header->size += size;
 	}
+}
 
-	if (sample_type & PERF_SAMPLE_STREAM_ID)
-		perf_output_put(&handle, counter->id);
-
-	if (sample_type & PERF_SAMPLE_CPU)
-		perf_output_put(&handle, cpu_entry);
-
-	if (sample_type & PERF_SAMPLE_PERIOD)
-		perf_output_put(&handle, data->period);
+static void perf_counter_output(struct perf_counter *counter, int nmi,
+				struct perf_sample_data *data,
+				struct pt_regs *regs)
+{
+	struct perf_output_handle handle;
+	struct perf_event_header header;
 
-	if (sample_type & PERF_SAMPLE_READ)
-		perf_output_read(&handle, counter);
+	perf_prepare_sample(&header, data, counter, regs);
 
-	if (sample_type & PERF_SAMPLE_CALLCHAIN) {
-		if (callchain)
-			perf_output_copy(&handle, callchain, callchain_size);
-		else {
-			u64 nr = 0;
-			perf_output_put(&handle, nr);
-		}
-	}
+	if (perf_output_begin(&handle, counter, header.size, nmi, 1))
+		return;
 
-	if (sample_type & PERF_SAMPLE_RAW) {
-		if (data->raw) {
-			perf_output_put(&handle, data->raw->size);
-			perf_output_copy(&handle, data->raw->data, data->raw->size);
-		} else {
-			struct {
-				u32	size;
-				u32	data;
-			} raw = {
-				.size = sizeof(u32),
-				.data = 0,
-			};
-			perf_output_put(&handle, raw);
-		}
-	}
+	perf_output_sample(&handle, &header, data, counter);
 
 	perf_output_end(&handle);
 }
@@ -3501,7 +3509,8 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
  */
 
 static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
-				   int throttle, struct perf_sample_data *data)
+				   int throttle, struct perf_sample_data *data,
+				   struct pt_regs *regs)
 {
 	int events = atomic_read(&counter->event_limit);
 	struct hw_perf_counter *hwc = &counter->hw;
@@ -3557,14 +3566,15 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
 			perf_counter_disable(counter);
 	}
 
-	perf_counter_output(counter, nmi, data);
+	perf_counter_output(counter, nmi, data, regs);
 	return ret;
 }
 
 int perf_counter_overflow(struct perf_counter *counter, int nmi,
-			  struct perf_sample_data *data)
+			  struct perf_sample_data *data,
+			  struct pt_regs *regs)
 {
-	return __perf_counter_overflow(counter, nmi, 1, data);
+	return __perf_counter_overflow(counter, nmi, 1, data, regs);
 }
 
 /*
@@ -3602,7 +3612,8 @@ again:
 }
 
 static void perf_swcounter_overflow(struct perf_counter *counter,
-				    int nmi, struct perf_sample_data *data)
+				    int nmi, struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 	int throttle = 0;
@@ -3615,7 +3626,8 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 		return;
 
 	for (; overflow; overflow--) {
-		if (__perf_counter_overflow(counter, nmi, throttle, data)) {
+		if (__perf_counter_overflow(counter, nmi, throttle,
+					    data, regs)) {
 			/*
 			 * We inhibit the overflow from happening when
 			 * hwc->interrupts == MAX_INTERRUPTS.
@@ -3634,7 +3646,8 @@ static void perf_swcounter_unthrottle(struct perf_counter *counter)
 }
 
 static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
-			       int nmi, struct perf_sample_data *data)
+			       int nmi, struct perf_sample_data *data,
+			       struct pt_regs *regs)
 {
 	struct hw_perf_counter *hwc = &counter->hw;
 
@@ -3643,11 +3656,11 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr,
 	if (!hwc->sample_period)
 		return;
 
-	if (!data->regs)
+	if (!regs)
 		return;
 
 	if (!atomic64_add_negative(nr, &hwc->period_left))
-		perf_swcounter_overflow(counter, nmi, data);
+		perf_swcounter_overflow(counter, nmi, data, regs);
 }
 
 static int perf_swcounter_is_counting(struct perf_counter *counter)
@@ -3706,7 +3719,8 @@ static int perf_swcounter_match(struct perf_counter *counter,
 static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 				     enum perf_type_id type,
 				     u32 event, u64 nr, int nmi,
-				     struct perf_sample_data *data)
+				     struct perf_sample_data *data,
+				     struct pt_regs *regs)
 {
 	struct perf_counter *counter;
 
@@ -3715,8 +3729,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx,
 
 	rcu_read_lock();
 	list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) {
-		if (perf_swcounter_match(counter, type, event, data->regs))
-			perf_swcounter_add(counter, nr, nmi, data);
+		if (perf_swcounter_match(counter, type, event, regs))
+			perf_swcounter_add(counter, nr, nmi, data, regs);
 	}
 	rcu_read_unlock();
 }
@@ -3737,7 +3751,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx)
 
 static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 				    u64 nr, int nmi,
-				    struct perf_sample_data *data)
+				    struct perf_sample_data *data,
+				    struct pt_regs *regs)
 {
 	struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context);
 	int *recursion = perf_swcounter_recursion_context(cpuctx);
@@ -3750,7 +3765,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 	barrier();
 
 	perf_swcounter_ctx_event(&cpuctx->ctx, type, event,
-				 nr, nmi, data);
+				 nr, nmi, data, regs);
 	rcu_read_lock();
 	/*
 	 * doesn't really matter which of the child contexts the
@@ -3758,7 +3773,7 @@ static void do_perf_swcounter_event(enum perf_type_id type, u32 event,
 	 */
 	ctx = rcu_dereference(current->perf_counter_ctxp);
 	if (ctx)
-		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data);
+		perf_swcounter_ctx_event(ctx, type, event, nr, nmi, data, regs);
 	rcu_read_unlock();
 
 	barrier();
@@ -3772,11 +3787,11 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi,
 			    struct pt_regs *regs, u64 addr)
 {
 	struct perf_sample_data data = {
-		.regs = regs,
 		.addr = addr,
 	};
 
-	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, &data);
+	do_perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi,
+				&data, regs);
 }
 
 static void perf_swcounter_read(struct perf_counter *counter)
@@ -3813,6 +3828,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 {
 	enum hrtimer_restart ret = HRTIMER_RESTART;
 	struct perf_sample_data data;
+	struct pt_regs *regs;
 	struct perf_counter *counter;
 	u64 period;
 
@@ -3820,17 +3836,17 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer)
 	counter->pmu->read(counter);
 
 	data.addr = 0;
-	data.regs = get_irq_regs();
+	regs = get_irq_regs();
 	/*
 	 * In case we exclude kernel IPs or are somehow not in interrupt
 	 * context, provide the next best thing, the user IP.
 	 */
-	if ((counter->attr.exclude_kernel || !data.regs) &&
+	if ((counter->attr.exclude_kernel || !regs) &&
 			!counter->attr.exclude_user)
-		data.regs = task_pt_regs(current);
+		regs = task_pt_regs(current);
 
-	if (data.regs) {
-		if (perf_counter_overflow(counter, 0, &data))
+	if (regs) {
+		if (perf_counter_overflow(counter, 0, &data, regs))
 			ret = HRTIMER_NORESTART;
 	}
 
@@ -3966,15 +3982,17 @@ void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record,
 	};
 
 	struct perf_sample_data data = {
-		.regs = get_irq_regs(),
 		.addr = addr,
 		.raw = &raw,
 	};
 
-	if (!data.regs)
-		data.regs = task_pt_regs(current);
+	struct pt_regs *regs = get_irq_regs();
+
+	if (!regs)
+		regs = task_pt_regs(current);
 
-	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1, &data);
+	do_perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, count, 1,
+				&data, regs);
 }
 EXPORT_SYMBOL_GPL(perf_tpcounter_event);
 
-- 
cgit v1.2.3


From cf450a7355a116af793998c118a6bcf7f5a8367e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 18 Sep 2009 12:18:14 +0200
Subject: perf_counter: Fix up swcounter throttling

/me dons the brown paper bag.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 215845243a69..6944bd55ec4e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3634,7 +3634,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter,
 			 */
 			break;
 		}
-		throttle = 0;
+		throttle = 1;
 	}
 }
 
-- 
cgit v1.2.3


From def0a9b2573e00ab0b486cb5382625203ab4c4a6 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 18 Sep 2009 20:14:01 +0200
Subject: sched_clock: Make it NMI safe

Arjan complained about the suckyness of TSC on modern machines, and
asked if we could do something about that for PERF_SAMPLE_TIME.

Make cpu_clock() NMI safe by removing the spinlock and using
cmpxchg. This also makes it smaller and more robust.

Affects architectures that use HAVE_UNSTABLE_SCHED_CLOCK, i.e. IA64
and x86.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c |   9 ++--
 kernel/sched_clock.c  | 122 ++++++++++++++++++++++----------------------------
 2 files changed, 56 insertions(+), 75 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 6944bd55ec4e..06d233a06da5 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2955,10 +2955,7 @@ void perf_prepare_sample(struct perf_event_header *header,
 	}
 
 	if (sample_type & PERF_SAMPLE_TIME) {
-		/*
-		 * Maybe do better on x86 and provide cpu_clock_nmi()
-		 */
-		data->time = sched_clock();
+		data->time = perf_clock();
 
 		header->size += sizeof(data->time);
 	}
@@ -3488,7 +3485,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time		= sched_clock(),
+		.time		= perf_clock(),
 		.id		= primary_counter_id(counter),
 		.stream_id	= counter->id,
 	};
@@ -3540,7 +3537,7 @@ static int __perf_counter_overflow(struct perf_counter *counter, int nmi,
 	}
 
 	if (counter->attr.freq) {
-		u64 now = sched_clock();
+		u64 now = perf_clock();
 		s64 delta = now - hwc->freq_stamp;
 
 		hwc->freq_stamp = now;
diff --git a/kernel/sched_clock.c b/kernel/sched_clock.c
index e1d16c9a7680..ac2e1dc708bd 100644
--- a/kernel/sched_clock.c
+++ b/kernel/sched_clock.c
@@ -48,13 +48,6 @@ static __read_mostly int sched_clock_running;
 __read_mostly int sched_clock_stable;
 
 struct sched_clock_data {
-	/*
-	 * Raw spinlock - this is a special case: this might be called
-	 * from within instrumentation code so we dont want to do any
-	 * instrumentation ourselves.
-	 */
-	raw_spinlock_t		lock;
-
 	u64			tick_raw;
 	u64			tick_gtod;
 	u64			clock;
@@ -80,7 +73,6 @@ void sched_clock_init(void)
 	for_each_possible_cpu(cpu) {
 		struct sched_clock_data *scd = cpu_sdc(cpu);
 
-		scd->lock = (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED;
 		scd->tick_raw = 0;
 		scd->tick_gtod = ktime_now;
 		scd->clock = ktime_now;
@@ -109,14 +101,19 @@ static inline u64 wrap_max(u64 x, u64 y)
  *  - filter out backward motion
  *  - use the GTOD tick value to create a window to filter crazy TSC values
  */
-static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
+static u64 sched_clock_local(struct sched_clock_data *scd)
 {
-	s64 delta = now - scd->tick_raw;
-	u64 clock, min_clock, max_clock;
+	u64 now, clock, old_clock, min_clock, max_clock;
+	s64 delta;
 
+again:
+	now = sched_clock();
+	delta = now - scd->tick_raw;
 	if (unlikely(delta < 0))
 		delta = 0;
 
+	old_clock = scd->clock;
+
 	/*
 	 * scd->clock = clamp(scd->tick_gtod + delta,
 	 *		      max(scd->tick_gtod, scd->clock),
@@ -124,84 +121,73 @@ static u64 __update_sched_clock(struct sched_clock_data *scd, u64 now)
 	 */
 
 	clock = scd->tick_gtod + delta;
-	min_clock = wrap_max(scd->tick_gtod, scd->clock);
-	max_clock = wrap_max(scd->clock, scd->tick_gtod + TICK_NSEC);
+	min_clock = wrap_max(scd->tick_gtod, old_clock);
+	max_clock = wrap_max(old_clock, scd->tick_gtod + TICK_NSEC);
 
 	clock = wrap_max(clock, min_clock);
 	clock = wrap_min(clock, max_clock);
 
-	scd->clock = clock;
+	if (cmpxchg(&scd->clock, old_clock, clock) != old_clock)
+		goto again;
 
-	return scd->clock;
+	return clock;
 }
 
-static void lock_double_clock(struct sched_clock_data *data1,
-				struct sched_clock_data *data2)
+static u64 sched_clock_remote(struct sched_clock_data *scd)
 {
-	if (data1 < data2) {
-		__raw_spin_lock(&data1->lock);
-		__raw_spin_lock(&data2->lock);
+	struct sched_clock_data *my_scd = this_scd();
+	u64 this_clock, remote_clock;
+	u64 *ptr, old_val, val;
+
+	sched_clock_local(my_scd);
+again:
+	this_clock = my_scd->clock;
+	remote_clock = scd->clock;
+
+	/*
+	 * Use the opportunity that we have both locks
+	 * taken to couple the two clocks: we take the
+	 * larger time as the latest time for both
+	 * runqueues. (this creates monotonic movement)
+	 */
+	if (likely((s64)(remote_clock - this_clock) < 0)) {
+		ptr = &scd->clock;
+		old_val = remote_clock;
+		val = this_clock;
 	} else {
-		__raw_spin_lock(&data2->lock);
-		__raw_spin_lock(&data1->lock);
+		/*
+		 * Should be rare, but possible:
+		 */
+		ptr = &my_scd->clock;
+		old_val = this_clock;
+		val = remote_clock;
 	}
+
+	if (cmpxchg(ptr, old_val, val) != old_val)
+		goto again;
+
+	return val;
 }
 
 u64 sched_clock_cpu(int cpu)
 {
-	u64 now, clock, this_clock, remote_clock;
 	struct sched_clock_data *scd;
+	u64 clock;
+
+	WARN_ON_ONCE(!irqs_disabled());
 
 	if (sched_clock_stable)
 		return sched_clock();
 
-	scd = cpu_sdc(cpu);
-
-	/*
-	 * Normally this is not called in NMI context - but if it is,
-	 * trying to do any locking here is totally lethal.
-	 */
-	if (unlikely(in_nmi()))
-		return scd->clock;
-
 	if (unlikely(!sched_clock_running))
 		return 0ull;
 
-	WARN_ON_ONCE(!irqs_disabled());
-	now = sched_clock();
-
-	if (cpu != raw_smp_processor_id()) {
-		struct sched_clock_data *my_scd = this_scd();
-
-		lock_double_clock(scd, my_scd);
-
-		this_clock = __update_sched_clock(my_scd, now);
-		remote_clock = scd->clock;
-
-		/*
-		 * Use the opportunity that we have both locks
-		 * taken to couple the two clocks: we take the
-		 * larger time as the latest time for both
-		 * runqueues. (this creates monotonic movement)
-		 */
-		if (likely((s64)(remote_clock - this_clock) < 0)) {
-			clock = this_clock;
-			scd->clock = clock;
-		} else {
-			/*
-			 * Should be rare, but possible:
-			 */
-			clock = remote_clock;
-			my_scd->clock = remote_clock;
-		}
-
-		__raw_spin_unlock(&my_scd->lock);
-	} else {
-		__raw_spin_lock(&scd->lock);
-		clock = __update_sched_clock(scd, now);
-	}
+	scd = cpu_sdc(cpu);
 
-	__raw_spin_unlock(&scd->lock);
+	if (cpu != smp_processor_id())
+		clock = sched_clock_remote(scd);
+	else
+		clock = sched_clock_local(scd);
 
 	return clock;
 }
@@ -223,11 +209,9 @@ void sched_clock_tick(void)
 	now_gtod = ktime_to_ns(ktime_get());
 	now = sched_clock();
 
-	__raw_spin_lock(&scd->lock);
 	scd->tick_raw = now;
 	scd->tick_gtod = now_gtod;
-	__update_sched_clock(scd, now);
-	__raw_spin_unlock(&scd->lock);
+	sched_clock_local(scd);
 }
 
 /*
-- 
cgit v1.2.3


From 393b2ad8c757ba3ccd2a99ca5bbcd6db4d3fa53d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 07:52:47 +0200
Subject: perf: Add a timestamp to fork events

perf timechart needs to know when a process forked, in order to be
able to visualize properly when tasks start.

This patch adds a time field to the event structure, and fills it
in appropriately.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130341.51ad2de2@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/perf_counter.h |  2 ++
 kernel/perf_counter.c        | 11 +++++++++--
 tools/perf/util/event.h      |  1 +
 3 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index c7375f97aa19..bd341007c4fc 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -336,6 +336,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid, ppid;
 	 *	u32				tid, ptid;
+	 *	u64				time;
 	 * };
 	 */
 	PERF_EVENT_EXIT			= 4,
@@ -356,6 +357,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u32				pid, ppid;
 	 *	u32				tid, ptid;
+	 *	{ u64				time;     } && PERF_SAMPLE_TIME
 	 * };
 	 */
 	PERF_EVENT_FORK			= 7,
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d013f4e89e9c..d5899b62b276 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3083,6 +3083,7 @@ struct perf_task_event {
 		u32				ppid;
 		u32				tid;
 		u32				ptid;
+		u64				time;
 	} event;
 };
 
@@ -3090,9 +3091,12 @@ static void perf_counter_task_output(struct perf_counter *counter,
 				     struct perf_task_event *task_event)
 {
 	struct perf_output_handle handle;
-	int size = task_event->event.header.size;
+	int size;
 	struct task_struct *task = task_event->task;
-	int ret = perf_output_begin(&handle, counter, size, 0, 0);
+	int ret;
+
+	size  = task_event->event.header.size;
+	ret = perf_output_begin(&handle, counter, size, 0, 0);
 
 	if (ret)
 		return;
@@ -3103,7 +3107,10 @@ static void perf_counter_task_output(struct perf_counter *counter,
 	task_event->event.tid = perf_counter_tid(counter, task);
 	task_event->event.ptid = perf_counter_tid(counter, current);
 
+	task_event->event.time = perf_clock();
+
 	perf_output_put(&handle, task_event->event);
+
 	perf_output_end(&handle);
 }
 
diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 2495529cae7d..28a579f8fa8e 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -39,6 +39,7 @@ struct fork_event {
 	struct perf_event_header header;
 	u32 pid, ppid;
 	u32 tid, ptid;
+	u64 time;
 };
 
 struct lost_event {
-- 
cgit v1.2.3


From 8755a8f27ae590dde225f9005ee18c6d9c4c6d78 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 07:52:51 +0200
Subject: perf: Store trace event name/id pairs in perf.data

The trace event name<->id mapping is dynamic for each kernel
compile. In order for perf.data to be useable outside the actual
system, we thus need to store a table of this mapping for later
use.

This patch adds this table to perf.data, and provides helper
functions for lookup up fields from this table.

To avoid mistakes, lookup-from-table is kept completely seprate
from lookup-from-local-debugfs.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130405.6960d099@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/header.c       | 59 ++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/header.h       |  6 +++++
 tools/perf/util/parse-events.c | 25 ++++++++++++++++++
 3 files changed, 90 insertions(+)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ec4d4c2f9522..ef9114583994 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -86,6 +86,44 @@ void perf_header__add_attr(struct perf_header *self,
 	self->attr[pos] = attr;
 }
 
+struct perf_trace_event_type {
+	u64	event_id;
+	char	name[64];
+};
+
+static int event_count;
+static struct perf_trace_event_type *events;
+
+void perf_header__push_event(u64 id, const char *name)
+{
+	if (strlen(name) > 64)
+		printf("Event %s will be truncated\n", name);
+
+	if (!events) {
+		events = malloc(sizeof(struct perf_trace_event_type));
+		if (!events)
+			die("nomem");
+	} else {
+		events = realloc(events, (event_count + 1) * sizeof(struct perf_trace_event_type));
+		if (!events)
+			die("nomem");
+	}
+	memset(&events[event_count], 0, sizeof(struct perf_trace_event_type));
+	events[event_count].event_id = id;
+	strncpy(events[event_count].name, name, 63);
+	event_count++;
+}
+
+char *perf_header__find_event(u64 id)
+{
+	int i;
+	for (i = 0 ; i < event_count; i++) {
+		if (events[i].event_id == id)
+			return events[i].name;
+	}
+	return NULL;
+}
+
 static const char *__perf_magic = "PERFFILE";
 
 #define PERF_MAGIC	(*(u64 *)__perf_magic)
@@ -106,6 +144,7 @@ struct perf_file_header {
 	u64				attr_size;
 	struct perf_file_section	attrs;
 	struct perf_file_section	data;
+	struct perf_file_section	event_types;
 };
 
 static void do_write(int fd, void *buf, size_t size)
@@ -154,6 +193,11 @@ void perf_header__write(struct perf_header *self, int fd)
 		do_write(fd, &f_attr, sizeof(f_attr));
 	}
 
+	self->event_offset = lseek(fd, 0, SEEK_CUR);
+	self->event_size = event_count * sizeof(struct perf_trace_event_type);
+	if (events)
+		do_write(fd, events, self->event_size);
+
 
 	self->data_offset = lseek(fd, 0, SEEK_CUR);
 
@@ -169,6 +213,10 @@ void perf_header__write(struct perf_header *self, int fd)
 			.offset = self->data_offset,
 			.size	= self->data_size,
 		},
+		.event_types = {
+			.offset = self->event_offset,
+			.size	= self->event_size,
+		},
 	};
 
 	lseek(fd, 0, SEEK_SET);
@@ -234,6 +282,17 @@ struct perf_header *perf_header__read(int fd)
 		lseek(fd, tmp, SEEK_SET);
 	}
 
+	if (f_header.event_types.size) {
+		lseek(fd, f_header.event_types.offset, SEEK_SET);
+		events = malloc(f_header.event_types.size);
+		if (!events)
+			die("nomem");
+		do_read(fd, events, f_header.event_types.size);
+		event_count =  f_header.event_types.size / sizeof(struct perf_trace_event_type);
+	}
+	self->event_offset = f_header.event_types.offset;
+	self->event_size   = f_header.event_types.size;
+
 	self->data_offset = f_header.data.offset;
 	self->data_size   = f_header.data.size;
 
diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index 5d0a72ecc919..7b0e84a87179 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -19,6 +19,8 @@ struct perf_header {
 	s64 attr_offset;
 	u64 data_offset;
 	u64 data_size;
+	u64 event_offset;
+	u64 event_size;
 };
 
 struct perf_header *perf_header__read(int fd);
@@ -27,6 +29,10 @@ void perf_header__write(struct perf_header *self, int fd);
 void perf_header__add_attr(struct perf_header *self,
 			   struct perf_header_attr *attr);
 
+void perf_header__push_event(u64 id, const char *name);
+char *perf_header__find_event(u64 id);
+
+
 struct perf_header_attr *
 perf_header_attr__new(struct perf_counter_attr *attr);
 void perf_header_attr__add_id(struct perf_header_attr *self, u64 id);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index a9bdcab8c070..89172fd0038b 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -6,6 +6,7 @@
 #include "exec_cmd.h"
 #include "string.h"
 #include "cache.h"
+#include "header.h"
 
 int					nr_counters;
 
@@ -687,11 +688,35 @@ modifier:
 	return ret;
 }
 
+static void store_event_type(const char *orgname)
+{
+	char filename[PATH_MAX], *c;
+	FILE *file;
+	int id;
+
+	sprintf(filename, "/sys/kernel/debug/tracing/events/%s/id", orgname);
+	c = strchr(filename, ':');
+	if (c)
+		*c = '/';
+
+	file = fopen(filename, "r");
+	if (!file)
+		return;
+	if (fscanf(file, "%i", &id) < 1)
+		die("cannot store event ID");
+	fclose(file);
+	perf_header__push_event(id, orgname);
+}
+
+
 int parse_events(const struct option *opt __used, const char *str, int unset __used)
 {
 	struct perf_counter_attr attr;
 	enum event_result ret;
 
+	if (strchr(str, ':'))
+		store_event_type(str);
+
 	for (;;) {
 		if (nr_counters == MAX_COUNTERS)
 			return -1;
-- 
cgit v1.2.3


From a79d7db9fdb0037925ffa31f6ff9b189ff75d235 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 07:52:54 +0200
Subject: perf: Allow perf utilities to have "callback" options without
 arguments

timechart needs to add a "callback" type command line argument that
does not take arguments.

This patch adds the parse-options.h infrastructure to make this
possible.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130440.548666c1@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-options.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/perf/util/parse-options.h b/tools/perf/util/parse-options.h
index 8aa3464c7090..2ee248ff27e5 100644
--- a/tools/perf/util/parse-options.h
+++ b/tools/perf/util/parse-options.h
@@ -104,6 +104,8 @@ struct option {
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), .argh = "time", .help = (h), .callback = parse_opt_approxidate_cb }
 #define OPT_CALLBACK(s, l, v, a, h, f) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f) }
+#define OPT_CALLBACK_NOOPT(s, l, v, a, h, f) \
+	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .flags = PARSE_OPT_NOARG }
 #define OPT_CALLBACK_DEFAULT(s, l, v, a, h, f, d) \
 	{ .type = OPTION_CALLBACK, .short_name = (s), .long_name = (l), .value = (v), (a), .help = (h), .callback = (f), .defval = (intptr_t)d, .flags = PARSE_OPT_LASTARG_DEFAULT }
 
-- 
cgit v1.2.3


From fd39e055c4fefa4553efc9030f9903afdc6ee323 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 07:53:00 +0200
Subject: perf: Add a sample_event type to the event_union

Add a sample_event type to the event_union so that raw samples can
be processed easily.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130511.411434b5@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/event.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h
index 28a579f8fa8e..018d414a09d1 100644
--- a/tools/perf/util/event.h
+++ b/tools/perf/util/event.h
@@ -60,6 +60,12 @@ struct read_event {
 	u64 id;
 };
 
+struct sample_event{
+	struct perf_event_header        header;
+	u64 array[];
+};
+
+
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
@@ -68,6 +74,7 @@ typedef union event_union {
 	struct fork_event		fork;
 	struct lost_event		lost;
 	struct read_event		read;
+	struct sample_event		sample;
 } event_t;
 
 struct map {
-- 
cgit v1.2.3


From 6161352142d5fed4cd753b32e5ccde66e705b14e Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Thu, 17 Sep 2009 16:11:28 +0200
Subject: tracing, perf: Convert the power tracer into an event tracer

This patch converts the existing power tracer into an event tracer,
so that power events (C states and frequency changes) can be
tracked via "perf".

This also removes the perl script that was used to demo the tracer;
its functionality is being replaced entirely with timechart.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130542.6d314860@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 Documentation/trace/power.txt              |  17 ---
 arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c |   7 +-
 arch/x86/kernel/process.c                  |  28 ++--
 include/trace/events/power.h               |  81 +++++++++++
 kernel/trace/Makefile                      |   2 +-
 kernel/trace/power-traces.c                |  20 +++
 kernel/trace/trace.h                       |   3 -
 kernel/trace/trace_entries.h               |  17 ---
 kernel/trace/trace_power.c                 | 218 -----------------------------
 scripts/tracing/power.pl                   | 108 --------------
 10 files changed, 113 insertions(+), 388 deletions(-)
 delete mode 100644 Documentation/trace/power.txt
 create mode 100644 include/trace/events/power.h
 create mode 100644 kernel/trace/power-traces.c
 delete mode 100644 kernel/trace/trace_power.c
 delete mode 100644 scripts/tracing/power.pl

diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt
deleted file mode 100644
index cd805e16dc27..000000000000
--- a/Documentation/trace/power.txt
+++ /dev/null
@@ -1,17 +0,0 @@
-The power tracer collects detailed information about C-state and P-state
-transitions, instead of just looking at the high-level "average"
-information.
-
-There is a helper script found in scrips/tracing/power.pl in the kernel
-sources which can be used to parse this information and create a
-Scalable Vector Graphics (SVG) picture from the trace data.
-
-To use this tracer:
-
-	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
-	echo power > /sys/kernel/debug/tracing/current_tracer
-	echo 1 > /sys/kernel/debug/tracing/tracing_enabled
-	sleep 1
-	echo 0 > /sys/kernel/debug/tracing/tracing_enabled
-	cat /sys/kernel/debug/tracing/trace | \
-		perl scripts/tracing/power.pl > out.sv
diff --git a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
index 4109679863c1..479cc8c418c1 100644
--- a/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
+++ b/arch/x86/kernel/cpu/cpufreq/acpi-cpufreq.c
@@ -33,7 +33,7 @@
 #include <linux/cpufreq.h>
 #include <linux/compiler.h>
 #include <linux/dmi.h>
-#include <trace/power.h>
+#include <trace/events/power.h>
 
 #include <linux/acpi.h>
 #include <linux/io.h>
@@ -72,8 +72,6 @@ static DEFINE_PER_CPU(struct acpi_cpufreq_data *, drv_data);
 
 static DEFINE_PER_CPU(struct aperfmperf, old_perf);
 
-DEFINE_TRACE(power_mark);
-
 /* acpi_perf_data is a pointer to percpu data. */
 static struct acpi_processor_performance *acpi_perf_data;
 
@@ -332,7 +330,6 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 	unsigned int next_perf_state = 0; /* Index into perf table */
 	unsigned int i;
 	int result = 0;
-	struct power_trace it;
 
 	dprintk("acpi_cpufreq_target %d (%d)\n", target_freq, policy->cpu);
 
@@ -364,7 +361,7 @@ static int acpi_cpufreq_target(struct cpufreq_policy *policy,
 		}
 	}
 
-	trace_power_mark(&it, POWER_PSTATE, next_perf_state);
+	trace_power_frequency(POWER_PSTATE, data->freq_table[next_state].frequency);
 
 	switch (data->cpu_feature) {
 	case SYSTEM_INTEL_MSR_CAPABLE:
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 071166a4ba83..7b60e3906889 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -9,7 +9,7 @@
 #include <linux/pm.h>
 #include <linux/clockchips.h>
 #include <linux/random.h>
-#include <trace/power.h>
+#include <trace/events/power.h>
 #include <asm/system.h>
 #include <asm/apic.h>
 #include <asm/syscalls.h>
@@ -25,9 +25,6 @@ EXPORT_SYMBOL(idle_nomwait);
 
 struct kmem_cache *task_xstate_cachep;
 
-DEFINE_TRACE(power_start);
-DEFINE_TRACE(power_end);
-
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
 	*dst = *src;
@@ -299,9 +296,7 @@ static inline int hlt_use_halt(void)
 void default_idle(void)
 {
 	if (hlt_use_halt()) {
-		struct power_trace it;
-
-		trace_power_start(&it, POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1);
 		current_thread_info()->status &= ~TS_POLLING;
 		/*
 		 * TS_POLLING-cleared state must be visible before we
@@ -314,7 +309,7 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		trace_power_end(&it);
+		trace_power_end(0);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -372,9 +367,7 @@ EXPORT_SYMBOL_GPL(cpu_idle_wait);
  */
 void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 {
-	struct power_trace it;
-
-	trace_power_start(&it, POWER_CSTATE, (ax>>4)+1);
+	trace_power_start(POWER_CSTATE, (ax>>4)+1);
 	if (!need_resched()) {
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
@@ -384,15 +377,14 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
-	trace_power_end(&it);
+	trace_power_end(0);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
 static void mwait_idle(void)
 {
-	struct power_trace it;
 	if (!need_resched()) {
-		trace_power_start(&it, POWER_CSTATE, 1);
+		trace_power_start(POWER_CSTATE, 1);
 		if (cpu_has(&current_cpu_data, X86_FEATURE_CLFLUSH_MONITOR))
 			clflush((void *)&current_thread_info()->flags);
 
@@ -402,7 +394,7 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
-		trace_power_end(&it);
+		trace_power_end(0);
 	} else
 		local_irq_enable();
 }
@@ -414,13 +406,11 @@ static void mwait_idle(void)
  */
 static void poll_idle(void)
 {
-	struct power_trace it;
-
-	trace_power_start(&it, POWER_CSTATE, 0);
+	trace_power_start(POWER_CSTATE, 0);
 	local_irq_enable();
 	while (!need_resched())
 		cpu_relax();
-	trace_power_end(&it);
+	trace_power_end(0);
 }
 
 /*
diff --git a/include/trace/events/power.h b/include/trace/events/power.h
new file mode 100644
index 000000000000..ea6d579261ad
--- /dev/null
+++ b/include/trace/events/power.h
@@ -0,0 +1,81 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM power
+
+#if !defined(_TRACE_POWER_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_POWER_H
+
+#include <linux/ktime.h>
+#include <linux/tracepoint.h>
+
+#ifndef _TRACE_POWER_ENUM_
+#define _TRACE_POWER_ENUM_
+enum {
+	POWER_NONE = 0,
+	POWER_CSTATE = 1,
+	POWER_PSTATE = 2,
+};
+#endif
+
+
+
+TRACE_EVENT(power_start,
+
+	TP_PROTO(unsigned int type, unsigned int state),
+
+	TP_ARGS(type, state),
+
+	TP_STRUCT__entry(
+		__field(	u64,		type		)
+		__field(	u64,		state		)
+	),
+
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->state = state;
+	),
+
+	TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long)__entry->state)
+);
+
+TRACE_EVENT(power_end,
+
+	TP_PROTO(int dummy),
+
+	TP_ARGS(dummy),
+
+	TP_STRUCT__entry(
+		__field(	u64,		dummy		)
+	),
+
+	TP_fast_assign(
+		__entry->dummy = 0xffff;
+	),
+
+	TP_printk("dummy=%lu", (unsigned long)__entry->dummy)
+
+);
+
+
+TRACE_EVENT(power_frequency,
+
+	TP_PROTO(unsigned int type, unsigned int state),
+
+	TP_ARGS(type, state),
+
+	TP_STRUCT__entry(
+		__field(	u64,		type		)
+		__field(	u64,		state		)
+	),
+
+	TP_fast_assign(
+		__entry->type = type;
+		__entry->state = state;
+	),
+
+	TP_printk("type=%lu state=%lu", (unsigned long)__entry->type, (unsigned long) __entry->state)
+);
+
+#endif /* _TRACE_POWER_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 844164dca90a..26f03ac07c2b 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -42,7 +42,6 @@ obj-$(CONFIG_BOOT_TRACER) += trace_boot.o
 obj-$(CONFIG_FUNCTION_GRAPH_TRACER) += trace_functions_graph.o
 obj-$(CONFIG_TRACE_BRANCH_PROFILING) += trace_branch.o
 obj-$(CONFIG_HW_BRANCH_TRACER) += trace_hw_branches.o
-obj-$(CONFIG_POWER_TRACER) += trace_power.o
 obj-$(CONFIG_KMEMTRACE) += kmemtrace.o
 obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o
 obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o
@@ -54,5 +53,6 @@ obj-$(CONFIG_EVENT_TRACING) += trace_export.o
 obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o
 obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o
 obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o
+obj-$(CONFIG_EVENT_TRACING) += power-traces.o
 
 libftrace-y := ftrace.o
diff --git a/kernel/trace/power-traces.c b/kernel/trace/power-traces.c
new file mode 100644
index 000000000000..e06c6e3d56a3
--- /dev/null
+++ b/kernel/trace/power-traces.c
@@ -0,0 +1,20 @@
+/*
+ * Power trace points
+ *
+ * Copyright (C) 2009 Arjan van de Ven <arjan@linux.intel.com>
+ */
+
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/power.h>
+
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_start);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_end);
+EXPORT_TRACEPOINT_SYMBOL_GPL(power_frequency);
+
diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h
index 86bcff94791a..405cb850b75d 100644
--- a/kernel/trace/trace.h
+++ b/kernel/trace/trace.h
@@ -11,7 +11,6 @@
 #include <linux/ftrace.h>
 #include <trace/boot.h>
 #include <linux/kmemtrace.h>
-#include <trace/power.h>
 
 #include <linux/trace_seq.h>
 #include <linux/ftrace_event.h>
@@ -37,7 +36,6 @@ enum trace_type {
 	TRACE_HW_BRANCHES,
 	TRACE_KMEM_ALLOC,
 	TRACE_KMEM_FREE,
-	TRACE_POWER,
 	TRACE_BLK,
 
 	__TRACE_LAST_TYPE,
@@ -207,7 +205,6 @@ extern void __ftrace_bad_type(void);
 		IF_ASSIGN(var, ent, struct ftrace_graph_ret_entry,	\
 			  TRACE_GRAPH_RET);		\
 		IF_ASSIGN(var, ent, struct hw_branch_entry, TRACE_HW_BRANCHES);\
-		IF_ASSIGN(var, ent, struct trace_power, TRACE_POWER); \
 		IF_ASSIGN(var, ent, struct kmemtrace_alloc_entry,	\
 			  TRACE_KMEM_ALLOC);	\
 		IF_ASSIGN(var, ent, struct kmemtrace_free_entry,	\
diff --git a/kernel/trace/trace_entries.h b/kernel/trace/trace_entries.h
index a431748ddd6e..ead3d724599d 100644
--- a/kernel/trace/trace_entries.h
+++ b/kernel/trace/trace_entries.h
@@ -330,23 +330,6 @@ FTRACE_ENTRY(hw_branch, hw_branch_entry,
 	F_printk("from: %llx to: %llx", __entry->from, __entry->to)
 );
 
-FTRACE_ENTRY(power, trace_power,
-
-	TRACE_POWER,
-
-	F_STRUCT(
-		__field_struct(	struct power_trace,	state_data	)
-		__field_desc(	s64,	state_data,	stamp		)
-		__field_desc(	s64,	state_data,	end		)
-		__field_desc(	int,	state_data,	type		)
-		__field_desc(	int,	state_data,	state		)
-	),
-
-	F_printk("%llx->%llx type:%u state:%u",
-		 __entry->stamp, __entry->end,
-		 __entry->type, __entry->state)
-);
-
 FTRACE_ENTRY(kmem_alloc, kmemtrace_alloc_entry,
 
 	TRACE_KMEM_ALLOC,
diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c
deleted file mode 100644
index fe1a00f1445a..000000000000
--- a/kernel/trace/trace_power.c
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * ring buffer based C-state tracer
- *
- * Arjan van de Ven <arjan@linux.intel.com>
- * Copyright (C) 2008 Intel Corporation
- *
- * Much is borrowed from trace_boot.c which is
- * Copyright (C) 2008 Frederic Weisbecker <fweisbec@gmail.com>
- *
- */
-
-#include <linux/init.h>
-#include <linux/debugfs.h>
-#include <trace/power.h>
-#include <linux/kallsyms.h>
-#include <linux/module.h>
-
-#include "trace.h"
-#include "trace_output.h"
-
-static struct trace_array *power_trace;
-static int __read_mostly trace_power_enabled;
-
-static void probe_power_start(struct power_trace *it, unsigned int type,
-				unsigned int level)
-{
-	if (!trace_power_enabled)
-		return;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-}
-
-
-static void probe_power_end(struct power_trace *it)
-{
-	struct ftrace_event_call *call = &event_power;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	buffer = tr->buffer;
-
-	preempt_disable();
-	it->end = ktime_get();
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
-
-static void probe_power_mark(struct power_trace *it, unsigned int type,
-				unsigned int level)
-{
-	struct ftrace_event_call *call = &event_power;
-	struct ring_buffer_event *event;
-	struct ring_buffer *buffer;
-	struct trace_power *entry;
-	struct trace_array_cpu *data;
-	struct trace_array *tr = power_trace;
-
-	if (!trace_power_enabled)
-		return;
-
-	buffer = tr->buffer;
-
-	memset(it, 0, sizeof(struct power_trace));
-	it->state = level;
-	it->type = type;
-	it->stamp = ktime_get();
-	preempt_disable();
-	it->end = it->stamp;
-	data = tr->data[smp_processor_id()];
-
-	event = trace_buffer_lock_reserve(buffer, TRACE_POWER,
-					  sizeof(*entry), 0, 0);
-	if (!event)
-		goto out;
-	entry	= ring_buffer_event_data(event);
-	entry->state_data = *it;
-	if (!filter_check_discard(call, entry, buffer, event))
-		trace_buffer_unlock_commit(buffer, event, 0, 0);
- out:
-	preempt_enable();
-}
-
-static int tracing_power_register(void)
-{
-	int ret;
-
-	ret = register_trace_power_start(probe_power_start);
-	if (ret) {
-		pr_info("power trace: Couldn't activate tracepoint"
-			" probe to trace_power_start\n");
-		return ret;
-	}
-	ret = register_trace_power_end(probe_power_end);
-	if (ret) {
-		pr_info("power trace: Couldn't activate tracepoint"
-			" probe to trace_power_end\n");
-		goto fail_start;
-	}
-	ret = register_trace_power_mark(probe_power_mark);
-	if (ret) {
-		pr_info("power trace: Couldn't activate tracepoint"
-			" probe to trace_power_mark\n");
-		goto fail_end;
-	}
-	return ret;
-fail_end:
-	unregister_trace_power_end(probe_power_end);
-fail_start:
-	unregister_trace_power_start(probe_power_start);
-	return ret;
-}
-
-static void start_power_trace(struct trace_array *tr)
-{
-	trace_power_enabled = 1;
-}
-
-static void stop_power_trace(struct trace_array *tr)
-{
-	trace_power_enabled = 0;
-}
-
-static void power_trace_reset(struct trace_array *tr)
-{
-	trace_power_enabled = 0;
-	unregister_trace_power_start(probe_power_start);
-	unregister_trace_power_end(probe_power_end);
-	unregister_trace_power_mark(probe_power_mark);
-}
-
-
-static int power_trace_init(struct trace_array *tr)
-{
-	power_trace = tr;
-
-	trace_power_enabled = 1;
-	tracing_power_register();
-
-	tracing_reset_online_cpus(tr);
-	return 0;
-}
-
-static enum print_line_t power_print_line(struct trace_iterator *iter)
-{
-	int ret = 0;
-	struct trace_entry *entry = iter->ent;
-	struct trace_power *field ;
-	struct power_trace *it;
-	struct trace_seq *s = &iter->seq;
-	struct timespec stamp;
-	struct timespec duration;
-
-	trace_assign_type(field, entry);
-	it = &field->state_data;
-	stamp = ktime_to_timespec(it->stamp);
-	duration = ktime_to_timespec(ktime_sub(it->end, it->stamp));
-
-	if (entry->type == TRACE_POWER) {
-		if (it->type == POWER_CSTATE)
-			ret = trace_seq_printf(s, "[%5ld.%09ld] CSTATE: Going to C%i on cpu %i for %ld.%09ld\n",
-					  stamp.tv_sec,
-					  stamp.tv_nsec,
-					  it->state, iter->cpu,
-					  duration.tv_sec,
-					  duration.tv_nsec);
-		if (it->type == POWER_PSTATE)
-			ret = trace_seq_printf(s, "[%5ld.%09ld] PSTATE: Going to P%i on cpu %i\n",
-					  stamp.tv_sec,
-					  stamp.tv_nsec,
-					  it->state, iter->cpu);
-		if (!ret)
-			return TRACE_TYPE_PARTIAL_LINE;
-		return TRACE_TYPE_HANDLED;
-	}
-	return TRACE_TYPE_UNHANDLED;
-}
-
-static void power_print_header(struct seq_file *s)
-{
-	seq_puts(s, "#   TIMESTAMP      STATE  EVENT\n");
-	seq_puts(s, "#       |            |      |\n");
-}
-
-static struct tracer power_tracer __read_mostly =
-{
-	.name		= "power",
-	.init		= power_trace_init,
-	.start		= start_power_trace,
-	.stop		= stop_power_trace,
-	.reset		= power_trace_reset,
-	.print_line	= power_print_line,
-	.print_header	= power_print_header,
-};
-
-static int init_power_trace(void)
-{
-	return register_tracer(&power_tracer);
-}
-device_initcall(init_power_trace);
diff --git a/scripts/tracing/power.pl b/scripts/tracing/power.pl
deleted file mode 100644
index 4f729b3501e0..000000000000
--- a/scripts/tracing/power.pl
+++ /dev/null
@@ -1,108 +0,0 @@
-#!/usr/bin/perl
-
-# Copyright 2008, Intel Corporation
-#
-# This file is part of the Linux kernel
-#
-# This program file is free software; you can redistribute it and/or modify it
-# under the terms of the GNU General Public License as published by the
-# Free Software Foundation; version 2 of the License.
-#
-# This program is distributed in the hope that it will be useful, but WITHOUT
-# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
-# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
-# for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program in a file named COPYING; if not, write to the
-# Free Software Foundation, Inc.,
-# 51 Franklin Street, Fifth Floor,
-# Boston, MA 02110-1301 USA
-#
-# Authors:
-# 	Arjan van de Ven <arjan@linux.intel.com>
-
-
-#
-# This script turns a cstate ftrace output into a SVG graphic that shows
-# historic C-state information
-#
-#
-# 	cat /sys/kernel/debug/tracing/trace | perl power.pl > out.svg
-#
-
-my @styles;
-my $base = 0;
-
-my @pstate_last;
-my @pstate_level;
-
-$styles[0] = "fill:rgb(0,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[1] = "fill:rgb(0,255,0);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[2] = "fill:rgb(255,0,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[3] = "fill:rgb(255,255,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[4] = "fill:rgb(255,0,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[5] = "fill:rgb(0,255,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[6] = "fill:rgb(0,128,255);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[7] = "fill:rgb(0,255,128);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-$styles[8] = "fill:rgb(0,25,20);fill-opacity:0.5;stroke-width:1;stroke:rgb(0,0,0)";
-
-
-print "<?xml version=\"1.0\" standalone=\"no\"?> \n";
-print "<svg width=\"10000\" height=\"100%\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n";
-
-my $scale = 30000.0;
-while (<>) {
-	my $line = $_;
-	if ($line =~ /([0-9\.]+)\] CSTATE: Going to C([0-9]) on cpu ([0-9]+) for ([0-9\.]+)/) {
-		if ($base == 0) {
-			$base = $1;
-		}
-		my $time = $1 - $base;
-		$time = $time * $scale;
-		my $C = $2;
-		my $cpu = $3;
-		my $y = 400 * $cpu;
-		my $duration = $4 * $scale;
-		my $msec = int($4 * 100000)/100.0;
-		my $height = $C * 20;
-		$style = $styles[$C];
-
-		$y = $y + 140 - $height;
-
-		$x2 = $time + 4;
-		$y2 = $y + 4;
-
-
-		print "<rect x=\"$time\" width=\"$duration\" y=\"$y\" height=\"$height\" style=\"$style\"/>\n";
-		print "<text transform=\"translate($x2,$y2) rotate(90)\">C$C $msec</text>\n";
-	}
-	if ($line =~ /([0-9\.]+)\] PSTATE: Going to P([0-9]) on cpu ([0-9]+)/) {
-		my $time = $1 - $base;
-		my $state = $2;
-		my $cpu = $3;
-
-		if (defined($pstate_last[$cpu])) {
-			my $from = $pstate_last[$cpu];
-			my $oldstate = $pstate_state[$cpu];
-			my $duration = ($time-$from) * $scale;
-
-			$from = $from * $scale;
-			my $to = $from + $duration;
-			my $height = 140 - ($oldstate * (140/8));
-
-			my $y = 400 * $cpu + 200 + $height;
-			my $y2 = $y+4;
-			my $style = $styles[8];
-
-			print "<rect x=\"$from\" y=\"$y\" width=\"$duration\" height=\"5\" style=\"$style\"/>\n";
-			print "<text transform=\"translate($from,$y2)\">P$oldstate (cpu $cpu)</text>\n";
-		};
-
-		$pstate_last[$cpu] = $time;
-		$pstate_state[$cpu] = $state;
-	}
-}
-
-
-print "</svg>\n";
-- 
cgit v1.2.3


From f48d55ce7871824eae3065f4d81956d7113eff19 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 12:52:11 +0200
Subject: perf: Add a SVG helper library file

The timechart tool writes out SVG format output; this patch adds a
set of helper functions to abstract dealing with SVG from the core
timechart code.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130613.677f0516@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile         |   1 +
 tools/perf/util/svghelper.c | 382 ++++++++++++++++++++++++++++++++++++++++++++
 tools/perf/util/svghelper.h |  25 +++
 3 files changed, 408 insertions(+)
 create mode 100644 tools/perf/util/svghelper.c
 create mode 100644 tools/perf/util/svghelper.h

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 2cb8cc3f6772..0388e36587a8 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -373,6 +373,7 @@ LIB_OBJS += util/thread.o
 LIB_OBJS += util/trace-event-parse.o
 LIB_OBJS += util/trace-event-read.o
 LIB_OBJS += util/trace-event-info.o
+LIB_OBJS += util/svghelper.o
 
 BUILTIN_OBJS += builtin-annotate.o
 BUILTIN_OBJS += builtin-help.o
diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
new file mode 100644
index 000000000000..c7a29afe2d72
--- /dev/null
+++ b/tools/perf/util/svghelper.c
@@ -0,0 +1,382 @@
+/*
+ * svghelper.c - helper functions for outputting svg
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * Authors:
+ *     Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <string.h>
+
+#include "svghelper.h"
+
+static u64 first_time, last_time;
+static u64 turbo_frequency, max_freq;
+
+
+#define SLOT_MULT 30.0
+#define SLOT_HEIGHT 25.0
+#define WIDTH 1000.0
+
+static u64 total_height;
+static FILE *svgfile;
+
+static double cpu2slot(int cpu)
+{
+	return 2 * cpu + 1;
+}
+
+static double cpu2y(int cpu)
+{
+	return cpu2slot(cpu) * SLOT_MULT;
+}
+
+static double time2pixels(u64 time)
+{
+	double X;
+
+	X = WIDTH * (time - first_time) / (last_time - first_time);
+	return X;
+}
+
+void open_svg(const char *filename, int cpus, int rows)
+{
+
+	svgfile = fopen(filename, "w");
+	if (!svgfile) {
+		fprintf(stderr, "Cannot open %s for output\n", filename);
+		return;
+	}
+	total_height = (1 + rows + cpu2slot(cpus)) * SLOT_MULT;
+	fprintf(svgfile, "<?xml version=\"1.0\" standalone=\"no\"?> \n");
+	fprintf(svgfile, "<svg width=\"%4.1f\" height=\"%llu\" version=\"1.1\" xmlns=\"http://www.w3.org/2000/svg\">\n", WIDTH, total_height);
+
+	fprintf(svgfile, "<defs>\n  <style type=\"text/css\">\n    <![CDATA[\n");
+
+	fprintf(svgfile, "      rect          { stroke-width: 1; }\n");
+	fprintf(svgfile, "      rect.process  { fill:rgb(180,180,180); fill-opacity:0.9; stroke-width:1;   stroke:rgb(  0,  0,  0); } \n");
+	fprintf(svgfile, "      rect.process2 { fill:rgb(180,180,180); fill-opacity:0.9; stroke-width:0;   stroke:rgb(  0,  0,  0); } \n");
+	fprintf(svgfile, "      rect.sample   { fill:rgb(  0,  0,255); fill-opacity:0.8; stroke-width:0;   stroke:rgb(  0,  0,  0); } \n");
+	fprintf(svgfile, "      rect.blocked  { fill:rgb(255,  0,  0); fill-opacity:0.5; stroke-width:0;   stroke:rgb(  0,  0,  0); } \n");
+	fprintf(svgfile, "      rect.waiting  { fill:rgb(255,255,  0); fill-opacity:0.3; stroke-width:0;   stroke:rgb(  0,  0,  0); } \n");
+	fprintf(svgfile, "      rect.cpu      { fill:rgb(192,192,192); fill-opacity:0.2; stroke-width:0.5; stroke:rgb(128,128,128); } \n");
+	fprintf(svgfile, "      rect.pstate   { fill:rgb(128,128,128); fill-opacity:0.8; stroke-width:0; } \n");
+	fprintf(svgfile, "      rect.c1       { fill:rgb(255,214,214); fill-opacity:0.5; stroke-width:0; } \n");
+	fprintf(svgfile, "      rect.c2       { fill:rgb(255,172,172); fill-opacity:0.5; stroke-width:0; } \n");
+	fprintf(svgfile, "      rect.c3       { fill:rgb(255,130,130); fill-opacity:0.5; stroke-width:0; } \n");
+	fprintf(svgfile, "      rect.c4       { fill:rgb(255, 88, 88); fill-opacity:0.5; stroke-width:0; } \n");
+	fprintf(svgfile, "      rect.c5       { fill:rgb(255, 44, 44); fill-opacity:0.5; stroke-width:0; } \n");
+	fprintf(svgfile, "      rect.c6       { fill:rgb(255,  0,  0); fill-opacity:0.5; stroke-width:0; } \n");
+	fprintf(svgfile, "      line.pstate   { stroke:rgb(255,255,  0); stroke-opacity:0.8; stroke-width:2; } \n");
+
+	fprintf(svgfile, "    ]]>\n   </style>\n</defs>\n");
+}
+
+void svg_box(int Yslot, u64 start, u64 end, const char *type)
+{
+	if (!svgfile)
+		return;
+
+	fprintf(svgfile, "<rect x=\"%4.8f\" width=\"%4.8f\" y=\"%4.1f\" height=\"%4.1f\" class=\"%s\"/>\n",
+		time2pixels(start), time2pixels(end)-time2pixels(start), Yslot * SLOT_MULT, SLOT_HEIGHT, type);
+}
+
+void svg_sample(int Yslot, int cpu, u64 start, u64 end, const char *type)
+{
+	double text_size;
+	if (!svgfile)
+		return;
+
+	fprintf(svgfile, "<rect x=\"%4.8f\" width=\"%4.8f\" y=\"%4.1f\" height=\"%4.1f\" class=\"%s\"/>\n",
+		time2pixels(start), time2pixels(end)-time2pixels(start), Yslot * SLOT_MULT, SLOT_HEIGHT, type);
+
+	text_size = (time2pixels(end)-time2pixels(start));
+	if (cpu > 9)
+		text_size = text_size/2;
+	if (text_size > 1.25)
+		text_size = 1.25;
+	if (text_size > 0.0001)
+		fprintf(svgfile, "<text transform=\"translate(%1.6f,%1.6f)\" font-size=\"%1.6fpt\">%i</text>\n",
+			time2pixels(start), Yslot *  SLOT_MULT + SLOT_HEIGHT - 1, text_size,  cpu + 1);
+
+}
+
+static char *cpu_model(void)
+{
+	static char cpu_m[255];
+	char buf[256];
+	FILE *file;
+
+	cpu_m[0] = 0;
+	/* CPU type */
+	file = fopen("/proc/cpuinfo", "r");
+	if (file) {
+		while (fgets(buf, 255, file)) {
+			if (strstr(buf, "model name")) {
+				strncpy(cpu_m, &buf[13], 255);
+				break;
+			}
+		}
+		fclose(file);
+	}
+	return cpu_m;
+}
+
+void svg_cpu_box(int cpu, u64 __max_freq, u64 __turbo_freq)
+{
+	char cpu_string[80];
+	if (!svgfile)
+		return;
+
+	max_freq = __max_freq;
+	turbo_frequency = __turbo_freq;
+
+	fprintf(svgfile, "<rect x=\"%4.8f\" width=\"%4.8f\" y=\"%4.1f\" height=\"%4.1f\" class=\"cpu\"/>\n",
+		time2pixels(first_time),
+		time2pixels(last_time)-time2pixels(first_time),
+		cpu2y(cpu), SLOT_MULT+SLOT_HEIGHT);
+
+	sprintf(cpu_string, "CPU %i", (int)cpu+1);
+	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\">%s</text>\n",
+		10+time2pixels(first_time), cpu2y(cpu) + SLOT_HEIGHT/2, cpu_string);
+
+	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\" font-size=\"1.25pt\">%s</text>\n",
+		10+time2pixels(first_time), cpu2y(cpu) + SLOT_MULT + SLOT_HEIGHT - 4, cpu_model());
+}
+
+void svg_process(int cpu, u64 start, u64 end, const char *type, const char *name)
+{
+	double width;
+
+	if (!svgfile)
+		return;
+
+	fprintf(svgfile, "<rect x=\"%4.8f\" width=\"%4.8f\" y=\"%4.1f\" height=\"%4.1f\" class=\"%s\"/>\n",
+		time2pixels(start), time2pixels(end)-time2pixels(start), cpu2y(cpu), SLOT_MULT+SLOT_HEIGHT, type);
+	width = time2pixels(end)-time2pixels(start);
+	if (width > 6)
+		width = 6;
+
+	if (width > 0.001)
+		fprintf(svgfile, "<text  transform=\"translate(%4.5f,%4.5f) rotate(90)\" font-size=\"%3.4fpt\">%s</text>\n",
+			time2pixels(start), cpu2y(cpu), width, name);
+}
+
+void svg_cstate(int cpu, u64 start, u64 end, int type)
+{
+	double width;
+	char style[128];
+
+	if (!svgfile)
+		return;
+
+
+	if (type > 6)
+		type = 6;
+	sprintf(style, "c%i", type);
+
+	fprintf(svgfile, "<rect class=\"%s\" x=\"%4.8f\" width=\"%4.8f\" y=\"%4.1f\" height=\"%4.1f\"/>\n",
+		style,
+		time2pixels(start), time2pixels(end)-time2pixels(start),
+		cpu2y(cpu), SLOT_MULT+SLOT_HEIGHT);
+
+	width = time2pixels(end)-time2pixels(start);
+	if (width > 6)
+		width = 6;
+
+	if (width > 0.05)
+		fprintf(svgfile, "<text  transform=\"translate(%4.5f,%4.5f) rotate(90)\" font-size=\"%3.4fpt\">C%i</text>\n",
+			time2pixels(start), cpu2y(cpu), width, type);
+}
+
+static char *HzToHuman(unsigned long hz)
+{
+	static char buffer[1024];
+	unsigned long long Hz;
+
+	memset(buffer, 0, 1024);
+
+	Hz = hz;
+
+	/* default: just put the Number in */
+	sprintf(buffer, "%9lli", Hz);
+
+	if (Hz > 1000)
+		sprintf(buffer, " %6lli Mhz", (Hz+500)/1000);
+
+	if (Hz > 1500000)
+		sprintf(buffer, " %6.2f Ghz", (Hz+5000.0)/1000000);
+
+	if (Hz == turbo_frequency)
+		sprintf(buffer, "Turbo");
+
+	return buffer;
+}
+
+void svg_pstate(int cpu, u64 start, u64 end, u64 freq)
+{
+	double height = 0;
+
+	if (!svgfile)
+		return;
+
+	if (max_freq)
+		height = freq * 1.0 / max_freq * (SLOT_HEIGHT + SLOT_MULT);
+	height = 1 + cpu2y(cpu) + SLOT_MULT + SLOT_HEIGHT - height;
+	fprintf(svgfile, "<line x1=\"%4.8f\" x2=\"%4.8f\" y1=\"%4.1f\" y2=\"%4.1f\" class=\"pstate\"/>\n",
+		time2pixels(start), time2pixels(end), height, height);
+	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\" font-size=\"0.25pt\">%s</text>\n",
+		time2pixels(start), height+0.9, HzToHuman(freq));
+
+}
+
+
+void svg_partial_wakeline(u64 start, int row1, int row2)
+{
+	double height;
+
+	if (!svgfile)
+		return;
+
+
+	if (row1 < row2) {
+		if (row1)
+			fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%4.2f\" style=\"stroke:rgb(32,255,32);stroke-width:0.009\"/>\n",
+				time2pixels(start), row1 * SLOT_MULT + SLOT_HEIGHT,  time2pixels(start), row1 * SLOT_MULT + SLOT_HEIGHT + SLOT_MULT/32);
+
+		if (row2)
+			fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%4.2f\" style=\"stroke:rgb(32,255,32);stroke-width:0.009\"/>\n",
+				time2pixels(start), row2 * SLOT_MULT - SLOT_MULT/32,  time2pixels(start), row2 * SLOT_MULT);
+	} else {
+		if (row2)
+			fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%4.2f\" style=\"stroke:rgb(32,255,32);stroke-width:0.009\"/>\n",
+				time2pixels(start), row2 * SLOT_MULT + SLOT_HEIGHT,  time2pixels(start), row2 * SLOT_MULT + SLOT_HEIGHT + SLOT_MULT/32);
+
+		if (row1)
+			fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%4.2f\" style=\"stroke:rgb(32,255,32);stroke-width:0.009\"/>\n",
+				time2pixels(start), row1 * SLOT_MULT - SLOT_MULT/32,  time2pixels(start), row1 * SLOT_MULT);
+	}
+	height = row1 * SLOT_MULT;
+	if (row2 > row1)
+		height += SLOT_HEIGHT;
+	if (row1)
+		fprintf(svgfile, "<circle  cx=\"%4.8f\" cy=\"%4.2f\" r = \"0.01\"  style=\"fill:rgb(32,255,32)\"/>\n",
+			time2pixels(start), height);
+}
+
+void svg_wakeline(u64 start, int row1, int row2)
+{
+	double height;
+
+	if (!svgfile)
+		return;
+
+
+	if (row1 < row2)
+		fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%4.2f\" style=\"stroke:rgb(32,255,32);stroke-width:0.009\"/>\n",
+			time2pixels(start), row1 * SLOT_MULT + SLOT_HEIGHT,  time2pixels(start), row2 * SLOT_MULT);
+	else
+		fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%4.2f\" style=\"stroke:rgb(32,255,32);stroke-width:0.009\"/>\n",
+			time2pixels(start), row2 * SLOT_MULT + SLOT_HEIGHT,  time2pixels(start), row1 * SLOT_MULT);
+
+	height = row1 * SLOT_MULT;
+	if (row2 > row1)
+		height += SLOT_HEIGHT;
+	fprintf(svgfile, "<circle  cx=\"%4.8f\" cy=\"%4.2f\" r = \"0.01\"  style=\"fill:rgb(32,255,32)\"/>\n",
+			time2pixels(start), height);
+}
+
+void svg_interrupt(u64 start, int row)
+{
+	if (!svgfile)
+		return;
+
+	fprintf(svgfile, "<circle  cx=\"%4.8f\" cy=\"%4.2f\" r = \"0.01\"  style=\"fill:rgb(255,128,128)\"/>\n",
+			time2pixels(start), row * SLOT_MULT);
+	fprintf(svgfile, "<circle  cx=\"%4.8f\" cy=\"%4.2f\" r = \"0.01\"  style=\"fill:rgb(255,128,128)\"/>\n",
+			time2pixels(start), row * SLOT_MULT + SLOT_HEIGHT);
+}
+
+void svg_text(int Yslot, u64 start, const char *text)
+{
+	if (!svgfile)
+		return;
+
+	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\">%s</text>\n",
+		time2pixels(start), Yslot * SLOT_MULT+SLOT_HEIGHT/2, text);
+}
+
+static void svg_legenda_box(int X, const char *text, const char *style)
+{
+	double boxsize;
+	boxsize = SLOT_HEIGHT / 2;
+
+	fprintf(svgfile, "<rect x=\"%i\" width=\"%4.8f\" y=\"0\" height=\"%4.1f\" class=\"%s\"/>\n",
+		X, boxsize, boxsize, style);
+	fprintf(svgfile, "<text transform=\"translate(%4.1f, %4.1f)\" font-size=\"%4.4fpt\">%s</text>\n",
+		X + boxsize + 5, boxsize, 0.8 * boxsize, text);
+}
+
+void svg_legenda(void)
+{
+	if (!svgfile)
+		return;
+
+	svg_legenda_box(0,	"Running", "sample");
+	svg_legenda_box(100,	"Idle","rect.c1");
+	svg_legenda_box(200,	"Deeper Idle", "rect.c3");
+	svg_legenda_box(350,	"Deepest Idle", "rect.c6");
+	svg_legenda_box(550,	"Sleeping", "process2");
+	svg_legenda_box(650,	"Waiting for cpu", "waiting");
+	svg_legenda_box(800,	"Blocked on IO", "blocked");
+}
+
+void svg_time_grid(u64 start, u64 end)
+{
+	u64 i;
+
+	first_time = start;
+	last_time = end;
+
+	first_time = first_time / 100000000 * 100000000;
+
+	if (!svgfile)
+		return;
+
+	i = first_time;
+	while (i < last_time) {
+		int color = 220;
+		double thickness = 0.075;
+		if ((i % 100000000) == 0) {
+			thickness = 0.5;
+			color = 192;
+		}
+		if ((i % 1000000000) == 0) {
+			thickness = 2.0;
+			color = 128;
+		}
+
+		fprintf(svgfile, "<line x1=\"%4.8f\" y1=\"%4.2f\" x2=\"%4.8f\" y2=\"%llu\" style=\"stroke:rgb(%i,%i,%i);stroke-width:%1.3f\"/>\n",
+			time2pixels(i), SLOT_MULT/2, time2pixels(i), total_height, color, color, color, thickness);
+
+		i += 10000000;
+	}
+}
+
+void svg_close(void)
+{
+	if (svgfile) {
+		fprintf(svgfile, "</svg>\n");
+		fclose(svgfile);
+		svgfile = NULL;
+	}
+}
diff --git a/tools/perf/util/svghelper.h b/tools/perf/util/svghelper.h
new file mode 100644
index 000000000000..ad79b5dc53de
--- /dev/null
+++ b/tools/perf/util/svghelper.h
@@ -0,0 +1,25 @@
+#ifndef _INCLUDE_GUARD_SVG_HELPER_
+#define _INCLUDE_GUARD_SVG_HELPER_
+
+#include "types.h"
+
+extern void open_svg(const char *filename, int cpus, int rows);
+extern void svg_box(int Yslot, u64 start, u64 end, const char *type);
+extern void svg_sample(int Yslot, int cpu, u64 start, u64 end, const char *type);
+extern void svg_cpu_box(int cpu, u64 max_frequency, u64 turbo_frequency);
+
+
+extern void svg_process(int cpu, u64 start, u64 end, const char *type, const char *name);
+extern void svg_cstate(int cpu, u64 start, u64 end, int type);
+extern void svg_pstate(int cpu, u64 start, u64 end, u64 freq);
+
+
+extern void svg_time_grid(u64 start, u64 end);
+extern void svg_legenda(void);
+extern void svg_wakeline(u64 start, int row1, int row2);
+extern void svg_partial_wakeline(u64 start, int row1, int row2);
+extern void svg_interrupt(u64 start, int row);
+extern void svg_text(int Yslot, u64 start, const char *text);
+extern void svg_close(void);
+
+#endif
-- 
cgit v1.2.3


From 10274989fd595db455874fc2c83272fb33f6b27b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Sat, 12 Sep 2009 07:53:05 +0200
Subject: perf: Add the timechart tool

timechart is a tool to visualize what is going on in the system.

The user makes a trace of what is going on with

 > perf record --timechart /usr/bin/some_command

and then can turn the output of this into an svg file

 > perf timechart

which then can be viewed with any SVG view; inkscape works well
enough for me.

The idea behind timechart is to create a "infinitely zoomable"
picture; something that has high level information on a 1:1 zoom
level, but which exposes more details every time you zoom into a
specific area.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <20090912130713.6a77bbc0@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Makefile            |    7 +
 tools/perf/builtin-timechart.c | 1120 ++++++++++++++++++++++++++++++++++++++++
 tools/perf/builtin.h           |    1 +
 tools/perf/perf.c              |    1 +
 4 files changed, 1129 insertions(+)
 create mode 100644 tools/perf/builtin-timechart.c

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 0388e36587a8..0aba8b6e9c54 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -382,6 +382,7 @@ BUILTIN_OBJS += builtin-list.o
 BUILTIN_OBJS += builtin-record.o
 BUILTIN_OBJS += builtin-report.o
 BUILTIN_OBJS += builtin-stat.o
+BUILTIN_OBJS += builtin-timechart.o
 BUILTIN_OBJS += builtin-top.o
 BUILTIN_OBJS += builtin-trace.o
 
@@ -712,6 +713,12 @@ builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS
 		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
 		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
 
+builtin-timechart.o: builtin-timechart.c common-cmds.h PERF-CFLAGS
+	$(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \
+		'-DPERF_HTML_PATH="$(htmldir_SQ)"' \
+		'-DPERF_MAN_PATH="$(mandir_SQ)"' \
+		'-DPERF_INFO_PATH="$(infodir_SQ)"' $<
+
 $(BUILT_INS): perf$X
 	$(QUIET_BUILT_IN)$(RM) $@ && \
 	ln perf$X $@ 2>/dev/null || \
diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
new file mode 100644
index 000000000000..00fac1b362fd
--- /dev/null
+++ b/tools/perf/builtin-timechart.c
@@ -0,0 +1,1120 @@
+/*
+ * builtin-timechart.c - make an svg timechart of system activity
+ *
+ * (C) Copyright 2009 Intel Corporation
+ *
+ * Authors:
+ *     Arjan van de Ven <arjan@linux.intel.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; version 2
+ * of the License.
+ */
+
+#include "builtin.h"
+
+#include "util/util.h"
+
+#include "util/color.h"
+#include <linux/list.h>
+#include "util/cache.h"
+#include <linux/rbtree.h>
+#include "util/symbol.h"
+#include "util/string.h"
+#include "util/callchain.h"
+#include "util/strlist.h"
+
+#include "perf.h"
+#include "util/header.h"
+#include "util/parse-options.h"
+#include "util/parse-events.h"
+#include "util/svghelper.h"
+
+static char		const *input_name = "perf.data";
+static char		const *output_name = "output.svg";
+
+
+static unsigned long	page_size;
+static unsigned long	mmap_window = 32;
+static u64		sample_type;
+
+static unsigned int	numcpus;
+static u64		min_freq;	/* Lowest CPU frequency seen */
+static u64		max_freq;	/* Highest CPU frequency seen */
+static u64		turbo_frequency;
+
+static u64		first_time, last_time;
+
+
+static struct perf_header	*header;
+
+struct per_pid;
+struct per_pidcomm;
+
+struct cpu_sample;
+struct power_event;
+struct wake_event;
+
+struct sample_wrapper;
+
+/*
+ * Datastructure layout:
+ * We keep an list of "pid"s, matching the kernels notion of a task struct.
+ * Each "pid" entry, has a list of "comm"s.
+ *	this is because we want to track different programs different, while
+ *	exec will reuse the original pid (by design).
+ * Each comm has a list of samples that will be used to draw
+ * final graph.
+ */
+
+struct per_pid {
+	struct per_pid *next;
+
+	int		pid;
+	int		ppid;
+
+	u64		start_time;
+	u64		end_time;
+	u64		total_time;
+	int		display;
+
+	struct per_pidcomm *all;
+	struct per_pidcomm *current;
+
+	int painted;
+};
+
+
+struct per_pidcomm {
+	struct per_pidcomm *next;
+
+	u64		start_time;
+	u64		end_time;
+	u64		total_time;
+
+	int		Y;
+	int		display;
+
+	long		state;
+	u64		state_since;
+
+	char		*comm;
+
+	struct cpu_sample *samples;
+};
+
+struct sample_wrapper {
+	struct sample_wrapper *next;
+
+	u64		timestamp;
+	unsigned char	data[0];
+};
+
+#define TYPE_NONE	0
+#define TYPE_RUNNING	1
+#define TYPE_WAITING	2
+#define TYPE_BLOCKED	3
+
+struct cpu_sample {
+	struct cpu_sample *next;
+
+	u64 start_time;
+	u64 end_time;
+	int type;
+	int cpu;
+};
+
+static struct per_pid *all_data;
+
+#define CSTATE 1
+#define PSTATE 2
+
+struct power_event {
+	struct power_event *next;
+	int type;
+	int state;
+	u64 start_time;
+	u64 end_time;
+	int cpu;
+};
+
+struct wake_event {
+	struct wake_event *next;
+	int waker;
+	int wakee;
+	u64 time;
+};
+
+static struct power_event    *power_events;
+static struct wake_event     *wake_events;
+
+struct sample_wrapper *all_samples;
+
+static struct per_pid *find_create_pid(int pid)
+{
+	struct per_pid *cursor = all_data;
+
+	while (cursor) {
+		if (cursor->pid == pid)
+			return cursor;
+		cursor = cursor->next;
+	}
+	cursor = malloc(sizeof(struct per_pid));
+	assert(cursor != NULL);
+	memset(cursor, 0, sizeof(struct per_pid));
+	cursor->pid = pid;
+	cursor->next = all_data;
+	all_data = cursor;
+	return cursor;
+}
+
+static void pid_set_comm(int pid, char *comm)
+{
+	struct per_pid *p;
+	struct per_pidcomm *c;
+	p = find_create_pid(pid);
+	c = p->all;
+	while (c) {
+		if (c->comm && strcmp(c->comm, comm) == 0) {
+			p->current = c;
+			return;
+		}
+		if (!c->comm) {
+			c->comm = strdup(comm);
+			p->current = c;
+			return;
+		}
+		c = c->next;
+	}
+	c = malloc(sizeof(struct per_pidcomm));
+	assert(c != NULL);
+	memset(c, 0, sizeof(struct per_pidcomm));
+	c->comm = strdup(comm);
+	p->current = c;
+	c->next = p->all;
+	p->all = c;
+}
+
+static void pid_fork(int pid, int ppid, u64 timestamp)
+{
+	struct per_pid *p, *pp;
+	p = find_create_pid(pid);
+	pp = find_create_pid(ppid);
+	p->ppid = ppid;
+	if (pp->current && pp->current->comm && !p->current)
+		pid_set_comm(pid, pp->current->comm);
+
+	p->start_time = timestamp;
+	if (p->current) {
+		p->current->start_time = timestamp;
+		p->current->state_since = timestamp;
+	}
+}
+
+static void pid_exit(int pid, u64 timestamp)
+{
+	struct per_pid *p;
+	p = find_create_pid(pid);
+	p->end_time = timestamp;
+	if (p->current)
+		p->current->end_time = timestamp;
+}
+
+static void
+pid_put_sample(int pid, int type, unsigned int cpu, u64 start, u64 end)
+{
+	struct per_pid *p;
+	struct per_pidcomm *c;
+	struct cpu_sample *sample;
+
+	p = find_create_pid(pid);
+	c = p->current;
+	if (!c) {
+		c = malloc(sizeof(struct per_pidcomm));
+		assert(c != NULL);
+		memset(c, 0, sizeof(struct per_pidcomm));
+		p->current = c;
+		c->next = p->all;
+		p->all = c;
+	}
+
+	sample = malloc(sizeof(struct cpu_sample));
+	assert(sample != NULL);
+	memset(sample, 0, sizeof(struct cpu_sample));
+	sample->start_time = start;
+	sample->end_time = end;
+	sample->type = type;
+	sample->next = c->samples;
+	sample->cpu = cpu;
+	c->samples = sample;
+
+	if (sample->type == TYPE_RUNNING && end > start && start > 0) {
+		c->total_time += (end-start);
+		p->total_time += (end-start);
+	}
+
+	if (c->start_time == 0 || c->start_time > start)
+		c->start_time = start;
+	if (p->start_time == 0 || p->start_time > start)
+		p->start_time = start;
+
+	if (cpu > numcpus)
+		numcpus = cpu;
+}
+
+#define MAX_CPUS 4096
+
+static u64 cpus_cstate_start_times[MAX_CPUS];
+static int cpus_cstate_state[MAX_CPUS];
+static u64 cpus_pstate_start_times[MAX_CPUS];
+static u64 cpus_pstate_state[MAX_CPUS];
+
+static int
+process_comm_event(event_t *event)
+{
+	pid_set_comm(event->comm.pid, event->comm.comm);
+	return 0;
+}
+static int
+process_fork_event(event_t *event)
+{
+	pid_fork(event->fork.pid, event->fork.ppid, event->fork.time);
+	return 0;
+}
+
+static int
+process_exit_event(event_t *event)
+{
+	pid_exit(event->fork.pid, event->fork.time);
+	return 0;
+}
+
+struct trace_entry {
+	u32			size;
+	unsigned short		type;
+	unsigned char		flags;
+	unsigned char		preempt_count;
+	int			pid;
+	int			tgid;
+};
+
+struct power_entry {
+	struct trace_entry te;
+	s64	type;
+	s64	value;
+};
+
+#define TASK_COMM_LEN 16
+struct wakeup_entry {
+	struct trace_entry te;
+	char comm[TASK_COMM_LEN];
+	int   pid;
+	int   prio;
+	int   success;
+};
+
+/*
+ * trace_flag_type is an enumeration that holds different
+ * states when a trace occurs. These are:
+ *  IRQS_OFF            - interrupts were disabled
+ *  IRQS_NOSUPPORT      - arch does not support irqs_disabled_flags
+ *  NEED_RESCED         - reschedule is requested
+ *  HARDIRQ             - inside an interrupt handler
+ *  SOFTIRQ             - inside a softirq handler
+ */
+enum trace_flag_type {
+	TRACE_FLAG_IRQS_OFF		= 0x01,
+	TRACE_FLAG_IRQS_NOSUPPORT	= 0x02,
+	TRACE_FLAG_NEED_RESCHED		= 0x04,
+	TRACE_FLAG_HARDIRQ		= 0x08,
+	TRACE_FLAG_SOFTIRQ		= 0x10,
+};
+
+
+
+struct sched_switch {
+	struct trace_entry te;
+	char prev_comm[TASK_COMM_LEN];
+	int  prev_pid;
+	int  prev_prio;
+	long prev_state; /* Arjan weeps. */
+	char next_comm[TASK_COMM_LEN];
+	int  next_pid;
+	int  next_prio;
+};
+
+static void c_state_start(int cpu, u64 timestamp, int state)
+{
+	cpus_cstate_start_times[cpu] = timestamp;
+	cpus_cstate_state[cpu] = state;
+}
+
+static void c_state_end(int cpu, u64 timestamp)
+{
+	struct power_event *pwr;
+	pwr = malloc(sizeof(struct power_event));
+	if (!pwr)
+		return;
+	memset(pwr, 0, sizeof(struct power_event));
+
+	pwr->state = cpus_cstate_state[cpu];
+	pwr->start_time = cpus_cstate_start_times[cpu];
+	pwr->end_time = timestamp;
+	pwr->cpu = cpu;
+	pwr->type = CSTATE;
+	pwr->next = power_events;
+
+	power_events = pwr;
+}
+
+static void p_state_change(int cpu, u64 timestamp, u64 new_freq)
+{
+	struct power_event *pwr;
+	pwr = malloc(sizeof(struct power_event));
+
+	if (new_freq > 8000000) /* detect invalid data */
+		return;
+
+	if (!pwr)
+		return;
+	memset(pwr, 0, sizeof(struct power_event));
+
+	pwr->state = cpus_pstate_state[cpu];
+	pwr->start_time = cpus_pstate_start_times[cpu];
+	pwr->end_time = timestamp;
+	pwr->cpu = cpu;
+	pwr->type = PSTATE;
+	pwr->next = power_events;
+
+	if (!pwr->start_time)
+		pwr->start_time = first_time;
+
+	power_events = pwr;
+
+	cpus_pstate_state[cpu] = new_freq;
+	cpus_pstate_start_times[cpu] = timestamp;
+
+	if ((u64)new_freq > max_freq)
+		max_freq = new_freq;
+
+	if (new_freq < min_freq || min_freq == 0)
+		min_freq = new_freq;
+
+	if (new_freq == max_freq - 1000)
+			turbo_frequency = max_freq;
+}
+
+static void
+sched_wakeup(int cpu, u64 timestamp, int pid, struct trace_entry *te)
+{
+	struct wake_event *we;
+	struct per_pid *p;
+	struct wakeup_entry *wake = (void *)te;
+
+	we = malloc(sizeof(struct wake_event));
+	if (!we)
+		return;
+
+	memset(we, 0, sizeof(struct wake_event));
+	we->time = timestamp;
+	we->waker = pid;
+
+	if ((te->flags & TRACE_FLAG_HARDIRQ) || (te->flags & TRACE_FLAG_SOFTIRQ))
+		we->waker = -1;
+
+	we->wakee = wake->pid;
+	we->next = wake_events;
+	wake_events = we;
+	p = find_create_pid(we->wakee);
+
+	if (p && p->current && p->current->state == TYPE_NONE) {
+		p->current->state_since = timestamp;
+		p->current->state = TYPE_WAITING;
+	}
+	if (p && p->current && p->current->state == TYPE_BLOCKED) {
+		pid_put_sample(p->pid, p->current->state, cpu, p->current->state_since, timestamp);
+		p->current->state_since = timestamp;
+		p->current->state = TYPE_WAITING;
+	}
+}
+
+static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te)
+{
+	struct per_pid *p = NULL, *prev_p;
+	struct sched_switch *sw = (void *)te;
+
+
+	prev_p = find_create_pid(sw->prev_pid);
+
+	p = find_create_pid(sw->next_pid);
+
+	if (prev_p->current && prev_p->current->state != TYPE_NONE)
+		pid_put_sample(sw->prev_pid, TYPE_RUNNING, cpu, prev_p->current->state_since, timestamp);
+	if (p && p->current) {
+		if (p->current->state != TYPE_NONE)
+			pid_put_sample(sw->next_pid, p->current->state, cpu, p->current->state_since, timestamp);
+
+			p->current->state_since = timestamp;
+			p->current->state = TYPE_RUNNING;
+	}
+
+	if (prev_p->current) {
+		prev_p->current->state = TYPE_NONE;
+		prev_p->current->state_since = timestamp;
+		if (sw->prev_state & 2)
+			prev_p->current->state = TYPE_BLOCKED;
+		if (sw->prev_state == 0)
+			prev_p->current->state = TYPE_WAITING;
+	}
+}
+
+
+static int
+process_sample_event(event_t *event)
+{
+	int cursor = 0;
+	u64 addr = 0;
+	u64 stamp = 0;
+	u32 cpu = 0;
+	u32 pid = 0;
+	struct trace_entry *te;
+
+	if (sample_type & PERF_SAMPLE_IP)
+		cursor++;
+
+	if (sample_type & PERF_SAMPLE_TID) {
+		pid = event->sample.array[cursor]>>32;
+		cursor++;
+	}
+	if (sample_type & PERF_SAMPLE_TIME) {
+		stamp = event->sample.array[cursor++];
+
+		if (!first_time || first_time > stamp)
+			first_time = stamp;
+		if (last_time < stamp)
+			last_time = stamp;
+
+	}
+	if (sample_type & PERF_SAMPLE_ADDR)
+		addr = event->sample.array[cursor++];
+	if (sample_type & PERF_SAMPLE_ID)
+		cursor++;
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		cursor++;
+	if (sample_type & PERF_SAMPLE_CPU)
+		cpu = event->sample.array[cursor++] & 0xFFFFFFFF;
+	if (sample_type & PERF_SAMPLE_PERIOD)
+		cursor++;
+
+	te = (void *)&event->sample.array[cursor];
+
+	if (sample_type & PERF_SAMPLE_RAW && te->size > 0) {
+		char *event_str;
+		struct power_entry *pe;
+
+		pe = (void *)te;
+
+		event_str = perf_header__find_event(te->type);
+
+		if (!event_str)
+			return 0;
+
+		if (strcmp(event_str, "power:power_start") == 0)
+			c_state_start(cpu, stamp, pe->value);
+
+		if (strcmp(event_str, "power:power_end") == 0)
+			c_state_end(cpu, stamp);
+
+		if (strcmp(event_str, "power:power_frequency") == 0)
+			p_state_change(cpu, stamp, pe->value);
+
+		if (strcmp(event_str, "sched:sched_wakeup") == 0)
+			sched_wakeup(cpu, stamp, pid, te);
+
+		if (strcmp(event_str, "sched:sched_switch") == 0)
+			sched_switch(cpu, stamp, te);
+	}
+	return 0;
+}
+
+/*
+ * After the last sample we need to wrap up the current C/P state
+ * and close out each CPU for these.
+ */
+static void end_sample_processing(void)
+{
+	u64 cpu;
+	struct power_event *pwr;
+
+	for (cpu = 0; cpu < numcpus; cpu++) {
+		pwr = malloc(sizeof(struct power_event));
+		if (!pwr)
+			return;
+		memset(pwr, 0, sizeof(struct power_event));
+
+		/* C state */
+#if 0
+		pwr->state = cpus_cstate_state[cpu];
+		pwr->start_time = cpus_cstate_start_times[cpu];
+		pwr->end_time = last_time;
+		pwr->cpu = cpu;
+		pwr->type = CSTATE;
+		pwr->next = power_events;
+
+		power_events = pwr;
+#endif
+		/* P state */
+
+		pwr = malloc(sizeof(struct power_event));
+		if (!pwr)
+			return;
+		memset(pwr, 0, sizeof(struct power_event));
+
+		pwr->state = cpus_pstate_state[cpu];
+		pwr->start_time = cpus_pstate_start_times[cpu];
+		pwr->end_time = last_time;
+		pwr->cpu = cpu;
+		pwr->type = PSTATE;
+		pwr->next = power_events;
+
+		if (!pwr->start_time)
+			pwr->start_time = first_time;
+		if (!pwr->state)
+			pwr->state = min_freq;
+		power_events = pwr;
+	}
+}
+
+static u64 sample_time(event_t *event)
+{
+	int cursor;
+
+	cursor = 0;
+	if (sample_type & PERF_SAMPLE_IP)
+		cursor++;
+	if (sample_type & PERF_SAMPLE_TID)
+		cursor++;
+	if (sample_type & PERF_SAMPLE_TIME)
+		return event->sample.array[cursor];
+	return 0;
+}
+
+
+/*
+ * We first queue all events, sorted backwards by insertion.
+ * The order will get flipped later.
+ */
+static int
+queue_sample_event(event_t *event)
+{
+	struct sample_wrapper *copy, *prev;
+	int size;
+
+	size = event->sample.header.size + sizeof(struct sample_wrapper) + 8;
+
+	copy = malloc(size);
+	if (!copy)
+		return 1;
+
+	memset(copy, 0, size);
+
+	copy->next = NULL;
+	copy->timestamp = sample_time(event);
+
+	memcpy(&copy->data, event, event->sample.header.size);
+
+	/* insert in the right place in the list */
+
+	if (!all_samples) {
+		/* first sample ever */
+		all_samples = copy;
+		return 0;
+	}
+
+	if (all_samples->timestamp < copy->timestamp) {
+		/* insert at the head of the list */
+		copy->next = all_samples;
+		all_samples = copy;
+		return 0;
+	}
+
+	prev = all_samples;
+	while (prev->next) {
+		if (prev->next->timestamp < copy->timestamp) {
+			copy->next = prev->next;
+			prev->next = copy;
+			return 0;
+		}
+		prev = prev->next;
+	}
+	/* insert at the end of the list */
+	prev->next = copy;
+
+	return 0;
+}
+
+static void sort_queued_samples(void)
+{
+	struct sample_wrapper *cursor, *next;
+
+	cursor = all_samples;
+	all_samples = NULL;
+
+	while (cursor) {
+		next = cursor->next;
+		cursor->next = all_samples;
+		all_samples = cursor;
+		cursor = next;
+	}
+}
+
+/*
+ * Sort the pid datastructure
+ */
+static void sort_pids(void)
+{
+	struct per_pid *new_list, *p, *cursor, *prev;
+	/* sort by ppid first, then by pid, lowest to highest */
+
+	new_list = NULL;
+
+	while (all_data) {
+		p = all_data;
+		all_data = p->next;
+		p->next = NULL;
+
+		if (new_list == NULL) {
+			new_list = p;
+			p->next = NULL;
+			continue;
+		}
+		prev = NULL;
+		cursor = new_list;
+		while (cursor) {
+			if (cursor->ppid > p->ppid ||
+				(cursor->ppid == p->ppid && cursor->pid > p->pid)) {
+				/* must insert before */
+				if (prev) {
+					p->next = prev->next;
+					prev->next = p;
+					cursor = NULL;
+					continue;
+				} else {
+					p->next = new_list;
+					new_list = p;
+					cursor = NULL;
+					continue;
+				}
+			}
+
+			prev = cursor;
+			cursor = cursor->next;
+			if (!cursor)
+				prev->next = p;
+		}
+	}
+	all_data = new_list;
+}
+
+
+static void draw_c_p_states(void)
+{
+	struct power_event *pwr;
+	pwr = power_events;
+
+	/*
+	 * two pass drawing so that the P state bars are on top of the C state blocks
+	 */
+	while (pwr) {
+		if (pwr->type == CSTATE)
+			svg_cstate(pwr->cpu, pwr->start_time, pwr->end_time, pwr->state);
+		pwr = pwr->next;
+	}
+
+	pwr = power_events;
+	while (pwr) {
+		if (pwr->type == PSTATE) {
+			if (!pwr->state)
+				pwr->state = min_freq;
+			svg_pstate(pwr->cpu, pwr->start_time, pwr->end_time, pwr->state);
+		}
+		pwr = pwr->next;
+	}
+}
+
+static void draw_wakeups(void)
+{
+	struct wake_event *we;
+	struct per_pid *p;
+	struct per_pidcomm *c;
+
+	we = wake_events;
+	while (we) {
+		int from = 0, to = 0;
+
+		/* locate the column of the waker and wakee */
+		p = all_data;
+		while (p) {
+			if (p->pid == we->waker || p->pid == we->wakee) {
+				c = p->all;
+				while (c) {
+					if (c->Y && c->start_time <= we->time && c->end_time >= we->time) {
+						if (p->pid == we->waker)
+							from = c->Y;
+						if (p->pid == we->wakee)
+							to = c->Y;
+					}
+					c = c->next;
+				}
+			}
+			p = p->next;
+		}
+
+		if (we->waker == -1)
+			svg_interrupt(we->time, to);
+		else if (from && to && abs(from - to) == 1)
+			svg_wakeline(we->time, from, to);
+		else
+			svg_partial_wakeline(we->time, from, to);
+		we = we->next;
+	}
+}
+
+static void draw_cpu_usage(void)
+{
+	struct per_pid *p;
+	struct per_pidcomm *c;
+	struct cpu_sample *sample;
+	p = all_data;
+	while (p) {
+		c = p->all;
+		while (c) {
+			sample = c->samples;
+			while (sample) {
+				if (sample->type == TYPE_RUNNING)
+					svg_process(sample->cpu, sample->start_time, sample->end_time, "sample", c->comm);
+
+				sample = sample->next;
+			}
+			c = c->next;
+		}
+		p = p->next;
+	}
+}
+
+static void draw_process_bars(void)
+{
+	struct per_pid *p;
+	struct per_pidcomm *c;
+	struct cpu_sample *sample;
+	int Y = 0;
+
+	Y = 2 * numcpus + 2;
+
+	p = all_data;
+	while (p) {
+		c = p->all;
+		while (c) {
+			if (!c->display) {
+				c->Y = 0;
+				c = c->next;
+				continue;
+			}
+
+			svg_box(Y, p->start_time, p->end_time, "process");
+			sample = c->samples;
+			while (sample) {
+				if (sample->type == TYPE_RUNNING)
+					svg_sample(Y, sample->cpu, sample->start_time, sample->end_time, "sample");
+				if (sample->type == TYPE_BLOCKED)
+					svg_box(Y, sample->start_time, sample->end_time, "blocked");
+				if (sample->type == TYPE_WAITING)
+					svg_box(Y, sample->start_time, sample->end_time, "waiting");
+				sample = sample->next;
+			}
+
+			if (c->comm) {
+				char comm[256];
+				if (c->total_time > 5000000000) /* 5 seconds */
+					sprintf(comm, "%s:%i (%2.2fs)", c->comm, p->pid, c->total_time / 1000000000.0);
+				else
+					sprintf(comm, "%s:%i (%3.1fms)", c->comm, p->pid, c->total_time / 1000000.0);
+
+				svg_text(Y, c->start_time, comm);
+			}
+			c->Y = Y;
+			Y++;
+			c = c->next;
+		}
+		p = p->next;
+	}
+}
+
+static int determine_display_tasks(u64 threshold)
+{
+	struct per_pid *p;
+	struct per_pidcomm *c;
+	int count = 0;
+
+	p = all_data;
+	while (p) {
+		p->display = 0;
+		if (p->start_time == 1)
+			p->start_time = first_time;
+
+		/* no exit marker, task kept running to the end */
+		if (p->end_time == 0)
+			p->end_time = last_time;
+		if (p->total_time >= threshold)
+			p->display = 1;
+
+		c = p->all;
+
+		while (c) {
+			c->display = 0;
+
+			if (c->start_time == 1)
+				c->start_time = first_time;
+
+			if (c->total_time >= threshold) {
+				c->display = 1;
+				count++;
+			}
+
+			if (c->end_time == 0)
+				c->end_time = last_time;
+
+			c = c->next;
+		}
+		p = p->next;
+	}
+	return count;
+}
+
+
+
+#define TIME_THRESH 10000000
+
+static void write_svg_file(const char *filename)
+{
+	u64 i;
+	int count;
+
+	numcpus++;
+
+
+	count = determine_display_tasks(TIME_THRESH);
+
+	/* We'd like to show at least 15 tasks; be less picky if we have fewer */
+	if (count < 15)
+		count = determine_display_tasks(TIME_THRESH / 10);
+
+	open_svg(filename, numcpus, count);
+
+	svg_time_grid(first_time, last_time);
+	svg_legenda();
+
+	for (i = 0; i < numcpus; i++)
+		svg_cpu_box(i, max_freq, turbo_frequency);
+
+	draw_cpu_usage();
+	draw_process_bars();
+	draw_c_p_states();
+	draw_wakeups();
+
+	svg_close();
+}
+
+static int
+process_event(event_t *event)
+{
+
+	switch (event->header.type) {
+
+	case PERF_EVENT_COMM:
+		return process_comm_event(event);
+	case PERF_EVENT_FORK:
+		return process_fork_event(event);
+	case PERF_EVENT_EXIT:
+		return process_exit_event(event);
+	case PERF_EVENT_SAMPLE:
+		return queue_sample_event(event);
+
+	/*
+	 * We dont process them right now but they are fine:
+	 */
+	case PERF_EVENT_MMAP:
+	case PERF_EVENT_THROTTLE:
+	case PERF_EVENT_UNTHROTTLE:
+		return 0;
+
+	default:
+		return -1;
+	}
+
+	return 0;
+}
+
+static void process_samples(void)
+{
+	struct sample_wrapper *cursor;
+	event_t *event;
+
+	sort_queued_samples();
+
+	cursor = all_samples;
+	while (cursor) {
+		event = (void *)&cursor->data;
+		cursor = cursor->next;
+		process_sample_event(event);
+	}
+}
+
+
+static int __cmd_timechart(void)
+{
+	int ret, rc = EXIT_FAILURE;
+	unsigned long offset = 0;
+	unsigned long head, shift;
+	struct stat statbuf;
+	event_t *event;
+	uint32_t size;
+	char *buf;
+	int input;
+
+	input = open(input_name, O_RDONLY);
+	if (input < 0) {
+		fprintf(stderr, " failed to open file: %s", input_name);
+		if (!strcmp(input_name, "perf.data"))
+			fprintf(stderr, "  (try 'perf record' first)");
+		fprintf(stderr, "\n");
+		exit(-1);
+	}
+
+	ret = fstat(input, &statbuf);
+	if (ret < 0) {
+		perror("failed to stat file");
+		exit(-1);
+	}
+
+	if (!statbuf.st_size) {
+		fprintf(stderr, "zero-sized file, nothing to do!\n");
+		exit(0);
+	}
+
+	header = perf_header__read(input);
+	head = header->data_offset;
+
+	sample_type = perf_header__sample_type(header);
+
+	shift = page_size * (head / page_size);
+	offset += shift;
+	head -= shift;
+
+remap:
+	buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ,
+			   MAP_SHARED, input, offset);
+	if (buf == MAP_FAILED) {
+		perror("failed to mmap file");
+		exit(-1);
+	}
+
+more:
+	event = (event_t *)(buf + head);
+
+	size = event->header.size;
+	if (!size)
+		size = 8;
+
+	if (head + event->header.size >= page_size * mmap_window) {
+		int ret2;
+
+		shift = page_size * (head / page_size);
+
+		ret2 = munmap(buf, page_size * mmap_window);
+		assert(ret2 == 0);
+
+		offset += shift;
+		head -= shift;
+		goto remap;
+	}
+
+	size = event->header.size;
+
+	if (!size || process_event(event) < 0) {
+
+		printf("%p [%p]: skipping unknown header type: %d\n",
+			(void *)(offset + head),
+			(void *)(long)(event->header.size),
+			event->header.type);
+
+		/*
+		 * assume we lost track of the stream, check alignment, and
+		 * increment a single u64 in the hope to catch on again 'soon'.
+		 */
+
+		if (unlikely(head & 7))
+			head &= ~7ULL;
+
+		size = 8;
+	}
+
+	head += size;
+
+	if (offset + head >= header->data_offset + header->data_size)
+		goto done;
+
+	if (offset + head < (unsigned long)statbuf.st_size)
+		goto more;
+
+done:
+	rc = EXIT_SUCCESS;
+	close(input);
+
+
+	process_samples();
+
+	end_sample_processing();
+
+	sort_pids();
+
+	write_svg_file(output_name);
+
+	printf("Written %2.1f seconds of trace to %s.\n", (last_time - first_time) / 1000000000.0, output_name);
+
+	return rc;
+}
+
+static const char * const report_usage[] = {
+	"perf report [<options>] <command>",
+	NULL
+};
+
+static const struct option options[] = {
+	OPT_STRING('i', "input", &input_name, "file",
+		    "input file name"),
+	OPT_STRING('o', "output", &output_name, "file",
+		    "output file name"),
+	OPT_END()
+};
+
+
+int cmd_timechart(int argc, const char **argv, const char *prefix __used)
+{
+	symbol__init();
+
+	page_size = getpagesize();
+
+	argc = parse_options(argc, argv, options, report_usage, 0);
+
+	/*
+	 * Any (unrecognized) arguments left?
+	 */
+	if (argc)
+		usage_with_options(report_usage, options);
+
+	setup_pager();
+
+	return __cmd_timechart();
+}
diff --git a/tools/perf/builtin.h b/tools/perf/builtin.h
index b09cadbd76b1..e11d8d231c3b 100644
--- a/tools/perf/builtin.h
+++ b/tools/perf/builtin.h
@@ -21,6 +21,7 @@ extern int cmd_list(int argc, const char **argv, const char *prefix);
 extern int cmd_record(int argc, const char **argv, const char *prefix);
 extern int cmd_report(int argc, const char **argv, const char *prefix);
 extern int cmd_stat(int argc, const char **argv, const char *prefix);
+extern int cmd_timechart(int argc, const char **argv, const char *prefix);
 extern int cmd_top(int argc, const char **argv, const char *prefix);
 extern int cmd_trace(int argc, const char **argv, const char *prefix);
 extern int cmd_version(int argc, const char **argv, const char *prefix);
diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index c972d1c35489..19fc7feb9d59 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -289,6 +289,7 @@ static void handle_internal_command(int argc, const char **argv)
 		{ "record", cmd_record, 0 },
 		{ "report", cmd_report, 0 },
 		{ "stat", cmd_stat, 0 },
+		{ "timechart", cmd_timechart, 0 },
 		{ "top", cmd_top, 0 },
 		{ "annotate", cmd_annotate, 0 },
 		{ "version", cmd_version, 0 },
-- 
cgit v1.2.3


From 3c09eebd61eaacca866cd60b50416f18640bc731 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 19 Sep 2009 13:34:42 +0200
Subject: perf timechart: Add "perf timechart record"

Add a command line option to record a trace, similar to "perf sched record".

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090919133442.0dc2c7f5@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-timechart.c | 47 +++++++++++++++++++++++++++++++++++-------
 1 file changed, 39 insertions(+), 8 deletions(-)

diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c
index 00fac1b362fd..58d737ec8f5e 100644
--- a/tools/perf/builtin-timechart.c
+++ b/tools/perf/builtin-timechart.c
@@ -1086,11 +1086,42 @@ done:
 	return rc;
 }
 
-static const char * const report_usage[] = {
-	"perf report [<options>] <command>",
+static const char * const timechart_usage[] = {
+	"perf timechart [<options>] {record}",
 	NULL
 };
 
+static const char *record_args[] = {
+	"record",
+	"-a",
+	"-R",
+	"-M",
+	"-f",
+	"-c", "1",
+	"-e", "power:power_start",
+	"-e", "power:power_end",
+	"-e", "power:power_frequency",
+	"-e", "sched:sched_wakeup",
+	"-e", "sched:sched_switch",
+};
+
+static int __cmd_record(int argc, const char **argv)
+{
+	unsigned int rec_argc, i, j;
+	const char **rec_argv;
+
+	rec_argc = ARRAY_SIZE(record_args) + argc - 1;
+	rec_argv = calloc(rec_argc + 1, sizeof(char *));
+
+	for (i = 0; i < ARRAY_SIZE(record_args); i++)
+		rec_argv[i] = strdup(record_args[i]);
+
+	for (j = 1; j < (unsigned int)argc; j++, i++)
+		rec_argv[i] = argv[j];
+
+	return cmd_record(i, rec_argv, NULL);
+}
+
 static const struct option options[] = {
 	OPT_STRING('i', "input", &input_name, "file",
 		    "input file name"),
@@ -1106,13 +1137,13 @@ int cmd_timechart(int argc, const char **argv, const char *prefix __used)
 
 	page_size = getpagesize();
 
-	argc = parse_options(argc, argv, options, report_usage, 0);
+	argc = parse_options(argc, argv, options, timechart_usage,
+			PARSE_OPT_STOP_AT_NON_OPTION);
 
-	/*
-	 * Any (unrecognized) arguments left?
-	 */
-	if (argc)
-		usage_with_options(report_usage, options);
+	if (argc && !strncmp(argv[0], "rec", 3))
+		return __cmd_record(argc, argv);
+	else if (argc)
+		usage_with_options(timechart_usage, options);
 
 	setup_pager();
 
-- 
cgit v1.2.3


From 964a0b3d2b1b1cac1d01e29b635831b3d92a3fdd Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 19 Sep 2009 13:35:07 +0200
Subject: perf utils: Be consistent about minimum text size in the svghelper

Be more consistent in the svghelper about the minimum text size
by having a global #define for this.

There needs to be a minimum text size in order to keep the size
of the SVG file within the reach of what current SVG viewers can
cope with.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arjan van de Ven <arjan@infradead.org>
LKML-Reference: <20090919133507.7374ef8b@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/svghelper.c | 24 +++++++++++++-----------
 1 file changed, 13 insertions(+), 11 deletions(-)

diff --git a/tools/perf/util/svghelper.c b/tools/perf/util/svghelper.c
index c7a29afe2d72..b0fcecdf378d 100644
--- a/tools/perf/util/svghelper.c
+++ b/tools/perf/util/svghelper.c
@@ -27,6 +27,8 @@ static u64 turbo_frequency, max_freq;
 #define SLOT_HEIGHT 25.0
 #define WIDTH 1000.0
 
+#define MIN_TEXT_SIZE 0.001
+
 static u64 total_height;
 static FILE *svgfile;
 
@@ -104,8 +106,8 @@ void svg_sample(int Yslot, int cpu, u64 start, u64 end, const char *type)
 		text_size = text_size/2;
 	if (text_size > 1.25)
 		text_size = 1.25;
-	if (text_size > 0.0001)
-		fprintf(svgfile, "<text transform=\"translate(%1.6f,%1.6f)\" font-size=\"%1.6fpt\">%i</text>\n",
+	if (text_size > MIN_TEXT_SIZE)
+		fprintf(svgfile, "<text transform=\"translate(%1.8f,%1.8f)\" font-size=\"%1.6fpt\">%i</text>\n",
 			time2pixels(start), Yslot *  SLOT_MULT + SLOT_HEIGHT - 1, text_size,  cpu + 1);
 
 }
@@ -146,10 +148,10 @@ void svg_cpu_box(int cpu, u64 __max_freq, u64 __turbo_freq)
 		cpu2y(cpu), SLOT_MULT+SLOT_HEIGHT);
 
 	sprintf(cpu_string, "CPU %i", (int)cpu+1);
-	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\">%s</text>\n",
+	fprintf(svgfile, "<text transform=\"translate(%4.8f,%4.8f)\">%s</text>\n",
 		10+time2pixels(first_time), cpu2y(cpu) + SLOT_HEIGHT/2, cpu_string);
 
-	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\" font-size=\"1.25pt\">%s</text>\n",
+	fprintf(svgfile, "<text transform=\"translate(%4.8f,%4.8f)\" font-size=\"1.25pt\">%s</text>\n",
 		10+time2pixels(first_time), cpu2y(cpu) + SLOT_MULT + SLOT_HEIGHT - 4, cpu_model());
 }
 
@@ -166,8 +168,8 @@ void svg_process(int cpu, u64 start, u64 end, const char *type, const char *name
 	if (width > 6)
 		width = 6;
 
-	if (width > 0.001)
-		fprintf(svgfile, "<text  transform=\"translate(%4.5f,%4.5f) rotate(90)\" font-size=\"%3.4fpt\">%s</text>\n",
+	if (width > MIN_TEXT_SIZE)
+		fprintf(svgfile, "<text  transform=\"translate(%4.8f,%4.8f) rotate(90)\" font-size=\"%3.4fpt\">%s</text>\n",
 			time2pixels(start), cpu2y(cpu), width, name);
 }
 
@@ -193,8 +195,8 @@ void svg_cstate(int cpu, u64 start, u64 end, int type)
 	if (width > 6)
 		width = 6;
 
-	if (width > 0.05)
-		fprintf(svgfile, "<text  transform=\"translate(%4.5f,%4.5f) rotate(90)\" font-size=\"%3.4fpt\">C%i</text>\n",
+	if (width > MIN_TEXT_SIZE)
+		fprintf(svgfile, "<text  transform=\"translate(%4.8f,%4.8f) rotate(90)\" font-size=\"%3.4fpt\">C%i</text>\n",
 			time2pixels(start), cpu2y(cpu), width, type);
 }
 
@@ -234,7 +236,7 @@ void svg_pstate(int cpu, u64 start, u64 end, u64 freq)
 	height = 1 + cpu2y(cpu) + SLOT_MULT + SLOT_HEIGHT - height;
 	fprintf(svgfile, "<line x1=\"%4.8f\" x2=\"%4.8f\" y1=\"%4.1f\" y2=\"%4.1f\" class=\"pstate\"/>\n",
 		time2pixels(start), time2pixels(end), height, height);
-	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\" font-size=\"0.25pt\">%s</text>\n",
+	fprintf(svgfile, "<text transform=\"translate(%4.8f,%4.8f)\" font-size=\"0.25pt\">%s</text>\n",
 		time2pixels(start), height+0.9, HzToHuman(freq));
 
 }
@@ -311,7 +313,7 @@ void svg_text(int Yslot, u64 start, const char *text)
 	if (!svgfile)
 		return;
 
-	fprintf(svgfile, "<text transform=\"translate(%4.1f,%4.1f)\">%s</text>\n",
+	fprintf(svgfile, "<text transform=\"translate(%4.8f,%4.8f)\">%s</text>\n",
 		time2pixels(start), Yslot * SLOT_MULT+SLOT_HEIGHT/2, text);
 }
 
@@ -322,7 +324,7 @@ static void svg_legenda_box(int X, const char *text, const char *style)
 
 	fprintf(svgfile, "<rect x=\"%i\" width=\"%4.8f\" y=\"0\" height=\"%4.1f\" class=\"%s\"/>\n",
 		X, boxsize, boxsize, style);
-	fprintf(svgfile, "<text transform=\"translate(%4.1f, %4.1f)\" font-size=\"%4.4fpt\">%s</text>\n",
+	fprintf(svgfile, "<text transform=\"translate(%4.8f, %4.8f)\" font-size=\"%4.4fpt\">%s</text>\n",
 		X + boxsize + 5, boxsize, 0.8 * boxsize, text);
 }
 
-- 
cgit v1.2.3


From 288f023e708efd89d77ce9acf977a33a623ae83d Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 19 Sep 2009 13:35:33 +0200
Subject: tracing, x86, cpuidle: Move the end point of a C state in the power
 tracer

The "end of a C state" trace point currently happens before
the code runs that corrects the TSC for having stopped during idle.

The result of this is that the timestamp of the end-of-C-state event
is garbage on cpus where the TSC stops during idle.

This patch moves the end point of the C state to after the timekeeping
engine of the kernel has been corrected.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Len Brown <len.brown@intel.com>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090919133533.139c2a46@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/process.c | 3 ---
 drivers/cpuidle/cpuidle.c | 2 ++
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 7b60e3906889..847ab4160315 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -309,7 +309,6 @@ void default_idle(void)
 		else
 			local_irq_enable();
 		current_thread_info()->status |= TS_POLLING;
-		trace_power_end(0);
 	} else {
 		local_irq_enable();
 		/* loop is done by the caller */
@@ -377,7 +376,6 @@ void mwait_idle_with_hints(unsigned long ax, unsigned long cx)
 		if (!need_resched())
 			__mwait(ax, cx);
 	}
-	trace_power_end(0);
 }
 
 /* Default MONITOR/MWAIT with no hints, used for default C1 state */
@@ -394,7 +392,6 @@ static void mwait_idle(void)
 			__sti_mwait(0, 0);
 		else
 			local_irq_enable();
-		trace_power_end(0);
 	} else
 		local_irq_enable();
 }
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 8504a2108557..ad41f19b8e3f 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -17,6 +17,7 @@
 #include <linux/cpuidle.h>
 #include <linux/ktime.h>
 #include <linux/hrtimer.h>
+#include <trace/events/power.h>
 
 #include "cpuidle.h"
 
@@ -91,6 +92,7 @@ static void cpuidle_idle_call(void)
 	/* give the governor an opportunity to reflect on the outcome */
 	if (cpuidle_curr_governor->reflect)
 		cpuidle_curr_governor->reflect(dev);
+	trace_power_end(0);
 }
 
 /**
-- 
cgit v1.2.3


From 151750cec5db3c7ea45255d2901e581e2162317a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 19 Sep 2009 13:36:04 +0200
Subject: perf: Add timechart help text and add timechart to "perf help"

As suggested by Ingo, add a timechart man page help text, as well
as add it to the "perf help" overview.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090919133604.3767fa35@infradead.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-timechart.txt | 35 +++++++++++++++++++++++++++++
 tools/perf/command-list.txt                 |  1 +
 2 files changed, 36 insertions(+)
 create mode 100644 tools/perf/Documentation/perf-timechart.txt

diff --git a/tools/perf/Documentation/perf-timechart.txt b/tools/perf/Documentation/perf-timechart.txt
new file mode 100644
index 000000000000..61e0104c6270
--- /dev/null
+++ b/tools/perf/Documentation/perf-timechart.txt
@@ -0,0 +1,35 @@
+perf-timechart(1)
+=================
+
+NAME
+----
+perf-timechart - Tool to visualize total system behavior during a workload
+
+SYNOPSIS
+--------
+[verse]
+'perf timechart' {record}
+
+DESCRIPTION
+-----------
+There are two variants of perf timechart:
+
+  'perf timechart record <command>' to record the system level events
+  of an arbitrary workload.
+
+  'perf timechart' to turn a trace into a Scalable Vector Graphics file,
+  that can be viewed with popular SVG viewers such as 'Inkscape'.
+
+OPTIONS
+-------
+-o::
+--output=::
+        Select the output file (default: output.svg)
+-i::
+--input=::
+        Select the input file (default: perf.data)
+
+
+SEE ALSO
+--------
+linkperf:perf-record[1]
diff --git a/tools/perf/command-list.txt b/tools/perf/command-list.txt
index 3133c74729dd..00326e230d87 100644
--- a/tools/perf/command-list.txt
+++ b/tools/perf/command-list.txt
@@ -8,5 +8,6 @@ perf-sched			mainporcelain common
 perf-record			mainporcelain common
 perf-report			mainporcelain common
 perf-stat			mainporcelain common
+perf-timechart			mainporcelain common
 perf-top			mainporcelain common
 perf-trace			mainporcelain common
-- 
cgit v1.2.3


From ec60a3fe478c0fc6d109eb5840b435ecee4d132b Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@infradead.org>
Date: Sat, 19 Sep 2009 13:36:30 +0200
Subject: perf utils: Use a define for the maximum length of a trace event

As per Ingo's review: use a #define rather than an open coded constant
for the maximum length of a trace event for storing in the perf.data file.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: fweisbec@gmail.com
Cc: peterz@infradead.org
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090919133630.10533d3e@infradead.org>
[ add a few comments to nearby functions ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/header.c | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c
index ef9114583994..bb4fca3efcc3 100644
--- a/tools/perf/util/header.c
+++ b/tools/perf/util/header.c
@@ -7,9 +7,8 @@
 #include "header.h"
 
 /*
- *
+ * Create new perf.data header attribute:
  */
-
 struct perf_header_attr *perf_header_attr__new(struct perf_counter_attr *attr)
 {
 	struct perf_header_attr *self = malloc(sizeof(*self));
@@ -43,9 +42,8 @@ void perf_header_attr__add_id(struct perf_header_attr *self, u64 id)
 }
 
 /*
- *
+ * Create new perf.data header:
  */
-
 struct perf_header *perf_header__new(void)
 {
 	struct perf_header *self = malloc(sizeof(*self));
@@ -86,9 +84,11 @@ void perf_header__add_attr(struct perf_header *self,
 	self->attr[pos] = attr;
 }
 
+#define MAX_EVENT_NAME 64
+
 struct perf_trace_event_type {
 	u64	event_id;
-	char	name[64];
+	char	name[MAX_EVENT_NAME];
 };
 
 static int event_count;
@@ -96,7 +96,7 @@ static struct perf_trace_event_type *events;
 
 void perf_header__push_event(u64 id, const char *name)
 {
-	if (strlen(name) > 64)
+	if (strlen(name) > MAX_EVENT_NAME)
 		printf("Event %s will be truncated\n", name);
 
 	if (!events) {
@@ -110,7 +110,7 @@ void perf_header__push_event(u64 id, const char *name)
 	}
 	memset(&events[event_count], 0, sizeof(struct perf_trace_event_type));
 	events[event_count].event_id = id;
-	strncpy(events[event_count].name, name, 63);
+	strncpy(events[event_count].name, name, MAX_EVENT_NAME - 1);
 	event_count++;
 }
 
-- 
cgit v1.2.3


From cdf8073d6b2c6c5a3cd6ce0e6c1297157f7f99ba Mon Sep 17 00:00:00 2001
From: Ian Schram <ischram@telenet.be>
Date: Fri, 18 Sep 2009 21:26:26 +0200
Subject: perf_counter: Fix perf_copy_attr() pointer arithmetic

There is still some weird code in per_copy_attr(). Which supposedly
checks that all bytes trailing a struct are zero.

It doesn't seem to get pointer arithmetic right. Since it
increments an iterating pointer by sizeof(unsigned long) rather
than 1.

Signed-off-by: Ian Schram <ischram@telenet.be>
[ v2: clean up the messy PTR_ALIGN logic as well. ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: <stable@kernel.org> # for v2.6.31.x
LKML-Reference: <4AB3DEE2.3030600@telenet.be>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d5899b62b276..cc768ab81ac8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -4208,8 +4208,8 @@ done:
 static int perf_copy_attr(struct perf_counter_attr __user *uattr,
 			  struct perf_counter_attr *attr)
 {
-	int ret;
 	u32 size;
+	int ret;
 
 	if (!access_ok(VERIFY_WRITE, uattr, PERF_ATTR_SIZE_VER0))
 		return -EFAULT;
@@ -4234,19 +4234,19 @@ static int perf_copy_attr(struct perf_counter_attr __user *uattr,
 
 	/*
 	 * If we're handed a bigger struct than we know of,
-	 * ensure all the unknown bits are 0.
+	 * ensure all the unknown bits are 0 - i.e. new
+	 * user-space does not rely on any kernel feature
+	 * extensions we dont know about yet.
 	 */
 	if (size > sizeof(*attr)) {
-		unsigned long val;
-		unsigned long __user *addr;
-		unsigned long __user *end;
+		unsigned char __user *addr;
+		unsigned char __user *end;
+		unsigned char val;
 
-		addr = PTR_ALIGN((void __user *)uattr + sizeof(*attr),
-				sizeof(unsigned long));
-		end  = PTR_ALIGN((void __user *)uattr + size,
-				sizeof(unsigned long));
+		addr = (void __user *)uattr + sizeof(*attr);
+		end  = (void __user *)uattr + size;
 
-		for (; addr < end; addr += sizeof(unsigned long)) {
+		for (; addr < end; addr++) {
 			ret = get_user(val, addr);
 			if (ret)
 				return ret;
-- 
cgit v1.2.3