From e8fcaa5c54e3b0371230e5d43a6f650c667da9c5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 7 Aug 2013 22:28:01 +0200
Subject: nohz: Convert a few places to use local per cpu accesses

A few functions use remote per CPU access APIs when they
deal with local values.

Just do the right conversion to improve performance, code
readability and debug checks.

While at it, lets extend some of these function names with *_this_cpu()
suffix in order to display their purpose more clearly.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/softirq.c             |  4 +---
 kernel/time/tick-broadcast.c |  6 +++---
 kernel/time/tick-internal.h  |  4 ++--
 kernel/time/tick-sched.c     | 39 ++++++++++++++++-----------------------
 4 files changed, 22 insertions(+), 31 deletions(-)

(limited to 'kernel')

diff --git a/kernel/softirq.c b/kernel/softirq.c
index 11025ccc06dd..11348de09400 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -311,8 +311,6 @@ asmlinkage void do_softirq(void)
  */
 void irq_enter(void)
 {
-	int cpu = smp_processor_id();
-
 	rcu_irq_enter();
 	if (is_idle_task(current) && !in_interrupt()) {
 		/*
@@ -320,7 +318,7 @@ void irq_enter(void)
 		 * here, as softirq will be serviced on return from interrupt.
 		 */
 		local_bh_disable();
-		tick_check_idle(cpu);
+		tick_check_idle();
 		_local_bh_enable();
 	}
 
diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c
index 9532690daaa9..43780ab5e279 100644
--- a/kernel/time/tick-broadcast.c
+++ b/kernel/time/tick-broadcast.c
@@ -538,10 +538,10 @@ int tick_resume_broadcast_oneshot(struct clock_event_device *bc)
  * Called from irq_enter() when idle was interrupted to reenable the
  * per cpu device.
  */
-void tick_check_oneshot_broadcast(int cpu)
+void tick_check_oneshot_broadcast_this_cpu(void)
 {
-	if (cpumask_test_cpu(cpu, tick_broadcast_oneshot_mask)) {
-		struct tick_device *td = &per_cpu(tick_cpu_device, cpu);
+	if (cpumask_test_cpu(smp_processor_id(), tick_broadcast_oneshot_mask)) {
+		struct tick_device *td = &__get_cpu_var(tick_cpu_device);
 
 		/*
 		 * We might be in the middle of switching over from
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index 18e71f7fbc2a..e2bced59b6dd 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -51,7 +51,7 @@ extern void tick_broadcast_switch_to_oneshot(void);
 extern void tick_shutdown_broadcast_oneshot(unsigned int *cpup);
 extern int tick_resume_broadcast_oneshot(struct clock_event_device *bc);
 extern int tick_broadcast_oneshot_active(void);
-extern void tick_check_oneshot_broadcast(int cpu);
+extern void tick_check_oneshot_broadcast_this_cpu(void);
 bool tick_broadcast_oneshot_available(void);
 # else /* BROADCAST */
 static inline void tick_broadcast_setup_oneshot(struct clock_event_device *bc)
@@ -62,7 +62,7 @@ static inline void tick_broadcast_oneshot_control(unsigned long reason) { }
 static inline void tick_broadcast_switch_to_oneshot(void) { }
 static inline void tick_shutdown_broadcast_oneshot(unsigned int *cpup) { }
 static inline int tick_broadcast_oneshot_active(void) { return 0; }
-static inline void tick_check_oneshot_broadcast(int cpu) { }
+static inline void tick_check_oneshot_broadcast_this_cpu(void) { }
 static inline bool tick_broadcast_oneshot_available(void) { return true; }
 # endif /* !BROADCAST */
 
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 3612fc77f834..2afd43fca93b 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -391,11 +391,9 @@ __setup("nohz=", setup_tick_nohz);
  */
 static void tick_nohz_update_jiffies(ktime_t now)
 {
-	int cpu = smp_processor_id();
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	unsigned long flags;
 
-	ts->idle_waketime = now;
+	__this_cpu_write(tick_cpu_sched.idle_waketime, now);
 
 	local_irq_save(flags);
 	tick_do_update_jiffies64(now);
@@ -426,17 +424,15 @@ update_ts_time_stats(int cpu, struct tick_sched *ts, ktime_t now, u64 *last_upda
 
 }
 
-static void tick_nohz_stop_idle(int cpu, ktime_t now)
+static void tick_nohz_stop_idle(struct tick_sched *ts, ktime_t now)
 {
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
-
-	update_ts_time_stats(cpu, ts, now, NULL);
+	update_ts_time_stats(smp_processor_id(), ts, now, NULL);
 	ts->idle_active = 0;
 
 	sched_clock_idle_wakeup_event(0);
 }
 
-static ktime_t tick_nohz_start_idle(int cpu, struct tick_sched *ts)
+static ktime_t tick_nohz_start_idle(struct tick_sched *ts)
 {
 	ktime_t now = ktime_get();
 
@@ -752,7 +748,7 @@ static void __tick_nohz_idle_enter(struct tick_sched *ts)
 	ktime_t now, expires;
 	int cpu = smp_processor_id();
 
-	now = tick_nohz_start_idle(cpu, ts);
+	now = tick_nohz_start_idle(ts);
 
 	if (can_stop_idle_tick(cpu, ts)) {
 		int was_stopped = ts->tick_stopped;
@@ -914,8 +910,7 @@ static void tick_nohz_account_idle_ticks(struct tick_sched *ts)
  */
 void tick_nohz_idle_exit(void)
 {
-	int cpu = smp_processor_id();
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	ktime_t now;
 
 	local_irq_disable();
@@ -928,7 +923,7 @@ void tick_nohz_idle_exit(void)
 		now = ktime_get();
 
 	if (ts->idle_active)
-		tick_nohz_stop_idle(cpu, now);
+		tick_nohz_stop_idle(ts, now);
 
 	if (ts->tick_stopped) {
 		tick_nohz_restart_sched_tick(ts, now);
@@ -1012,12 +1007,10 @@ static void tick_nohz_switch_to_nohz(void)
  * timer and do not touch the other magic bits which need to be done
  * when idle is left.
  */
-static void tick_nohz_kick_tick(int cpu, ktime_t now)
+static void tick_nohz_kick_tick(struct tick_sched *ts, ktime_t now)
 {
 #if 0
 	/* Switch back to 2.6.27 behaviour */
-
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
 	ktime_t delta;
 
 	/*
@@ -1032,36 +1025,36 @@ static void tick_nohz_kick_tick(int cpu, ktime_t now)
 #endif
 }
 
-static inline void tick_check_nohz(int cpu)
+static inline void tick_check_nohz_this_cpu(void)
 {
-	struct tick_sched *ts = &per_cpu(tick_cpu_sched, cpu);
+	struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
 	ktime_t now;
 
 	if (!ts->idle_active && !ts->tick_stopped)
 		return;
 	now = ktime_get();
 	if (ts->idle_active)
-		tick_nohz_stop_idle(cpu, now);
+		tick_nohz_stop_idle(ts, now);
 	if (ts->tick_stopped) {
 		tick_nohz_update_jiffies(now);
-		tick_nohz_kick_tick(cpu, now);
+		tick_nohz_kick_tick(ts, now);
 	}
 }
 
 #else
 
 static inline void tick_nohz_switch_to_nohz(void) { }
-static inline void tick_check_nohz(int cpu) { }
+static inline void tick_check_nohz_this_cpu(void) { }
 
 #endif /* CONFIG_NO_HZ_COMMON */
 
 /*
  * Called from irq_enter to notify about the possible interruption of idle()
  */
-void tick_check_idle(int cpu)
+void tick_check_idle(void)
 {
-	tick_check_oneshot_broadcast(cpu);
-	tick_check_nohz(cpu);
+	tick_check_oneshot_broadcast_this_cpu();
+	tick_check_nohz_this_cpu();
 }
 
 /*
-- 
cgit v1.2.3


From 58135f574f1b791c926622387780ed3d090116d6 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 6 Nov 2013 14:45:57 +0100
Subject: context_tracking: Wrap static key check into more intuitive function
 name

Use a function with a meaningful name to check the global context
tracking state. static_key_false() is a bit confusing for reviewers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 include/linux/context_tracking.h       | 10 +++++-----
 include/linux/context_tracking_state.h |  4 ++++
 include/linux/tick.h                   |  2 +-
 include/linux/vtime.h                  |  2 +-
 kernel/context_tracking.c              |  8 ++++----
 5 files changed, 15 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/include/linux/context_tracking.h b/include/linux/context_tracking.h
index 158158704c30..37b81bd51ec0 100644
--- a/include/linux/context_tracking.h
+++ b/include/linux/context_tracking.h
@@ -17,13 +17,13 @@ extern void __context_tracking_task_switch(struct task_struct *prev,
 
 static inline void user_enter(void)
 {
-	if (static_key_false(&context_tracking_enabled))
+	if (context_tracking_is_enabled())
 		context_tracking_user_enter();
 
 }
 static inline void user_exit(void)
 {
-	if (static_key_false(&context_tracking_enabled))
+	if (context_tracking_is_enabled())
 		context_tracking_user_exit();
 }
 
@@ -31,7 +31,7 @@ static inline enum ctx_state exception_enter(void)
 {
 	enum ctx_state prev_ctx;
 
-	if (!static_key_false(&context_tracking_enabled))
+	if (!context_tracking_is_enabled())
 		return 0;
 
 	prev_ctx = this_cpu_read(context_tracking.state);
@@ -42,7 +42,7 @@ static inline enum ctx_state exception_enter(void)
 
 static inline void exception_exit(enum ctx_state prev_ctx)
 {
-	if (static_key_false(&context_tracking_enabled)) {
+	if (context_tracking_is_enabled()) {
 		if (prev_ctx == IN_USER)
 			context_tracking_user_enter();
 	}
@@ -51,7 +51,7 @@ static inline void exception_exit(enum ctx_state prev_ctx)
 static inline void context_tracking_task_switch(struct task_struct *prev,
 						struct task_struct *next)
 {
-	if (static_key_false(&context_tracking_enabled))
+	if (context_tracking_is_enabled())
 		__context_tracking_task_switch(prev, next);
 }
 #else
diff --git a/include/linux/context_tracking_state.h b/include/linux/context_tracking_state.h
index 0f1979d0674f..0db535b79be7 100644
--- a/include/linux/context_tracking_state.h
+++ b/include/linux/context_tracking_state.h
@@ -22,6 +22,10 @@ struct context_tracking {
 extern struct static_key context_tracking_enabled;
 DECLARE_PER_CPU(struct context_tracking, context_tracking);
 
+static inline bool context_tracking_is_enabled(void)
+{
+	return static_key_false(&context_tracking_enabled);
+}
 static inline bool context_tracking_in_user(void)
 {
 	return __this_cpu_read(context_tracking.state) == IN_USER;
diff --git a/include/linux/tick.h b/include/linux/tick.h
index a004f66a6cf0..0175d8663b6c 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -165,7 +165,7 @@ extern cpumask_var_t tick_nohz_full_mask;
 
 static inline bool tick_nohz_full_enabled(void)
 {
-	if (!static_key_false(&context_tracking_enabled))
+	if (!context_tracking_is_enabled())
 		return false;
 
 	return tick_nohz_full_running;
diff --git a/include/linux/vtime.h b/include/linux/vtime.h
index f5b72b364bda..807c732cbf29 100644
--- a/include/linux/vtime.h
+++ b/include/linux/vtime.h
@@ -19,7 +19,7 @@ static inline bool vtime_accounting_enabled(void) { return true; }
 #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
 static inline bool vtime_accounting_enabled(void)
 {
-	if (static_key_false(&context_tracking_enabled)) {
+	if (context_tracking_is_enabled()) {
 		if (context_tracking_active())
 			return true;
 	}
diff --git a/kernel/context_tracking.c b/kernel/context_tracking.c
index e5f3917aa05b..6cb20d2e7ee0 100644
--- a/kernel/context_tracking.c
+++ b/kernel/context_tracking.c
@@ -53,10 +53,10 @@ void context_tracking_user_enter(void)
 	/*
 	 * Repeat the user_enter() check here because some archs may be calling
 	 * this from asm and if no CPU needs context tracking, they shouldn't
-	 * go further. Repeat the check here until they support the static key
-	 * check.
+	 * go further. Repeat the check here until they support the inline static
+	 * key check.
 	 */
-	if (!static_key_false(&context_tracking_enabled))
+	if (!context_tracking_is_enabled())
 		return;
 
 	/*
@@ -160,7 +160,7 @@ void context_tracking_user_exit(void)
 {
 	unsigned long flags;
 
-	if (!static_key_false(&context_tracking_enabled))
+	if (!context_tracking_is_enabled())
 		return;
 
 	if (in_interrupt())
-- 
cgit v1.2.3


From d4283c654130c3d01b6842d3821dbdc3c15ceb46 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 6 Nov 2013 15:42:04 +0100
Subject: posix-timers: Spare workqueue if there is no full dynticks CPU to
 kick

After a posix cpu timer is set, a workqueue is scheduled in order to
kick the full dynticks CPUs and let them restart their tick if
necessary in case the task they are running is concerned by the
new timer.

This kick is implemented by way of IPIs, which require interrupts
to be enabled, hence the need for a workqueue to raise them because
the posix cpu timer set path has interrupts disabled.

Now if there is no full dynticks CPU on the system, the workqueue is
still scheduled but it simply won't send any IPI and return immediately.

So lets spare that worqueue when it is not needed.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
---
 kernel/posix-cpu-timers.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c7f31aa272f7..35509c5a3ffb 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -608,7 +608,8 @@ static DECLARE_WORK(nohz_kick_work, nohz_kick_work_fn);
  */
 static void posix_cpu_timer_kick_nohz(void)
 {
-	schedule_work(&nohz_kick_work);
+	if (context_tracking_is_enabled())
+		schedule_work(&nohz_kick_work);
 }
 
 bool posix_cpu_timers_can_stop_tick(struct task_struct *tsk)
-- 
cgit v1.2.3


From c925077c33fc9a546e7cf6c3be2adf4a2afe2608 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Wed, 6 Nov 2013 17:18:30 +0100
Subject: posix-timers: Fix full dynticks CPUs kick on timer rescheduling

A posix CPU timer can be rearmed while it is firing or after it is
notified with a signal. This can happen for example with timers that
were set with a non zero interval in timer_settime().

This rearming can happen in two places:

1) On timer firing time, which happens on the target's tick. If the timer
can't trigger a signal because it is ignored, it reschedules itself
to honour the timer interval.

2) On signal handling from the timer's notification target. This one
can be a different task than the timer's target itself. Once the
signal is notified, the notification target rearms the timer, again
to honour the timer interval.

When a timer is rearmed, we need to notify the full dynticks CPUs
such that they restart their tick in case they are running tasks that
may have a share in elapsing this timer.

Now the 1st case above handles full dynticks CPUs with a call to
posix_cpu_timer_kick_nohz() from the posix cpu timer firing code. But
the second case ignores the fact that some CPUs may run non-idle tasks
with their tick off. As a result, when a timer is resheduled after its signal
notification, the full dynticks CPUs may completely ignore it and not
tick on the timer as expected

This patch fixes this bug by handling both cases in one. All we need
is to move the kick to the rearming common code in posix_cpu_timer_schedule().

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Olivier Langlois <olivier@olivierlanglois.net>
---
 kernel/posix-cpu-timers.c | 18 +++++++-----------
 1 file changed, 7 insertions(+), 11 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 35509c5a3ffb..79747b7d9420 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -1091,7 +1091,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			put_task_struct(p);
 			timer->it.cpu.task = p = NULL;
 			timer->it.cpu.expires = 0;
-			goto out_unlock;
+			read_unlock(&tasklist_lock);
+			goto out;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
 			/*
 			 * We've noticed that the thread is dead, but
@@ -1100,7 +1101,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 */
 			cpu_timer_sample_group(timer->it_clock, p, &now);
 			clear_dead_task(timer, now);
-			goto out_unlock;
+			read_unlock(&tasklist_lock);
+			goto out;
 		}
 		spin_lock(&p->sighand->siglock);
 		cpu_timer_sample_group(timer->it_clock, p, &now);
@@ -1114,10 +1116,11 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	BUG_ON(!irqs_disabled());
 	arm_timer(timer);
 	spin_unlock(&p->sighand->siglock);
-
-out_unlock:
 	read_unlock(&tasklist_lock);
 
+	/* Kick full dynticks CPUs in case they need to tick on the new timer */
+	posix_cpu_timer_kick_nohz();
+
 out:
 	timer->it_overrun_last = timer->it_overrun;
 	timer->it_overrun = -1;
@@ -1257,13 +1260,6 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 			cpu_timer_fire(timer);
 		spin_unlock(&timer->it_lock);
 	}
-
-	/*
-	 * In case some timers were rescheduled after the queue got emptied,
-	 * wake up full dynticks CPUs.
-	 */
-	if (tsk->signal->cputimer.running)
-		posix_cpu_timer_kick_nohz();
 }
 
 /*
-- 
cgit v1.2.3


From 724a371396e8162cc883a40cd8e525dfc7e5f3ff Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 10 Oct 2013 16:55:57 +0200
Subject: posix-timers: Remove dead thread posix cpu timers caching

When a task is exiting or has exited, its posix cpu timers
don't tick anymore and won't elapse further. It's too late
for them to expire.

So any further call to timer_gettime() on these timers will
return the same remaining expiry time.

The current code optimize this by caching the remaining delta
and storing it where we use to save the absolute expiration time.
This way, the future calls to timer_gettime() won't need to
compute the difference between the absolute expiration time and
the current time anymore.

Now this optimization doesn't seem to bring much value. Computing
the timer remaining delta is not very costly. Fetching the timer
value OTOH can be costly in two ways:

* CPUCLOCK_SCHED read requires to lock the target's rq. But some
optimizations are on the way to make task_sched_runtime() not holding
the rq lock of a non-running target.

* CPUCLOCK_VIRT/CPUCLOCK_PROF read simply consist in fetching
current->utime/current->stime except when the system uses full
dynticks cputime accounting. The latter requires a per task lock
in order to correctly compute user and system time. But once the
target is dead, this lock shouldn't be contended anyway.

All in one this caching doesn't seem to be justified.
Given that it complicates the code significantly for
few wins, let's remove it on single thread timers.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 79747b7d9420..3b7df8653913 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -788,7 +788,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 {
 	unsigned long long now;
 	struct task_struct *p = timer->it.cpu.task;
-	int clear_dead;
 
 	/*
 	 * Easy part: convert the reload time.
@@ -802,6 +801,7 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	}
 
 	if (unlikely(p == NULL)) {
+		WARN_ON_ONCE(CPUCLOCK_PERTHREAD(timer->it_clock));
 		/*
 		 * This task already died and the timer will never fire.
 		 * In this case, expires is actually the dead value.
@@ -817,7 +817,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	 */
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 		cpu_clock_sample(timer->it_clock, p, &now);
-		clear_dead = p->exit_state;
 	} else {
 		read_lock(&tasklist_lock);
 		if (unlikely(p->sighand == NULL)) {
@@ -833,22 +832,20 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			goto dead;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
-			clear_dead = (unlikely(p->exit_state) &&
-				      thread_group_empty(p));
+			if (unlikely(p->exit_state) && thread_group_empty(p)) {
+				read_unlock(&tasklist_lock);
+				/*
+				 * We've noticed that the thread is dead, but
+				 * not yet reaped.  Take this opportunity to
+				 * drop our task ref.
+				 */
+				clear_dead_task(timer, now);
+				goto dead;
+			}
 		}
 		read_unlock(&tasklist_lock);
 	}
 
-	if (unlikely(clear_dead)) {
-		/*
-		 * We've noticed that the thread is dead, but
-		 * not yet reaped.  Take this opportunity to
-		 * drop our task ref.
-		 */
-		clear_dead_task(timer, now);
-		goto dead;
-	}
-
 	if (now < timer->it.cpu.expires) {
 		sample_to_timespec(timer->it_clock,
 				   timer->it.cpu.expires - now,
@@ -1063,11 +1060,13 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	struct task_struct *p = timer->it.cpu.task;
 	unsigned long long now;
 
-	if (unlikely(p == NULL))
+	if (unlikely(p == NULL)) {
+		WARN_ON_ONCE(CPUCLOCK_PERTHREAD(timer->it_clock));
 		/*
 		 * The task was cleaned up already, no future firings.
 		 */
 		goto out;
+	}
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
@@ -1075,10 +1074,9 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 		cpu_clock_sample(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
-		if (unlikely(p->exit_state)) {
-			clear_dead_task(timer, now);
+		if (unlikely(p->exit_state))
 			goto out;
-		}
+
 		read_lock(&tasklist_lock); /* arm_timer needs it.  */
 		spin_lock(&p->sighand->siglock);
 	} else {
-- 
cgit v1.2.3


From d430b9173a9a50a83e10d1c70baead3e625b522f Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Thu, 10 Oct 2013 16:55:57 +0200
Subject: posix-timers: Remove dead process posix cpu timers caching

Now that we removed dead thread posix cpu timers caching,
lets remove the dead process wide version. This caching
is similar to the per thread version but it should be even
more rare:

* If the process id dead, we are not reading its timers
status from a thread belonging to its group since they
are all dead. So this caching only concern remote process
timers reads. Now posix cpu timers using itimers or timer_settime()
can't do remote process timers anyway so it's not even clear if there
is actually a user for this caching.

* Unlike per thread timers caching, this only applies to
zombies targets. Buried targets' process wide timers return
0 values. But then again, timer_gettime() can't read remote
process timers, so if the process is dead, there can't be
any reader left anyway.

Then again this caching seem to complicate the code for
corner cases that are probably not worth it. So lets get
rid of it.

Also remove the sample snapshot on dying process timer
that is now useless, as suggested by Kosaki.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 35 +----------------------------------
 1 file changed, 1 insertion(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 3b7df8653913..c5d1ef530268 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -453,23 +453,6 @@ void posix_cpu_timers_exit_group(struct task_struct *tsk)
 		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
 }
 
-static void clear_dead_task(struct k_itimer *itimer, unsigned long long now)
-{
-	struct cpu_timer_list *timer = &itimer->it.cpu;
-
-	/*
-	 * That's all for this thread or process.
-	 * We leave our residual in expires to be reported.
-	 */
-	put_task_struct(timer->task);
-	timer->task = NULL;
-	if (timer->expires < now) {
-		timer->expires = 0;
-	} else {
-		timer->expires -= now;
-	}
-}
-
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 {
 	return expires == 0 || expires > new_exp;
@@ -832,16 +815,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			goto dead;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
-			if (unlikely(p->exit_state) && thread_group_empty(p)) {
-				read_unlock(&tasklist_lock);
-				/*
-				 * We've noticed that the thread is dead, but
-				 * not yet reaped.  Take this opportunity to
-				 * drop our task ref.
-				 */
-				clear_dead_task(timer, now);
-				goto dead;
-			}
 		}
 		read_unlock(&tasklist_lock);
 	}
@@ -1092,14 +1065,8 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			read_unlock(&tasklist_lock);
 			goto out;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
-			/*
-			 * We've noticed that the thread is dead, but
-			 * not yet reaped.  Take this opportunity to
-			 * drop our task ref.
-			 */
-			cpu_timer_sample_group(timer->it_clock, p, &now);
-			clear_dead_task(timer, now);
 			read_unlock(&tasklist_lock);
+			/* Optimizations: if the process is dying, no need to rearm */
 			goto out;
 		}
 		spin_lock(&p->sighand->siglock);
-- 
cgit v1.2.3


From e26d70d271ee1a68a925796b411cb0239394c7a1 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 00:27:19 +0200
Subject: posix-timers: Cleanup reaped target handling

When a timer's target is seen to be buried, for example on calls
to timer_gettime(), the posix cpu timers code behaves a bit
like a garbage collector and releases early the reference to the
task.

Then again, this optimization complicates the code for no much
value: it's up to the user to release the timer and its associated
ressources by calling timer_delete() after it buries the target
tasks.

Remove this to simplify the code.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index c5d1ef530268..dc4355b967db 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -639,8 +639,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	 */
 	if (unlikely(p->sighand == NULL)) {
 		read_unlock(&tasklist_lock);
-		put_task_struct(p);
-		timer->it.cpu.task = NULL;
 		return -ESRCH;
 	}
 
@@ -808,8 +806,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 * We can't even collect a sample any more.
 			 * Call the timer disarmed, nothing else to do.
 			 */
-			put_task_struct(p);
-			timer->it.cpu.task = NULL;
 			timer->it.cpu.expires = 0;
 			read_unlock(&tasklist_lock);
 			goto dead;
@@ -1059,8 +1055,6 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
 			 */
-			put_task_struct(p);
-			timer->it.cpu.task = p = NULL;
 			timer->it.cpu.expires = 0;
 			read_unlock(&tasklist_lock);
 			goto out;
-- 
cgit v1.2.3


From a3222f88fa4f2ebec4632aef527dd2c9a41b997d Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 00:37:39 +0200
Subject: posix-timers: Remove dead task special case

Now that we've removed all the optimizations that could
result in NULL timer's targets, we can remove all the
associated special case handling.

Also add some warnings on NULL targets to spot any possible
leftover.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 70 +++++++++++++++++------------------------------
 1 file changed, 25 insertions(+), 45 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index dc4355b967db..ab9911b54faf 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -374,27 +374,27 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 	struct task_struct *p = timer->it.cpu.task;
 	int ret = 0;
 
-	if (likely(p != NULL)) {
-		read_lock(&tasklist_lock);
-		if (unlikely(p->sighand == NULL)) {
-			/*
-			 * We raced with the reaping of the task.
-			 * The deletion should have cleared us off the list.
-			 */
-			BUG_ON(!list_empty(&timer->it.cpu.entry));
-		} else {
-			spin_lock(&p->sighand->siglock);
-			if (timer->it.cpu.firing)
-				ret = TIMER_RETRY;
-			else
-				list_del(&timer->it.cpu.entry);
-			spin_unlock(&p->sighand->siglock);
-		}
-		read_unlock(&tasklist_lock);
+	WARN_ON_ONCE(p == NULL);
 
-		if (!ret)
-			put_task_struct(p);
+	read_lock(&tasklist_lock);
+	if (unlikely(p->sighand == NULL)) {
+		/*
+		 * We raced with the reaping of the task.
+		 * The deletion should have cleared us off the list.
+		 */
+		BUG_ON(!list_empty(&timer->it.cpu.entry));
+	} else {
+		spin_lock(&p->sighand->siglock);
+		if (timer->it.cpu.firing)
+			ret = TIMER_RETRY;
+		else
+			list_del(&timer->it.cpu.entry);
+		spin_unlock(&p->sighand->siglock);
 	}
+	read_unlock(&tasklist_lock);
+
+	if (!ret)
+		put_task_struct(p);
 
 	return ret;
 }
@@ -622,12 +622,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 	unsigned long long old_expires, new_expires, old_incr, val;
 	int ret;
 
-	if (unlikely(p == NULL)) {
-		/*
-		 * Timer refers to a dead task's clock.
-		 */
-		return -ESRCH;
-	}
+	WARN_ON_ONCE(p == NULL);
 
 	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
 
@@ -770,6 +765,8 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	unsigned long long now;
 	struct task_struct *p = timer->it.cpu.task;
 
+	WARN_ON_ONCE(p == NULL);
+
 	/*
 	 * Easy part: convert the reload time.
 	 */
@@ -781,18 +778,6 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 		return;
 	}
 
-	if (unlikely(p == NULL)) {
-		WARN_ON_ONCE(CPUCLOCK_PERTHREAD(timer->it_clock));
-		/*
-		 * This task already died and the timer will never fire.
-		 * In this case, expires is actually the dead value.
-		 */
-	dead:
-		sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
-				   &itp->it_value);
-		return;
-	}
-
 	/*
 	 * Sample the clock to take the difference with the expiry time.
 	 */
@@ -807,8 +792,9 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			 * Call the timer disarmed, nothing else to do.
 			 */
 			timer->it.cpu.expires = 0;
+			sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
+					   &itp->it_value);
 			read_unlock(&tasklist_lock);
-			goto dead;
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
 		}
@@ -1029,13 +1015,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	struct task_struct *p = timer->it.cpu.task;
 	unsigned long long now;
 
-	if (unlikely(p == NULL)) {
-		WARN_ON_ONCE(CPUCLOCK_PERTHREAD(timer->it_clock));
-		/*
-		 * The task was cleaned up already, no future firings.
-		 */
-		goto out;
-	}
+	WARN_ON_ONCE(p == NULL);
 
 	/*
 	 * Fetch the current sample and update the timer's expiry time.
-- 
cgit v1.2.3


From af82eb3c3068877a6b1989796a06b846b1e9e1c3 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 16:11:43 +0200
Subject: posix-timers: Remove useless clock sample on timers cleanup

a0b2062b0904ef07944c4a6e4d0f88ee44f1e9f2
("posix_timers: fix racy timer delta caching on task exit") forgot
to remove the arguments used for timer caching.

Fix this leftover.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index ab9911b54faf..e6389f915bcb 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -399,8 +399,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 	return ret;
 }
 
-static void cleanup_timers_list(struct list_head *head,
-				unsigned long long curr)
+static void cleanup_timers_list(struct list_head *head)
 {
 	struct cpu_timer_list *timer, *next;
 
@@ -414,16 +413,11 @@ static void cleanup_timers_list(struct list_head *head,
  * time for later timer_gettime calls to return.
  * This must be called with the siglock held.
  */
-static void cleanup_timers(struct list_head *head,
-			   cputime_t utime, cputime_t stime,
-			   unsigned long long sum_exec_runtime)
+static void cleanup_timers(struct list_head *head)
 {
-
-	cputime_t ptime = utime + stime;
-
-	cleanup_timers_list(head, cputime_to_expires(ptime));
-	cleanup_timers_list(++head, cputime_to_expires(utime));
-	cleanup_timers_list(++head, sum_exec_runtime);
+	cleanup_timers_list(head);
+	cleanup_timers_list(++head);
+	cleanup_timers_list(++head);
 }
 
 /*
@@ -433,24 +427,14 @@ static void cleanup_timers(struct list_head *head,
  */
 void posix_cpu_timers_exit(struct task_struct *tsk)
 {
-	cputime_t utime, stime;
-
 	add_device_randomness((const void*) &tsk->se.sum_exec_runtime,
 						sizeof(unsigned long long));
-	task_cputime(tsk, &utime, &stime);
-	cleanup_timers(tsk->cpu_timers,
-		       utime, stime, tsk->se.sum_exec_runtime);
+	cleanup_timers(tsk->cpu_timers);
 
 }
 void posix_cpu_timers_exit_group(struct task_struct *tsk)
 {
-	struct signal_struct *const sig = tsk->signal;
-	cputime_t utime, stime;
-
-	task_cputime(tsk, &utime, &stime);
-	cleanup_timers(tsk->signal->cpu_timers,
-		       utime + sig->utime, stime + sig->stime,
-		       tsk->se.sum_exec_runtime + sig->sum_sched_runtime);
+	cleanup_timers(tsk->signal->cpu_timers);
 }
 
 static inline int expires_gt(cputime_t expires, cputime_t new_exp)
-- 
cgit v1.2.3


From 33ab0fec33527e8b5ab124cff6aefd4746508e04 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 17:41:11 +0200
Subject: posix-timers: Consolidate posix_cpu_clock_get()

Consolidate the clock sampling common code used for both local
and remote targets.

Note that this introduces a tiny user ABI change: if a
PID is passed to clock_gettime() along the clockid,
we used to forbid a process wide clock sample when that
PID doesn't belong to a group leader. Now after this patch
we allow process wide clock samples if that PID belongs to
the current task, even if the current task is not the
group leader.

But local process wide clock samples are allowed if PID == 0
(current task) even if the current task is not the group leader.
So in the end this should be no big deal as this actually harmonize
the behaviour when the remote sample is actually a local one.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 64 ++++++++++++++++++++++-------------------------
 1 file changed, 30 insertions(+), 34 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index e6389f915bcb..03c5d6c3e614 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -260,30 +260,43 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 	return 0;
 }
 
+static int posix_cpu_clock_get_task(struct task_struct *tsk,
+				    const clockid_t which_clock,
+				    struct timespec *tp)
+{
+	int err = -EINVAL;
+	unsigned long long rtn;
+
+	if (CPUCLOCK_PERTHREAD(which_clock)) {
+		if (same_thread_group(tsk, current))
+			err = cpu_clock_sample(which_clock, tsk, &rtn);
+	} else {
+		read_lock(&tasklist_lock);
+
+		if (tsk->sighand && (tsk == current || thread_group_leader(tsk)))
+			err = cpu_clock_sample_group(which_clock, tsk, &rtn);
+
+		read_unlock(&tasklist_lock);
+	}
+
+	if (!err)
+		sample_to_timespec(which_clock, rtn, tp);
+
+	return err;
+}
+
 
 static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 {
 	const pid_t pid = CPUCLOCK_PID(which_clock);
-	int error = -EINVAL;
-	unsigned long long rtn;
+	int err = -EINVAL;
 
 	if (pid == 0) {
 		/*
 		 * Special case constant value for our own clocks.
 		 * We don't have to do any lookup to find ourselves.
 		 */
-		if (CPUCLOCK_PERTHREAD(which_clock)) {
-			/*
-			 * Sampling just ourselves we can do with no locking.
-			 */
-			error = cpu_clock_sample(which_clock,
-						 current, &rtn);
-		} else {
-			read_lock(&tasklist_lock);
-			error = cpu_clock_sample_group(which_clock,
-						       current, &rtn);
-			read_unlock(&tasklist_lock);
-		}
+		err = posix_cpu_clock_get_task(current, which_clock, tp);
 	} else {
 		/*
 		 * Find the given PID, and validate that the caller
@@ -292,29 +305,12 @@ static int posix_cpu_clock_get(const clockid_t which_clock, struct timespec *tp)
 		struct task_struct *p;
 		rcu_read_lock();
 		p = find_task_by_vpid(pid);
-		if (p) {
-			if (CPUCLOCK_PERTHREAD(which_clock)) {
-				if (same_thread_group(p, current)) {
-					error = cpu_clock_sample(which_clock,
-								 p, &rtn);
-				}
-			} else {
-				read_lock(&tasklist_lock);
-				if (thread_group_leader(p) && p->sighand) {
-					error =
-					    cpu_clock_sample_group(which_clock,
-							           p, &rtn);
-				}
-				read_unlock(&tasklist_lock);
-			}
-		}
+		if (p)
+			err = posix_cpu_clock_get_task(p, which_clock, tp);
 		rcu_read_unlock();
 	}
 
-	if (error)
-		return error;
-	sample_to_timespec(which_clock, rtn, tp);
-	return 0;
+	return err;
 }
 
 
-- 
cgit v1.2.3


From 50875788a1d4a3f662a27ed13cd05282d835939a Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 17:41:11 +0200
Subject: posix-timers: Use sighand lock instead of tasklist_lock for task
 clock sample

There is no need for the tasklist_lock just to take a process
wide clock sample.

All we need is to get a coherent sample that doesn't race with
exit() and exec():

* exit() may be concurrently reaping a task and flushing its time

* sighand is unstable under exit() and exec(), and the latter also
  result in group leader that can change

To protect against these, locking the target's sighand is enough.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 16 +++++++++++++---
 1 file changed, 13 insertions(+), 3 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 03c5d6c3e614..71a07699a36b 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -271,12 +271,22 @@ static int posix_cpu_clock_get_task(struct task_struct *tsk,
 		if (same_thread_group(tsk, current))
 			err = cpu_clock_sample(which_clock, tsk, &rtn);
 	} else {
-		read_lock(&tasklist_lock);
+		unsigned long flags;
+		struct sighand_struct *sighand;
 
-		if (tsk->sighand && (tsk == current || thread_group_leader(tsk)))
+		/*
+		 * while_each_thread() is not yet entirely RCU safe,
+		 * keep locking the group while sampling process
+		 * clock for now.
+		 */
+		sighand = lock_task_sighand(tsk, &flags);
+		if (!sighand)
+			return err;
+
+		if (tsk == current || thread_group_leader(tsk))
 			err = cpu_clock_sample_group(which_clock, tsk, &rtn);
 
-		read_unlock(&tasklist_lock);
+		unlock_task_sighand(tsk, &flags);
 	}
 
 	if (!err)
-- 
cgit v1.2.3


From 3d7a1427e4ce545e949e9bccb75d0ca8d941d93c Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 17:41:11 +0200
Subject: posix-timers: Use sighand lock instead of tasklist_lock on timer
 deletion

Timer deletion doesn't need the tasklist lock.
We need to protect against:

* concurrent access to the lists p->cputime_expires and
  p->sighand->cputime_expires

* task reaping that may also delete the timer list entry

* timer firing

We already hold the timer lock which protects us against concurrent
timer firing.

The rest only need the targets sighand to be locked.
So hold it and drop the use of tasklist_lock there.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 71a07699a36b..9641958ddb3e 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -377,27 +377,32 @@ static int posix_cpu_timer_create(struct k_itimer *new_timer)
  */
 static int posix_cpu_timer_del(struct k_itimer *timer)
 {
-	struct task_struct *p = timer->it.cpu.task;
 	int ret = 0;
+	unsigned long flags;
+	struct sighand_struct *sighand;
+	struct task_struct *p = timer->it.cpu.task;
 
 	WARN_ON_ONCE(p == NULL);
 
-	read_lock(&tasklist_lock);
-	if (unlikely(p->sighand == NULL)) {
+	/*
+	 * Protect against sighand release/switch in exit/exec and process/
+	 * thread timer list entry concurrent read/writes.
+	 */
+	sighand = lock_task_sighand(p, &flags);
+	if (unlikely(sighand == NULL)) {
 		/*
 		 * We raced with the reaping of the task.
 		 * The deletion should have cleared us off the list.
 		 */
 		BUG_ON(!list_empty(&timer->it.cpu.entry));
 	} else {
-		spin_lock(&p->sighand->siglock);
 		if (timer->it.cpu.firing)
 			ret = TIMER_RETRY;
 		else
 			list_del(&timer->it.cpu.entry);
-		spin_unlock(&p->sighand->siglock);
+
+		unlock_task_sighand(p, &flags);
 	}
-	read_unlock(&tasklist_lock);
 
 	if (!ret)
 		put_task_struct(p);
-- 
cgit v1.2.3


From e73d84e33f15c099ed1df60437700093cb14e46e Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 18:56:49 +0200
Subject: posix-timers: Remove remaining uses of tasklist_lock

The remaining uses of tasklist_lock were mostly about synchronizing
against sighand modifications, getting coherent and safe group samples
and also thread/process wide timers list handling.

All of this is already safely synchronizable with the target's
sighand lock. Let's use it on these places instead.

Also update the comments about locking.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 76 +++++++++++++++++++++++++++--------------------
 1 file changed, 44 insertions(+), 32 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 9641958ddb3e..d9dc5edc318c 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -233,7 +233,8 @@ void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times)
 
 /*
  * Sample a process (thread group) clock for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
  */
 static int cpu_clock_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
@@ -455,8 +456,7 @@ static inline int expires_gt(cputime_t expires, cputime_t new_exp)
 
 /*
  * Insert the timer on the appropriate list before any timers that
- * expire later.  This must be called with the tasklist_lock held
- * for reading, interrupts disabled and p->sighand->siglock taken.
+ * expire later.  This must be called with the sighand lock held.
  */
 static void arm_timer(struct k_itimer *timer)
 {
@@ -547,7 +547,8 @@ static void cpu_timer_fire(struct k_itimer *timer)
 
 /*
  * Sample a process (thread group) timer for the given group_leader task.
- * Must be called with tasklist_lock held for reading.
+ * Must be called with task sighand lock held for safe while_each_thread()
+ * traversal.
  */
 static int cpu_timer_sample_group(const clockid_t which_clock,
 				  struct task_struct *p,
@@ -610,9 +611,11 @@ static inline void posix_cpu_timer_kick_nohz(void) { }
  * If we return TIMER_RETRY, it's necessary to release the timer's lock
  * and try again.  (This happens when the timer is in the middle of firing.)
  */
-static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
+static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 			       struct itimerspec *new, struct itimerspec *old)
 {
+	unsigned long flags;
+	struct sighand_struct *sighand;
 	struct task_struct *p = timer->it.cpu.task;
 	unsigned long long old_expires, new_expires, old_incr, val;
 	int ret;
@@ -621,14 +624,16 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 
 	new_expires = timespec_to_sample(timer->it_clock, &new->it_value);
 
-	read_lock(&tasklist_lock);
 	/*
-	 * We need the tasklist_lock to protect against reaping that
-	 * clears p->sighand.  If p has just been reaped, we can no
+	 * Protect against sighand release/switch in exit/exec and p->cpu_timers
+	 * and p->signal->cpu_timers read/write in arm_timer()
+	 */
+	sighand = lock_task_sighand(p, &flags);
+	/*
+	 * If p has just been reaped, we can no
 	 * longer get any information about it at all.
 	 */
-	if (unlikely(p->sighand == NULL)) {
-		read_unlock(&tasklist_lock);
+	if (unlikely(sighand == NULL)) {
 		return -ESRCH;
 	}
 
@@ -639,7 +644,6 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 
 	ret = 0;
 	old_incr = timer->it.cpu.incr;
-	spin_lock(&p->sighand->siglock);
 	old_expires = timer->it.cpu.expires;
 	if (unlikely(timer->it.cpu.firing)) {
 		timer->it.cpu.firing = -1;
@@ -696,12 +700,11 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		 * disable this firing since we are already reporting
 		 * it as an overrun (thanks to bump_cpu_timer above).
 		 */
-		spin_unlock(&p->sighand->siglock);
-		read_unlock(&tasklist_lock);
+		unlock_task_sighand(p, &flags);
 		goto out;
 	}
 
-	if (new_expires != 0 && !(flags & TIMER_ABSTIME)) {
+	if (new_expires != 0 && !(timer_flags & TIMER_ABSTIME)) {
 		new_expires += val;
 	}
 
@@ -715,9 +718,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int flags,
 		arm_timer(timer);
 	}
 
-	spin_unlock(&p->sighand->siglock);
-	read_unlock(&tasklist_lock);
-
+	unlock_task_sighand(p, &flags);
 	/*
 	 * Install the new reload setting, and
 	 * set up the signal and overrun bookkeeping.
@@ -779,8 +780,16 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 	if (CPUCLOCK_PERTHREAD(timer->it_clock)) {
 		cpu_clock_sample(timer->it_clock, p, &now);
 	} else {
-		read_lock(&tasklist_lock);
-		if (unlikely(p->sighand == NULL)) {
+		struct sighand_struct *sighand;
+		unsigned long flags;
+
+		/*
+		 * Protect against sighand release/switch in exit/exec and
+		 * also make timer sampling safe if it ends up calling
+		 * thread_group_cputime().
+		 */
+		sighand = lock_task_sighand(p, &flags);
+		if (unlikely(sighand == NULL)) {
 			/*
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
@@ -789,11 +798,10 @@ static void posix_cpu_timer_get(struct k_itimer *timer, struct itimerspec *itp)
 			timer->it.cpu.expires = 0;
 			sample_to_timespec(timer->it_clock, timer->it.cpu.expires,
 					   &itp->it_value);
-			read_unlock(&tasklist_lock);
 		} else {
 			cpu_timer_sample_group(timer->it_clock, p, &now);
+			unlock_task_sighand(p, &flags);
 		}
-		read_unlock(&tasklist_lock);
 	}
 
 	if (now < timer->it.cpu.expires) {
@@ -1007,6 +1015,8 @@ static void check_process_timers(struct task_struct *tsk,
  */
 void posix_cpu_timer_schedule(struct k_itimer *timer)
 {
+	struct sighand_struct *sighand;
+	unsigned long flags;
 	struct task_struct *p = timer->it.cpu.task;
 	unsigned long long now;
 
@@ -1021,27 +1031,31 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 		if (unlikely(p->exit_state))
 			goto out;
 
-		read_lock(&tasklist_lock); /* arm_timer needs it.  */
-		spin_lock(&p->sighand->siglock);
+		/* Protect timer list r/w in arm_timer() */
+		sighand = lock_task_sighand(p, &flags);
+		if (!sighand)
+			goto out;
 	} else {
-		read_lock(&tasklist_lock);
-		if (unlikely(p->sighand == NULL)) {
+		/*
+		 * Protect arm_timer() and timer sampling in case of call to
+		 * thread_group_cputime().
+		 */
+		sighand = lock_task_sighand(p, &flags);
+		if (unlikely(sighand == NULL)) {
 			/*
 			 * The process has been reaped.
 			 * We can't even collect a sample any more.
 			 */
 			timer->it.cpu.expires = 0;
-			read_unlock(&tasklist_lock);
 			goto out;
 		} else if (unlikely(p->exit_state) && thread_group_empty(p)) {
-			read_unlock(&tasklist_lock);
+			unlock_task_sighand(p, &flags);
 			/* Optimizations: if the process is dying, no need to rearm */
 			goto out;
 		}
-		spin_lock(&p->sighand->siglock);
 		cpu_timer_sample_group(timer->it_clock, p, &now);
 		bump_cpu_timer(timer, now);
-		/* Leave the tasklist_lock locked for the call below.  */
+		/* Leave the sighand locked for the call below.  */
 	}
 
 	/*
@@ -1049,12 +1063,10 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	 */
 	BUG_ON(!irqs_disabled());
 	arm_timer(timer);
-	spin_unlock(&p->sighand->siglock);
-	read_unlock(&tasklist_lock);
+	unlock_task_sighand(p, &flags);
 
 	/* Kick full dynticks CPUs in case they need to tick on the new timer */
 	posix_cpu_timer_kick_nohz();
-
 out:
 	timer->it_overrun_last = timer->it_overrun;
 	timer->it_overrun = -1;
-- 
cgit v1.2.3


From 531f64fd6f46a3f2a3edb1b97ecc827c775932c5 Mon Sep 17 00:00:00 2001
From: Frederic Weisbecker <fweisbec@gmail.com>
Date: Fri, 11 Oct 2013 17:58:08 +0200
Subject: posix-timers: Convert abuses of BUG_ON to WARN_ON

The posix cpu timers code makes a heavy use of BUG_ON()
but none of these concern fatal issues that require
to stop the machine. So let's just warn the user when
some internal state slips out of our hands.

Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Kosaki Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
---
 kernel/posix-cpu-timers.c | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

(limited to 'kernel')

diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index d9dc5edc318c..3b8946416a5f 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -395,7 +395,7 @@ static int posix_cpu_timer_del(struct k_itimer *timer)
 		 * We raced with the reaping of the task.
 		 * The deletion should have cleared us off the list.
 		 */
-		BUG_ON(!list_empty(&timer->it.cpu.entry));
+		WARN_ON_ONCE(!list_empty(&timer->it.cpu.entry));
 	} else {
 		if (timer->it.cpu.firing)
 			ret = TIMER_RETRY;
@@ -640,7 +640,7 @@ static int posix_cpu_timer_set(struct k_itimer *timer, int timer_flags,
 	/*
 	 * Disarm any old timer after extracting its expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	WARN_ON_ONCE(!irqs_disabled());
 
 	ret = 0;
 	old_incr = timer->it.cpu.incr;
@@ -1061,7 +1061,7 @@ void posix_cpu_timer_schedule(struct k_itimer *timer)
 	/*
 	 * Now re-arm for the new expiry time.
 	 */
-	BUG_ON(!irqs_disabled());
+	WARN_ON_ONCE(!irqs_disabled());
 	arm_timer(timer);
 	unlock_task_sighand(p, &flags);
 
@@ -1150,7 +1150,7 @@ void run_posix_cpu_timers(struct task_struct *tsk)
 	struct k_itimer *timer, *next;
 	unsigned long flags;
 
-	BUG_ON(!irqs_disabled());
+	WARN_ON_ONCE(!irqs_disabled());
 
 	/*
 	 * The fast path checks that there are no expired thread or thread
@@ -1217,7 +1217,7 @@ void set_process_cpu_timer(struct task_struct *tsk, unsigned int clock_idx,
 {
 	unsigned long long now;
 
-	BUG_ON(clock_idx == CPUCLOCK_SCHED);
+	WARN_ON_ONCE(clock_idx == CPUCLOCK_SCHED);
 	cpu_timer_sample_group(clock_idx, tsk, &now);
 
 	if (oldval) {
-- 
cgit v1.2.3


From f55c07607a38f84b5c7e6066ee1cfe433fa5643c Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Dec 2013 18:50:25 -0800
Subject: timekeeping: Fix lost updates to tai adjustment

Since 48cdc135d4840 (Implement a shadow timekeeper), we have to
call timekeeping_update() after any adjustment to the timekeeping
structure in order to make sure that any adjustments to the structure
persist.

Unfortunately, the updates to the tai offset via adjtimex do not
trigger this update, causing adjustments to the tai offset to be
made and then over-written by the previous value at the next
update_wall_time() call.

This patch resovles the issue by calling timekeeping_update()
right after setting the tai offset.

Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable <stable@vger.kernel.org> #3.10+
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3abf53418b67..7488f0b97dee 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -610,6 +610,7 @@ void timekeeping_set_tai_offset(s32 tai_offset)
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
 	write_seqcount_begin(&timekeeper_seq);
 	__timekeeping_set_tai_offset(tk, tai_offset);
+	timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 	clock_was_set();
@@ -1698,7 +1699,7 @@ int do_adjtimex(struct timex *txc)
 
 	if (tai != orig_tai) {
 		__timekeeping_set_tai_offset(tk, tai);
-		update_pvclock_gtod(tk, true);
+		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
 		clock_was_set_delayed();
 	}
 	write_seqcount_end(&timekeeper_seq);
-- 
cgit v1.2.3


From 5258d3f25c76f6ab86e9333abf97a55a877d3870 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Dec 2013 20:07:49 -0800
Subject: timekeeping: Fix potential lost pv notification of time change

In 780427f0e11 (Indicate that clock was set in the pvclock
gtod notifier), logic was added to pass a CLOCK_WAS_SET
notification to the pvclock notifier chain.

While that patch added a action flag returned from
accumulate_nsecs_to_secs(), it only uses the returned value
in one location, and not in the logarithmic accumulation.

This means if a leap second triggered during the logarithmic
accumulation (which is most likely where it would happen),
the notification that the clock was set would not make it to
the pv notifiers.

This patch extends the logarithmic_accumulation pass down
that action flag so proper notification will occur.

This patch also changes the varialbe action -> clock_set
per Ingo's suggestion.

Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: <xen-devel@lists.xen.org>
Cc: stable <stable@vger.kernel.org> #3.11+
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 7488f0b97dee..051855fe68bc 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1256,7 +1256,7 @@ out_adjust:
 static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 {
 	u64 nsecps = (u64)NSEC_PER_SEC << tk->shift;
-	unsigned int action = 0;
+	unsigned int clock_set = 0;
 
 	while (tk->xtime_nsec >= nsecps) {
 		int leap;
@@ -1279,10 +1279,10 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
 
 			clock_was_set_delayed();
-			action = TK_CLOCK_WAS_SET;
+			clock_set = TK_CLOCK_WAS_SET;
 		}
 	}
-	return action;
+	return clock_set;
 }
 
 /**
@@ -1295,7 +1295,8 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
  * Returns the unconsumed cycles.
  */
 static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
-						u32 shift)
+						u32 shift,
+						unsigned int *clock_set)
 {
 	cycle_t interval = tk->cycle_interval << shift;
 	u64 raw_nsecs;
@@ -1309,7 +1310,7 @@ static cycle_t logarithmic_accumulation(struct timekeeper *tk, cycle_t offset,
 	tk->cycle_last += interval;
 
 	tk->xtime_nsec += tk->xtime_interval << shift;
-	accumulate_nsecs_to_secs(tk);
+	*clock_set |= accumulate_nsecs_to_secs(tk);
 
 	/* Accumulate raw time */
 	raw_nsecs = (u64)tk->raw_interval << shift;
@@ -1367,7 +1368,7 @@ static void update_wall_time(void)
 	struct timekeeper *tk = &shadow_timekeeper;
 	cycle_t offset;
 	int shift = 0, maxshift;
-	unsigned int action;
+	unsigned int clock_set = 0;
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&timekeeper_lock, flags);
@@ -1402,7 +1403,8 @@ static void update_wall_time(void)
 	maxshift = (64 - (ilog2(ntp_tick_length())+1)) - 1;
 	shift = min(shift, maxshift);
 	while (offset >= tk->cycle_interval) {
-		offset = logarithmic_accumulation(tk, offset, shift);
+		offset = logarithmic_accumulation(tk, offset, shift,
+							&clock_set);
 		if (offset < tk->cycle_interval<<shift)
 			shift--;
 	}
@@ -1420,7 +1422,7 @@ static void update_wall_time(void)
 	 * Finally, make sure that after the rounding
 	 * xtime_nsec isn't larger than NSEC_PER_SEC
 	 */
-	action = accumulate_nsecs_to_secs(tk);
+	clock_set |= accumulate_nsecs_to_secs(tk);
 
 	write_seqcount_begin(&timekeeper_seq);
 	/* Update clock->cycle_last with the new value */
@@ -1436,7 +1438,7 @@ static void update_wall_time(void)
 	 * updating.
 	 */
 	memcpy(real_tk, tk, sizeof(*tk));
-	timekeeping_update(real_tk, action);
+	timekeeping_update(real_tk, clock_set);
 	write_seqcount_end(&timekeeper_seq);
 out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-- 
cgit v1.2.3


From 6fdda9a9c5db367130cf32df5d6618d08b89f46a Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 10 Dec 2013 17:18:18 -0800
Subject: timekeeping: Avoid possible deadlock from clock_was_set_delayed

As part of normal operaions, the hrtimer subsystem frequently calls
into the timekeeping code, creating a locking order of
  hrtimer locks -> timekeeping locks

clock_was_set_delayed() was suppoed to allow us to avoid deadlocks
between the timekeeping the hrtimer subsystem, so that we could
notify the hrtimer subsytem the time had changed while holding
the timekeeping locks. This was done by scheduling delayed work
that would run later once we were out of the timekeeing code.

But unfortunately the lock chains are complex enoguh that in
scheduling delayed work, we end up eventually trying to grab
an hrtimer lock.

Sasha Levin noticed this in testing when the new seqlock lockdep
enablement triggered the following (somewhat abrieviated) message:

[  251.100221] ======================================================
[  251.100221] [ INFO: possible circular locking dependency detected ]
[  251.100221] 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053 Not tainted
[  251.101967] -------------------------------------------------------
[  251.101967] kworker/10:1/4506 is trying to acquire lock:
[  251.101967]  (timekeeper_seq){----..}, at: [<ffffffff81160e96>] retrigger_next_event+0x56/0x70
[  251.101967]
[  251.101967] but task is already holding lock:
[  251.101967]  (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[  251.101967]
[  251.101967] which lock already depends on the new lock.
[  251.101967]
[  251.101967]
[  251.101967] the existing dependency chain (in reverse order) is:
[  251.101967]
-> #5 (hrtimer_bases.lock#11){-.-...}:
[snipped]
-> #4 (&rt_b->rt_runtime_lock){-.-...}:
[snipped]
-> #3 (&rq->lock){-.-.-.}:
[snipped]
-> #2 (&p->pi_lock){-.-.-.}:
[snipped]
-> #1 (&(&pool->lock)->rlock){-.-...}:
[  251.101967]        [<ffffffff81194803>] validate_chain+0x6c3/0x7b0
[  251.101967]        [<ffffffff81194d9d>] __lock_acquire+0x4ad/0x580
[  251.101967]        [<ffffffff81194ff2>] lock_acquire+0x182/0x1d0
[  251.101967]        [<ffffffff84398500>] _raw_spin_lock+0x40/0x80
[  251.101967]        [<ffffffff81153e69>] __queue_work+0x1a9/0x3f0
[  251.101967]        [<ffffffff81154168>] queue_work_on+0x98/0x120
[  251.101967]        [<ffffffff81161351>] clock_was_set_delayed+0x21/0x30
[  251.101967]        [<ffffffff811c4bd1>] do_adjtimex+0x111/0x160
[  251.101967]        [<ffffffff811e2711>] compat_sys_adjtimex+0x41/0x70
[  251.101967]        [<ffffffff843a4b49>] ia32_sysret+0x0/0x5
[  251.101967]
-> #0 (timekeeper_seq){----..}:
[snipped]
[  251.101967] other info that might help us debug this:
[  251.101967]
[  251.101967] Chain exists of:
  timekeeper_seq --> &rt_b->rt_runtime_lock --> hrtimer_bases.lock#11

[  251.101967]  Possible unsafe locking scenario:
[  251.101967]
[  251.101967]        CPU0                    CPU1
[  251.101967]        ----                    ----
[  251.101967]   lock(hrtimer_bases.lock#11);
[  251.101967]                                lock(&rt_b->rt_runtime_lock);
[  251.101967]                                lock(hrtimer_bases.lock#11);
[  251.101967]   lock(timekeeper_seq);
[  251.101967]
[  251.101967]  *** DEADLOCK ***
[  251.101967]
[  251.101967] 3 locks held by kworker/10:1/4506:
[  251.101967]  #0:  (events){.+.+.+}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[  251.101967]  #1:  (hrtimer_work){+.+...}, at: [<ffffffff81154960>] process_one_work+0x200/0x530
[  251.101967]  #2:  (hrtimer_bases.lock#11){-.-...}, at: [<ffffffff81160e7c>] retrigger_next_event+0x3c/0x70
[  251.101967]
[  251.101967] stack backtrace:
[  251.101967] CPU: 10 PID: 4506 Comm: kworker/10:1 Not tainted 3.13.0-rc2-next-20131206-sasha-00005-g8be2375-dirty #4053
[  251.101967] Workqueue: events clock_was_set_work

So the best solution is to avoid calling clock_was_set_delayed() while
holding the timekeeping lock, and instead using a flag variable to
decide if we should call clock_was_set() once we've released the locks.

This works for the case here, where the do_adjtimex() was the deadlock
trigger point. Unfortuantely, in update_wall_time() we still hold
the jiffies lock, which would deadlock with the ipi triggered by
clock_was_set(), preventing us from calling it even after we drop the
timekeeping lock. So instead call clock_was_set_delayed() at that point.

Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: stable <stable@vger.kernel.org> #3.10+
Reported-by: Sasha Levin <sasha.levin@oracle.com>
Tested-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 051855fe68bc..d62682b6df4a 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1278,7 +1278,6 @@ static inline unsigned int accumulate_nsecs_to_secs(struct timekeeper *tk)
 
 			__timekeeping_set_tai_offset(tk, tk->tai_offset - leap);
 
-			clock_was_set_delayed();
 			clock_set = TK_CLOCK_WAS_SET;
 		}
 	}
@@ -1442,6 +1441,19 @@ static void update_wall_time(void)
 	write_seqcount_end(&timekeeper_seq);
 out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
+	if (clock_was_set) {
+		/*
+		 * XXX -  I'd rather we just call clock_was_set(), but
+		 * since we're currently holding the jiffies lock, calling
+		 * clock_was_set would trigger an ipi which would then grab
+		 * the jiffies lock and we'd deadlock. :(
+		 * The right solution should probably be droping
+		 * the jiffies lock before calling update_wall_time
+		 * but that requires some rework of the tick sched
+		 * code.
+		 */
+		clock_was_set_delayed();
+	}
 }
 
 /**
@@ -1702,11 +1714,13 @@ int do_adjtimex(struct timex *txc)
 	if (tai != orig_tai) {
 		__timekeeping_set_tai_offset(tk, tai);
 		timekeeping_update(tk, TK_MIRROR | TK_CLOCK_WAS_SET);
-		clock_was_set_delayed();
 	}
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
+	if (tai != orig_tai)
+		clock_was_set();
+
 	ntp_notify_cmos_timer();
 
 	return ret;
-- 
cgit v1.2.3


From 47a1b796306356f358e515149d86baf0cc6bf007 Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Thu, 12 Dec 2013 13:10:55 -0800
Subject: tick/timekeeping: Call update_wall_time outside the jiffies lock

Since the xtime lock was split into the timekeeping lock and
the jiffies lock, we no longer need to call update_wall_time()
while holding the jiffies lock.

Thus, this patch splits update_wall_time() out from do_timer().

This allows us to get away from calling clock_was_set_delayed()
in update_wall_time() and instead use the standard clock_was_set()
call that previously would deadlock, as it causes the jiffies lock
to be acquired.

Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/tick-common.c   |  1 +
 kernel/time/tick-internal.h |  1 +
 kernel/time/tick-sched.c    |  1 +
 kernel/time/timekeeping.c   | 19 ++++---------------
 4 files changed, 7 insertions(+), 15 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c
index 64522ecdfe0e..91c5f27e82a3 100644
--- a/kernel/time/tick-common.c
+++ b/kernel/time/tick-common.c
@@ -70,6 +70,7 @@ static void tick_periodic(int cpu)
 
 		do_timer(1);
 		write_sequnlock(&jiffies_lock);
+		update_wall_time();
 	}
 
 	update_process_times(user_mode(get_irq_regs()));
diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h
index e2bced59b6dd..8329669b51ec 100644
--- a/kernel/time/tick-internal.h
+++ b/kernel/time/tick-internal.h
@@ -155,3 +155,4 @@ static inline int tick_device_is_functional(struct clock_event_device *dev)
 #endif
 
 extern void do_timer(unsigned long ticks);
+extern void update_wall_time(void);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index 2afd43fca93b..c58b03d89951 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -86,6 +86,7 @@ static void tick_do_update_jiffies64(ktime_t now)
 		tick_next_period = ktime_add(last_jiffies_update, tick_period);
 	}
 	write_sequnlock(&jiffies_lock);
+	update_wall_time();
 }
 
 /*
diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index d62682b6df4a..44b7e6bb081b 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1360,7 +1360,7 @@ static inline void old_vsyscall_fixup(struct timekeeper *tk)
  * update_wall_time - Uses the current clocksource to increment the wall time
  *
  */
-static void update_wall_time(void)
+void update_wall_time(void)
 {
 	struct clocksource *clock;
 	struct timekeeper *real_tk = &timekeeper;
@@ -1441,19 +1441,8 @@ static void update_wall_time(void)
 	write_seqcount_end(&timekeeper_seq);
 out:
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
-	if (clock_was_set) {
-		/*
-		 * XXX -  I'd rather we just call clock_was_set(), but
-		 * since we're currently holding the jiffies lock, calling
-		 * clock_was_set would trigger an ipi which would then grab
-		 * the jiffies lock and we'd deadlock. :(
-		 * The right solution should probably be droping
-		 * the jiffies lock before calling update_wall_time
-		 * but that requires some rework of the tick sched
-		 * code.
-		 */
-		clock_was_set_delayed();
-	}
+	if (clock_set)
+		clock_was_set();
 }
 
 /**
@@ -1598,7 +1587,6 @@ struct timespec get_monotonic_coarse(void)
 void do_timer(unsigned long ticks)
 {
 	jiffies_64 += ticks;
-	update_wall_time();
 	calc_global_load(ticks);
 }
 
@@ -1756,4 +1744,5 @@ void xtime_update(unsigned long ticks)
 	write_seqlock(&jiffies_lock);
 	do_timer(ticks);
 	write_sequnlock(&jiffies_lock);
+	update_wall_time();
 }
-- 
cgit v1.2.3


From 04005f6011e3b504cd4d791d9769f7cb9a3b2eae Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Tue, 10 Dec 2013 17:13:35 -0800
Subject: timekeeping: Fix CLOCK_TAI timer/nanosleep delays

A think-o in the calculation of the monotonic -> tai time offset
results in CLOCK_TAI timers and nanosleeps to expire late (the
latency is ~2x the tai offset).

Fix this by adding the tai offset from the realtime offset instead
of subtracting.

Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable <stable@vger.kernel.org> #3.10+
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 44b7e6bb081b..3f6a827680fa 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -77,7 +77,7 @@ static void tk_set_wall_to_mono(struct timekeeper *tk, struct timespec wtm)
 	tk->wall_to_monotonic = wtm;
 	set_normalized_timespec(&tmp, -wtm.tv_sec, -wtm.tv_nsec);
 	tk->offs_real = timespec_to_ktime(tmp);
-	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tk->tai_offset, 0));
+	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tk->tai_offset, 0));
 }
 
 static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
@@ -595,7 +595,7 @@ s32 timekeeping_get_tai_offset(void)
 static void __timekeeping_set_tai_offset(struct timekeeper *tk, s32 tai_offset)
 {
 	tk->tai_offset = tai_offset;
-	tk->offs_tai = ktime_sub(tk->offs_real, ktime_set(tai_offset, 0));
+	tk->offs_tai = ktime_add(tk->offs_real, ktime_set(tai_offset, 0));
 }
 
 /**
-- 
cgit v1.2.3


From 330a1617b0a6268d427aa5922c94d082b1d3e96d Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Wed, 11 Dec 2013 19:10:36 -0800
Subject: timekeeping: Fix missing timekeeping_update in suspend path

Since 48cdc135d4840 (Implement a shadow timekeeper), we have to
call timekeeping_update() after any adjustment to the timekeeping
structure in order to make sure that any adjustments to the structure
persist.

In the timekeeping suspend path, we udpate the timekeeper
structure, so we should be sure to update the shadow-timekeeper
before releasing the timekeeping locks. Currently this isn't done.

In most cases, the next time related code to run would be
timekeeping_resume, which does update the shadow-timekeeper, but
in an abundence of caution, this patch adds the call to
timekeeping_update() in the suspend path.

Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Prarit Bhargava <prarit@redhat.com>
Cc: Richard Cochran <richardcochran@gmail.com>
Cc: Ingo Molnar <mingo@kernel.org>
Cc: stable <stable@vger.kernel.org> #3.10+
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 2 ++
 1 file changed, 2 insertions(+)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3f6a827680fa..2793c4382529 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1024,6 +1024,8 @@ static int timekeeping_suspend(void)
 		timekeeping_suspend_time =
 			timespec_add(timekeeping_suspend_time, delta_delta);
 	}
+
+	timekeeping_update(tk, TK_MIRROR);
 	write_seqcount_end(&timekeeper_seq);
 	raw_spin_unlock_irqrestore(&timekeeper_lock, flags);
 
-- 
cgit v1.2.3


From d26e4fe0dbe95778b9dbe80b6aa884d71fb6f459 Mon Sep 17 00:00:00 2001
From: Yijing Wang <wangyijing@huawei.com>
Date: Thu, 28 Nov 2013 16:28:55 +0800
Subject: timekeeper: fix comment typo for tk_setup_internals()

Fix trivial comment typo for tk_setup_internals().

Signed-off-by: Yijing Wang <wangyijing@huawei.com>
Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 2793c4382529..3ff30640fc9d 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -90,8 +90,9 @@ static void tk_set_sleep_time(struct timekeeper *tk, struct timespec t)
 }
 
 /**
- * timekeeper_setup_internals - Set up internals to use clocksource clock.
+ * tk_setup_internals - Set up internals to use clocksource clock.
  *
+ * @tk:		The target timekeeper to setup.
  * @clock:		Pointer to clocksource.
  *
  * Calculates a fixed cycle/nsec interval for a given clocksource/adjustment
-- 
cgit v1.2.3


From 38aef31ce7773624c8f09ff58c4c27b3b955faaf Mon Sep 17 00:00:00 2001
From: John Stultz <john.stultz@linaro.org>
Date: Mon, 23 Dec 2013 12:53:22 -0800
Subject: timekeeping: Remove comment that's mostly out of date

Prior to 92bb1fcf57a0c2e45f7e67fbf0a8ed475a749236 (Only
do nanosecond rounding on GENERIC_TIME_VSYSCALL_OLD
systems), the comment here was accuate, but now we can
mostly avoid the extra rounding which causes the unlikey
to be actually likely here.

So remove the out of date comment.

Signed-off-by: John Stultz <john.stultz@linaro.org>
---
 kernel/time/timekeeping.c | 10 ----------
 1 file changed, 10 deletions(-)

(limited to 'kernel')

diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c
index 3ff30640fc9d..abfa4e86ac54 100644
--- a/kernel/time/timekeeping.c
+++ b/kernel/time/timekeeping.c
@@ -1134,16 +1134,6 @@ static void timekeeping_adjust(struct timekeeper *tk, s64 offset)
 		 * we can adjust by 1.
 		 */
 		error >>= 2;
-		/*
-		 * XXX - In update_wall_time, we round up to the next
-		 * nanosecond, and store the amount rounded up into
-		 * the error. This causes the likely below to be unlikely.
-		 *
-		 * The proper fix is to avoid rounding up by using
-		 * the high precision tk->xtime_nsec instead of
-		 * xtime.tv_nsec everywhere. Fixing this will take some
-		 * time.
-		 */
 		if (likely(error <= interval))
 			adj = 1;
 		else
-- 
cgit v1.2.3