From 44a12806d010944a5727f1dc99123121e3e2c8c6 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Mon, 7 Aug 2017 21:25:01 +1000
Subject: Revert "powerpc/64: Avoid restore_math call if possible in syscall
 exit"

This reverts commit bc4f65e4cf9d6cc43e0e9ba0b8648cf9201cd55f.

As reported by Andreas, this commit is causing unrecoverable SLB misses in the
system call exit path:

  Unrecoverable exception 4100 at c00000000000a1ec
  Oops: Unrecoverable exception, sig: 6 [#1]
  SMP NR_CPUS=2 PowerMac
  ...
  CPU: 0 PID: 18626 Comm: rm Not tainted 4.13.0-rc3 #1
  task: c00000018335e080 task.stack: c000000139e50000
  NIP: c00000000000a1ec LR: c00000000000a118 CTR: 0000000000000000
  REGS: c000000139e53bb0 TRAP: 4100   Not tainted  (4.13.0-rc3)
  MSR: 9000000000001030 <SF,HV,ME,IR,DR> CR: 24000044  XER: 20000000 SOFTE: 1
  GPR00: 0000000000000000 c000000139e53e30 c000000000abb500 fffffffffffffffe
  GPR04: c0000001eb866298 0000000000000000 0000000000000000 c00000018335e080
  GPR08: 900000000000d032 0000000000000000 0000000000000002 fffffffffffff001
  GPR12: c000000139e50000 c00000000ffff000 00003fffa8c0dca0 00003fffa8c0dc88
  GPR16: 0000000010000000 0000000000000001 00003fffa8c0eaa0 0000000000000000
  GPR20: 00003fffa8c27528 00003fffa8c27b00 0000000000000000 0000000000000000
  GPR24: 00003fffa8c0d918 00003ffff1b3efa0 00003fffa8c26d68 0000000000000000
  GPR28: 00003fffa8c249e8 00003fffa8c263d0 00003fffa8c27550 00003ffff1b3ef10
  NIP [c00000000000a1ec] system_call_exit+0xc0/0x21c
  LR [c00000000000a118] system_call+0x58/0x6c
  Call Trace:
  [c000000139e53e30] [c00000000000a118] system_call+0x58/0x6c (unreliable)
  Instruction dump:
  64a51000 7c6300d0 f8a101a0 4bffff9c 3c000000 60000006 780007c6 64000000
  60000000 7c004039 4082001c e8ed0170 <88070b78> 88c70b79 7c003214 2c200000

This is caused by us trying to load THREAD_LOAD_FP with MSR_RI=0, and taking an
SLB miss on the thread struct.

Reported-by: Andreas Schwab <schwab@linux-m68k.org>
Diagnosed-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/entry_64.S | 60 +++++++++++++-----------------------------
 arch/powerpc/kernel/process.c  |  4 ---
 2 files changed, 18 insertions(+), 46 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 49d8422767b4..e925c1c99c71 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -223,17 +223,27 @@ system_call_exit:
 	andi.	r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)
 	bne-	.Lsyscall_exit_work
 
-	/* If MSR_FP and MSR_VEC are set in user msr, then no need to restore */
-	li	r7,MSR_FP
+	andi.	r0,r8,MSR_FP
+	beq 2f
 #ifdef CONFIG_ALTIVEC
-	oris	r7,r7,MSR_VEC@h
+	andis.	r0,r8,MSR_VEC@h
+	bne	3f
 #endif
-	and	r0,r8,r7
-	cmpd	r0,r7
-	bne	.Lsyscall_restore_math
-.Lsyscall_restore_math_cont:
+2:	addi    r3,r1,STACK_FRAME_OVERHEAD
+#ifdef CONFIG_PPC_BOOK3S
+	li	r10,MSR_RI
+	mtmsrd	r10,1		/* Restore RI */
+#endif
+	bl	restore_math
+#ifdef CONFIG_PPC_BOOK3S
+	li	r11,0
+	mtmsrd	r11,1
+#endif
+	ld	r8,_MSR(r1)
+	ld	r3,RESULT(r1)
+	li	r11,-MAX_ERRNO
 
-	cmpld	r3,r11
+3:	cmpld	r3,r11
 	ld	r5,_CCR(r1)
 	bge-	.Lsyscall_error
 .Lsyscall_error_cont:
@@ -267,40 +277,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	std	r5,_CCR(r1)
 	b	.Lsyscall_error_cont
 
-.Lsyscall_restore_math:
-	/*
-	 * Some initial tests from restore_math to avoid the heavyweight
-	 * C code entry and MSR manipulations.
-	 */
-	LOAD_REG_IMMEDIATE(r0, MSR_TS_MASK)
-	and.	r0,r0,r8
-	bne	1f
-
-	ld	r7,PACACURRENT(r13)
-	lbz	r0,THREAD+THREAD_LOAD_FP(r7)
-#ifdef CONFIG_ALTIVEC
-	lbz	r6,THREAD+THREAD_LOAD_VEC(r7)
-	add	r0,r0,r6
-#endif
-	cmpdi	r0,0
-	beq	.Lsyscall_restore_math_cont
-
-1:	addi    r3,r1,STACK_FRAME_OVERHEAD
-#ifdef CONFIG_PPC_BOOK3S
-	li	r10,MSR_RI
-	mtmsrd	r10,1		/* Restore RI */
-#endif
-	bl	restore_math
-#ifdef CONFIG_PPC_BOOK3S
-	li	r11,0
-	mtmsrd	r11,1
-#endif
-	/* Restore volatiles, reload MSR from updated one */
-	ld	r8,_MSR(r1)
-	ld	r3,RESULT(r1)
-	li	r11,-MAX_ERRNO
-	b	.Lsyscall_restore_math_cont
-
 /* Traced system call support */
 .Lsyscall_dotrace:
 	bl	save_nvgprs
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 9f3e2c932dcc..ec480966f9bf 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -511,10 +511,6 @@ void restore_math(struct pt_regs *regs)
 {
 	unsigned long msr;
 
-	/*
-	 * Syscall exit makes a similar initial check before branching
-	 * to restore_math. Keep them in synch.
-	 */
 	if (!msr_tm_active(regs->msr) &&
 		!current->thread.load_fp && !loadvec(current->thread))
 		return;
-- 
cgit v1.2.3


From 785a12afdb4a52903447fd890633c82fdda4b6f7 Mon Sep 17 00:00:00 2001
From: "Gautham R. Shenoy" <ego@linux.vnet.ibm.com>
Date: Tue, 8 Aug 2017 14:13:15 +0530
Subject: powerpc/powernv/idle: Disable LOSE_FULL_CONTEXT states when stop-api
 fails

Currently, we use the opal call opal_slw_set_reg() to inform the
Sleep-Winkle Engine (SLW) to restore the contents of some of the
Hypervisor state on wakeup from deep idle states that lose full
hypervisor context (characterized by the flag
OPAL_PM_LOSE_FULL_CONTEXT).

However, the current code has a bug in that if opal_slw_set_reg()
fails, we don't disable the use of these deep states (winkle on
POWER8, stop4 onwards on POWER9).

This patch fixes this bug by ensuring that if programing the
sleep-winkle engine to restore the hypervisor states in
pnv_save_sprs_for_deep_states() fails, then we exclude such states by
clearing the OPAL_PM_LOSE_FULL_CONTEXT flag from
supported_cpuidle_states. As a result POWER8 will be prevented from
using winkle for CPU-Hotplug, and POWER9 will put the offlined CPUs to
the default stop state when available.

Further, we ensure in the initialization of the cpuidle-powernv driver
to only include those states whose flags are present in
supported_cpuidle_states, thereby skipping OPAL_PM_LOSE_FULL_CONTEXT
states when they have been disabled due to stop-api failure.

Fixes: 1e1601b38e6 ("powerpc/powernv/idle: Restore SPRs for deep idle
states via stop API.")

Signed-off-by: Gautham R. Shenoy <ego@linux.vnet.ibm.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/platforms/powernv/idle.c | 41 ++++++++++++++++++++++++++++++++---
 drivers/cpuidle/cpuidle-powernv.c     | 10 +++++++++
 2 files changed, 48 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/platforms/powernv/idle.c b/arch/powerpc/platforms/powernv/idle.c
index 2abee070373f..a553aeea7af6 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -56,6 +56,7 @@ u64 pnv_first_deep_stop_state = MAX_STOP_STATE;
  */
 static u64 pnv_deepest_stop_psscr_val;
 static u64 pnv_deepest_stop_psscr_mask;
+static u64 pnv_deepest_stop_flag;
 static bool deepest_stop_found;
 
 static int pnv_save_sprs_for_deep_states(void)
@@ -185,8 +186,40 @@ static void pnv_alloc_idle_core_states(void)
 
 	update_subcore_sibling_mask();
 
-	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT)
-		pnv_save_sprs_for_deep_states();
+	if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) {
+		int rc = pnv_save_sprs_for_deep_states();
+
+		if (likely(!rc))
+			return;
+
+		/*
+		 * The stop-api is unable to restore hypervisor
+		 * resources on wakeup from platform idle states which
+		 * lose full context. So disable such states.
+		 */
+		supported_cpuidle_states &= ~OPAL_PM_LOSE_FULL_CONTEXT;
+		pr_warn("cpuidle-powernv: Disabling idle states that lose full context\n");
+		pr_warn("cpuidle-powernv: Idle power-savings, CPU-Hotplug affected\n");
+
+		if (cpu_has_feature(CPU_FTR_ARCH_300) &&
+		    (pnv_deepest_stop_flag & OPAL_PM_LOSE_FULL_CONTEXT)) {
+			/*
+			 * Use the default stop state for CPU-Hotplug
+			 * if available.
+			 */
+			if (default_stop_found) {
+				pnv_deepest_stop_psscr_val =
+					pnv_default_stop_val;
+				pnv_deepest_stop_psscr_mask =
+					pnv_default_stop_mask;
+				pr_warn("cpuidle-powernv: Offlined CPUs will stop with psscr = 0x%016llx\n",
+					pnv_deepest_stop_psscr_val);
+			} else { /* Fallback to snooze loop for CPU-Hotplug */
+				deepest_stop_found = false;
+				pr_warn("cpuidle-powernv: Offlined CPUs will busy wait\n");
+			}
+		}
+	}
 }
 
 u32 pnv_get_supported_cpuidle_states(void)
@@ -375,7 +408,8 @@ unsigned long pnv_cpu_offline(unsigned int cpu)
 						pnv_deepest_stop_psscr_val;
 		srr1 = power9_idle_stop(psscr);
 
-	} else if (idle_states & OPAL_PM_WINKLE_ENABLED) {
+	} else if ((idle_states & OPAL_PM_WINKLE_ENABLED) &&
+		   (idle_states & OPAL_PM_LOSE_FULL_CONTEXT)) {
 		srr1 = power7_idle_insn(PNV_THREAD_WINKLE);
 	} else if ((idle_states & OPAL_PM_SLEEP_ENABLED) ||
 		   (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) {
@@ -553,6 +587,7 @@ static int __init pnv_power9_idle_init(struct device_node *np, u32 *flags,
 			max_residency_ns = residency_ns[i];
 			pnv_deepest_stop_psscr_val = psscr_val[i];
 			pnv_deepest_stop_psscr_mask = psscr_mask[i];
+			pnv_deepest_stop_flag = flags[i];
 			deepest_stop_found = true;
 		}
 
diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c
index 37b0698b7193..42896a67aeae 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -235,6 +235,7 @@ static inline int validate_dt_prop_sizes(const char *prop1, int prop1_len,
 	return -1;
 }
 
+extern u32 pnv_get_supported_cpuidle_states(void);
 static int powernv_add_idle_states(void)
 {
 	struct device_node *power_mgt;
@@ -248,6 +249,8 @@ static int powernv_add_idle_states(void)
 	const char *names[CPUIDLE_STATE_MAX];
 	u32 has_stop_states = 0;
 	int i, rc;
+	u32 supported_flags = pnv_get_supported_cpuidle_states();
+
 
 	/* Currently we have snooze statically defined */
 
@@ -362,6 +365,13 @@ static int powernv_add_idle_states(void)
 	for (i = 0; i < dt_idle_states; i++) {
 		unsigned int exit_latency, target_residency;
 		bool stops_timebase = false;
+
+		/*
+		 * Skip the platform idle state whose flag isn't in
+		 * the supported_cpuidle_states flag mask.
+		 */
+		if ((flags[i] & supported_flags) != flags[i])
+			continue;
 		/*
 		 * If an idle state has exit latency beyond
 		 * POWERNV_THRESHOLD_LATENCY_NS then don't use it
-- 
cgit v1.2.3


From 7310d5c8c55e8987a854507f71022f8de676bbf4 Mon Sep 17 00:00:00 2001
From: Michael Ellerman <mpe@ellerman.id.au>
Date: Wed, 9 Aug 2017 20:57:55 +1000
Subject: powerpc/configs: Re-enable HARD/SOFT lockup detectors

In commit 05a4a9527931 ("kernel/watchdog: split up config options"),
CONFIG_LOCKUP_DETECTOR was split into two separate config options,
CONFIG_HARDLOCKUP_DETECTOR and CONFIG_SOFTLOCKUP_DETECTOR.

Our defconfigs still have CONFIG_LOCKUP_DETECTOR=y, but that is no longer
user selectable, and we don't mention the new options, so we end up with
none of them enabled.

So update the defconfigs to turn on the new SOFT and HARD options, the
end result being the same as what we had previously.

Fixes: 05a4a9527931 ("kernel/watchdog: split up config options")
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/configs/powernv_defconfig | 3 ++-
 arch/powerpc/configs/ppc64_defconfig   | 3 ++-
 arch/powerpc/configs/pseries_defconfig | 3 ++-
 3 files changed, 6 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig
index 0695ce047d56..34fc9bbfca9e 100644
--- a/arch/powerpc/configs/powernv_defconfig
+++ b/arch/powerpc/configs/powernv_defconfig
@@ -293,7 +293,8 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
diff --git a/arch/powerpc/configs/ppc64_defconfig b/arch/powerpc/configs/ppc64_defconfig
index 5175028c56ce..c5246d29f385 100644
--- a/arch/powerpc/configs/ppc64_defconfig
+++ b/arch/powerpc/configs/ppc64_defconfig
@@ -324,7 +324,8 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_DEBUG_MUTEXES=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig
index 1a61aa20dfba..fd5d98a0b95c 100644
--- a/arch/powerpc/configs/pseries_defconfig
+++ b/arch/powerpc/configs/pseries_defconfig
@@ -291,7 +291,8 @@ CONFIG_MAGIC_SYSRQ=y
 CONFIG_DEBUG_KERNEL=y
 CONFIG_DEBUG_STACK_USAGE=y
 CONFIG_DEBUG_STACKOVERFLOW=y
-CONFIG_LOCKUP_DETECTOR=y
+CONFIG_SOFTLOCKUP_DETECTOR=y
+CONFIG_HARDLOCKUP_DETECTOR=y
 CONFIG_LATENCYTOP=y
 CONFIG_SCHED_TRACER=y
 CONFIG_BLK_DEV_IO_TRACE=y
-- 
cgit v1.2.3


From 0459ddfdb31e7d812b555a2530ecbacdf96961a6 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 Aug 2017 22:41:21 +1000
Subject: powerpc: NMI IPI improve lock primitive

When the NMI IPI lock is contended, spin at low SMT priority, using
loads only, and with interrupts enabled (where possible). This
improves behaviour under high contention (e.g., a system crash when
a number of CPUs are trying to enter the debugger).

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/smp.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/smp.c b/arch/powerpc/kernel/smp.c
index cf0e1245b8cc..8d3320562c70 100644
--- a/arch/powerpc/kernel/smp.c
+++ b/arch/powerpc/kernel/smp.c
@@ -351,7 +351,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
 	hard_irq_disable();
 	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1) {
 		raw_local_irq_restore(*flags);
-		cpu_relax();
+		spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
 		raw_local_irq_save(*flags);
 		hard_irq_disable();
 	}
@@ -360,7 +360,7 @@ static void nmi_ipi_lock_start(unsigned long *flags)
 static void nmi_ipi_lock(void)
 {
 	while (atomic_cmpxchg(&__nmi_ipi_lock, 0, 1) == 1)
-		cpu_relax();
+		spin_until_cond(atomic_read(&__nmi_ipi_lock) == 0);
 }
 
 static void nmi_ipi_unlock(void)
@@ -475,7 +475,7 @@ int smp_send_nmi_ipi(int cpu, void (*fn)(struct pt_regs *), u64 delay_us)
 	nmi_ipi_lock_start(&flags);
 	while (nmi_ipi_busy_count) {
 		nmi_ipi_unlock_end(&flags);
-		cpu_relax();
+		spin_until_cond(nmi_ipi_busy_count == 0);
 		nmi_ipi_lock_start(&flags);
 	}
 
-- 
cgit v1.2.3


From d8e2a4053574002135fbb032c2e74f1d1dbb2103 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 Aug 2017 22:41:22 +1000
Subject: powerpc/watchdog: Improve watchdog lock primitive

- Hard-disable interrupts before taking the lock, which prevents
  soft-NMI re-entrancy and therefore can prevent deadlocks.
- Use raw_ variants of local_irq_disable to avoid irq debugging.
- When the lock is contended, spin at low SMT priority, using
  loads only, and with interrupts enabled (where possible).

Some stalls have been noticed at high loads that go away with improved
locking. There should not be so much locking contention in the first
place (which is addressed in a subsequent patch), but locking should
still be improved.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index b67f8b03a32d..fda4d044d326 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -71,15 +71,20 @@ static inline void wd_smp_lock(unsigned long *flags)
 	 * This may be called from low level interrupt handlers at some
 	 * point in future.
 	 */
-	local_irq_save(*flags);
-	while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock)))
-		cpu_relax();
+	raw_local_irq_save(*flags);
+	hard_irq_disable(); /* Make it soft-NMI safe */
+	while (unlikely(test_and_set_bit_lock(0, &__wd_smp_lock))) {
+		raw_local_irq_restore(*flags);
+		spin_until_cond(!test_bit(0, &__wd_smp_lock));
+		raw_local_irq_save(*flags);
+		hard_irq_disable();
+	}
 }
 
 static inline void wd_smp_unlock(unsigned long *flags)
 {
 	clear_bit_unlock(0, &__wd_smp_lock);
-	local_irq_restore(*flags);
+	raw_local_irq_restore(*flags);
 }
 
 static void wd_lockup_ipi(struct pt_regs *regs)
-- 
cgit v1.2.3


From 26c5c6e129ee725f103938262a034861ada467ae Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 Aug 2017 22:41:23 +1000
Subject: powerpc/watchdog: Moderate touch_nmi_watchdog overhead

Some code can go into a tight loop calling touch_nmi_watchdog (e.g.,
stop_machine CPU hotplug code). This can cause contention on watchdog
locks particularly if all CPUs with watchdog enabled are spinning in
the loops.

Avoid this storm of activity by running the watchdog timer callback
from this path if we have exceeded the timer period since it was last
run.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index fda4d044d326..426dd34891d6 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -263,9 +263,11 @@ static void wd_timer_fn(unsigned long data)
 
 void arch_touch_nmi_watchdog(void)
 {
+	unsigned long ticks = tb_ticks_per_usec * wd_timer_period_ms * 1000;
 	int cpu = smp_processor_id();
 
-	watchdog_timer_interrupt(cpu);
+	if (get_tb() - per_cpu(wd_timer_tb, cpu) >= ticks)
+		watchdog_timer_interrupt(cpu);
 }
 EXPORT_SYMBOL(arch_touch_nmi_watchdog);
 
-- 
cgit v1.2.3


From 8e23692175ad465628b8c86c1acc154fecad97be Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 Aug 2017 22:41:24 +1000
Subject: powerpc/watchdog: Fix final-check recovered case

When the watchdog decides to panic, it takes the lock and double
checks everything (to avoid races with the CPU being unstuck or
panic()ed by something else).

The exit label was misplaced and would result in all-CPUs backtrace
and watchdog panic even in the case that the condition was found to be
resolved.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 426dd34891d6..7d16dafa1bb6 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -144,7 +144,6 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	for_each_cpu(c, &wd_smp_cpus_pending)
 		set_cpu_stuck(c, tb);
 
-out:
 	wd_smp_unlock(&flags);
 
 	printk_safe_flush();
@@ -157,6 +156,11 @@ out:
 
 	if (hardlockup_panic)
 		nmi_panic(NULL, "Hard LOCKUP");
+
+	return;
+
+out:
+	wd_smp_unlock(&flags);
 }
 
 static void wd_smp_clear_cpu_pending(int cpu, u64 tb)
-- 
cgit v1.2.3


From 87607a30be92f1ecee3af6f4a5779e179db98118 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 Aug 2017 22:41:25 +1000
Subject: powerpc/watchdog: Fix marking of stuck CPUs

When the SMP detector finds other CPUs stuck, it iterates over
them and marks them as stuck. This pulls them out of the pending
mask and allows the detector to continue with remaining good
CPUs (if nmi_watchdog=panic is not enabled).

The code to dothat was buggy because when setting a CPU stuck,
if the pending mask became empty, it resets it to keep the
watchdog running. However the iterator will continue to run
over the new pending mask and mark remaining good CPUs sas stuck.

Fix this by doing it with cpumask bitwise operations.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 7d16dafa1bb6..12e90ae712fe 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -101,10 +101,10 @@ static void wd_lockup_ipi(struct pt_regs *regs)
 		nmi_panic(regs, "Hard LOCKUP");
 }
 
-static void set_cpu_stuck(int cpu, u64 tb)
+static void set_cpumask_stuck(const struct cpumask *cpumask, u64 tb)
 {
-	cpumask_set_cpu(cpu, &wd_smp_cpus_stuck);
-	cpumask_clear_cpu(cpu, &wd_smp_cpus_pending);
+	cpumask_or(&wd_smp_cpus_stuck, &wd_smp_cpus_stuck, cpumask);
+	cpumask_andnot(&wd_smp_cpus_pending, &wd_smp_cpus_pending, cpumask);
 	if (cpumask_empty(&wd_smp_cpus_pending)) {
 		wd_smp_last_reset_tb = tb;
 		cpumask_andnot(&wd_smp_cpus_pending,
@@ -112,6 +112,10 @@ static void set_cpu_stuck(int cpu, u64 tb)
 				&wd_smp_cpus_stuck);
 	}
 }
+static void set_cpu_stuck(int cpu, u64 tb)
+{
+	set_cpumask_stuck(cpumask_of(cpu), tb);
+}
 
 static void watchdog_smp_panic(int cpu, u64 tb)
 {
@@ -140,9 +144,8 @@ static void watchdog_smp_panic(int cpu, u64 tb)
 	}
 	smp_flush_nmi_ipi(1000000);
 
-	/* Take the stuck CPU out of the watch group */
-	for_each_cpu(c, &wd_smp_cpus_pending)
-		set_cpu_stuck(c, tb);
+	/* Take the stuck CPUs out of the watch group */
+	set_cpumask_stuck(&wd_smp_cpus_pending, tb);
 
 	wd_smp_unlock(&flags);
 
-- 
cgit v1.2.3


From 96ea91e7b6ee2c406598d859e7348b4829404eea Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Wed, 9 Aug 2017 22:41:26 +1000
Subject: powerpc/watchdog: add locking around init/exit functions

When CPUs start and stop the watchdog, they manipulate shared data
that is normally protected by the lock. Other CPUs can be running
concurrently at this time, so it's a good idea to use locking here
to be on the safe side.

Remove the barrier which is undocumented and didn't do anything.

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/watchdog.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c
index 12e90ae712fe..34721a257a77 100644
--- a/arch/powerpc/kernel/watchdog.c
+++ b/arch/powerpc/kernel/watchdog.c
@@ -297,6 +297,8 @@ static void stop_watchdog_timer_on(unsigned int cpu)
 
 static int start_wd_on_cpu(unsigned int cpu)
 {
+	unsigned long flags;
+
 	if (cpumask_test_cpu(cpu, &wd_cpus_enabled)) {
 		WARN_ON(1);
 		return 0;
@@ -311,12 +313,14 @@ static int start_wd_on_cpu(unsigned int cpu)
 	if (!cpumask_test_cpu(cpu, &watchdog_cpumask))
 		return 0;
 
+	wd_smp_lock(&flags);
 	cpumask_set_cpu(cpu, &wd_cpus_enabled);
 	if (cpumask_weight(&wd_cpus_enabled) == 1) {
 		cpumask_set_cpu(cpu, &wd_smp_cpus_pending);
 		wd_smp_last_reset_tb = get_tb();
 	}
-	smp_wmb();
+	wd_smp_unlock(&flags);
+
 	start_watchdog_timer_on(cpu);
 
 	return 0;
@@ -324,12 +328,17 @@ static int start_wd_on_cpu(unsigned int cpu)
 
 static int stop_wd_on_cpu(unsigned int cpu)
 {
+	unsigned long flags;
+
 	if (!cpumask_test_cpu(cpu, &wd_cpus_enabled))
 		return 0; /* Can happen in CPU unplug case */
 
 	stop_watchdog_timer_on(cpu);
 
+	wd_smp_lock(&flags);
 	cpumask_clear_cpu(cpu, &wd_cpus_enabled);
+	wd_smp_unlock(&flags);
+
 	wd_smp_clear_cpu_pending(cpu, get_tb());
 
 	return 0;
-- 
cgit v1.2.3


From 5a69aec945d27e78abac9fd032533d3aaebf7c1e Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Wed, 16 Aug 2017 16:01:14 +1000
Subject: powerpc: Fix VSX enabling/flushing to also test MSR_FP and MSR_VEC

VSX uses a combination of the old vector registers, the old FP
registers and new "second halves" of the FP registers.

Thus when we need to see the VSX state in the thread struct
(flush_vsx_to_thread()) or when we'll use the VSX in the kernel
(enable_kernel_vsx()) we need to ensure they are all flushed into
the thread struct if either of them is individually enabled.

Unfortunately we only tested if the whole VSX was enabled, not if they
were individually enabled.

Fixes: 72cd7b44bc99 ("powerpc: Uncomment and make enable_kernel_vsx() routine available")
Cc: stable@vger.kernel.org # v4.3+
Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
---
 arch/powerpc/kernel/process.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index ec480966f9bf..1f0fd361e09b 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -362,7 +362,8 @@ void enable_kernel_vsx(void)
 
 	cpumsr = msr_check_and_set(MSR_FP|MSR_VEC|MSR_VSX);
 
-	if (current->thread.regs && (current->thread.regs->msr & MSR_VSX)) {
+	if (current->thread.regs &&
+	    (current->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP))) {
 		check_if_tm_restore_required(current);
 		/*
 		 * If a thread has already been reclaimed then the
@@ -386,7 +387,7 @@ void flush_vsx_to_thread(struct task_struct *tsk)
 {
 	if (tsk->thread.regs) {
 		preempt_disable();
-		if (tsk->thread.regs->msr & MSR_VSX) {
+		if (tsk->thread.regs->msr & (MSR_VSX|MSR_VEC|MSR_FP)) {
 			BUG_ON(tsk != current);
 			giveup_vsx(tsk);
 		}
-- 
cgit v1.2.3


From 92e5aae457787d0bc6b255200d2fb116edf69794 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Fri, 18 Aug 2017 15:15:51 -0700
Subject: kernel/watchdog: fix Kconfig constraints for perf hardlockup watchdog

Commit 05a4a9527931 ("kernel/watchdog: split up config options") lost
the perf-based hardlockup detector's dependency on PERF_EVENTS, which
can result in broken builds with some powerpc configurations.

Restore the dependency.  Add it in for x86 too, despite x86 always
selecting PERF_EVENTS it seems reasonable to make the dependency
explicit.

Link: http://lkml.kernel.org/r/20170810114452.6673-1-npiggin@gmail.com
Fixes: 05a4a9527931 ("kernel/watchdog: split up config options")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Acked-by: Don Zickus <dzickus@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
---
 arch/powerpc/Kconfig | 2 +-
 arch/x86/Kconfig     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig
index 36f858c37ca7..81b0031f909f 100644
--- a/arch/powerpc/Kconfig
+++ b/arch/powerpc/Kconfig
@@ -199,7 +199,7 @@ config PPC
 	select HAVE_OPTPROBES			if PPC64
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI		if PPC64
-	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE		if SMP
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 781521b7cf9e..29a1bf85e507 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -163,7 +163,7 @@ config X86
 	select HAVE_PCSPKR_PLATFORM
 	select HAVE_PERF_EVENTS
 	select HAVE_PERF_EVENTS_NMI
-	select HAVE_HARDLOCKUP_DETECTOR_PERF	if HAVE_PERF_EVENTS_NMI
+	select HAVE_HARDLOCKUP_DETECTOR_PERF	if PERF_EVENTS && HAVE_PERF_EVENTS_NMI
 	select HAVE_PERF_REGS
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_REGS_AND_STACK_ACCESS_API
-- 
cgit v1.2.3


From bd0fdb191c8523a9126bb14ac1b22cb47698ebf5 Mon Sep 17 00:00:00 2001
From: Nicholas Piggin <npiggin@gmail.com>
Date: Mon, 13 Mar 2017 03:03:49 +1000
Subject: KVM: PPC: Book3S HV: Use msgsync with hypervisor doorbells on POWER9

When msgsnd is used for IPIs to other cores, msgsync must be executed by
the target to order stores performed on the source before its msgsnd
(provided the source executes the appropriate sync).

Fixes: 1704a81ccebc ("KVM: PPC: Book3S HV: Use msgsnd for IPIs to other cores on POWER9")
Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 3 +++
 1 file changed, 3 insertions(+)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index c52184a8efdf..9c9c983b864f 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1291,6 +1291,9 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR)
 	/* Hypervisor doorbell - exit only if host IPI flag set */
 	cmpwi	r12, BOOK3S_INTERRUPT_H_DOORBELL
 	bne	3f
+BEGIN_FTR_SECTION
+	PPC_MSGSYNC
+END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300)
 	lbz	r0, HSTATE_HOST_IPI(r13)
 	cmpwi	r0, 0
 	beq	4f
-- 
cgit v1.2.3


From 2c4fb78f78b6e420604ee1b05bdfb5c1d637869f Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 18 Aug 2017 12:10:52 +1000
Subject: KVM: PPC: Book3S HV: Workaround POWER9 DD1.0 bug causing IPB bit loss

This adds a workaround for a bug in POWER9 DD1 chips where changing
the CPPR (Current Processor Priority Register) can cause bits in the
IPB (Interrupt Pending Buffer) to get lost.  Thankfully it only
happens when manually manipulating CPPR which is quite rare.  When it
does happen it can cause interrupts to be delayed or lost.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_xive_template.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 4636ca6e7d38..150be86b1018 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -16,7 +16,16 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
 	u8 cppr;
 	u16 ack;
 
-	/* XXX DD1 bug workaround: Check PIPR vs. CPPR first ! */
+	/*
+	 * DD1 bug workaround: If PIPR is less favored than CPPR
+	 * ignore the interrupt or we might incorrectly lose an IPB
+	 * bit.
+	 */
+	if (cpu_has_feature(CPU_FTR_POWER9_DD1)) {
+		u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR);
+		if (pipr >= xc->hw_cppr)
+			return;
+	}
 
 	/* Perform the acknowledge OS to register cycle. */
 	ack = be16_to_cpu(__x_readw(__x_tima + TM_SPC_ACK_OS_REG));
-- 
cgit v1.2.3


From bb9b52bd51dcb17b965a30167d0812902c1b9927 Mon Sep 17 00:00:00 2001
From: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Date: Fri, 18 Aug 2017 12:10:58 +1000
Subject: KVM: PPC: Book3S HV: Add missing barriers to XIVE code and document
 them

This adds missing memory barriers to order updates/tests of
the virtual CPPR and MFRR, thus fixing a lost IPI problem.

While at it also document all barriers in this file.

This fixes a bug causing guest IPIs to occasionally get lost.  The
symptom then is hangs or stalls in the guest.

Signed-off-by: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Tested-by: Guilherme G. Piccoli <gpiccoli@linux.vnet.ibm.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_xive_template.c | 57 +++++++++++++++++++++++++++++++--
 1 file changed, 55 insertions(+), 2 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_xive_template.c b/arch/powerpc/kvm/book3s_xive_template.c
index 150be86b1018..d1ed2c41b5d2 100644
--- a/arch/powerpc/kvm/book3s_xive_template.c
+++ b/arch/powerpc/kvm/book3s_xive_template.c
@@ -16,6 +16,12 @@ static void GLUE(X_PFX,ack_pending)(struct kvmppc_xive_vcpu *xc)
 	u8 cppr;
 	u16 ack;
 
+	/*
+	 * Ensure any previous store to CPPR is ordered vs.
+	 * the subsequent loads from PIPR or ACK.
+	 */
+	eieio();
+
 	/*
 	 * DD1 bug workaround: If PIPR is less favored than CPPR
 	 * ignore the interrupt or we might incorrectly lose an IPB
@@ -244,6 +250,11 @@ skip_ipi:
 	/*
 	 * If we found an interrupt, adjust what the guest CPPR should
 	 * be as if we had just fetched that interrupt from HW.
+	 *
+	 * Note: This can only make xc->cppr smaller as the previous
+	 * loop will only exit with hirq != 0 if prio is lower than
+	 * the current xc->cppr. Thus we don't need to re-check xc->mfrr
+	 * for pending IPIs.
 	 */
 	if (hirq)
 		xc->cppr = prio;
@@ -389,6 +400,12 @@ X_STATIC int GLUE(X_PFX,h_cppr)(struct kvm_vcpu *vcpu, unsigned long cppr)
 	old_cppr = xc->cppr;
 	xc->cppr = cppr;
 
+	/*
+	 * Order the above update of xc->cppr with the subsequent
+	 * read of xc->mfrr inside push_pending_to_hw()
+	 */
+	smp_mb();
+
 	/*
 	 * We are masking less, we need to look for pending things
 	 * to deliver and set VP pending bits accordingly to trigger
@@ -429,21 +446,37 @@ X_STATIC int GLUE(X_PFX,h_eoi)(struct kvm_vcpu *vcpu, unsigned long xirr)
 	 * used to signal MFRR changes is EOId when fetched from
 	 * the queue.
 	 */
-	if (irq == XICS_IPI || irq == 0)
+	if (irq == XICS_IPI || irq == 0) {
+		/*
+		 * This barrier orders the setting of xc->cppr vs.
+		 * subsquent test of xc->mfrr done inside
+		 * scan_interrupts and push_pending_to_hw
+		 */
+		smp_mb();
 		goto bail;
+	}
 
 	/* Find interrupt source */
 	sb = kvmppc_xive_find_source(xive, irq, &src);
 	if (!sb) {
 		pr_devel(" source not found !\n");
 		rc = H_PARAMETER;
+		/* Same as above */
+		smp_mb();
 		goto bail;
 	}
 	state = &sb->irq_state[src];
 	kvmppc_xive_select_irq(state, &hw_num, &xd);
 
 	state->in_eoi = true;
-	mb();
+
+	/*
+	 * This barrier orders both setting of in_eoi above vs,
+	 * subsequent test of guest_priority, and the setting
+	 * of xc->cppr vs. subsquent test of xc->mfrr done inside
+	 * scan_interrupts and push_pending_to_hw
+	 */
+	smp_mb();
 
 again:
 	if (state->guest_priority == MASKED) {
@@ -470,6 +503,14 @@ again:
 
 	}
 
+	/*
+	 * This barrier orders the above guest_priority check
+	 * and spin_lock/unlock with clearing in_eoi below.
+	 *
+	 * It also has to be a full mb() as it must ensure
+	 * the MMIOs done in source_eoi() are completed before
+	 * state->in_eoi is visible.
+	 */
 	mb();
 	state->in_eoi = false;
 bail:
@@ -504,6 +545,18 @@ X_STATIC int GLUE(X_PFX,h_ipi)(struct kvm_vcpu *vcpu, unsigned long server,
 	/* Locklessly write over MFRR */
 	xc->mfrr = mfrr;
 
+	/*
+	 * The load of xc->cppr below and the subsequent MMIO store
+	 * to the IPI must happen after the above mfrr update is
+	 * globally visible so that:
+	 *
+	 * - Synchronize with another CPU doing an H_EOI or a H_CPPR
+	 *   updating xc->cppr then reading xc->mfrr.
+	 *
+	 * - The target of the IPI sees the xc->mfrr update
+	 */
+	mb();
+
 	/* Shoot the IPI if most favored than target cppr */
 	if (mfrr < xc->cppr)
 		__x_writeq(0, __x_trig_page(&xc->vp_ipi_data));
-- 
cgit v1.2.3


From 47c5310a8dbe7c2cb9f0083daa43ceed76c257fa Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Thu, 24 Aug 2017 19:14:47 +1000
Subject: KVM: PPC: Book3S: Fix race and leak in
 kvm_vm_ioctl_create_spapr_tce()

Nixiaoming pointed out that there is a memory leak in
kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd()
fails; the memory allocated for the kvmppc_spapr_tce_table struct
is not freed, and nor are the pages allocated for the iommu
tables.  In addition, we have already incremented the process's
count of locked memory pages, and this doesn't get restored on
error.

David Hildenbrand pointed out that there is a race in that the
function checks early on that there is not already an entry in the
stt->iommu_tables list with the same LIOBN, but an entry with the
same LIOBN could get added between then and when the new entry is
added to the list.

This fixes all three problems.  To simplify things, we now call
anon_inode_getfd() before placing the new entry in the list.  The
check for an existing entry is done while holding the kvm->lock
mutex, immediately before adding the new entry to the list.
Finally, on failure we now call kvmppc_account_memlimit to
decrement the process's count of locked memory pages.

Reported-by: Nixiaoming <nixiaoming@huawei.com>
Reported-by: David Hildenbrand <david@redhat.com>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
---
 arch/powerpc/kvm/book3s_64_vio.c | 56 ++++++++++++++++++++++++----------------
 1 file changed, 34 insertions(+), 22 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index a160c14304eb..53766e2bc029 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -294,32 +294,26 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 				   struct kvm_create_spapr_tce_64 *args)
 {
 	struct kvmppc_spapr_tce_table *stt = NULL;
+	struct kvmppc_spapr_tce_table *siter;
 	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;
+	int fd = -1;
 
 	if (!args->size)
 		return -EINVAL;
 
-	/* Check this LIOBN hasn't been previously allocated */
-	list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
-		if (stt->liobn == args->liobn)
-			return -EBUSY;
-	}
-
 	size = _ALIGN_UP(args->size, PAGE_SIZE >> 3);
 	npages = kvmppc_tce_pages(size);
 	ret = kvmppc_account_memlimit(kvmppc_stt_pages(npages), true);
-	if (ret) {
-		stt = NULL;
-		goto fail;
-	}
+	if (ret)
+		return ret;
 
 	ret = -ENOMEM;
 	stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *),
 		      GFP_KERNEL);
 	if (!stt)
-		goto fail;
+		goto fail_acct;
 
 	stt->liobn = args->liobn;
 	stt->page_shift = args->page_shift;
@@ -334,24 +328,42 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			goto fail;
 	}
 
-	kvm_get_kvm(kvm);
+	ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				    stt, O_RDWR | O_CLOEXEC);
+	if (ret < 0)
+		goto fail;
 
 	mutex_lock(&kvm->lock);
-	list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+
+	/* Check this LIOBN hasn't been previously allocated */
+	ret = 0;
+	list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) {
+		if (siter->liobn == args->liobn) {
+			ret = -EBUSY;
+			break;
+		}
+	}
+
+	if (!ret) {
+		list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
+		kvm_get_kvm(kvm);
+	}
 
 	mutex_unlock(&kvm->lock);
 
-	return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-				stt, O_RDWR | O_CLOEXEC);
+	if (!ret)
+		return fd;
 
-fail:
-	if (stt) {
-		for (i = 0; i < npages; i++)
-			if (stt->pages[i])
-				__free_page(stt->pages[i]);
+	put_unused_fd(fd);
 
-		kfree(stt);
-	}
+ fail:
+	for (i = 0; i < npages; i++)
+		if (stt->pages[i])
+			__free_page(stt->pages[i]);
+
+	kfree(stt);
+ fail_acct:
+	kvmppc_account_memlimit(kvmppc_stt_pages(npages), false);
 	return ret;
 }
 
-- 
cgit v1.2.3


From edd03602d97236e8fea13cd76886c576186aa307 Mon Sep 17 00:00:00 2001
From: Paul Mackerras <paulus@ozlabs.org>
Date: Mon, 28 Aug 2017 14:31:24 +1000
Subject: KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list

Al Viro pointed out that while one thread of a process is executing
in kvm_vm_ioctl_create_spapr_tce(), another thread could guess the
file descriptor returned by anon_inode_getfd() and close() it before
the first thread has added it to the kvm->arch.spapr_tce_tables list.
That highlights a more general problem: there is no mutual exclusion
between writers to the spapr_tce_tables list, leading to the
possibility of the list becoming corrupted, which could cause a
host kernel crash.

To fix the mutual exclusion problem, we add a mutex_lock/unlock
pair around the list_del_rce in kvm_spapr_tce_release().  Also,
this moves the call to anon_inode_getfd() inside the region
protected by the kvm->lock mutex, after we have done the check for
a duplicate LIOBN.  This means that if another thread does guess the
file descriptor and closes it, its call to kvm_spapr_tce_release()
will not do any harm because it will have to wait until the first
thread has released kvm->lock.  With this, there are no failure
points in kvm_vm_ioctl_create_spapr_tce() after the call to
anon_inode_getfd().

The other things that the second thread could do with the guessed
file descriptor are to mmap it or to pass it as a parameter to a
KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl on a KVM device fd.  An mmap
call won't cause any harm because kvm_spapr_tce_mmap() and
kvm_spapr_tce_fault() don't access the spapr_tce_tables list or
the kvmppc_spapr_tce_table.list field, and the fields that they do use
have been properly initialized by the time of the anon_inode_getfd()
call.

The KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE ioctl calls
kvm_spapr_tce_attach_iommu_group(), which scans the spapr_tce_tables
list looking for the kvmppc_spapr_tce_table struct corresponding to
the fd given as the parameter.  Either it will find the new entry
or it won't; if it doesn't, it just returns an error, and if it
does, it will function normally.  So, in each case there is no
harmful effect.

Reviewed-by: David Gibson <david@gibson.dropbear.id.au>
Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---
 arch/powerpc/kvm/book3s_64_vio.c | 21 ++++++++++-----------
 1 file changed, 10 insertions(+), 11 deletions(-)

(limited to 'arch/powerpc')

diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 53766e2bc029..8f2da8bba737 100644
--- a/arch/powerpc/kvm/book3s_64_vio.c
+++ b/arch/powerpc/kvm/book3s_64_vio.c
@@ -265,8 +265,11 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp)
 {
 	struct kvmppc_spapr_tce_table *stt = filp->private_data;
 	struct kvmppc_spapr_tce_iommu_table *stit, *tmp;
+	struct kvm *kvm = stt->kvm;
 
+	mutex_lock(&kvm->lock);
 	list_del_rcu(&stt->list);
+	mutex_unlock(&kvm->lock);
 
 	list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) {
 		WARN_ON(!kref_read(&stit->kref));
@@ -298,7 +301,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 	unsigned long npages, size;
 	int ret = -ENOMEM;
 	int i;
-	int fd = -1;
 
 	if (!args->size)
 		return -EINVAL;
@@ -328,11 +330,6 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 			goto fail;
 	}
 
-	ret = fd = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
-				    stt, O_RDWR | O_CLOEXEC);
-	if (ret < 0)
-		goto fail;
-
 	mutex_lock(&kvm->lock);
 
 	/* Check this LIOBN hasn't been previously allocated */
@@ -344,17 +341,19 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
 		}
 	}
 
-	if (!ret) {
+	if (!ret)
+		ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops,
+				       stt, O_RDWR | O_CLOEXEC);
+
+	if (ret >= 0) {
 		list_add_rcu(&stt->list, &kvm->arch.spapr_tce_tables);
 		kvm_get_kvm(kvm);
 	}
 
 	mutex_unlock(&kvm->lock);
 
-	if (!ret)
-		return fd;
-
-	put_unused_fd(fd);
+	if (ret >= 0)
+		return ret;
 
  fail:
 	for (i = 0; i < npages; i++)
-- 
cgit v1.2.3