From 442302820356977237e32a76a211e7942255003a Mon Sep 17 00:00:00 2001
From: Heiko Carstens <heiko.carstens@de.ibm.com>
Date: Mon, 8 Sep 2014 08:20:43 +0200
Subject: s390/spinlock: optimize spin_unlock code

Use a memory barrier + store sequence instead of a load + compare and swap
sequence to unlock a spinlock and an rw lock.
For the spinlock case this saves us two memory reads and a not needed cpu
serialization after the compare and swap instruction stored the new value.

The kernel size (performance_defconfig) gets reduced by ~14k.

Average execution time of a tight inlined spin_unlock loop drops from
5.8ns to 0.7ns on a zEC12 machine.

An artificial stress test case where several counters are protected with
a single spinlock and which are only incremented while holding the spinlock
shows ~30% improvement on a 4 cpu machine.

Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Signed-off-by: Martin Schwidefsky <schwidefsky@de.ibm.com>
---
 arch/s390/include/asm/barrier.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

(limited to 'arch/s390/include/asm/barrier.h')

diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index 19ff956b752b..b5dce6544d76 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -15,11 +15,13 @@
 
 #ifdef CONFIG_HAVE_MARCH_Z196_FEATURES
 /* Fast-BCR without checkpoint synchronization */
-#define mb() do {  asm volatile("bcr 14,0" : : : "memory"); } while (0)
+#define __ASM_BARRIER "bcr 14,0\n"
 #else
-#define mb() do {  asm volatile("bcr 15,0" : : : "memory"); } while (0)
+#define __ASM_BARRIER "bcr 15,0\n"
 #endif
 
+#define mb() do {  asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
+
 #define rmb()				mb()
 #define wmb()				mb()
 #define read_barrier_depends()		do { } while(0)
-- 
cgit v1.2.3