From b6ccb9803e90c16b212cf4ed62913a7591e79a39 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 7 Feb 2014 19:12:27 +0100
Subject: ARM: 7954/1: mm: remove remaining domain support from ARMv6

CPU_32v6 currently selects CPU_USE_DOMAINS if CPU_V6 and MMU. This is
because ARM 1136 r0pX CPUs lack the v6k extensions, and therefore do
not have hardware thread registers. The lack of these registers requires
the kernel to update the vectors page at each context switch in order to
write a new TLS pointer. This write must be done via the userspace
mapping, since aliasing caches can lead to expensive flushing when using
kmap. Finally, this requires the vectors page to be mapped r/w for
kernel and r/o for user, which has implications for things like put_user
which must trigger CoW appropriately when targetting user pages.

The upshot of all this is that a v6/v7 kernel makes use of domains to
segregate kernel and user memory accesses. This has the nasty
side-effect of making device mappings executable, which has been
observed to cause subtle bugs on recent cores (e.g. Cortex-A15
performing a speculative instruction fetch from the GIC and acking an
interrupt in the process).

This patch solves this problem by removing the remaining domain support
from ARMv6. A new memory type is added specifically for the vectors page
which allows that page (and only that page) to be mapped as user r/o,
kernel r/w. All other user r/o pages are mapped also as kernel r/o.
Patch co-developed with Russell King.

Cc: <stable@vger.kernel.org>
Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/futex.h | 6 ------
 1 file changed, 6 deletions(-)

(limited to 'arch/arm/include/asm/futex.h')

diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index e42cf597f6e6..2aff798fbef4 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -3,11 +3,6 @@
 
 #ifdef __KERNEL__
 
-#if defined(CONFIG_CPU_USE_DOMAINS) && defined(CONFIG_SMP)
-/* ARM doesn't provide unprivileged exclusive memory accessors */
-#include <asm-generic/futex.h>
-#else
-
 #include <linux/futex.h>
 #include <linux/uaccess.h>
 #include <asm/errno.h>
@@ -164,6 +159,5 @@ futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr)
 	return ret;
 }
 
-#endif /* !(CPU_USE_DOMAINS && SMP) */
 #endif /* __KERNEL__ */
 #endif /* _ASM_ARM_FUTEX_H */
-- 
cgit v1.2.3


From c32ffce0f66e5d1d4856254516e24f5ef275cd00 Mon Sep 17 00:00:00 2001
From: Will Deacon <will.deacon@arm.com>
Date: Fri, 21 Feb 2014 17:01:48 +0100
Subject: ARM: 7984/1: prefetch: add prefetchw invocations for barriered
 atomics

After a bunch of benchmarking on the interaction between dmb and pldw,
it turns out that issuing the pldw *after* the dmb instruction can
give modest performance gains (~3% atomic_add_return improvement on a
dual A15).

This patch adds prefetchw invocations to our barriered atomic operations
including cmpxchg, test_and_xxx and futexes.

Signed-off-by: Will Deacon <will.deacon@arm.com>
Signed-off-by: Russell King <rmk+kernel@arm.linux.org.uk>
---
 arch/arm/include/asm/atomic.h  | 9 +++++++++
 arch/arm/include/asm/cmpxchg.h | 6 ++++++
 arch/arm/include/asm/futex.h   | 3 +++
 arch/arm/lib/bitops.h          | 5 +++++
 4 files changed, 23 insertions(+)

(limited to 'arch/arm/include/asm/futex.h')

diff --git a/arch/arm/include/asm/atomic.h b/arch/arm/include/asm/atomic.h
index 62d2cb53b069..6e410090896e 100644
--- a/arch/arm/include/asm/atomic.h
+++ b/arch/arm/include/asm/atomic.h
@@ -60,6 +60,7 @@ static inline int atomic_add_return(int i, atomic_t *v)
 	int result;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic_add_return\n"
 "1:	ldrex	%0, [%3]\n"
@@ -99,6 +100,7 @@ static inline int atomic_sub_return(int i, atomic_t *v)
 	int result;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic_sub_return\n"
 "1:	ldrex	%0, [%3]\n"
@@ -121,6 +123,7 @@ static inline int atomic_cmpxchg(atomic_t *ptr, int old, int new)
 	unsigned long res;
 
 	smp_mb();
+	prefetchw(&ptr->counter);
 
 	do {
 		__asm__ __volatile__("@ atomic_cmpxchg\n"
@@ -299,6 +302,7 @@ static inline long long atomic64_add_return(long long i, atomic64_t *v)
 	unsigned long tmp;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic64_add_return\n"
 "1:	ldrexd	%0, %H0, [%3]\n"
@@ -340,6 +344,7 @@ static inline long long atomic64_sub_return(long long i, atomic64_t *v)
 	unsigned long tmp;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic64_sub_return\n"
 "1:	ldrexd	%0, %H0, [%3]\n"
@@ -364,6 +369,7 @@ static inline long long atomic64_cmpxchg(atomic64_t *ptr, long long old,
 	unsigned long res;
 
 	smp_mb();
+	prefetchw(&ptr->counter);
 
 	do {
 		__asm__ __volatile__("@ atomic64_cmpxchg\n"
@@ -388,6 +394,7 @@ static inline long long atomic64_xchg(atomic64_t *ptr, long long new)
 	unsigned long tmp;
 
 	smp_mb();
+	prefetchw(&ptr->counter);
 
 	__asm__ __volatile__("@ atomic64_xchg\n"
 "1:	ldrexd	%0, %H0, [%3]\n"
@@ -409,6 +416,7 @@ static inline long long atomic64_dec_if_positive(atomic64_t *v)
 	unsigned long tmp;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic64_dec_if_positive\n"
 "1:	ldrexd	%0, %H0, [%3]\n"
@@ -436,6 +444,7 @@ static inline int atomic64_add_unless(atomic64_t *v, long long a, long long u)
 	int ret = 1;
 
 	smp_mb();
+	prefetchw(&v->counter);
 
 	__asm__ __volatile__("@ atomic64_add_unless\n"
 "1:	ldrexd	%0, %H0, [%4]\n"
diff --git a/arch/arm/include/asm/cmpxchg.h b/arch/arm/include/asm/cmpxchg.h
index df2fbba7efc8..abb2c3769b01 100644
--- a/arch/arm/include/asm/cmpxchg.h
+++ b/arch/arm/include/asm/cmpxchg.h
@@ -2,6 +2,7 @@
 #define __ASM_ARM_CMPXCHG_H
 
 #include <linux/irqflags.h>
+#include <linux/prefetch.h>
 #include <asm/barrier.h>
 
 #if defined(CONFIG_CPU_SA1100) || defined(CONFIG_CPU_SA110)
@@ -35,6 +36,7 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, int size
 #endif
 
 	smp_mb();
+	prefetchw((const void *)ptr);
 
 	switch (size) {
 #if __LINUX_ARM_ARCH__ >= 6
@@ -138,6 +140,8 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old,
 {
 	unsigned long oldval, res;
 
+	prefetchw((const void *)ptr);
+
 	switch (size) {
 #ifndef CONFIG_CPU_V6	/* min ARCH >= ARMv6K */
 	case 1:
@@ -230,6 +234,8 @@ static inline unsigned long long __cmpxchg64(unsigned long long *ptr,
 	unsigned long long oldval;
 	unsigned long res;
 
+	prefetchw(ptr);
+
 	__asm__ __volatile__(
 "1:	ldrexd		%1, %H1, [%3]\n"
 "	teq		%1, %4\n"
diff --git a/arch/arm/include/asm/futex.h b/arch/arm/include/asm/futex.h
index 2aff798fbef4..53e69dae796f 100644
--- a/arch/arm/include/asm/futex.h
+++ b/arch/arm/include/asm/futex.h
@@ -23,6 +23,7 @@
 
 #define __futex_atomic_op(insn, ret, oldval, tmp, uaddr, oparg)	\
 	smp_mb();						\
+	prefetchw(uaddr);					\
 	__asm__ __volatile__(					\
 	"1:	ldrex	%1, [%3]\n"				\
 	"	" insn "\n"					\
@@ -46,6 +47,8 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr,
 		return -EFAULT;
 
 	smp_mb();
+	/* Prefetching cannot fault */
+	prefetchw(uaddr);
 	__asm__ __volatile__("@futex_atomic_cmpxchg_inatomic\n"
 	"1:	ldrex	%1, [%4]\n"
 	"	teq	%1, %2\n"
diff --git a/arch/arm/lib/bitops.h b/arch/arm/lib/bitops.h
index 52886b89706c..9f12ed1eea86 100644
--- a/arch/arm/lib/bitops.h
+++ b/arch/arm/lib/bitops.h
@@ -37,6 +37,11 @@ UNWIND(	.fnstart	)
 	add	r1, r1, r0, lsl #2	@ Get word offset
 	mov	r3, r2, lsl r3		@ create mask
 	smp_dmb
+#if __LINUX_ARM_ARCH__ >= 7 && defined(CONFIG_SMP)
+	.arch_extension	mp
+	ALT_SMP(W(pldw)	[r1])
+	ALT_UP(W(nop))
+#endif
 1:	ldrex	r2, [r1]
 	ands	r0, r2, r3		@ save old value of bit
 	\instr	r2, r2, r3		@ toggle bit
-- 
cgit v1.2.3