From 8a4e3a9ead7e37ce1505602b564c15da09ac039f Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:47:36 +0100 Subject: ARM: 7659/1: mm: make mm->context.id an atomic64_t variable mm->context.id is updated under asid_lock when a new ASID is allocated to an mm_struct. However, it is also read without the lock when a task is being scheduled and checking whether or not the current ASID generation is up-to-date. If two threads of the same process are being scheduled in parallel and the bottom bits of the generation in their mm->context.id match the current generation (that is, the mm_struct has not been used for ~2^24 rollovers) then the non-atomic, lockless access to mm->context.id may yield the incorrect ASID. This patch fixes this issue by making mm->context.id and atomic64_t, ensuring that the generation is always read consistently. For code that only requires access to the ASID bits (e.g. TLB flushing by mm), then the value is accessed directly, which GCC converts to an ldrb. Cc: # 3.8 Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/include/asm/mmu.h | 8 ++++---- arch/arm/include/asm/mmu_context.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'arch/arm/include') diff --git a/arch/arm/include/asm/mmu.h b/arch/arm/include/asm/mmu.h index 9f77e7804f3b..e3d55547e755 100644 --- a/arch/arm/include/asm/mmu.h +++ b/arch/arm/include/asm/mmu.h @@ -5,15 +5,15 @@ typedef struct { #ifdef CONFIG_CPU_HAS_ASID - u64 id; + atomic64_t id; #endif - unsigned int vmalloc_seq; + unsigned int vmalloc_seq; } mm_context_t; #ifdef CONFIG_CPU_HAS_ASID #define ASID_BITS 8 #define ASID_MASK ((~0ULL) << ASID_BITS) -#define ASID(mm) ((mm)->context.id & ~ASID_MASK) +#define ASID(mm) ((mm)->context.id.counter & ~ASID_MASK) #else #define ASID(mm) (0) #endif @@ -26,7 +26,7 @@ typedef struct { * modified for 2.6 by Hyok S. Choi */ typedef struct { - unsigned long end_brk; + unsigned long end_brk; } mm_context_t; #endif diff --git a/arch/arm/include/asm/mmu_context.h b/arch/arm/include/asm/mmu_context.h index e1f644bc7cc5..863a6611323c 100644 --- a/arch/arm/include/asm/mmu_context.h +++ b/arch/arm/include/asm/mmu_context.h @@ -25,7 +25,7 @@ void __check_vmalloc_seq(struct mm_struct *mm); #ifdef CONFIG_CPU_HAS_ASID void check_and_switch_context(struct mm_struct *mm, struct task_struct *tsk); -#define init_new_context(tsk,mm) ({ mm->context.id = 0; }) +#define init_new_context(tsk,mm) ({ atomic64_set(&mm->context.id, 0); 0; }) #else /* !CONFIG_CPU_HAS_ASID */ -- cgit v1.2.3 From 862c588f062fe9339a180cf6429e4df1855c376a Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Thu, 28 Feb 2013 17:48:11 +0100 Subject: ARM: 7660/1: tlb: add branch predictor maintenance operations The ARM architecture requires explicit branch predictor maintenance when updating an instruction stream for a given virtual address. In reality, this isn't so much of a burden because the branch predictor is flushed during the cache maintenance required to make the new instructions visible to the I-side of the processor. However, there are still some cases where explicit flushing is required, so add a local_bp_flush_all operation to deal with this. Reviewed-by: Catalin Marinas Signed-off-by: Will Deacon Signed-off-by: Russell King --- arch/arm/include/asm/tlbflush.h | 34 ++++++++++++++++++++++++++++------ arch/arm/kernel/smp_tlb.c | 12 ++++++++++++ 2 files changed, 40 insertions(+), 6 deletions(-) (limited to 'arch/arm/include') diff --git a/arch/arm/include/asm/tlbflush.h b/arch/arm/include/asm/tlbflush.h index 6e924d3a77eb..4db8c8820f0d 100644 --- a/arch/arm/include/asm/tlbflush.h +++ b/arch/arm/include/asm/tlbflush.h @@ -34,10 +34,13 @@ #define TLB_V6_D_ASID (1 << 17) #define TLB_V6_I_ASID (1 << 18) +#define TLB_V6_BP (1 << 19) + /* Unified Inner Shareable TLB operations (ARMv7 MP extensions) */ -#define TLB_V7_UIS_PAGE (1 << 19) -#define TLB_V7_UIS_FULL (1 << 20) -#define TLB_V7_UIS_ASID (1 << 21) +#define TLB_V7_UIS_PAGE (1 << 20) +#define TLB_V7_UIS_FULL (1 << 21) +#define TLB_V7_UIS_ASID (1 << 22) +#define TLB_V7_UIS_BP (1 << 23) #define TLB_BARRIER (1 << 28) #define TLB_L2CLEAN_FR (1 << 29) /* Feroceon */ @@ -150,7 +153,8 @@ #define v6wbi_tlb_flags (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \ TLB_V6_I_FULL | TLB_V6_D_FULL | \ TLB_V6_I_PAGE | TLB_V6_D_PAGE | \ - TLB_V6_I_ASID | TLB_V6_D_ASID) + TLB_V6_I_ASID | TLB_V6_D_ASID | \ + TLB_V6_BP) #ifdef CONFIG_CPU_TLB_V6 # define v6wbi_possible_flags v6wbi_tlb_flags @@ -166,9 +170,11 @@ #endif #define v7wbi_tlb_flags_smp (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \ - TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | TLB_V7_UIS_ASID) + TLB_V7_UIS_FULL | TLB_V7_UIS_PAGE | \ + TLB_V7_UIS_ASID | TLB_V7_UIS_BP) #define v7wbi_tlb_flags_up (TLB_WB | TLB_DCLEAN | TLB_BARRIER | \ - TLB_V6_U_FULL | TLB_V6_U_PAGE | TLB_V6_U_ASID) + TLB_V6_U_FULL | TLB_V6_U_PAGE | \ + TLB_V6_U_ASID | TLB_V6_BP) #ifdef CONFIG_CPU_TLB_V7 @@ -430,6 +436,20 @@ static inline void local_flush_tlb_kernel_page(unsigned long kaddr) } } +static inline void local_flush_bp_all(void) +{ + const int zero = 0; + const unsigned int __tlb_flag = __cpu_tlb_flags; + + if (tlb_flag(TLB_V7_UIS_BP)) + asm("mcr p15, 0, %0, c7, c1, 6" : : "r" (zero)); + else if (tlb_flag(TLB_V6_BP)) + asm("mcr p15, 0, %0, c7, c5, 6" : : "r" (zero)); + + if (tlb_flag(TLB_BARRIER)) + isb(); +} + /* * flush_pmd_entry * @@ -480,6 +500,7 @@ static inline void clean_pmd_entry(void *pmd) #define flush_tlb_kernel_page local_flush_tlb_kernel_page #define flush_tlb_range local_flush_tlb_range #define flush_tlb_kernel_range local_flush_tlb_kernel_range +#define flush_bp_all local_flush_bp_all #else extern void flush_tlb_all(void); extern void flush_tlb_mm(struct mm_struct *mm); @@ -487,6 +508,7 @@ extern void flush_tlb_page(struct vm_area_struct *vma, unsigned long uaddr); extern void flush_tlb_kernel_page(unsigned long kaddr); extern void flush_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); extern void flush_tlb_kernel_range(unsigned long start, unsigned long end); +extern void flush_bp_all(void); #endif /* diff --git a/arch/arm/kernel/smp_tlb.c b/arch/arm/kernel/smp_tlb.c index 02c5d2ce23bf..bd0300531399 100644 --- a/arch/arm/kernel/smp_tlb.c +++ b/arch/arm/kernel/smp_tlb.c @@ -64,6 +64,11 @@ static inline void ipi_flush_tlb_kernel_range(void *arg) local_flush_tlb_kernel_range(ta->ta_start, ta->ta_end); } +static inline void ipi_flush_bp_all(void *ignored) +{ + local_flush_bp_all(); +} + void flush_tlb_all(void) { if (tlb_ops_need_broadcast()) @@ -127,3 +132,10 @@ void flush_tlb_kernel_range(unsigned long start, unsigned long end) local_flush_tlb_kernel_range(start, end); } +void flush_bp_all(void) +{ + if (tlb_ops_need_broadcast()) + on_each_cpu(ipi_flush_bp_all, NULL, 1); + else + local_flush_bp_all(); +} -- cgit v1.2.3 From 3f7d1fe108dbaefd0c57a41753fc2c90b395f458 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 28 Feb 2013 21:53:35 +0100 Subject: ARM: 7665/1: Wire up kcmp syscall Wire up kcmp syscall for ability to proceed checkpoint/restore procedure on ARM platform. Signed-off-by: Alexander Kartashov Signed-off-by: Cyrill Gorcunov Acked-by: Arnd Bergmann Signed-off-by: Russell King --- arch/arm/include/uapi/asm/unistd.h | 2 +- arch/arm/kernel/calls.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'arch/arm/include') diff --git a/arch/arm/include/uapi/asm/unistd.h b/arch/arm/include/uapi/asm/unistd.h index 4da7cde70b5d..af33b44990ed 100644 --- a/arch/arm/include/uapi/asm/unistd.h +++ b/arch/arm/include/uapi/asm/unistd.h @@ -404,7 +404,7 @@ #define __NR_setns (__NR_SYSCALL_BASE+375) #define __NR_process_vm_readv (__NR_SYSCALL_BASE+376) #define __NR_process_vm_writev (__NR_SYSCALL_BASE+377) - /* 378 for kcmp */ +#define __NR_kcmp (__NR_SYSCALL_BASE+378) #define __NR_finit_module (__NR_SYSCALL_BASE+379) /* diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S index 0cc57611fc4f..c6ca7e376773 100644 --- a/arch/arm/kernel/calls.S +++ b/arch/arm/kernel/calls.S @@ -387,7 +387,7 @@ /* 375 */ CALL(sys_setns) CALL(sys_process_vm_readv) CALL(sys_process_vm_writev) - CALL(sys_ni_syscall) /* reserved for sys_kcmp */ + CALL(sys_kcmp) CALL(sys_finit_module) #ifndef syscalls_counted .equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls -- cgit v1.2.3 From 85323a991d40681023822e86ca95f38a75262026 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 7 Mar 2013 07:17:25 +0000 Subject: xen: arm: mandate EABI and use generic atomic operations. Rob Herring has observed that c81611c4e96f "xen: event channel arrays are xen_ulong_t and not unsigned long" introduced a compile failure when building without CONFIG_AEABI: /tmp/ccJaIZOW.s: Assembler messages: /tmp/ccJaIZOW.s:831: Error: even register required -- `ldrexd r5,r6,[r4]' Will Deacon pointed out that this is because OABI does not require even base registers for 64-bit values. We can avoid this by simply using the existing atomic64_xchg operation and the same containerof trick as used by the cmpxchg macros. However since this code is used on memory which is shared with the hypervisor we require proper atomic instructions and cannot use the generic atomic64 callbacks (which are based on spinlocks), therefore add a dependency on !GENERIC_ATOMIC64. Since we already depend on !CPU_V6 there isn't much downside to this. While thinking about this we also observed that OABI has different struct alignment requirements to EABI, which is a problem for hypercall argument structs which are shared with the hypervisor and which must be in EABI layout. Since I don't expect people to want to run OABI kernels on Xen depend on CONFIG_AEABI explicitly too (although it also happens to be enforced by the !GENERIC_ATOMIC64 requirement too). Signed-off-by: Ian Campbell Cc: Will Deacon Cc: Rob Herring Acked-by: Stefano Stabellini Cc: Konrad Rzeszutek Wilk Signed-off-by: Konrad Rzeszutek Wilk --- arch/arm/Kconfig | 3 ++- arch/arm/include/asm/xen/events.h | 25 ++++--------------------- 2 files changed, 6 insertions(+), 22 deletions(-) (limited to 'arch/arm/include') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index a955d89ed836..caf00998cfa7 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1866,8 +1866,9 @@ config XEN_DOM0 config XEN bool "Xen guest support on ARM (EXPERIMENTAL)" - depends on ARM && OF + depends on ARM && AEABI && OF depends on CPU_V7 && !CPU_V6 + depends on !GENERIC_ATOMIC64 help Say Y if you want to run Linux in a Virtual Machine on Xen on ARM. diff --git a/arch/arm/include/asm/xen/events.h b/arch/arm/include/asm/xen/events.h index 5c27696de14f..8b1f37bfeeec 100644 --- a/arch/arm/include/asm/xen/events.h +++ b/arch/arm/include/asm/xen/events.h @@ -2,6 +2,7 @@ #define _ASM_ARM_XEN_EVENTS_H #include +#include enum ipi_vector { XEN_PLACEHOLDER_VECTOR, @@ -15,26 +16,8 @@ static inline int xen_irqs_disabled(struct pt_regs *regs) return raw_irqs_disabled_flags(regs->ARM_cpsr); } -/* - * We cannot use xchg because it does not support 8-byte - * values. However it is safe to use {ldr,dtd}exd directly because all - * platforms which Xen can run on support those instructions. - */ -static inline xen_ulong_t xchg_xen_ulong(xen_ulong_t *ptr, xen_ulong_t val) -{ - xen_ulong_t oldval; - unsigned int tmp; - - wmb(); - asm volatile("@ xchg_xen_ulong\n" - "1: ldrexd %0, %H0, [%3]\n" - " strexd %1, %2, %H2, [%3]\n" - " teq %1, #0\n" - " bne 1b" - : "=&r" (oldval), "=&r" (tmp) - : "r" (val), "r" (ptr) - : "memory", "cc"); - return oldval; -} +#define xchg_xen_ulong(ptr, val) atomic64_xchg(container_of((ptr), \ + atomic64_t, \ + counter), (val)) #endif /* _ASM_ARM_XEN_EVENTS_H */ -- cgit v1.2.3 From 0c91e7e07ebf08092bf8e28d8cd8d420732fc716 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 23 Apr 2013 16:45:40 -0400 Subject: ARM: cacheflush: add synchronization helpers for mixed cache state accesses Algorithms used by the MCPM layer rely on state variables which are accessed while the cache is either active or inactive, depending on the code path and the active state. This patch introduces generic cache maintenance helpers to provide the necessary cache synchronization for such state variables to always hit main memory in an ordered way. Signed-off-by: Nicolas Pitre Acked-by: Russell King Acked-by: Dave Martin --- arch/arm/include/asm/cacheflush.h | 75 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) (limited to 'arch/arm/include') diff --git a/arch/arm/include/asm/cacheflush.h b/arch/arm/include/asm/cacheflush.h index e1489c54cd12..bff71388e72a 100644 --- a/arch/arm/include/asm/cacheflush.h +++ b/arch/arm/include/asm/cacheflush.h @@ -363,4 +363,79 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) flush_cache_all(); } +/* + * Memory synchronization helpers for mixed cached vs non cached accesses. + * + * Some synchronization algorithms have to set states in memory with the + * cache enabled or disabled depending on the code path. It is crucial + * to always ensure proper cache maintenance to update main memory right + * away in that case. + * + * Any cached write must be followed by a cache clean operation. + * Any cached read must be preceded by a cache invalidate operation. + * Yet, in the read case, a cache flush i.e. atomic clean+invalidate + * operation is needed to avoid discarding possible concurrent writes to the + * accessed memory. + * + * Also, in order to prevent a cached writer from interfering with an + * adjacent non-cached writer, each state variable must be located to + * a separate cache line. + */ + +/* + * This needs to be >= the max cache writeback size of all + * supported platforms included in the current kernel configuration. + * This is used to align state variables to their own cache lines. + */ +#define __CACHE_WRITEBACK_ORDER 6 /* guessed from existing platforms */ +#define __CACHE_WRITEBACK_GRANULE (1 << __CACHE_WRITEBACK_ORDER) + +/* + * There is no __cpuc_clean_dcache_area but we use it anyway for + * code intent clarity, and alias it to __cpuc_flush_dcache_area. + */ +#define __cpuc_clean_dcache_area __cpuc_flush_dcache_area + +/* + * Ensure preceding writes to *p by this CPU are visible to + * subsequent reads by other CPUs: + */ +static inline void __sync_cache_range_w(volatile void *p, size_t size) +{ + char *_p = (char *)p; + + __cpuc_clean_dcache_area(_p, size); + outer_clean_range(__pa(_p), __pa(_p + size)); +} + +/* + * Ensure preceding writes to *p by other CPUs are visible to + * subsequent reads by this CPU. We must be careful not to + * discard data simultaneously written by another CPU, hence the + * usage of flush rather than invalidate operations. + */ +static inline void __sync_cache_range_r(volatile void *p, size_t size) +{ + char *_p = (char *)p; + +#ifdef CONFIG_OUTER_CACHE + if (outer_cache.flush_range) { + /* + * Ensure dirty data migrated from other CPUs into our cache + * are cleaned out safely before the outer cache is cleaned: + */ + __cpuc_clean_dcache_area(_p, size); + + /* Clean and invalidate stale data for *p from outer ... */ + outer_flush_range(__pa(_p), __pa(_p + size)); + } +#endif + + /* ... and inner cache: */ + __cpuc_flush_dcache_area(_p, size); +} + +#define sync_cache_w(ptr) __sync_cache_range_w(ptr, sizeof *(ptr)) +#define sync_cache_r(ptr) __sync_cache_range_r(ptr, sizeof *(ptr)) + #endif -- cgit v1.2.3 From e8db288e05e588ad3f416b3a24354d60d02f35f2 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 12 Apr 2012 02:45:22 -0400 Subject: ARM: multi-cluster PM: secondary kernel entry code CPUs in cluster based systems, such as big.LITTLE, have special needs when entering the kernel due to a hotplug event, or when resuming from a deep sleep mode. This is vectorized so multiple CPUs can enter the kernel in parallel without serialization. The mcpm prefix stands for "multi cluster power management", however this is usable on single cluster systems as well. Only the basic structure is introduced here. This will be extended with later patches. In order not to complexify things more than they currently have to, the planned work to make runtime adjusted MPIDR based indexing and dynamic memory allocation for cluster states is postponed to a later cycle. The MAX_NR_CLUSTERS and MAX_CPUS_PER_CLUSTER static definitions should be sufficient for those systems expected to be available in the near future. Signed-off-by: Nicolas Pitre Reviewed-by: Santosh Shilimkar Reviewed-by: Will Deacon --- arch/arm/Kconfig | 8 +++++ arch/arm/common/Makefile | 1 + arch/arm/common/mcpm_entry.c | 22 ++++++++++++ arch/arm/common/mcpm_head.S | 86 ++++++++++++++++++++++++++++++++++++++++++++ arch/arm/include/asm/mcpm.h | 42 ++++++++++++++++++++++ 5 files changed, 159 insertions(+) create mode 100644 arch/arm/common/mcpm_entry.c create mode 100644 arch/arm/common/mcpm_head.S create mode 100644 arch/arm/include/asm/mcpm.h (limited to 'arch/arm/include') diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 2c3bdce15134..2d17275b5df2 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -1599,6 +1599,14 @@ config HAVE_ARM_TWD help This options enables support for the ARM timer and watchdog unit +config MCPM + bool "Multi-Cluster Power Management" + depends on CPU_V7 && SMP + help + This option provides the common power management infrastructure + for (multi-)cluster based systems, such as big.LITTLE based + systems. + choice prompt "Memory split" default VMSPLIT_3G diff --git a/arch/arm/common/Makefile b/arch/arm/common/Makefile index dc8dd0de5c0f..b070671033ae 100644 --- a/arch/arm/common/Makefile +++ b/arch/arm/common/Makefile @@ -11,3 +11,4 @@ obj-$(CONFIG_SHARP_PARAM) += sharpsl_param.o obj-$(CONFIG_SHARP_SCOOP) += scoop.o obj-$(CONFIG_PCI_HOST_ITE8152) += it8152.o obj-$(CONFIG_ARM_TIMER_SP804) += timer-sp.o +obj-$(CONFIG_MCPM) += mcpm_head.o mcpm_entry.o diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c new file mode 100644 index 000000000000..7cbf70051ea7 --- /dev/null +++ b/arch/arm/common/mcpm_entry.c @@ -0,0 +1,22 @@ +/* + * arch/arm/common/mcpm_entry.c -- entry point for multi-cluster PM + * + * Created by: Nicolas Pitre, March 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER]; + +void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) +{ + unsigned long val = ptr ? virt_to_phys(ptr) : 0; + mcpm_entry_vectors[cluster][cpu] = val; + sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); +} diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S new file mode 100644 index 000000000000..68c9903075a9 --- /dev/null +++ b/arch/arm/common/mcpm_head.S @@ -0,0 +1,86 @@ +/* + * arch/arm/common/mcpm_head.S -- kernel entry point for multi-cluster PM + * + * Created by: Nicolas Pitre, March 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + + .macro pr_dbg string +#if defined(CONFIG_DEBUG_LL) && defined(DEBUG) + b 1901f +1902: .asciz "CPU" +1903: .asciz " cluster" +1904: .asciz ": \string" + .align +1901: adr r0, 1902b + bl printascii + mov r0, r9 + bl printhex8 + adr r0, 1903b + bl printascii + mov r0, r10 + bl printhex8 + adr r0, 1904b + bl printascii +#endif + .endm + + .arm + .align + +ENTRY(mcpm_entry_point) + + THUMB( adr r12, BSYM(1f) ) + THUMB( bx r12 ) + THUMB( .thumb ) +1: + mrc p15, 0, r0, c0, c0, 5 @ MPIDR + ubfx r9, r0, #0, #8 @ r9 = cpu + ubfx r10, r0, #8, #8 @ r10 = cluster + mov r3, #MAX_CPUS_PER_CLUSTER + mla r4, r3, r10, r9 @ r4 = canonical CPU index + cmp r4, #(MAX_CPUS_PER_CLUSTER * MAX_NR_CLUSTERS) + blo 2f + + /* We didn't expect this CPU. Try to cheaply make it quiet. */ +1: wfi + wfe + b 1b + +2: pr_dbg "kernel mcpm_entry_point\n" + + /* + * MMU is off so we need to get to mcpm_entry_vectors in a + * position independent way. + */ + adr r5, 3f + ldr r6, [r5] + add r6, r5, r6 @ r6 = mcpm_entry_vectors + +mcpm_entry_gated: + ldr r5, [r6, r4, lsl #2] @ r5 = CPU entry vector + cmp r5, #0 + wfeeq + beq mcpm_entry_gated + pr_dbg "released\n" + bx r5 + + .align 2 + +3: .word mcpm_entry_vectors - . + +ENDPROC(mcpm_entry_point) + + .bss + .align 5 + + .type mcpm_entry_vectors, #object +ENTRY(mcpm_entry_vectors) + .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h new file mode 100644 index 000000000000..470a417d1351 --- /dev/null +++ b/arch/arm/include/asm/mcpm.h @@ -0,0 +1,42 @@ +/* + * arch/arm/include/asm/mcpm.h + * + * Created by: Nicolas Pitre, April 2012 + * Copyright: (C) 2012-2013 Linaro Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#ifndef MCPM_H +#define MCPM_H + +/* + * Maximum number of possible clusters / CPUs per cluster. + * + * This should be sufficient for quite a while, while keeping the + * (assembly) code simpler. When this starts to grow then we'll have + * to consider dynamic allocation. + */ +#define MAX_CPUS_PER_CLUSTER 4 +#define MAX_NR_CLUSTERS 2 + +#ifndef __ASSEMBLY__ + +/* + * Platform specific code should use this symbol to set up secondary + * entry location for processors to use when released from reset. + */ +extern void mcpm_entry_point(void); + +/* + * This is used to indicate where the given CPU from given cluster should + * branch once it is ready to re-enter the kernel using ptr, or NULL if it + * should be gated. A gated CPU is held in a WFE loop until its vector + * becomes non NULL. + */ +void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); + +#endif /* ! __ASSEMBLY__ */ +#endif -- cgit v1.2.3 From 7c2b860534d02d11923dd0504b961f21508173f1 Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Thu, 20 Sep 2012 16:05:37 -0400 Subject: ARM: mcpm: introduce the CPU/cluster power API This is the basic API used to handle the powering up/down of individual CPUs in a (multi-)cluster system. The platform specific backend implementation has the responsibility to also handle the cluster level power as well when the first/last CPU in a cluster is brought up/down. Signed-off-by: Nicolas Pitre Reviewed-by: Santosh Shilimkar Reviewed-by: Will Deacon --- arch/arm/common/mcpm_entry.c | 91 +++++++++++++++++++++++++++++++++++++++++++ arch/arm/include/asm/mcpm.h | 92 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 183 insertions(+) (limited to 'arch/arm/include') diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index 7cbf70051ea7..5d72889a58a4 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -9,8 +9,13 @@ * published by the Free Software Foundation. */ +#include +#include +#include + #include #include +#include extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER]; @@ -20,3 +25,89 @@ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr) mcpm_entry_vectors[cluster][cpu] = val; sync_cache_w(&mcpm_entry_vectors[cluster][cpu]); } + +static const struct mcpm_platform_ops *platform_ops; + +int __init mcpm_platform_register(const struct mcpm_platform_ops *ops) +{ + if (platform_ops) + return -EBUSY; + platform_ops = ops; + return 0; +} + +int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster) +{ + if (!platform_ops) + return -EUNATCH; /* try not to shadow power_up errors */ + might_sleep(); + return platform_ops->power_up(cpu, cluster); +} + +typedef void (*phys_reset_t)(unsigned long); + +void mcpm_cpu_power_down(void) +{ + phys_reset_t phys_reset; + + BUG_ON(!platform_ops); + BUG_ON(!irqs_disabled()); + + /* + * Do this before calling into the power_down method, + * as it might not always be safe to do afterwards. + */ + setup_mm_for_reboot(); + + platform_ops->power_down(); + + /* + * It is possible for a power_up request to happen concurrently + * with a power_down request for the same CPU. In this case the + * power_down method might not be able to actually enter a + * powered down state with the WFI instruction if the power_up + * method has removed the required reset condition. The + * power_down method is then allowed to return. We must perform + * a re-entry in the kernel as if the power_up method just had + * deasserted reset on the CPU. + * + * To simplify race issues, the platform specific implementation + * must accommodate for the possibility of unordered calls to + * power_down and power_up with a usage count. Therefore, if a + * call to power_up is issued for a CPU that is not down, then + * the next call to power_down must not attempt a full shutdown + * but only do the minimum (normally disabling L1 cache and CPU + * coherency) and return just as if a concurrent power_up request + * had happened as described above. + */ + + phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset); + phys_reset(virt_to_phys(mcpm_entry_point)); + + /* should never get here */ + BUG(); +} + +void mcpm_cpu_suspend(u64 expected_residency) +{ + phys_reset_t phys_reset; + + BUG_ON(!platform_ops); + BUG_ON(!irqs_disabled()); + + /* Very similar to mcpm_cpu_power_down() */ + setup_mm_for_reboot(); + platform_ops->suspend(expected_residency); + phys_reset = (phys_reset_t)(unsigned long)virt_to_phys(cpu_reset); + phys_reset(virt_to_phys(mcpm_entry_point)); + BUG(); +} + +int mcpm_cpu_powered_up(void) +{ + if (!platform_ops) + return -EUNATCH; + if (platform_ops->powered_up) + platform_ops->powered_up(); + return 0; +} diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index 470a417d1351..627761fce780 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h @@ -38,5 +38,97 @@ extern void mcpm_entry_point(void); */ void mcpm_set_entry_vector(unsigned cpu, unsigned cluster, void *ptr); +/* + * CPU/cluster power operations API for higher subsystems to use. + */ + +/** + * mcpm_cpu_power_up - make given CPU in given cluster runable + * + * @cpu: CPU number within given cluster + * @cluster: cluster number for the CPU + * + * The identified CPU is brought out of reset. If the cluster was powered + * down then it is brought up as well, taking care not to let the other CPUs + * in the cluster run, and ensuring appropriate cluster setup. + * + * Caller must ensure the appropriate entry vector is initialized with + * mcpm_set_entry_vector() prior to calling this. + * + * This must be called in a sleepable context. However, the implementation + * is strongly encouraged to return early and let the operation happen + * asynchronously, especially when significant delays are expected. + * + * If the operation cannot be performed then an error code is returned. + */ +int mcpm_cpu_power_up(unsigned int cpu, unsigned int cluster); + +/** + * mcpm_cpu_power_down - power the calling CPU down + * + * The calling CPU is powered down. + * + * If this CPU is found to be the "last man standing" in the cluster + * then the cluster is prepared for power-down too. + * + * This must be called with interrupts disabled. + * + * This does not return. Re-entry in the kernel is expected via + * mcpm_entry_point. + */ +void mcpm_cpu_power_down(void); + +/** + * mcpm_cpu_suspend - bring the calling CPU in a suspended state + * + * @expected_residency: duration in microseconds the CPU is expected + * to remain suspended, or 0 if unknown/infinity. + * + * The calling CPU is suspended. The expected residency argument is used + * as a hint by the platform specific backend to implement the appropriate + * sleep state level according to the knowledge it has on wake-up latency + * for the given hardware. + * + * If this CPU is found to be the "last man standing" in the cluster + * then the cluster may be prepared for power-down too, if the expected + * residency makes it worthwhile. + * + * This must be called with interrupts disabled. + * + * This does not return. Re-entry in the kernel is expected via + * mcpm_entry_point. + */ +void mcpm_cpu_suspend(u64 expected_residency); + +/** + * mcpm_cpu_powered_up - housekeeping workafter a CPU has been powered up + * + * This lets the platform specific backend code perform needed housekeeping + * work. This must be called by the newly activated CPU as soon as it is + * fully operational in kernel space, before it enables interrupts. + * + * If the operation cannot be performed then an error code is returned. + */ +int mcpm_cpu_powered_up(void); + +/* + * Platform specific methods used in the implementation of the above API. + */ +struct mcpm_platform_ops { + int (*power_up)(unsigned int cpu, unsigned int cluster); + void (*power_down)(void); + void (*suspend)(u64); + void (*powered_up)(void); +}; + +/** + * mcpm_platform_register - register platform specific power methods + * + * @ops: mcpm_platform_ops structure to register + * + * An error is returned if the registration has been done previously. + */ +int __init mcpm_platform_register(const struct mcpm_platform_ops *ops); + #endif /* ! __ASSEMBLY__ */ #endif -- cgit v1.2.3 From 7fe31d28e839f9565c8176ec584676a045970802 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Tue, 17 Jul 2012 14:25:42 +0100 Subject: ARM: mcpm: introduce helpers for platform coherency exit/setup This provides helper methods to coordinate between CPUs coming down and CPUs going up, as well as documentation on the used algorithms, so that cluster teardown and setup operations are not done for a cluster simultaneously. For use in the power_down() implementation: * __mcpm_cpu_going_down(unsigned int cluster, unsigned int cpu) * __mcpm_outbound_enter_critical(unsigned int cluster) * __mcpm_outbound_leave_critical(unsigned int cluster) * __mcpm_cpu_down(unsigned int cluster, unsigned int cpu) The power_up_setup() helper should do platform-specific setup in preparation for turning the CPU on, such as invalidating local caches or entering coherency. It must be assembler for now, since it must run before the MMU can be switched on. It is passed the affinity level for which initialization should be performed. Because the mcpm_sync_struct content is looked-up and modified with the cache enabled or disabled depending on the code path, it is crucial to always ensure proper cache maintenance to update main memory right away. The sync_cache_*() helpers are used to that end. Also, in order to prevent a cached writer from interfering with an adjacent non-cached writer, we ensure each state variable is located to a separate cache line. Thanks to Nicolas Pitre and Achin Gupta for the help with this patch. Signed-off-by: Dave Martin Signed-off-by: Nicolas Pitre Reviewed-by: Will Deacon --- Documentation/arm/cluster-pm-race-avoidance.txt | 498 ++++++++++++++++++++++++ arch/arm/common/mcpm_entry.c | 150 +++++++ arch/arm/common/mcpm_head.S | 106 ++++- arch/arm/include/asm/mcpm.h | 73 ++++ arch/arm/kernel/asm-offsets.c | 3 + 5 files changed, 828 insertions(+), 2 deletions(-) create mode 100644 Documentation/arm/cluster-pm-race-avoidance.txt (limited to 'arch/arm/include') diff --git a/Documentation/arm/cluster-pm-race-avoidance.txt b/Documentation/arm/cluster-pm-race-avoidance.txt new file mode 100644 index 000000000000..750b6fc24af9 --- /dev/null +++ b/Documentation/arm/cluster-pm-race-avoidance.txt @@ -0,0 +1,498 @@ +Cluster-wide Power-up/power-down race avoidance algorithm +========================================================= + +This file documents the algorithm which is used to coordinate CPU and +cluster setup and teardown operations and to manage hardware coherency +controls safely. + +The section "Rationale" explains what the algorithm is for and why it is +needed. "Basic model" explains general concepts using a simplified view +of the system. The other sections explain the actual details of the +algorithm in use. + + +Rationale +--------- + +In a system containing multiple CPUs, it is desirable to have the +ability to turn off individual CPUs when the system is idle, reducing +power consumption and thermal dissipation. + +In a system containing multiple clusters of CPUs, it is also desirable +to have the ability to turn off entire clusters. + +Turning entire clusters off and on is a risky business, because it +involves performing potentially destructive operations affecting a group +of independently running CPUs, while the OS continues to run. This +means that we need some coordination in order to ensure that critical +cluster-level operations are only performed when it is truly safe to do +so. + +Simple locking may not be sufficient to solve this problem, because +mechanisms like Linux spinlocks may rely on coherency mechanisms which +are not immediately enabled when a cluster powers up. Since enabling or +disabling those mechanisms may itself be a non-atomic operation (such as +writing some hardware registers and invalidating large caches), other +methods of coordination are required in order to guarantee safe +power-down and power-up at the cluster level. + +The mechanism presented in this document describes a coherent memory +based protocol for performing the needed coordination. It aims to be as +lightweight as possible, while providing the required safety properties. + + +Basic model +----------- + +Each cluster and CPU is assigned a state, as follows: + + DOWN + COMING_UP + UP + GOING_DOWN + + +---------> UP ----------+ + | v + + COMING_UP GOING_DOWN + + ^ | + +--------- DOWN <--------+ + + +DOWN: The CPU or cluster is not coherent, and is either powered off or + suspended, or is ready to be powered off or suspended. + +COMING_UP: The CPU or cluster has committed to moving to the UP state. + It may be part way through the process of initialisation and + enabling coherency. + +UP: The CPU or cluster is active and coherent at the hardware + level. A CPU in this state is not necessarily being used + actively by the kernel. + +GOING_DOWN: The CPU or cluster has committed to moving to the DOWN + state. It may be part way through the process of teardown and + coherency exit. + + +Each CPU has one of these states assigned to it at any point in time. +The CPU states are described in the "CPU state" section, below. + +Each cluster is also assigned a state, but it is necessary to split the +state value into two parts (the "cluster" state and "inbound" state) and +to introduce additional states in order to avoid races between different +CPUs in the cluster simultaneously modifying the state. The cluster- +level states are described in the "Cluster state" section. + +To help distinguish the CPU states from cluster states in this +discussion, the state names are given a CPU_ prefix for the CPU states, +and a CLUSTER_ or INBOUND_ prefix for the cluster states. + + +CPU state +--------- + +In this algorithm, each individual core in a multi-core processor is +referred to as a "CPU". CPUs are assumed to be single-threaded: +therefore, a CPU can only be doing one thing at a single point in time. + +This means that CPUs fit the basic model closely. + +The algorithm defines the following states for each CPU in the system: + + CPU_DOWN + CPU_COMING_UP + CPU_UP + CPU_GOING_DOWN + + cluster setup and + CPU setup complete policy decision + +-----------> CPU_UP ------------+ + | v + + CPU_COMING_UP CPU_GOING_DOWN + + ^ | + +----------- CPU_DOWN <----------+ + policy decision CPU teardown complete + or hardware event + + +The definitions of the four states correspond closely to the states of +the basic model. + +Transitions between states occur as follows. + +A trigger event (spontaneous) means that the CPU can transition to the +next state as a result of making local progress only, with no +requirement for any external event to happen. + + +CPU_DOWN: + + A CPU reaches the CPU_DOWN state when it is ready for + power-down. On reaching this state, the CPU will typically + power itself down or suspend itself, via a WFI instruction or a + firmware call. + + Next state: CPU_COMING_UP + Conditions: none + + Trigger events: + + a) an explicit hardware power-up operation, resulting + from a policy decision on another CPU; + + b) a hardware event, such as an interrupt. + + +CPU_COMING_UP: + + A CPU cannot start participating in hardware coherency until the + cluster is set up and coherent. If the cluster is not ready, + then the CPU will wait in the CPU_COMING_UP state until the + cluster has been set up. + + Next state: CPU_UP + Conditions: The CPU's parent cluster must be in CLUSTER_UP. + Trigger events: Transition of the parent cluster to CLUSTER_UP. + + Refer to the "Cluster state" section for a description of the + CLUSTER_UP state. + + +CPU_UP: + When a CPU reaches the CPU_UP state, it is safe for the CPU to + start participating in local coherency. + + This is done by jumping to the kernel's CPU resume code. + + Note that the definition of this state is slightly different + from the basic model definition: CPU_UP does not mean that the + CPU is coherent yet, but it does mean that it is safe to resume + the kernel. The kernel handles the rest of the resume + procedure, so the remaining steps are not visible as part of the + race avoidance algorithm. + + The CPU remains in this state until an explicit policy decision + is made to shut down or suspend the CPU. + + Next state: CPU_GOING_DOWN + Conditions: none + Trigger events: explicit policy decision + + +CPU_GOING_DOWN: + + While in this state, the CPU exits coherency, including any + operations required to achieve this (such as cleaning data + caches). + + Next state: CPU_DOWN + Conditions: local CPU teardown complete + Trigger events: (spontaneous) + + +Cluster state +------------- + +A cluster is a group of connected CPUs with some common resources. +Because a cluster contains multiple CPUs, it can be doing multiple +things at the same time. This has some implications. In particular, a +CPU can start up while another CPU is tearing the cluster down. + +In this discussion, the "outbound side" is the view of the cluster state +as seen by a CPU tearing the cluster down. The "inbound side" is the +view of the cluster state as seen by a CPU setting the CPU up. + +In order to enable safe coordination in such situations, it is important +that a CPU which is setting up the cluster can advertise its state +independently of the CPU which is tearing down the cluster. For this +reason, the cluster state is split into two parts: + + "cluster" state: The global state of the cluster; or the state + on the outbound side: + + CLUSTER_DOWN + CLUSTER_UP + CLUSTER_GOING_DOWN + + "inbound" state: The state of the cluster on the inbound side. + + INBOUND_NOT_COMING_UP + INBOUND_COMING_UP + + + The different pairings of these states results in six possible + states for the cluster as a whole: + + CLUSTER_UP + +==========> INBOUND_NOT_COMING_UP -------------+ + # | + | + CLUSTER_UP <----+ | + INBOUND_COMING_UP | v + + ^ CLUSTER_GOING_DOWN CLUSTER_GOING_DOWN + # INBOUND_COMING_UP <=== INBOUND_NOT_COMING_UP + + CLUSTER_DOWN | | + INBOUND_COMING_UP <----+ | + | + ^ | + +=========== CLUSTER_DOWN <------------+ + INBOUND_NOT_COMING_UP + + Transitions -----> can only be made by the outbound CPU, and + only involve changes to the "cluster" state. + + Transitions ===##> can only be made by the inbound CPU, and only + involve changes to the "inbound" state, except where there is no + further transition possible on the outbound side (i.e., the + outbound CPU has put the cluster into the CLUSTER_DOWN state). + + The race avoidance algorithm does not provide a way to determine + which exact CPUs within the cluster play these roles. This must + be decided in advance by some other means. Refer to the section + "Last man and first man selection" for more explanation. + + + CLUSTER_DOWN/INBOUND_NOT_COMING_UP is the only state where the + cluster can actually be powered down. + + The parallelism of the inbound and outbound CPUs is observed by + the existence of two different paths from CLUSTER_GOING_DOWN/ + INBOUND_NOT_COMING_UP (corresponding to GOING_DOWN in the basic + model) to CLUSTER_DOWN/INBOUND_COMING_UP (corresponding to + COMING_UP in the basic model). The second path avoids cluster + teardown completely. + + CLUSTER_UP/INBOUND_COMING_UP is equivalent to UP in the basic + model. The final transition to CLUSTER_UP/INBOUND_NOT_COMING_UP + is trivial and merely resets the state machine ready for the + next cycle. + + Details of the allowable transitions follow. + + The next state in each case is notated + + / () + + where the is the side on which the transition + can occur; either the inbound or the outbound side. + + +CLUSTER_DOWN/INBOUND_NOT_COMING_UP: + + Next state: CLUSTER_DOWN/INBOUND_COMING_UP (inbound) + Conditions: none + Trigger events: + + a) an explicit hardware power-up operation, resulting + from a policy decision on another CPU; + + b) a hardware event, such as an interrupt. + + +CLUSTER_DOWN/INBOUND_COMING_UP: + + In this state, an inbound CPU sets up the cluster, including + enabling of hardware coherency at the cluster level and any + other operations (such as cache invalidation) which are required + in order to achieve this. + + The purpose of this state is to do sufficient cluster-level + setup to enable other CPUs in the cluster to enter coherency + safely. + + Next state: CLUSTER_UP/INBOUND_COMING_UP (inbound) + Conditions: cluster-level setup and hardware coherency complete + Trigger events: (spontaneous) + + +CLUSTER_UP/INBOUND_COMING_UP: + + Cluster-level setup is complete and hardware coherency is + enabled for the cluster. Other CPUs in the cluster can safely + enter coherency. + + This is a transient state, leading immediately to + CLUSTER_UP/INBOUND_NOT_COMING_UP. All other CPUs on the cluster + should consider treat these two states as equivalent. + + Next state: CLUSTER_UP/INBOUND_NOT_COMING_UP (inbound) + Conditions: none + Trigger events: (spontaneous) + + +CLUSTER_UP/INBOUND_NOT_COMING_UP: + + Cluster-level setup is complete and hardware coherency is + enabled for the cluster. Other CPUs in the cluster can safely + enter coherency. + + The cluster will remain in this state until a policy decision is + made to power the cluster down. + + Next state: CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP (outbound) + Conditions: none + Trigger events: policy decision to power down the cluster + + +CLUSTER_GOING_DOWN/INBOUND_NOT_COMING_UP: + + An outbound CPU is tearing the cluster down. The selected CPU + must wait in this state until all CPUs in the cluster are in the + CPU_DOWN state. + + When all CPUs are in the CPU_DOWN state, the cluster can be torn + down, for example by cleaning data caches and exiting + cluster-level coherency. + + To avoid wasteful unnecessary teardown operations, the outbound + should check the inbound cluster state for asynchronous + transitions to INBOUND_COMING_UP. Alternatively, individual + CPUs can be checked for entry into CPU_COMING_UP or CPU_UP. + + + Next states: + + CLUSTER_DOWN/INBOUND_NOT_COMING_UP (outbound) + Conditions: cluster torn down and ready to power off + Trigger events: (spontaneous) + + CLUSTER_GOING_DOWN/INBOUND_COMING_UP (inbound) + Conditions: none + Trigger events: + + a) an explicit hardware power-up operation, + resulting from a policy decision on another + CPU; + + b) a hardware event, such as an interrupt. + + +CLUSTER_GOING_DOWN/INBOUND_COMING_UP: + + The cluster is (or was) being torn down, but another CPU has + come online in the meantime and is trying to set up the cluster + again. + + If the outbound CPU observes this state, it has two choices: + + a) back out of teardown, restoring the cluster to the + CLUSTER_UP state; + + b) finish tearing the cluster down and put the cluster + in the CLUSTER_DOWN state; the inbound CPU will + set up the cluster again from there. + + Choice (a) permits the removal of some latency by avoiding + unnecessary teardown and setup operations in situations where + the cluster is not really going to be powered down. + + + Next states: + + CLUSTER_UP/INBOUND_COMING_UP (outbound) + Conditions: cluster-level setup and hardware + coherency complete + Trigger events: (spontaneous) + + CLUSTER_DOWN/INBOUND_COMING_UP (outbound) + Conditions: cluster torn down and ready to power off + Trigger events: (spontaneous) + + +Last man and First man selection +-------------------------------- + +The CPU which performs cluster tear-down operations on the outbound side +is commonly referred to as the "last man". + +The CPU which performs cluster setup on the inbound side is commonly +referred to as the "first man". + +The race avoidance algorithm documented above does not provide a +mechanism to choose which CPUs should play these roles. + + +Last man: + +When shutting down the cluster, all the CPUs involved are initially +executing Linux and hence coherent. Therefore, ordinary spinlocks can +be used to select a last man safely, before the CPUs become +non-coherent. + + +First man: + +Because CPUs may power up asynchronously in response to external wake-up +events, a dynamic mechanism is needed to make sure that only one CPU +attempts to play the first man role and do the cluster-level +initialisation: any other CPUs must wait for this to complete before +proceeding. + +Cluster-level initialisation may involve actions such as configuring +coherency controls in the bus fabric. + +The current implementation in mcpm_head.S uses a separate mutual exclusion +mechanism to do this arbitration. This mechanism is documented in +detail in vlocks.txt. + + +Features and Limitations +------------------------ + +Implementation: + + The current ARM-based implementation is split between + arch/arm/common/mcpm_head.S (low-level inbound CPU operations) and + arch/arm/common/mcpm_entry.c (everything else): + + __mcpm_cpu_going_down() signals the transition of a CPU to the + CPU_GOING_DOWN state. + + __mcpm_cpu_down() signals the transition of a CPU to the CPU_DOWN + state. + + A CPU transitions to CPU_COMING_UP and then to CPU_UP via the + low-level power-up code in mcpm_head.S. This could + involve CPU-specific setup code, but in the current + implementation it does not. + + __mcpm_outbound_enter_critical() and __mcpm_outbound_leave_critical() + handle transitions from CLUSTER_UP to CLUSTER_GOING_DOWN + and from there to CLUSTER_DOWN or back to CLUSTER_UP (in + the case of an aborted cluster power-down). + + These functions are more complex than the __mcpm_cpu_*() + functions due to the extra inter-CPU coordination which + is needed for safe transitions at the cluster level. + + A cluster transitions from CLUSTER_DOWN back to CLUSTER_UP via + the low-level power-up code in mcpm_head.S. This + typically involves platform-specific setup code, + provided by the platform-specific power_up_setup + function registered via mcpm_sync_init. + +Deep topologies: + + As currently described and implemented, the algorithm does not + support CPU topologies involving more than two levels (i.e., + clusters of clusters are not supported). The algorithm could be + extended by replicating the cluster-level states for the + additional topological levels, and modifying the transition + rules for the intermediate (non-outermost) cluster levels. + + +Colophon +-------- + +Originally created and documented by Dave Martin for Linaro Limited, in +collaboration with Nicolas Pitre and Achin Gupta. + +Copyright (C) 2012-2013 Linaro Limited +Distributed under the terms of Version 2 of the GNU General Public +License, as defined in linux/COPYING. diff --git a/arch/arm/common/mcpm_entry.c b/arch/arm/common/mcpm_entry.c index 5d72889a58a4..370236dd1a03 100644 --- a/arch/arm/common/mcpm_entry.c +++ b/arch/arm/common/mcpm_entry.c @@ -16,6 +16,7 @@ #include #include #include +#include extern unsigned long mcpm_entry_vectors[MAX_NR_CLUSTERS][MAX_CPUS_PER_CLUSTER]; @@ -111,3 +112,152 @@ int mcpm_cpu_powered_up(void) platform_ops->powered_up(); return 0; } + +struct sync_struct mcpm_sync; + +/* + * __mcpm_cpu_going_down: Indicates that the cpu is being torn down. + * This must be called at the point of committing to teardown of a CPU. + * The CPU cache (SCTRL.C bit) is expected to still be active. + */ +void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster) +{ + mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_GOING_DOWN; + sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu); +} + +/* + * __mcpm_cpu_down: Indicates that cpu teardown is complete and that the + * cluster can be torn down without disrupting this CPU. + * To avoid deadlocks, this must be called before a CPU is powered down. + * The CPU cache (SCTRL.C bit) is expected to be off. + * However L2 cache might or might not be active. + */ +void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster) +{ + dmb(); + mcpm_sync.clusters[cluster].cpus[cpu].cpu = CPU_DOWN; + sync_cache_w(&mcpm_sync.clusters[cluster].cpus[cpu].cpu); + dsb_sev(); +} + +/* + * __mcpm_outbound_leave_critical: Leave the cluster teardown critical section. + * @state: the final state of the cluster: + * CLUSTER_UP: no destructive teardown was done and the cluster has been + * restored to the previous state (CPU cache still active); or + * CLUSTER_DOWN: the cluster has been torn-down, ready for power-off + * (CPU cache disabled, L2 cache either enabled or disabled). + */ +void __mcpm_outbound_leave_critical(unsigned int cluster, int state) +{ + dmb(); + mcpm_sync.clusters[cluster].cluster = state; + sync_cache_w(&mcpm_sync.clusters[cluster].cluster); + dsb_sev(); +} + +/* + * __mcpm_outbound_enter_critical: Enter the cluster teardown critical section. + * This function should be called by the last man, after local CPU teardown + * is complete. CPU cache expected to be active. + * + * Returns: + * false: the critical section was not entered because an inbound CPU was + * observed, or the cluster is already being set up; + * true: the critical section was entered: it is now safe to tear down the + * cluster. + */ +bool __mcpm_outbound_enter_critical(unsigned int cpu, unsigned int cluster) +{ + unsigned int i; + struct mcpm_sync_struct *c = &mcpm_sync.clusters[cluster]; + + /* Warn inbound CPUs that the cluster is being torn down: */ + c->cluster = CLUSTER_GOING_DOWN; + sync_cache_w(&c->cluster); + + /* Back out if the inbound cluster is already in the critical region: */ + sync_cache_r(&c->inbound); + if (c->inbound == INBOUND_COMING_UP) + goto abort; + + /* + * Wait for all CPUs to get out of the GOING_DOWN state, so that local + * teardown is complete on each CPU before tearing down the cluster. + * + * If any CPU has been woken up again from the DOWN state, then we + * shouldn't be taking the cluster down at all: abort in that case. + */ + sync_cache_r(&c->cpus); + for (i = 0; i < MAX_CPUS_PER_CLUSTER; i++) { + int cpustate; + + if (i == cpu) + continue; + + while (1) { + cpustate = c->cpus[i].cpu; + if (cpustate != CPU_GOING_DOWN) + break; + + wfe(); + sync_cache_r(&c->cpus[i].cpu); + } + + switch (cpustate) { + case CPU_DOWN: + continue; + + default: + goto abort; + } + } + + return true; + +abort: + __mcpm_outbound_leave_critical(cluster, CLUSTER_UP); + return false; +} + +int __mcpm_cluster_state(unsigned int cluster) +{ + sync_cache_r(&mcpm_sync.clusters[cluster].cluster); + return mcpm_sync.clusters[cluster].cluster; +} + +extern unsigned long mcpm_power_up_setup_phys; + +int __init mcpm_sync_init( + void (*power_up_setup)(unsigned int affinity_level)) +{ + unsigned int i, j, mpidr, this_cluster; + + BUILD_BUG_ON(MCPM_SYNC_CLUSTER_SIZE * MAX_NR_CLUSTERS != sizeof mcpm_sync); + BUG_ON((unsigned long)&mcpm_sync & (__CACHE_WRITEBACK_GRANULE - 1)); + + /* + * Set initial CPU and cluster states. + * Only one cluster is assumed to be active at this point. + */ + for (i = 0; i < MAX_NR_CLUSTERS; i++) { + mcpm_sync.clusters[i].cluster = CLUSTER_DOWN; + mcpm_sync.clusters[i].inbound = INBOUND_NOT_COMING_UP; + for (j = 0; j < MAX_CPUS_PER_CLUSTER; j++) + mcpm_sync.clusters[i].cpus[j].cpu = CPU_DOWN; + } + mpidr = read_cpuid_mpidr(); + this_cluster = MPIDR_AFFINITY_LEVEL(mpidr, 1); + for_each_online_cpu(i) + mcpm_sync.clusters[this_cluster].cpus[i].cpu = CPU_UP; + mcpm_sync.clusters[this_cluster].cluster = CLUSTER_UP; + sync_cache_w(&mcpm_sync); + + if (power_up_setup) { + mcpm_power_up_setup_phys = virt_to_phys(power_up_setup); + sync_cache_w(&mcpm_power_up_setup_phys); + } + + return 0; +} diff --git a/arch/arm/common/mcpm_head.S b/arch/arm/common/mcpm_head.S index 68c9903075a9..7d729bd72674 100644 --- a/arch/arm/common/mcpm_head.S +++ b/arch/arm/common/mcpm_head.S @@ -7,11 +7,19 @@ * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. + * + * + * Refer to Documentation/arm/cluster-pm-race-avoidance.txt + * for details of the synchronisation algorithms used here. */ #include #include +.if MCPM_SYNC_CLUSTER_CPUS +.error "cpus must be the first member of struct mcpm_sync_struct" +.endif + .macro pr_dbg string #if defined(CONFIG_DEBUG_LL) && defined(DEBUG) b 1901f @@ -57,24 +65,114 @@ ENTRY(mcpm_entry_point) 2: pr_dbg "kernel mcpm_entry_point\n" /* - * MMU is off so we need to get to mcpm_entry_vectors in a + * MMU is off so we need to get to various variables in a * position independent way. */ adr r5, 3f - ldr r6, [r5] + ldmia r5, {r6, r7, r8} add r6, r5, r6 @ r6 = mcpm_entry_vectors + ldr r7, [r5, r7] @ r7 = mcpm_power_up_setup_phys + add r8, r5, r8 @ r8 = mcpm_sync + + mov r0, #MCPM_SYNC_CLUSTER_SIZE + mla r8, r0, r10, r8 @ r8 = sync cluster base + + @ Signal that this CPU is coming UP: + mov r0, #CPU_COMING_UP + mov r5, #MCPM_SYNC_CPU_SIZE + mla r5, r9, r5, r8 @ r5 = sync cpu address + strb r0, [r5] + + @ At this point, the cluster cannot unexpectedly enter the GOING_DOWN + @ state, because there is at least one active CPU (this CPU). + + @ Note: the following is racy as another CPU might be testing + @ the same flag at the same moment. That'll be fixed later. + ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + cmp r0, #CLUSTER_UP @ cluster already up? + bne mcpm_setup @ if not, set up the cluster + + @ Otherwise, skip setup: + b mcpm_setup_complete + +mcpm_setup: + @ Control dependency implies strb not observable before previous ldrb. + + @ Signal that the cluster is being brought up: + mov r0, #INBOUND_COMING_UP + strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND] + dmb + + @ Any CPU trying to take the cluster into CLUSTER_GOING_DOWN from this + @ point onwards will observe INBOUND_COMING_UP and abort. + + @ Wait for any previously-pending cluster teardown operations to abort + @ or complete: +mcpm_teardown_wait: + ldrb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + cmp r0, #CLUSTER_GOING_DOWN + bne first_man_setup + wfe + b mcpm_teardown_wait + +first_man_setup: + dmb + + @ If the outbound gave up before teardown started, skip cluster setup: + + cmp r0, #CLUSTER_UP + beq mcpm_setup_leave + + @ power_up_setup is now responsible for setting up the cluster: + + cmp r7, #0 + mov r0, #1 @ second (cluster) affinity level + blxne r7 @ Call power_up_setup if defined + dmb + + mov r0, #CLUSTER_UP + strb r0, [r8, #MCPM_SYNC_CLUSTER_CLUSTER] + dmb + +mcpm_setup_leave: + @ Leave the cluster setup critical section: + + mov r0, #INBOUND_NOT_COMING_UP + strb r0, [r8, #MCPM_SYNC_CLUSTER_INBOUND] + dsb + sev + +mcpm_setup_complete: + @ If a platform-specific CPU setup hook is needed, it is + @ called from here. + + cmp r7, #0 + mov r0, #0 @ first (CPU) affinity level + blxne r7 @ Call power_up_setup if defined + dmb + + @ Mark the CPU as up: + + mov r0, #CPU_UP + strb r0, [r5] + + @ Observability order of CPU_UP and opening of the gate does not matter. mcpm_entry_gated: ldr r5, [r6, r4, lsl #2] @ r5 = CPU entry vector cmp r5, #0 wfeeq beq mcpm_entry_gated + dmb + pr_dbg "released\n" bx r5 .align 2 3: .word mcpm_entry_vectors - . + .word mcpm_power_up_setup_phys - 3b + .word mcpm_sync - 3b ENDPROC(mcpm_entry_point) @@ -84,3 +182,7 @@ ENDPROC(mcpm_entry_point) .type mcpm_entry_vectors, #object ENTRY(mcpm_entry_vectors) .space 4 * MAX_NR_CLUSTERS * MAX_CPUS_PER_CLUSTER + + .type mcpm_power_up_setup_phys, #object +ENTRY(mcpm_power_up_setup_phys) + .space 4 @ set by mcpm_sync_init() diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index 627761fce780..3046e90210cb 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h @@ -24,6 +24,9 @@ #ifndef __ASSEMBLY__ +#include +#include + /* * Platform specific code should use this symbol to set up secondary * entry location for processors to use when released from reset. @@ -130,5 +133,75 @@ struct mcpm_platform_ops { */ int __init mcpm_platform_register(const struct mcpm_platform_ops *ops); +/* Synchronisation structures for coordinating safe cluster setup/teardown: */ + +/* + * When modifying this structure, make sure you update the MCPM_SYNC_ defines + * to match. + */ +struct mcpm_sync_struct { + /* individual CPU states */ + struct { + s8 cpu __aligned(__CACHE_WRITEBACK_GRANULE); + } cpus[MAX_CPUS_PER_CLUSTER]; + + /* cluster state */ + s8 cluster __aligned(__CACHE_WRITEBACK_GRANULE); + + /* inbound-side state */ + s8 inbound __aligned(__CACHE_WRITEBACK_GRANULE); +}; + +struct sync_struct { + struct mcpm_sync_struct clusters[MAX_NR_CLUSTERS]; +}; + +extern unsigned long sync_phys; /* physical address of *mcpm_sync */ + +void __mcpm_cpu_going_down(unsigned int cpu, unsigned int cluster); +void __mcpm_cpu_down(unsigned int cpu, unsigned int cluster); +void __mcpm_outbound_leave_critical(unsigned int cluster, int state); +bool __mcpm_outbound_enter_critical(unsigned int this_cpu, unsigned int cluster); +int __mcpm_cluster_state(unsigned int cluster); + +int __init mcpm_sync_init( + void (*power_up_setup)(unsigned int affinity_level)); + +#else + +/* + * asm-offsets.h causes trouble when included in .c files, and cacheflush.h + * cannot be included in asm files. Let's work around the conflict like this. + */ +#include +#define __CACHE_WRITEBACK_GRANULE CACHE_WRITEBACK_GRANULE + #endif /* ! __ASSEMBLY__ */ + +/* Definitions for mcpm_sync_struct */ +#define CPU_DOWN 0x11 +#define CPU_COMING_UP 0x12 +#define CPU_UP 0x13 +#define CPU_GOING_DOWN 0x14 + +#define CLUSTER_DOWN 0x21 +#define CLUSTER_UP 0x22 +#define CLUSTER_GOING_DOWN 0x23 + +#define INBOUND_NOT_COMING_UP 0x31 +#define INBOUND_COMING_UP 0x32 + +/* + * Offsets for the mcpm_sync_struct members, for use in asm. + * We don't want to make them global to the kernel via asm-offsets.c. + */ +#define MCPM_SYNC_CLUSTER_CPUS 0 +#define MCPM_SYNC_CPU_SIZE __CACHE_WRITEBACK_GRANULE +#define MCPM_SYNC_CLUSTER_CLUSTER \ + (MCPM_SYNC_CLUSTER_CPUS + MCPM_SYNC_CPU_SIZE * MAX_CPUS_PER_CLUSTER) +#define MCPM_SYNC_CLUSTER_INBOUND \ + (MCPM_SYNC_CLUSTER_CLUSTER + __CACHE_WRITEBACK_GRANULE) +#define MCPM_SYNC_CLUSTER_SIZE \ + (MCPM_SYNC_CLUSTER_INBOUND + __CACHE_WRITEBACK_GRANULE) + #endif diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 923eec7105cf..1bed82a0a9e0 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -149,6 +149,9 @@ int main(void) DEFINE(DMA_BIDIRECTIONAL, DMA_BIDIRECTIONAL); DEFINE(DMA_TO_DEVICE, DMA_TO_DEVICE); DEFINE(DMA_FROM_DEVICE, DMA_FROM_DEVICE); + BLANK(); + DEFINE(CACHE_WRITEBACK_GRANULE, __CACHE_WRITEBACK_GRANULE); + BLANK(); #ifdef CONFIG_KVM_ARM_HOST DEFINE(VCPU_KVM, offsetof(struct kvm_vcpu, kvm)); DEFINE(VCPU_MIDR, offsetof(struct kvm_vcpu, arch.midr)); -- cgit v1.2.3 From a7eb7c6f9a657a01a8359edae31bbeacd18b072c Mon Sep 17 00:00:00 2001 From: Nicolas Pitre Date: Tue, 9 Apr 2013 01:29:17 -0400 Subject: ARM: mcpm: provide an interface to set the SMP ops at run time This is cleaner than exporting the mcpm_smp_ops structure. Signed-off-by: Nicolas Pitre Acked-by: Jon Medhurst --- arch/arm/common/mcpm_platsmp.c | 7 ++++++- arch/arm/include/asm/mcpm.h | 2 ++ 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'arch/arm/include') diff --git a/arch/arm/common/mcpm_platsmp.c b/arch/arm/common/mcpm_platsmp.c index e7e3235b8b64..52b88d81b7bb 100644 --- a/arch/arm/common/mcpm_platsmp.c +++ b/arch/arm/common/mcpm_platsmp.c @@ -76,7 +76,7 @@ static void mcpm_cpu_die(unsigned int cpu) #endif -struct smp_operations __initdata mcpm_smp_ops = { +static struct smp_operations __initdata mcpm_smp_ops = { .smp_init_cpus = simple_smp_init_cpus, .smp_boot_secondary = mcpm_boot_secondary, .smp_secondary_init = mcpm_secondary_init, @@ -85,3 +85,8 @@ struct smp_operations __initdata mcpm_smp_ops = { .cpu_die = mcpm_cpu_die, #endif }; + +void __init mcpm_smp_set_ops(void) +{ + smp_set_ops(&mcpm_smp_ops); +} diff --git a/arch/arm/include/asm/mcpm.h b/arch/arm/include/asm/mcpm.h index 3046e90210cb..0f7b7620e9a5 100644 --- a/arch/arm/include/asm/mcpm.h +++ b/arch/arm/include/asm/mcpm.h @@ -167,6 +167,8 @@ int __mcpm_cluster_state(unsigned int cluster); int __init mcpm_sync_init( void (*power_up_setup)(unsigned int affinity_level)); +void __init mcpm_smp_set_ops(void); + #else /* -- cgit v1.2.3