From b7182d1a788461c36bad5a13b78d0779a6376f5c Mon Sep 17 00:00:00 2001 From: Baruch Siach Date: Tue, 11 Aug 2015 12:08:32 +0300 Subject: MAINTAINERS: add git tree for the arc architecture Signed-off-by: Baruch Siach Signed-off-by: Vineet Gupta --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index a9ae6c105520..d7ab7363b263 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9876,6 +9876,7 @@ S: Supported F: arch/arc/ F: Documentation/devicetree/bindings/arc/ F: drivers/tty/serial/arc_uart.c +T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git SYNOPSYS ARC SDP platform support M: Alexey Brodkin -- cgit v1.2.3 From 2a4401687c11def29fe5a9b23ab98bf7ab1dce61 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Sat, 8 Aug 2015 17:51:58 +0530 Subject: ARC: Enable optimistic spinning for LLSC config Suggested-by: Peter Zijlstra (Intel) Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index bd4670d1b89b..e119d42f92ce 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -8,6 +8,7 @@ config ARC def_bool y + select ARCH_SUPPORTS_ATOMIC_RMW if ARC_HAS_LLSC select BUILDTIME_EXTABLE_SORT select COMMON_CLK select CLONE_BACKWARDS -- cgit v1.2.3 From f2b0b25a37a6db12580dcdfdf00f020e5e0e3a43 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Mon, 25 May 2015 19:54:28 +0300 Subject: ARCv2: Support IO Coherency and permutations involving L1 and L2 caches In case of ARCv2 CPU there're could be following configurations that affect cache handling for data exchanged with peripherals via DMA: [1] Only L1 cache exists [2] Both L1 and L2 exist, but no IO coherency unit [3] L1, L2 caches and IO coherency unit exist Current implementation takes care of [1] and [2]. Moreover support of [2] is implemented with run-time check for SLC existence which is not super optimal. This patch introduces support of [3] and rework of DMA ops usage. Instead of doing run-time check every time a particular DMA op is executed we'll have 3 different implementations of DMA ops and select appropriate one during init. As for IOC support for it we need: [a] Implement empty DMA ops because IOC takes care of cache coherency with DMAed data [b] Route dma_alloc_coherent() via dma_alloc_noncoherent() This is required to make IOC work in first place and also serves as optimization as LD/ST to coherent buffers can be srviced from caches w/o going all the way to memory Signed-off-by: Alexey Brodkin [vgupta: -Added some comments about IOC gains -Marked dma ops as static, -Massaged changelog a bit] Signed-off-by: Vineet Gupta --- arch/arc/include/asm/arcregs.h | 1 + arch/arc/include/asm/cache.h | 8 +++ arch/arc/mm/cache.c | 114 +++++++++++++++++++++++++++++++++++------ arch/arc/mm/dma.c | 18 +++++++ 4 files changed, 125 insertions(+), 16 deletions(-) diff --git a/arch/arc/include/asm/arcregs.h b/arch/arc/include/asm/arcregs.h index c8f57b8449dc..d8023bc8d1ad 100644 --- a/arch/arc/include/asm/arcregs.h +++ b/arch/arc/include/asm/arcregs.h @@ -35,6 +35,7 @@ #define ARC_REG_RTT_BCR 0xF2 #define ARC_REG_IRQ_BCR 0xF3 #define ARC_REG_SMART_BCR 0xFF +#define ARC_REG_CLUSTER_BCR 0xcf /* status32 Bits Positions */ #define STATUS_AE_BIT 5 /* Exception active */ diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index d67345d3e2d4..e23ea6e7633a 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -53,6 +53,8 @@ extern void arc_cache_init(void); extern char *arc_cache_mumbojumbo(int cpu_id, char *buf, int len); extern void read_decode_cache_bcr(void); +extern int ioc_exists; + #endif /* !__ASSEMBLY__ */ /* Instruction cache related Auxiliary registers */ @@ -94,4 +96,10 @@ extern void read_decode_cache_bcr(void); #define SLC_CTRL_BUSY 0x100 #define SLC_CTRL_RGN_OP_INV 0x200 +/* IO coherency related Auxiliary registers */ +#define ARC_REG_IO_COH_ENABLE 0x500 +#define ARC_REG_IO_COH_PARTIAL 0x501 +#define ARC_REG_IO_COH_AP0_BASE 0x508 +#define ARC_REG_IO_COH_AP0_SIZE 0x509 + #endif /* _ASM_CACHE_H */ diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 1cd6695b6ab5..25e7077d4c04 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -22,10 +22,15 @@ #include static int l2_line_sz; +int ioc_exists; void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr, unsigned long sz, const int cacheop); +void (*__dma_cache_wback_inv)(unsigned long start, unsigned long sz); +void (*__dma_cache_inv)(unsigned long start, unsigned long sz); +void (*__dma_cache_wback)(unsigned long start, unsigned long sz); + char *arc_cache_mumbojumbo(int c, char *buf, int len) { int n = 0; @@ -50,6 +55,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) n += scnprintf(buf + n, len - n, "SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len); + if (ioc_exists) + n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n"); + return buf; } @@ -80,6 +88,14 @@ void read_decode_cache_bcr(void) #endif } slc_cfg; + struct bcr_clust_cfg { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:7, c:1, num_entries:8, num_cores:8, ver:8; +#else + unsigned int ver:8, num_cores:8, num_entries:8, c:1, pad:7; +#endif + } cbcr; + p_ic = &cpuinfo_arc700[cpu].icache; READ_BCR(ARC_REG_IC_BCR, ibcr); @@ -133,6 +149,10 @@ slc_chk: p_slc->sz_k = 128 << slc_cfg.sz; l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64; } + + READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); + if (cbcr.c) + ioc_exists = 1; } /* @@ -516,11 +536,6 @@ noinline void slc_op(unsigned long paddr, unsigned long sz, const int op) #endif } -static inline int need_slc_flush(void) -{ - return is_isa_arcv2() && l2_line_sz; -} - /*********************************************************** * Exported APIs */ @@ -569,30 +584,74 @@ void flush_dcache_page(struct page *page) } EXPORT_SYMBOL(flush_dcache_page); -void dma_cache_wback_inv(unsigned long start, unsigned long sz) +/* + * DMA ops for systems with L1 cache only + * Make memory coherent with L1 cache by flushing/invalidating L1 lines + */ +static void __dma_cache_wback_inv_l1(unsigned long start, unsigned long sz) { __dc_line_op_k(start, sz, OP_FLUSH_N_INV); +} - if (need_slc_flush()) - slc_op(start, sz, OP_FLUSH_N_INV); +static void __dma_cache_inv_l1(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_INV); } -EXPORT_SYMBOL(dma_cache_wback_inv); -void dma_cache_inv(unsigned long start, unsigned long sz) +static void __dma_cache_wback_l1(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_FLUSH); +} + +/* + * DMA ops for systems with both L1 and L2 caches, but without IOC + * Both L1 and L2 lines need to be explicity flushed/invalidated + */ +static void __dma_cache_wback_inv_slc(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_FLUSH_N_INV); + slc_op(start, sz, OP_FLUSH_N_INV); +} + +static void __dma_cache_inv_slc(unsigned long start, unsigned long sz) { __dc_line_op_k(start, sz, OP_INV); + slc_op(start, sz, OP_INV); +} - if (need_slc_flush()) - slc_op(start, sz, OP_INV); +static void __dma_cache_wback_slc(unsigned long start, unsigned long sz) +{ + __dc_line_op_k(start, sz, OP_FLUSH); + slc_op(start, sz, OP_FLUSH); +} + +/* + * DMA ops for systems with IOC + * IOC hardware snoops all DMA traffic keeping the caches consistent with + * memory - eliding need for any explicit cache maintenance of DMA buffers + */ +static void __dma_cache_wback_inv_ioc(unsigned long start, unsigned long sz) {} +static void __dma_cache_inv_ioc(unsigned long start, unsigned long sz) {} +static void __dma_cache_wback_ioc(unsigned long start, unsigned long sz) {} + +/* + * Exported DMA API + */ +void dma_cache_wback_inv(unsigned long start, unsigned long sz) +{ + __dma_cache_wback_inv(start, sz); +} +EXPORT_SYMBOL(dma_cache_wback_inv); + +void dma_cache_inv(unsigned long start, unsigned long sz) +{ + __dma_cache_inv(start, sz); } EXPORT_SYMBOL(dma_cache_inv); void dma_cache_wback(unsigned long start, unsigned long sz) { - __dc_line_op_k(start, sz, OP_FLUSH); - - if (need_slc_flush()) - slc_op(start, sz, OP_FLUSH); + __dma_cache_wback(start, sz); } EXPORT_SYMBOL(dma_cache_wback); @@ -848,4 +907,27 @@ void arc_cache_init(void) panic("Disable CONFIG_ARC_CACHE_VIPT_ALIASING\n"); } } + + if (is_isa_arcv2() && ioc_exists) { + /* IO coherency base - 0x8z */ + write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000); + /* IO coherency aperture size - 512Mb: 0x8z-0xAz */ + write_aux_reg(ARC_REG_IO_COH_AP0_SIZE, 0x11); + /* Enable partial writes */ + write_aux_reg(ARC_REG_IO_COH_PARTIAL, 1); + /* Enable IO coherency */ + write_aux_reg(ARC_REG_IO_COH_ENABLE, 1); + + __dma_cache_wback_inv = __dma_cache_wback_inv_ioc; + __dma_cache_inv = __dma_cache_inv_ioc; + __dma_cache_wback = __dma_cache_wback_ioc; + } else if (is_isa_arcv2() && l2_line_sz) { + __dma_cache_wback_inv = __dma_cache_wback_inv_slc; + __dma_cache_inv = __dma_cache_inv_slc; + __dma_cache_wback = __dma_cache_wback_slc; + } else { + __dma_cache_wback_inv = __dma_cache_wback_inv_l1; + __dma_cache_inv = __dma_cache_inv_l1; + __dma_cache_wback = __dma_cache_wback_l1; + } } diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index 57706a9c6948..e039fac163f8 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* @@ -53,6 +54,20 @@ void *dma_alloc_coherent(struct device *dev, size_t size, { void *paddr, *kvaddr; + /* + * IOC relies on all data (even coherent DMA data) being in cache + * Thus allocate normal cached memory + * + * The gains with IOC are two pronged: + * -For streaming data, elides needs for cache maintenance, saving + * cycles in flush code, and bus bandwidth as all the lines of a + * buffer need to be flushed out to memory + * -For coherent data, Read/Write to buffers terminate early in cache + * (vs. always going to memory - thus are faster) + */ + if (ioc_exists) + return dma_alloc_noncoherent(dev, size, dma_handle, gfp); + /* This is linear addr (0x8000_0000 based) */ paddr = alloc_pages_exact(size, gfp); if (!paddr) @@ -85,6 +100,9 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *kvaddr, dma_addr_t dma_handle) { + if (ioc_exists) + return dma_free_noncoherent(dev, size, kvaddr, dma_handle); + iounmap((void __force __iomem *)kvaddr); free_pages_exact((void *)dma_handle, size); -- cgit v1.2.3 From 79335a2ca03fdd883823e068b5e2f89a8ee47839 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 4 Jun 2015 18:30:23 +0530 Subject: ARCv2: SLC: Allow boot time disable Signed-off-by: Vineet Gupta --- arch/arc/mm/cache.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 25e7077d4c04..7c424e3f980d 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -23,6 +23,7 @@ static int l2_line_sz; int ioc_exists; +volatile int slc_enable = 1; void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr, unsigned long sz, const int cacheop); @@ -36,6 +37,7 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) int n = 0; struct cpuinfo_arc_cache *p; +#define IS_USED_RUN(v) ((v) ? "" : "(disabled) ") #define PR_CACHE(p, cfg, str) \ if (!(p)->ver) \ n += scnprintf(buf + n, len - n, str"\t\t: N/A\n"); \ @@ -53,7 +55,8 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) p = &cpuinfo_arc700[c].slc; if (p->ver) n += scnprintf(buf + n, len - n, - "SLC\t\t: %uK, %uB Line\n", p->sz_k, p->line_len); + "SLC\t\t: %uK, %uB Line%s\n", + p->sz_k, p->line_len, IS_USED_RUN(slc_enable)); if (ioc_exists) n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n"); @@ -908,6 +911,20 @@ void arc_cache_init(void) } } + if (is_isa_arcv2() && l2_line_sz && !slc_enable) { + + /* IM set : flush before invalidate */ + write_aux_reg(ARC_REG_SLC_CTRL, + read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_IM); + + write_aux_reg(ARC_REG_SLC_INVALIDATE, 1); + + /* Important to wait for flush to complete */ + while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY); + write_aux_reg(ARC_REG_SLC_CTRL, + read_aux_reg(ARC_REG_SLC_CTRL) | SLC_CTRL_DISABLE); + } + if (is_isa_arcv2() && ioc_exists) { /* IO coherency base - 0x8z */ write_aux_reg(ARC_REG_IO_COH_AP0_BASE, 0x80000); @@ -921,7 +938,7 @@ void arc_cache_init(void) __dma_cache_wback_inv = __dma_cache_wback_inv_ioc; __dma_cache_inv = __dma_cache_inv_ioc; __dma_cache_wback = __dma_cache_wback_ioc; - } else if (is_isa_arcv2() && l2_line_sz) { + } else if (is_isa_arcv2() && l2_line_sz && slc_enable) { __dma_cache_wback_inv = __dma_cache_wback_inv_slc; __dma_cache_inv = __dma_cache_inv_slc; __dma_cache_wback = __dma_cache_wback_slc; -- cgit v1.2.3 From 1648c70d301e669ba03aa1c70fff46ec2c400414 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 9 Jun 2015 11:25:22 +0300 Subject: ARCv2: IOC: Allow boot time disable Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/mm/cache.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 7c424e3f980d..5c825c8ebe10 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -23,7 +23,7 @@ static int l2_line_sz; int ioc_exists; -volatile int slc_enable = 1; +volatile int slc_enable = 1, ioc_enable = 1; void (*_cache_line_loop_ic_fn)(unsigned long paddr, unsigned long vaddr, unsigned long sz, const int cacheop); @@ -59,7 +59,8 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) p->sz_k, p->line_len, IS_USED_RUN(slc_enable)); if (ioc_exists) - n += scnprintf(buf + n, len - n, "IOC\t\t: exists\n"); + n += scnprintf(buf + n, len - n, "IOC\t\t:%s\n", + IS_USED_RUN(ioc_enable)); return buf; } @@ -154,7 +155,7 @@ slc_chk: } READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); - if (cbcr.c) + if (cbcr.c && ioc_enable) ioc_exists = 1; } -- cgit v1.2.3 From 31d30c8208a38a0442cc01a9c7f6542489c76353 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 5 Aug 2015 19:10:02 +0530 Subject: ARC: add barriers to futex code The atomic ops on futex need to provide the full barrier just like regular atomics in kernel. Also remove pagefault_enable/disable in futex_atomic_cmpxchg_inatomic() as core code already does that Cc: David Hildenbrand Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Michel Lespinasse Signed-off-by: Vineet Gupta --- arch/arc/include/asm/futex.h | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 70cfe16b742d..9de18a526aff 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -20,6 +20,7 @@ #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\ \ + smp_mb(); \ __asm__ __volatile__( \ "1: llock %1, [%2] \n" \ insn "\n" \ @@ -40,12 +41,14 @@ \ : "=&r" (ret), "=&r" (oldval) \ : "r" (uaddr), "r" (oparg), "ir" (-EFAULT) \ - : "cc", "memory") + : "cc", "memory"); \ + smp_mb() \ #else /* !CONFIG_ARC_HAS_LLSC */ #define __futex_atomic_op(insn, ret, oldval, uaddr, oparg)\ \ + smp_mb(); \ __asm__ __volatile__( \ "1: ld %1, [%2] \n" \ insn "\n" \ @@ -65,7 +68,8 @@ \ : "=&r" (ret), "=&r" (oldval) \ : "r" (uaddr), "r" (oparg), "ir" (-EFAULT) \ - : "cc", "memory") + : "cc", "memory"); \ + smp_mb() \ #endif @@ -134,13 +138,8 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) return ret; } -/* Compare-xchg with pagefaults disabled. - * Notes: - * -Best-Effort: Exchg happens only if compare succeeds. - * If compare fails, returns; leaving retry/looping to upper layers - * -successful cmp-xchg: return orig value in @addr (same as cmp val) - * -Compare fails: return orig value in @addr - * -user access r/w fails: return -EFAULT +/* + * cmpxchg of futex (pagefaults disabled by caller) */ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, @@ -151,7 +150,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; - pagefault_disable(); + smp_mb(); __asm__ __volatile__( #ifdef CONFIG_ARC_HAS_LLSC @@ -178,7 +177,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) : "cc", "memory"); - pagefault_enable(); + smp_mb(); *uval = val; return val; -- cgit v1.2.3 From ed574e2bbd81ec20134059fb5e17acbc76387270 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 5 Aug 2015 19:23:34 +0530 Subject: ARC: futex cosmetics Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Michel Lespinasse Signed-off-by: Vineet Gupta --- arch/arc/include/asm/futex.h | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 9de18a526aff..14b1c9aaf455 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -94,6 +94,7 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) __futex_atomic_op("mov %0, %3", ret, oldval, uaddr, oparg); break; case FUTEX_OP_ADD: + /* oldval = *uaddr; *uaddr += oparg ; ret = *uaddr */ __futex_atomic_op("add %0, %1, %3", ret, oldval, uaddr, oparg); break; case FUTEX_OP_OR: @@ -142,12 +143,12 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) * cmpxchg of futex (pagefaults disabled by caller) */ static inline int -futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, - u32 newval) +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, + u32 newval) { - u32 val; + u32 existval; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) + if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; smp_mb(); @@ -173,14 +174,14 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 oldval, " .word 1b, 4b \n" " .word 2b, 4b \n" " .previous\n" - : "=&r"(val) - : "r"(oldval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) + : "=&r"(existval) + : "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) : "cc", "memory"); smp_mb(); - *uval = val; - return val; + *uval = existval; + return existval; } #endif -- cgit v1.2.3 From 882a95ae0a4f8fc303257acf5c6ff305df34d04b Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 6 Aug 2015 17:03:17 +0530 Subject: ARC: make futex_atomic_cmpxchg_inatomic() return bimodal Callers of cmpxchg_futex_value_locked() in futex code expect bimodal return value: !0 (essentially -EFAULT as failure) 0 (success) Before this patch, the success return value was old value of futex, which could very well be non zero, causing caller to possibly take the failure path erroneously. Fix that by returning 0 for success (This fix was done back in 2011 for all upstream arches, which ARC obviously missed) Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Michel Lespinasse Signed-off-by: Vineet Gupta --- arch/arc/include/asm/futex.h | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 14b1c9aaf455..0ea8bcc7b846 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -141,11 +141,13 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) /* * cmpxchg of futex (pagefaults disabled by caller) + * Return 0 for success, -EFAULT otherwise */ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, u32 newval) { + int ret = 0; u32 existval; if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) @@ -155,18 +157,18 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, __asm__ __volatile__( #ifdef CONFIG_ARC_HAS_LLSC - "1: llock %0, [%3] \n" - " brne %0, %1, 3f \n" - "2: scond %2, [%3] \n" + "1: llock %1, [%4] \n" + " brne %1, %2, 3f \n" + "2: scond %3, [%4] \n" " bnz 1b \n" #else - "1: ld %0, [%3] \n" - " brne %0, %1, 3f \n" - "2: st %2, [%3] \n" + "1: ld %1, [%4] \n" + " brne %1, %2, 3f \n" + "2: st %3, [%4] \n" #endif "3: \n" " .section .fixup,\"ax\" \n" - "4: mov %0, %4 \n" + "4: mov %0, %5 \n" " b 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" @@ -174,14 +176,14 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, " .word 1b, 4b \n" " .word 2b, 4b \n" " .previous\n" - : "=&r"(existval) + : "+&r"(ret), "=&r"(existval) : "r"(expval), "r"(newval), "r"(uaddr), "ir"(-EFAULT) : "cc", "memory"); smp_mb(); *uval = existval; - return existval; + return ret; } #endif -- cgit v1.2.3 From 5e0574292ad48dcdf48ef90a47da862c21d649a6 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 6 Aug 2015 17:55:34 +0530 Subject: ARC: Enable HAVE_FUTEX_CMPXCHG ARC doesn't need the runtime detection of futex cmpxchg op Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Signed-off-by: Vineet Gupta --- arch/arc/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index e119d42f92ce..78c0621d5819 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -23,6 +23,7 @@ config ARC select GENERIC_SMP_IDLE_THREAD select HAVE_ARCH_KGDB select HAVE_ARCH_TRACEHOOK + select HAVE_FUTEX_CMPXCHG select HAVE_IOREMAP_PROT select HAVE_KPROBES select HAVE_KRETPROBES -- cgit v1.2.3 From eb2cd8b72b08fe56998600aee8a5dff93f7be5a2 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 6 Aug 2015 19:11:06 +0530 Subject: ARC: ensure futex ops are atomic in !LLSC config W/o hardware assisted atomic r-m-w the best we can do is to disable preemption. Cc: David Hildenbrand Cc: Peter Zijlstra (Intel) Cc: Thomas Gleixner Cc: Michel Lespinasse Signed-off-by: Vineet Gupta --- arch/arc/include/asm/futex.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 0ea8bcc7b846..8f449982523b 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -87,6 +87,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) return -EFAULT; +#ifndef CONFIG_ARC_HAS_LLSC + preempt_disable(); /* to guarantee atomic r-m-w of futex op */ +#endif pagefault_disable(); switch (op) { @@ -111,6 +114,9 @@ static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) } pagefault_enable(); +#ifndef CONFIG_ARC_HAS_LLSC + preempt_enable(); +#endif if (!ret) { switch (cmp) { @@ -153,6 +159,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) return -EFAULT; +#ifndef CONFIG_ARC_HAS_LLSC + preempt_disable(); /* to guarantee atomic r-m-w of futex op */ +#endif smp_mb(); __asm__ __volatile__( @@ -182,6 +191,9 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, smp_mb(); +#ifndef CONFIG_ARC_HAS_LLSC + preempt_enable(); +#endif *uval = existval; return ret; } -- cgit v1.2.3 From 6de6066c0d24a66df465cf87a4041ef7ef35ba6f Mon Sep 17 00:00:00 2001 From: Yuriy Kolerov Date: Wed, 12 Aug 2015 17:23:32 +0300 Subject: ARC: change some branchs to jumps to resolve linkage errors When kernel's binary becomes large enough (32M and more) errors may occur during the final linkage stage. It happens because the build system uses short relocations for ARC by default. This problem may be easily resolved by passing -mlong-calls option to GCC to use long absolute jumps (j) instead of short relative branchs (b). But there are fragments of pure assembler code exist which use branchs in inappropriate places and cause a linkage error because of relocations overflow. First of these fragments is .fixup insertion in futex.h and unaligned.c. It inserts a code in the separate section (.fixup) with branch instruction. It leads to the linkage error when kernel becomes large. Second of these fragments is calling scheduler's functions (common kernel code) from entry.S of ARC's code. When kernel's binary becomes large it may lead to the linkage error because scheduler may occur far enough from ARC's code in the final binary. Signed-off-by: Yuriy Kolerov Signed-off-by: Vineet Gupta --- arch/arc/include/asm/futex.h | 6 +++--- arch/arc/kernel/entry.S | 6 +++--- arch/arc/kernel/unaligned.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/arc/include/asm/futex.h b/arch/arc/include/asm/futex.h index 8f449982523b..11e1b1f3acda 100644 --- a/arch/arc/include/asm/futex.h +++ b/arch/arc/include/asm/futex.h @@ -31,7 +31,7 @@ " .section .fixup,\"ax\" \n" \ " .align 4 \n" \ "4: mov %0, %4 \n" \ - " b 3b \n" \ + " j 3b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ " .align 4 \n" \ @@ -58,7 +58,7 @@ " .section .fixup,\"ax\" \n" \ " .align 4 \n" \ "4: mov %0, %4 \n" \ - " b 3b \n" \ + " j 3b \n" \ " .previous \n" \ " .section __ex_table,\"a\" \n" \ " .align 4 \n" \ @@ -178,7 +178,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, u32 expval, "3: \n" " .section .fixup,\"ax\" \n" "4: mov %0, %5 \n" - " b 3b \n" + " j 3b \n" " .previous \n" " .section __ex_table,\"a\" \n" " .align 4 \n" diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index f7a82fd4d601..589abf5172d6 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -42,7 +42,7 @@ ENTRY(ret_from_fork) ; when the forked child comes here from the __switch_to function ; r0 has the last task pointer. ; put last task in scheduler queue - bl @schedule_tail + jl @schedule_tail ld r9, [sp, PT_status32] brne r9, 0, 1f @@ -320,7 +320,7 @@ resume_user_mode_begin: ; --- (Slow Path #1) task preemption --- bbit0 r9, TIF_NEED_RESCHED, .Lchk_pend_signals mov blink, resume_user_mode_begin ; tail-call to U mode ret chks - b @schedule ; BTST+Bnz causes relo error in link + j @schedule ; BTST+Bnz causes relo error in link .Lchk_pend_signals: IRQ_ENABLE r10 @@ -381,7 +381,7 @@ resume_kernel_mode: bbit0 r9, TIF_NEED_RESCHED, .Lrestore_regs ; Invoke PREEMPTION - bl preempt_schedule_irq + jl preempt_schedule_irq ; preempt_schedule_irq() always returns with IRQ disabled #endif diff --git a/arch/arc/kernel/unaligned.c b/arch/arc/kernel/unaligned.c index 74db59b6f392..abd961f3e763 100644 --- a/arch/arc/kernel/unaligned.c +++ b/arch/arc/kernel/unaligned.c @@ -34,7 +34,7 @@ " .section .fixup,\"ax\"\n" \ " .align 4\n" \ "3: mov %0, 1\n" \ - " b 2b\n" \ + " j 2b\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .align 4\n" \ @@ -82,7 +82,7 @@ " .section .fixup,\"ax\"\n" \ " .align 4\n" \ "4: mov %0, 1\n" \ - " b 3b\n" \ + " j 3b\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .align 4\n" \ @@ -113,7 +113,7 @@ " .section .fixup,\"ax\"\n" \ " .align 4\n" \ "6: mov %0, 1\n" \ - " b 5b\n" \ + " j 5b\n" \ " .previous\n" \ " .section __ex_table,\"a\"\n" \ " .align 4\n" \ -- cgit v1.2.3 From 090749502ff20d7d9ec244036fe636b6bf0433b6 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Wed, 19 Aug 2015 17:23:58 +0530 Subject: ARC: add/fix some comments in code - no functional change Signed-off-by: Vineet Gupta --- arch/arc/boot/dts/axc003.dtsi | 13 +++++++------ arch/arc/include/asm/cmpxchg.h | 22 +++++++++++----------- arch/arc/include/asm/perf_event.h | 2 +- arch/arc/kernel/perf_event.c | 4 ++-- arch/arc/kernel/process.c | 2 +- arch/arc/plat-axs10x/axs10x.c | 2 +- 6 files changed, 23 insertions(+), 22 deletions(-) diff --git a/arch/arc/boot/dts/axc003.dtsi b/arch/arc/boot/dts/axc003.dtsi index 1cd5e82f5dc2..846481f37eef 100644 --- a/arch/arc/boot/dts/axc003.dtsi +++ b/arch/arc/boot/dts/axc003.dtsi @@ -72,12 +72,13 @@ }; /* - * This INTC is actually connected to DW APB GPIO - * which acts as a wire between MB INTC and CPU INTC. - * GPIO INTC is configured in platform init code - * and here we mimic direct connection from MB INTC to - * CPU INTC, thus we set "interrupts = <7>" instead of - * "interrupts = <12>" + * The DW APB ICTL intc on MB is connected to CPU intc via a + * DT "invisible" DW APB GPIO block, configured to simply pass thru + * interrupts - setup accordinly in platform init (plat-axs10x/ax10x.c) + * + * So here we mimic a direct connection betwen them, ignoring the + * ABPG GPIO. Thus set "interrupts = <24>" (DW APB GPIO to core) + * instead of "interrupts = <12>" (DW APB ICTL to DW APB GPIO) * * This intc actually resides on MB, but we move it here to * avoid duplicating the MB dtsi file given that IRQ from diff --git a/arch/arc/include/asm/cmpxchg.h b/arch/arc/include/asm/cmpxchg.h index 44fd531f4d7b..af7a2db139c9 100644 --- a/arch/arc/include/asm/cmpxchg.h +++ b/arch/arc/include/asm/cmpxchg.h @@ -110,18 +110,18 @@ static inline unsigned long __xchg(unsigned long val, volatile void *ptr, sizeof(*(ptr)))) /* - * On ARC700, EX insn is inherently atomic, so by default "vanilla" xchg() need - * not require any locking. However there's a quirk. - * ARC lacks native CMPXCHG, thus emulated (see above), using external locking - - * incidently it "reuses" the same atomic_ops_lock used by atomic APIs. - * Now, llist code uses cmpxchg() and xchg() on same data, so xchg() needs to - * abide by same serializing rules, thus ends up using atomic_ops_lock as well. + * xchg() maps directly to ARC EX instruction which guarantees atomicity. + * However in !LLSC config, it also needs to be use @atomic_ops_lock spinlock + * due to a subtle reason: + * - For !LLSC, cmpxchg() needs to use that lock (see above) and there is lot + * of kernel code which calls xchg()/cmpxchg() on same data (see llist.h) + * Hence xchg() needs to follow same locking rules. * - * This however is only relevant if SMP and/or ARC lacks LLSC - * if (UP or LLSC) - * xchg doesn't need serialization - * else <==> !(UP or LLSC) <==> (!UP and !LLSC) <==> (SMP and !LLSC) - * xchg needs serialization + * Technically the lock is also needed for UP (boils down to irq save/restore) + * but we can cheat a bit since cmpxchg() atomic_ops_lock() would cause irqs to + * be disabled thus can't possibly be interrpted/preempted/clobbered by xchg() + * Other way around, xchg is one instruction anyways, so can't be interrupted + * as such */ #if !defined(CONFIG_ARC_HAS_LLSC) && defined(CONFIG_SMP) diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 2b8880e953a2..e2eaf6fb0468 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -95,7 +95,7 @@ static const char * const arc_pmu_ev_hw_map[] = { /* counts condition */ [PERF_COUNT_HW_INSTRUCTIONS] = "iall", - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */ [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 1287388c258a..79ab199a9778 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -199,8 +199,8 @@ static void arc_pmu_start(struct perf_event *event, int flags) event->hw.state = 0; /* enable ARC pmu here */ - write_aux_reg(ARC_REG_PCT_INDEX, idx); - write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config); + write_aux_reg(ARC_REG_PCT_INDEX, idx); /* counter # */ + write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config); /* condition */ } static void arc_pmu_stop(struct perf_event *event, int flags) diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 44092456776f..91d5a0f1f3f7 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -65,7 +65,7 @@ asmlinkage void ret_from_fork(void); * ------------------ * | r25 | <==== top of Stack (thread.ksp) * ~ ~ - * | --to-- | (CALLEE Regs of user mode) + * | --to-- | (CALLEE Regs of kernel mode) * | r13 | * ------------------ * | fp | diff --git a/arch/arc/plat-axs10x/axs10x.c b/arch/arc/plat-axs10x/axs10x.c index e7769c3ab5f2..ad9825d4026a 100644 --- a/arch/arc/plat-axs10x/axs10x.c +++ b/arch/arc/plat-axs10x/axs10x.c @@ -46,7 +46,7 @@ static void __init axs10x_enable_gpio_intc_wire(void) * ------------------- ------------------- * | snps,dw-apb-gpio | | snps,dw-apb-gpio | * ------------------- ------------------- - * | | + * | #12 | * | [ Debug UART on cpu card ] * | * ------------------------ -- cgit v1.2.3 From fd0881a24ac9ab2be6c052d30ca779597c0bd3bc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Fri, 21 Aug 2015 15:06:43 +0530 Subject: ARC: Eliminate some ARCv2 specific code for ARCompact build Signed-off-by: Vineet Gupta --- arch/arc/mm/cache.c | 58 +++++++++++++++++++++++++++++------------------------ arch/arc/mm/dma.c | 4 ++-- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index 5c825c8ebe10..0d1a6e96839f 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -52,6 +52,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) PR_CACHE(&cpuinfo_arc700[c].icache, CONFIG_ARC_HAS_ICACHE, "I-Cache"); PR_CACHE(&cpuinfo_arc700[c].dcache, CONFIG_ARC_HAS_DCACHE, "D-Cache"); + if (!is_isa_arcv2()) + return buf; + p = &cpuinfo_arc700[c].slc; if (p->ver) n += scnprintf(buf + n, len - n, @@ -70,18 +73,9 @@ char *arc_cache_mumbojumbo(int c, char *buf, int len) * the cpuinfo structure for later use. * No Validation done here, simply read/convert the BCRs */ -void read_decode_cache_bcr(void) +static void read_decode_cache_bcr_arcv2(int cpu) { - struct cpuinfo_arc_cache *p_ic, *p_dc, *p_slc; - unsigned int cpu = smp_processor_id(); - struct bcr_cache { -#ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int pad:12, line_len:4, sz:4, config:4, ver:8; -#else - unsigned int ver:8, config:4, sz:4, line_len:4, pad:12; -#endif - } ibcr, dbcr; - + struct cpuinfo_arc_cache *p_slc = &cpuinfo_arc700[cpu].slc; struct bcr_generic sbcr; struct bcr_slc_cfg { @@ -100,6 +94,31 @@ void read_decode_cache_bcr(void) #endif } cbcr; + READ_BCR(ARC_REG_SLC_BCR, sbcr); + if (sbcr.ver) { + READ_BCR(ARC_REG_SLC_CFG, slc_cfg); + p_slc->ver = sbcr.ver; + p_slc->sz_k = 128 << slc_cfg.sz; + l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64; + } + + READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); + if (cbcr.c && ioc_enable) + ioc_exists = 1; +} + +void read_decode_cache_bcr(void) +{ + struct cpuinfo_arc_cache *p_ic, *p_dc; + unsigned int cpu = smp_processor_id(); + struct bcr_cache { +#ifdef CONFIG_CPU_BIG_ENDIAN + unsigned int pad:12, line_len:4, sz:4, config:4, ver:8; +#else + unsigned int ver:8, config:4, sz:4, line_len:4, pad:12; +#endif + } ibcr, dbcr; + p_ic = &cpuinfo_arc700[cpu].icache; READ_BCR(ARC_REG_IC_BCR, ibcr); @@ -142,21 +161,8 @@ dc_chk: p_dc->ver = dbcr.ver; slc_chk: - if (!is_isa_arcv2()) - return; - - p_slc = &cpuinfo_arc700[cpu].slc; - READ_BCR(ARC_REG_SLC_BCR, sbcr); - if (sbcr.ver) { - READ_BCR(ARC_REG_SLC_CFG, slc_cfg); - p_slc->ver = sbcr.ver; - p_slc->sz_k = 128 << slc_cfg.sz; - l2_line_sz = p_slc->line_len = (slc_cfg.lsz == 0) ? 128 : 64; - } - - READ_BCR(ARC_REG_CLUSTER_BCR, cbcr); - if (cbcr.c && ioc_enable) - ioc_exists = 1; + if (is_isa_arcv2()) + read_decode_cache_bcr_arcv2(cpu); } /* diff --git a/arch/arc/mm/dma.c b/arch/arc/mm/dma.c index e039fac163f8..29a46bb198cc 100644 --- a/arch/arc/mm/dma.c +++ b/arch/arc/mm/dma.c @@ -65,7 +65,7 @@ void *dma_alloc_coherent(struct device *dev, size_t size, * -For coherent data, Read/Write to buffers terminate early in cache * (vs. always going to memory - thus are faster) */ - if (ioc_exists) + if (is_isa_arcv2() && ioc_exists) return dma_alloc_noncoherent(dev, size, dma_handle, gfp); /* This is linear addr (0x8000_0000 based) */ @@ -100,7 +100,7 @@ EXPORT_SYMBOL(dma_alloc_coherent); void dma_free_coherent(struct device *dev, size_t size, void *kvaddr, dma_addr_t dma_handle) { - if (ioc_exists) + if (is_isa_arcv2() && ioc_exists) return dma_free_noncoherent(dev, size, kvaddr, dma_handle); iounmap((void __force __iomem *)kvaddr); -- cgit v1.2.3 From fb7c57255168d34ae34300bcf78f50aebdeae4dc Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Mon, 24 Aug 2015 13:37:01 +0300 Subject: ARC: perf: cap the number of counters to hardware max of 32 The number of counters in PCT can never be more than 32 (while countable conditions could be 100+) for both ARCompact and ARCv2 And while at it update copyright dates. Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Signed-off-by: Vineet Gupta --- arch/arc/include/asm/perf_event.h | 5 +++-- arch/arc/kernel/perf_event.c | 6 +++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index e2eaf6fb0468..3c9bf285e96d 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -1,6 +1,7 @@ /* * Linux performance counter support for ARC * + * Copyright (C) 2014-2015 Synopsys, Inc. (www.synopsys.com) * Copyright (C) 2011-2013 Synopsys, Inc. (www.synopsys.com) * * This program is free software; you can redistribute it and/or modify @@ -12,8 +13,8 @@ #ifndef __ASM_PERF_EVENT_H #define __ASM_PERF_EVENT_H -/* real maximum varies per CPU, this is the maximum supported by the driver */ -#define ARC_PMU_MAX_HWEVENTS 64 +/* Max number of counters that PCT block may ever have */ +#define ARC_PERF_MAX_COUNTERS 32 #define ARC_REG_CC_BUILD 0xF6 #define ARC_REG_CC_INDEX 0x240 diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 79ab199a9778..c55543738ddc 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -1,7 +1,7 @@ /* * Linux performance counter support for ARC700 series * - * Copyright (C) 2013 Synopsys, Inc. (www.synopsys.com) + * Copyright (C) 2013-2015 Synopsys, Inc. (www.synopsys.com) * * This code is inspired by the perf support of various other architectures. * @@ -22,7 +22,7 @@ struct arc_pmu { struct pmu pmu; int counter_size; /* in bits */ int n_counters; - unsigned long used_mask[BITS_TO_LONGS(ARC_PMU_MAX_HWEVENTS)]; + unsigned long used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)]; int ev_hw_idx[PERF_COUNT_ARC_HW_MAX]; }; @@ -284,7 +284,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) pr_err("This core does not have performance counters!\n"); return -ENODEV; } - BUG_ON(pct_bcr.c > ARC_PMU_MAX_HWEVENTS); + BUG_ON(pct_bcr.c > ARC_PERF_MAX_COUNTERS); READ_BCR(ARC_REG_CC_BUILD, cc_bcr); BUG_ON(!cc_bcr.v); /* Counters exist but No countable conditions ? */ -- cgit v1.2.3 From 1fe8bfa5ff3b2e97f26add89b20768fb7c4188c0 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Mon, 24 Aug 2015 13:42:27 +0300 Subject: ARCv2: perf: implement "event_set_period" This generalization prepares for support of overflow interrupts. Hardware event counters on ARC work that way: Each counter counts from programmed start value (set in ARC_REG_PCT_COUNT) to a limit value (set in ARC_REG_PCT_INT_CNT) and once limit value is reached this timer generates an interrupt. Even though this hardware implementation allows for more flexibility, in Linux kernel we decided to mimic behavior of other architectures this way: [1] Set limit value as half of counter's max value (to allow counter to run after reaching it limit, see below for more explanation): ---------->8----------- arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL; ---------->8----------- [2] Set start value as "arc_pmu->max_period - sample_period" and then count up to the limit Our event counters don't stop on reaching max value (the one we set in ARC_REG_PCT_INT_CNT) but continue to count until kernel explicitly stops each of them. And setting a limit as half of counter capacity is done to allow capturing of additional events in between moment when interrupt was triggered until we're actually processing PMU interrupts. That way we're trying to be more precise. For example if we count CPU cycles we keep track of cycles while running through generic IRQ handling code: [1] We set counter period as say 100_000 events of type "crun" [2] Counter reaches that limit and raises its interrupt [3] Once we get in PMU IRQ handler we read current counter value from ARC_REG_PCT_SNAP ans see there something like 105_000. If counters stop on reaching a limit value then we would miss additional 5000 cycles. Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/kernel/perf_event.c | 79 +++++++++++++++++++++++++++++++++++--------- 1 file changed, 63 insertions(+), 16 deletions(-) diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index c55543738ddc..3626c56376cf 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -20,9 +20,9 @@ struct arc_pmu { struct pmu pmu; - int counter_size; /* in bits */ int n_counters; unsigned long used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)]; + u64 max_period; int ev_hw_idx[PERF_COUNT_ARC_HW_MAX]; }; @@ -88,18 +88,15 @@ static uint64_t arc_pmu_read_counter(int idx) static void arc_perf_event_update(struct perf_event *event, struct hw_perf_event *hwc, int idx) { - uint64_t prev_raw_count, new_raw_count; - int64_t delta; - - do { - prev_raw_count = local64_read(&hwc->prev_count); - new_raw_count = arc_pmu_read_counter(idx); - } while (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count); - - delta = (new_raw_count - prev_raw_count) & - ((1ULL << arc_pmu->counter_size) - 1ULL); + uint64_t prev_raw_count = local64_read(&hwc->prev_count); + uint64_t new_raw_count = arc_pmu_read_counter(idx); + int64_t delta = new_raw_count - prev_raw_count; + /* + * We don't afaraid of hwc->prev_count changing beneath our feet + * because there's no way for us to re-enter this function anytime. + */ + local64_set(&hwc->prev_count, new_raw_count); local64_add(delta, &event->count); local64_sub(delta, &hwc->period_left); } @@ -142,6 +139,10 @@ static int arc_pmu_event_init(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int ret; + hwc->sample_period = arc_pmu->max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + switch (event->attr.type) { case PERF_TYPE_HARDWARE: if (event->attr.config >= PERF_COUNT_HW_MAX) @@ -153,6 +154,7 @@ static int arc_pmu_event_init(struct perf_event *event) (int) event->attr.config, (int) hwc->config, arc_pmu_ev_hw_map[event->attr.config]); return 0; + case PERF_TYPE_HW_CACHE: ret = arc_pmu_cache_event(event->attr.config); if (ret < 0) @@ -180,6 +182,47 @@ static void arc_pmu_disable(struct pmu *pmu) write_aux_reg(ARC_REG_PCT_CONTROL, (tmp & 0xffff0000) | 0x0); } +static int arc_pmu_event_set_period(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + s64 left = local64_read(&hwc->period_left); + s64 period = hwc->sample_period; + int idx = hwc->idx; + int overflow = 0; + u64 value; + + if (unlikely(left <= -period)) { + /* left underflowed by more than period. */ + left = period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } else if (unlikely(left <= 0)) { + /* left underflowed by less than period. */ + left += period; + local64_set(&hwc->period_left, left); + hwc->last_period = period; + overflow = 1; + } + + if (left > arc_pmu->max_period) + left = arc_pmu->max_period; + + value = arc_pmu->max_period - left; + local64_set(&hwc->prev_count, value); + + /* Select counter */ + write_aux_reg(ARC_REG_PCT_INDEX, idx); + + /* Write value */ + write_aux_reg(ARC_REG_PCT_COUNTL, (u32)value); + write_aux_reg(ARC_REG_PCT_COUNTH, (value >> 32)); + + perf_event_update_userpage(event); + + return overflow; +} + /* * Assigns hardware counter to hardware condition. * Note that there is no separate start/stop mechanism; @@ -194,9 +237,11 @@ static void arc_pmu_start(struct perf_event *event, int flags) return; if (flags & PERF_EF_RELOAD) - WARN_ON_ONCE(!(event->hw.state & PERF_HES_UPTODATE)); + WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE)); + + hwc->state = 0; - event->hw.state = 0; + arc_pmu_event_set_period(event); /* enable ARC pmu here */ write_aux_reg(ARC_REG_PCT_INDEX, idx); /* counter # */ @@ -269,6 +314,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) struct arc_reg_pct_build pct_bcr; struct arc_reg_cc_build cc_bcr; int i, j; + int counter_size; /* in bits */ union cc_name { struct { @@ -294,10 +340,11 @@ static int arc_pmu_device_probe(struct platform_device *pdev) return -ENOMEM; arc_pmu->n_counters = pct_bcr.c; - arc_pmu->counter_size = 32 + (pct_bcr.s << 4); + counter_size = 32 + (pct_bcr.s << 4); + arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL; pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n", - arc_pmu->n_counters, arc_pmu->counter_size, cc_bcr.c); + arc_pmu->n_counters, counter_size, cc_bcr.c); cc_name.str[8] = 0; for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++) -- cgit v1.2.3 From 36481cf7fbcc666699d54cb267088d2b415ff164 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Mon, 24 Aug 2015 13:48:06 +0300 Subject: ARCv2: perf: Support sampling events using overflow interrupts In times of ARC 700 performance counters didn't have support of interrupt an so for ARC we only had support of non-sampling events. Put simply only "perf stat" was functional. Now with ARC HS we have support of interrupts in performance counters which this change introduces support of. ARC performance counters act in the following way in regard of interrupts generation. [1] A counter counts starting from value set in PCT_COUNT register pair [2] Once counter reaches value set in PCT_INT_CNT interrupt is raised Basic setup look like this: [1] PCT_COUNT = 0; [2] PCT_INT_CNT = __limit_value__; [3] Enable interrupts for that counter and let it run [4] Let counter reach its limit [5] Handle interrupt when it happens Note that PCT HW block is build in CPU core and so ints interrupt line (which is basically OR of all counters IRQs) is wired directly to top-level IRQC. That means do de-assert PCT interrupt it's required to reset IRQs from all counters that have reached their limit values. Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/include/asm/perf_event.h | 8 ++- arch/arc/kernel/perf_event.c | 128 +++++++++++++++++++++++++++++++++++--- 2 files changed, 126 insertions(+), 10 deletions(-) diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 3c9bf285e96d..3522d095b615 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -29,15 +29,19 @@ #define ARC_REG_PCT_CONFIG 0x254 #define ARC_REG_PCT_CONTROL 0x255 #define ARC_REG_PCT_INDEX 0x256 +#define ARC_REG_PCT_INT_CNTL 0x25C +#define ARC_REG_PCT_INT_CNTH 0x25D +#define ARC_REG_PCT_INT_CTRL 0x25E +#define ARC_REG_PCT_INT_ACT 0x25F #define ARC_REG_PCT_CONTROL_CC (1 << 16) /* clear counts */ #define ARC_REG_PCT_CONTROL_SN (1 << 17) /* snapshot */ struct arc_reg_pct_build { #ifdef CONFIG_CPU_BIG_ENDIAN - unsigned int m:8, c:8, r:6, s:2, v:8; + unsigned int m:8, c:8, r:5, i:1, s:2, v:8; #else - unsigned int v:8, s:2, r:6, c:8, m:8; + unsigned int v:8, s:2, i:1, r:5, c:8, m:8; #endif }; diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 3626c56376cf..208c9546e8f0 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -11,6 +11,7 @@ * */ #include +#include #include #include #include @@ -24,6 +25,7 @@ struct arc_pmu { unsigned long used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)]; u64 max_period; int ev_hw_idx[PERF_COUNT_ARC_HW_MAX]; + struct perf_event *act_counter[ARC_PERF_MAX_COUNTERS]; }; struct arc_callchain_trace { @@ -139,9 +141,11 @@ static int arc_pmu_event_init(struct perf_event *event) struct hw_perf_event *hwc = &event->hw; int ret; - hwc->sample_period = arc_pmu->max_period; - hwc->last_period = hwc->sample_period; - local64_set(&hwc->period_left, hwc->sample_period); + if (!is_sampling_event(event)) { + hwc->sample_period = arc_pmu->max_period; + hwc->last_period = hwc->sample_period; + local64_set(&hwc->period_left, hwc->sample_period); + } switch (event->attr.type) { case PERF_TYPE_HARDWARE: @@ -243,6 +247,11 @@ static void arc_pmu_start(struct perf_event *event, int flags) arc_pmu_event_set_period(event); + /* Enable interrupt for this counter */ + if (is_sampling_event(event)) + write_aux_reg(ARC_REG_PCT_INT_CTRL, + read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx)); + /* enable ARC pmu here */ write_aux_reg(ARC_REG_PCT_INDEX, idx); /* counter # */ write_aux_reg(ARC_REG_PCT_CONFIG, hwc->config); /* condition */ @@ -253,6 +262,17 @@ static void arc_pmu_stop(struct perf_event *event, int flags) struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; + /* Disable interrupt for this counter */ + if (is_sampling_event(event)) { + /* + * Reset interrupt flag by writing of 1. This is required + * to make sure pending interrupt was not left. + */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx); + write_aux_reg(ARC_REG_PCT_INT_CTRL, + read_aux_reg(ARC_REG_PCT_INT_CTRL) & ~(1 << idx)); + } + if (!(event->hw.state & PERF_HES_STOPPED)) { /* stop ARC pmu here */ write_aux_reg(ARC_REG_PCT_INDEX, idx); @@ -275,6 +295,8 @@ static void arc_pmu_del(struct perf_event *event, int flags) arc_pmu_stop(event, PERF_EF_UPDATE); __clear_bit(event->hw.idx, arc_pmu->used_mask); + arc_pmu->act_counter[event->hw.idx] = 0; + perf_event_update_userpage(event); } @@ -295,6 +317,16 @@ static int arc_pmu_add(struct perf_event *event, int flags) } write_aux_reg(ARC_REG_PCT_INDEX, idx); + + arc_pmu->act_counter[idx] = event; + + if (is_sampling_event(event)) { + /* Mimic full counter overflow as other arches do */ + write_aux_reg(ARC_REG_PCT_INT_CNTL, (u32)arc_pmu->max_period); + write_aux_reg(ARC_REG_PCT_INT_CNTH, + (arc_pmu->max_period >> 32)); + } + write_aux_reg(ARC_REG_PCT_CONFIG, 0); write_aux_reg(ARC_REG_PCT_COUNTL, 0); write_aux_reg(ARC_REG_PCT_COUNTH, 0); @@ -309,11 +341,70 @@ static int arc_pmu_add(struct perf_event *event, int flags) return 0; } +#ifdef CONFIG_ISA_ARCV2 +static irqreturn_t arc_pmu_intr(int irq, void *dev) +{ + struct perf_sample_data data; + struct arc_pmu *arc_pmu = (struct arc_pmu *)dev; + struct pt_regs *regs; + int active_ints; + int idx; + + arc_pmu_disable(&arc_pmu->pmu); + + active_ints = read_aux_reg(ARC_REG_PCT_INT_ACT); + + regs = get_irq_regs(); + + for (idx = 0; idx < arc_pmu->n_counters; idx++) { + struct perf_event *event = arc_pmu->act_counter[idx]; + struct hw_perf_event *hwc; + + if (!(active_ints & (1 << idx))) + continue; + + /* Reset interrupt flag by writing of 1 */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 1 << idx); + + /* + * On reset of "interrupt active" bit corresponding + * "interrupt enable" bit gets automatically reset as well. + * Now we need to re-enable interrupt for the counter. + */ + write_aux_reg(ARC_REG_PCT_INT_CTRL, + read_aux_reg(ARC_REG_PCT_INT_CTRL) | (1 << idx)); + + hwc = &event->hw; + + WARN_ON_ONCE(hwc->idx != idx); + + arc_perf_event_update(event, &event->hw, event->hw.idx); + perf_sample_data_init(&data, 0, hwc->last_period); + if (!arc_pmu_event_set_period(event)) + continue; + + if (perf_event_overflow(event, &data, regs)) + arc_pmu_stop(event, 0); + } + + arc_pmu_enable(&arc_pmu->pmu); + + return IRQ_HANDLED; +} +#else + +static irqreturn_t arc_pmu_intr(int irq, void *dev) +{ + return IRQ_NONE; +} + +#endif /* CONFIG_ISA_ARCV2 */ + static int arc_pmu_device_probe(struct platform_device *pdev) { struct arc_reg_pct_build pct_bcr; struct arc_reg_cc_build cc_bcr; - int i, j; + int i, j, has_interrupts; int counter_size; /* in bits */ union cc_name { @@ -339,12 +430,16 @@ static int arc_pmu_device_probe(struct platform_device *pdev) if (!arc_pmu) return -ENOMEM; + has_interrupts = is_isa_arcv2() ? pct_bcr.i : 0; + arc_pmu->n_counters = pct_bcr.c; counter_size = 32 + (pct_bcr.s << 4); + arc_pmu->max_period = (1ULL << counter_size) / 2 - 1ULL; - pr_info("ARC perf\t: %d counters (%d bits), %d countable conditions\n", - arc_pmu->n_counters, counter_size, cc_bcr.c); + pr_info("ARC perf\t: %d counters (%d bits), %d conditions%s\n", + arc_pmu->n_counters, counter_size, cc_bcr.c, + has_interrupts ? ", [overflow IRQ support]":""); cc_name.str[8] = 0; for (i = 0; i < PERF_COUNT_ARC_HW_MAX; i++) @@ -379,8 +474,25 @@ static int arc_pmu_device_probe(struct platform_device *pdev) .read = arc_pmu_read, }; - /* ARC 700 PMU does not support sampling events */ - arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; + if (has_interrupts) { + int irq = platform_get_irq(pdev, 0); + + if (irq < 0) { + pr_err("Cannot get IRQ number for the platform\n"); + return -ENODEV; + } + + ret = devm_request_irq(&pdev->dev, irq, arc_pmu_intr, 0, + "arc-pmu", arc_pmu); + if (ret) { + pr_err("could not allocate PMU IRQ\n"); + return ret; + } + + /* Clean all pending interrupt flags */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff); + } else + arc_pmu->pmu.capabilities |= PERF_PMU_CAP_NO_INTERRUPT; return perf_pmu_register(&arc_pmu->pmu, pdev->name, PERF_TYPE_RAW); } -- cgit v1.2.3 From e6b1d126bb748103824087189e30febc88c4db73 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Mon, 24 Aug 2015 13:53:36 +0300 Subject: ARCv2: perf: implement exclusion of event counting in user or kernel mode Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/include/asm/perf_event.h | 3 +++ arch/arc/kernel/perf_event.c | 16 ++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 3522d095b615..5824ab46cb71 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -34,6 +34,9 @@ #define ARC_REG_PCT_INT_CTRL 0x25E #define ARC_REG_PCT_INT_ACT 0x25F +#define ARC_REG_PCT_CONFIG_USER (1 << 18) /* count in user mode */ +#define ARC_REG_PCT_CONFIG_KERN (1 << 19) /* count in kernel mode */ + #define ARC_REG_PCT_CONTROL_CC (1 << 16) /* clear counts */ #define ARC_REG_PCT_CONTROL_SN (1 << 17) /* snapshot */ diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 208c9546e8f0..5b94cebe0535 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -147,13 +147,25 @@ static int arc_pmu_event_init(struct perf_event *event) local64_set(&hwc->period_left, hwc->sample_period); } + hwc->config = 0; + + if (is_isa_arcv2()) { + /* "exclude user" means "count only kernel" */ + if (event->attr.exclude_user) + hwc->config |= ARC_REG_PCT_CONFIG_KERN; + + /* "exclude kernel" means "count only user" */ + if (event->attr.exclude_kernel) + hwc->config |= ARC_REG_PCT_CONFIG_USER; + } + switch (event->attr.type) { case PERF_TYPE_HARDWARE: if (event->attr.config >= PERF_COUNT_HW_MAX) return -ENOENT; if (arc_pmu->ev_hw_idx[event->attr.config] < 0) return -ENOENT; - hwc->config = arc_pmu->ev_hw_idx[event->attr.config]; + hwc->config |= arc_pmu->ev_hw_idx[event->attr.config]; pr_debug("init event %d with h/w %d \'%s\'\n", (int) event->attr.config, (int) hwc->config, arc_pmu_ev_hw_map[event->attr.config]); @@ -163,7 +175,7 @@ static int arc_pmu_event_init(struct perf_event *event) ret = arc_pmu_cache_event(event->attr.config); if (ret < 0) return ret; - hwc->config = arc_pmu->ev_hw_idx[ret]; + hwc->config |= arc_pmu->ev_hw_idx[ret]; return 0; default: return -ENOENT; -- cgit v1.2.3 From e525c37f8413b19130d0499c7467fed45a94579b Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Mon, 24 Aug 2015 14:03:30 +0300 Subject: ARCv2: perf: SMP support * split off pmu info into singleton and per-cpu bits * setup PMU on all cores Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- arch/arc/kernel/perf_event.c | 69 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 15 deletions(-) diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 5b94cebe0535..743065251e23 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -21,10 +21,22 @@ struct arc_pmu { struct pmu pmu; + unsigned int irq; int n_counters; - unsigned long used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)]; u64 max_period; int ev_hw_idx[PERF_COUNT_ARC_HW_MAX]; +}; + +struct arc_pmu_cpu { + /* + * A 1 bit for an index indicates that the counter is being used for + * an event. A 0 means that the counter can be used. + */ + unsigned long used_mask[BITS_TO_LONGS(ARC_PERF_MAX_COUNTERS)]; + + /* + * The events that are active on the PMU for the given index. + */ struct perf_event *act_counter[ARC_PERF_MAX_COUNTERS]; }; @@ -67,6 +79,7 @@ perf_callchain_user(struct perf_callchain_entry *entry, struct pt_regs *regs) } static struct arc_pmu *arc_pmu; +static DEFINE_PER_CPU(struct arc_pmu_cpu, arc_pmu_cpu); /* read counter #idx; note that counter# != event# on ARC! */ static uint64_t arc_pmu_read_counter(int idx) @@ -304,10 +317,12 @@ static void arc_pmu_stop(struct perf_event *event, int flags) static void arc_pmu_del(struct perf_event *event, int flags) { + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); + arc_pmu_stop(event, PERF_EF_UPDATE); - __clear_bit(event->hw.idx, arc_pmu->used_mask); + __clear_bit(event->hw.idx, pmu_cpu->used_mask); - arc_pmu->act_counter[event->hw.idx] = 0; + pmu_cpu->act_counter[event->hw.idx] = 0; perf_event_update_userpage(event); } @@ -315,22 +330,23 @@ static void arc_pmu_del(struct perf_event *event, int flags) /* allocate hardware counter and optionally start counting */ static int arc_pmu_add(struct perf_event *event, int flags) { + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); struct hw_perf_event *hwc = &event->hw; int idx = hwc->idx; - if (__test_and_set_bit(idx, arc_pmu->used_mask)) { - idx = find_first_zero_bit(arc_pmu->used_mask, + if (__test_and_set_bit(idx, pmu_cpu->used_mask)) { + idx = find_first_zero_bit(pmu_cpu->used_mask, arc_pmu->n_counters); if (idx == arc_pmu->n_counters) return -EAGAIN; - __set_bit(idx, arc_pmu->used_mask); + __set_bit(idx, pmu_cpu->used_mask); hwc->idx = idx; } write_aux_reg(ARC_REG_PCT_INDEX, idx); - arc_pmu->act_counter[idx] = event; + pmu_cpu->act_counter[idx] = event; if (is_sampling_event(event)) { /* Mimic full counter overflow as other arches do */ @@ -357,7 +373,7 @@ static int arc_pmu_add(struct perf_event *event, int flags) static irqreturn_t arc_pmu_intr(int irq, void *dev) { struct perf_sample_data data; - struct arc_pmu *arc_pmu = (struct arc_pmu *)dev; + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); struct pt_regs *regs; int active_ints; int idx; @@ -369,7 +385,7 @@ static irqreturn_t arc_pmu_intr(int irq, void *dev) regs = get_irq_regs(); for (idx = 0; idx < arc_pmu->n_counters; idx++) { - struct perf_event *event = arc_pmu->act_counter[idx]; + struct perf_event *event = pmu_cpu->act_counter[idx]; struct hw_perf_event *hwc; if (!(active_ints & (1 << idx))) @@ -412,6 +428,17 @@ static irqreturn_t arc_pmu_intr(int irq, void *dev) #endif /* CONFIG_ISA_ARCV2 */ +void arc_cpu_pmu_irq_init(void) +{ + struct arc_pmu_cpu *pmu_cpu = this_cpu_ptr(&arc_pmu_cpu); + + arc_request_percpu_irq(arc_pmu->irq, smp_processor_id(), arc_pmu_intr, + "ARC perf counters", pmu_cpu); + + /* Clear all pending interrupt flags */ + write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff); +} + static int arc_pmu_device_probe(struct platform_device *pdev) { struct arc_reg_pct_build pct_bcr; @@ -488,18 +515,30 @@ static int arc_pmu_device_probe(struct platform_device *pdev) if (has_interrupts) { int irq = platform_get_irq(pdev, 0); + unsigned long flags; if (irq < 0) { pr_err("Cannot get IRQ number for the platform\n"); return -ENODEV; } - ret = devm_request_irq(&pdev->dev, irq, arc_pmu_intr, 0, - "arc-pmu", arc_pmu); - if (ret) { - pr_err("could not allocate PMU IRQ\n"); - return ret; - } + arc_pmu->irq = irq; + + /* + * arc_cpu_pmu_irq_init() needs to be called on all cores for + * their respective local PMU. + * However we use opencoded on_each_cpu() to ensure it is called + * on core0 first, so that arc_request_percpu_irq() sets up + * AUTOEN etc. Otherwise enable_percpu_irq() fails to enable + * perf IRQ on non master cores. + * see arc_request_percpu_irq() + */ + preempt_disable(); + local_irq_save(flags); + arc_cpu_pmu_irq_init(); + local_irq_restore(flags); + smp_call_function((smp_call_func_t)arc_cpu_pmu_irq_init, 0, 1); + preempt_enable(); /* Clean all pending interrupt flags */ write_aux_reg(ARC_REG_PCT_INT_ACT, 0xffffffff); -- cgit v1.2.3 From 9b28829d6da391f67a76dbba07a167e2b554bd10 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Tue, 18 Nov 2014 17:36:11 +0530 Subject: ARCv2: perf: Finally introduce HS perf unit With all features in place, the ARC HS pct block can now be effectively allowed to be probed/used Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta --- Documentation/devicetree/bindings/arc/archs-pct.txt | 17 +++++++++++++++++ MAINTAINERS | 2 +- arch/arc/include/asm/perf_event.h | 5 ++++- arch/arc/kernel/perf_event.c | 3 ++- 4 files changed, 24 insertions(+), 3 deletions(-) create mode 100644 Documentation/devicetree/bindings/arc/archs-pct.txt diff --git a/Documentation/devicetree/bindings/arc/archs-pct.txt b/Documentation/devicetree/bindings/arc/archs-pct.txt new file mode 100644 index 000000000000..1ae98b87c640 --- /dev/null +++ b/Documentation/devicetree/bindings/arc/archs-pct.txt @@ -0,0 +1,17 @@ +* ARC HS Performance Counters + +The ARC HS can be configured with a pipeline performance monitor for counting +CPU and cache events like cache misses and hits. Like conventional PCT there +are 100+ hardware conditions dynamically mapped to upto 32 counters. +It also supports overflow interrupts. + +Required properties: + +- compatible : should contain + "snps,archs-pct" + +Example: + +pmu { + compatible = "snps,archs-pct"; +}; diff --git a/MAINTAINERS b/MAINTAINERS index d7ab7363b263..d4cda4273131 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9874,7 +9874,7 @@ SYNOPSYS ARC ARCHITECTURE M: Vineet Gupta S: Supported F: arch/arc/ -F: Documentation/devicetree/bindings/arc/ +F: Documentation/devicetree/bindings/arc/* F: drivers/tty/serial/arc_uart.c T: git git://git.kernel.org/pub/scm/linux/kernel/git/vgupta/arc.git diff --git a/arch/arc/include/asm/perf_event.h b/arch/arc/include/asm/perf_event.h index 5824ab46cb71..5f071762fb1c 100644 --- a/arch/arc/include/asm/perf_event.h +++ b/arch/arc/include/asm/perf_event.h @@ -105,8 +105,11 @@ static const char * const arc_pmu_ev_hw_map[] = { [PERF_COUNT_HW_INSTRUCTIONS] = "iall", [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = "ijmp", /* Excludes ZOL jumps */ [PERF_COUNT_ARC_BPOK] = "bpok", /* NP-NT, PT-T, PNT-NT */ +#ifdef CONFIG_ISA_ARCV2 + [PERF_COUNT_HW_BRANCH_MISSES] = "bpmp", +#else [PERF_COUNT_HW_BRANCH_MISSES] = "bpfail", /* NP-T, PT-NT, PNT-T */ - +#endif [PERF_COUNT_ARC_LDC] = "imemrdc", /* Instr: mem read cached */ [PERF_COUNT_ARC_STC] = "imemwrc", /* Instr: mem write cached */ diff --git a/arch/arc/kernel/perf_event.c b/arch/arc/kernel/perf_event.c index 743065251e23..0c08bb1ce15a 100644 --- a/arch/arc/kernel/perf_event.c +++ b/arch/arc/kernel/perf_event.c @@ -551,6 +551,7 @@ static int arc_pmu_device_probe(struct platform_device *pdev) #ifdef CONFIG_OF static const struct of_device_id arc_pmu_match[] = { { .compatible = "snps,arc700-pct" }, + { .compatible = "snps,archs-pct" }, {}, }; MODULE_DEVICE_TABLE(of, arc_pmu_match); @@ -558,7 +559,7 @@ MODULE_DEVICE_TABLE(of, arc_pmu_match); static struct platform_driver arc_pmu_driver = { .driver = { - .name = "arc700-pct", + .name = "arc-pct", .of_match_table = of_match_ptr(arc_pmu_match), }, .probe = arc_pmu_device_probe, -- cgit v1.2.3 From 3d5926599a6bc551efc0c8b244469a711f0d0166 Mon Sep 17 00:00:00 2001 From: Vineet Gupta Date: Thu, 27 Aug 2015 16:25:07 +0530 Subject: ARCv2: entry: Fix reserved handler Signed-off-by: Vineet Gupta --- arch/arc/kernel/entry-arcv2.S | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/arch/arc/kernel/entry-arcv2.S b/arch/arc/kernel/entry-arcv2.S index bd7105d3172f..8fa76567e402 100644 --- a/arch/arc/kernel/entry-arcv2.S +++ b/arch/arc/kernel/entry-arcv2.S @@ -57,13 +57,8 @@ VECTOR handle_interrupt ; (23) End of fixed IRQs .section .text, "ax",@progbits -res_service: ; processor restart - flag 0x1 ; not implemented - nop - nop - -reserved: ; processor restart - rtie ; jump to processor initializations +reserved: + flag 1 ; Unexpected event, halt ;##################### Interrupt Handling ############################## -- cgit v1.2.3