diff options
Diffstat (limited to 'arch')
51 files changed, 768 insertions, 369 deletions
diff --git a/arch/alpha/include/asm/cmpxchg.h b/arch/alpha/include/asm/cmpxchg.h index 429e8cd0d78e..e5117766529e 100644 --- a/arch/alpha/include/asm/cmpxchg.h +++ b/arch/alpha/include/asm/cmpxchg.h @@ -66,6 +66,4 @@ #undef __ASM__MB #undef ____cmpxchg -#define __HAVE_ARCH_CMPXCHG 1 - #endif /* _ALPHA_CMPXCHG_H */ diff --git a/arch/arm/include/asm/barrier.h b/arch/arm/include/asm/barrier.h index d2f81e6b8c1c..6c2327e1c732 100644 --- a/arch/arm/include/asm/barrier.h +++ b/arch/arm/include/asm/barrier.h @@ -81,7 +81,7 @@ do { \ #define read_barrier_depends() do { } while(0) #define smp_read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define smp_mb__before_atomic() smp_mb() #define smp_mb__after_atomic() smp_mb() diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h index 71f19c4dc0de..0fa47c4275cb 100644 --- a/arch/arm64/include/asm/barrier.h +++ b/arch/arm64/include/asm/barrier.h @@ -114,7 +114,7 @@ do { \ #define read_barrier_depends() do { } while(0) #define smp_read_barrier_depends() do { } while(0) -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define nop() asm volatile("nop"); #define smp_mb__before_atomic() smp_mb() diff --git a/arch/avr32/include/asm/cmpxchg.h b/arch/avr32/include/asm/cmpxchg.h index 962a6aeab787..366bbeaeb405 100644 --- a/arch/avr32/include/asm/cmpxchg.h +++ b/arch/avr32/include/asm/cmpxchg.h @@ -70,8 +70,6 @@ extern unsigned long __cmpxchg_u64_unsupported_on_32bit_kernels( if something tries to do an invalid cmpxchg(). */ extern void __cmpxchg_called_with_bad_pointer(void); -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, unsigned long new, int size) { diff --git a/arch/hexagon/include/asm/cmpxchg.h b/arch/hexagon/include/asm/cmpxchg.h index 9e7802911a57..a6e34e2acbba 100644 --- a/arch/hexagon/include/asm/cmpxchg.h +++ b/arch/hexagon/include/asm/cmpxchg.h @@ -64,7 +64,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void *ptr, * looks just like atomic_cmpxchg on our arch currently with a bunch of * variable casting. */ -#define __HAVE_ARCH_CMPXCHG 1 #define cmpxchg(ptr, old, new) \ ({ \ diff --git a/arch/ia64/include/asm/barrier.h b/arch/ia64/include/asm/barrier.h index f6769eb2bbf9..843ba435e43b 100644 --- a/arch/ia64/include/asm/barrier.h +++ b/arch/ia64/include/asm/barrier.h @@ -77,12 +77,7 @@ do { \ ___p1; \ }) -/* - * XXX check on this ---I suspect what Linus really wants here is - * acquire vs release semantics but we can't discuss this stuff with - * Linus just yet. Grrr... - */ -#define set_mb(var, value) do { (var) = (value); mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) /* * The group barrier in front of the rsm & ssm are necessary to ensure diff --git a/arch/ia64/include/uapi/asm/cmpxchg.h b/arch/ia64/include/uapi/asm/cmpxchg.h index f35109b1d907..a0e3620f8f13 100644 --- a/arch/ia64/include/uapi/asm/cmpxchg.h +++ b/arch/ia64/include/uapi/asm/cmpxchg.h @@ -61,8 +61,6 @@ extern void ia64_xchg_called_with_bad_pointer(void); * indicated by comparing RETURN with OLD. */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * This function doesn't exist, so you'll get a linker error * if something tries to do an invalid cmpxchg(). diff --git a/arch/m32r/include/asm/cmpxchg.h b/arch/m32r/include/asm/cmpxchg.h index de651db20b43..14bf9b739dd2 100644 --- a/arch/m32r/include/asm/cmpxchg.h +++ b/arch/m32r/include/asm/cmpxchg.h @@ -107,8 +107,6 @@ __xchg_local(unsigned long x, volatile void *ptr, int size) ((__typeof__(*(ptr)))__xchg_local((unsigned long)(x), (ptr), \ sizeof(*(ptr)))) -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned int old, unsigned int new) { diff --git a/arch/m68k/include/asm/cmpxchg.h b/arch/m68k/include/asm/cmpxchg.h index bc755bc620ad..83b1df80f0ac 100644 --- a/arch/m68k/include/asm/cmpxchg.h +++ b/arch/m68k/include/asm/cmpxchg.h @@ -90,7 +90,6 @@ extern unsigned long __invalid_cmpxchg_size(volatile void *, * indicated by comparing RETURN with OLD. */ #ifdef CONFIG_RMW_INSNS -#define __HAVE_ARCH_CMPXCHG 1 static inline unsigned long __cmpxchg(volatile void *p, unsigned long old, unsigned long new, int size) diff --git a/arch/metag/include/asm/barrier.h b/arch/metag/include/asm/barrier.h index d703d8e26a65..5a696e507930 100644 --- a/arch/metag/include/asm/barrier.h +++ b/arch/metag/include/asm/barrier.h @@ -84,7 +84,7 @@ static inline void fence(void) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) -#define set_mb(var, value) do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define smp_store_release(p, v) \ do { \ diff --git a/arch/metag/include/asm/cmpxchg.h b/arch/metag/include/asm/cmpxchg.h index b1bc1be8540f..be29e3e44321 100644 --- a/arch/metag/include/asm/cmpxchg.h +++ b/arch/metag/include/asm/cmpxchg.h @@ -51,8 +51,6 @@ static inline unsigned long __cmpxchg(volatile void *ptr, unsigned long old, return old; } -#define __HAVE_ARCH_CMPXCHG 1 - #define cmpxchg(ptr, o, n) \ ({ \ __typeof__(*(ptr)) _o_ = (o); \ diff --git a/arch/mips/include/asm/barrier.h b/arch/mips/include/asm/barrier.h index 2b8bbbcb9be0..7ecba84656d4 100644 --- a/arch/mips/include/asm/barrier.h +++ b/arch/mips/include/asm/barrier.h @@ -112,8 +112,8 @@ #define __WEAK_LLSC_MB " \n" #endif -#define set_mb(var, value) \ - do { var = value; smp_mb(); } while (0) +#define smp_store_mb(var, value) \ + do { WRITE_ONCE(var, value); smp_mb(); } while (0) #define smp_llsc_mb() __asm__ __volatile__(__WEAK_LLSC_MB : : :"memory") diff --git a/arch/mips/include/asm/cmpxchg.h b/arch/mips/include/asm/cmpxchg.h index 412f945f1f5e..b71ab4a5fd50 100644 --- a/arch/mips/include/asm/cmpxchg.h +++ b/arch/mips/include/asm/cmpxchg.h @@ -138,8 +138,6 @@ static inline unsigned long __xchg(unsigned long x, volatile void * ptr, int siz __xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))); \ }) -#define __HAVE_ARCH_CMPXCHG 1 - #define __cmpxchg_asm(ld, st, m, old, new) \ ({ \ __typeof(*(m)) __ret; \ diff --git a/arch/parisc/include/asm/cmpxchg.h b/arch/parisc/include/asm/cmpxchg.h index dbd13354ec41..0a90b965cccb 100644 --- a/arch/parisc/include/asm/cmpxchg.h +++ b/arch/parisc/include/asm/cmpxchg.h @@ -46,8 +46,6 @@ __xchg(unsigned long x, __volatile__ void *ptr, int size) #define xchg(ptr, x) \ ((__typeof__(*(ptr)))__xchg((unsigned long)(x), (ptr), sizeof(*(ptr)))) -#define __HAVE_ARCH_CMPXCHG 1 - /* bug catcher for when unsupported size is used - won't link */ extern void __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/powerpc/include/asm/barrier.h b/arch/powerpc/include/asm/barrier.h index a3bf5be111ff..51ccc7232042 100644 --- a/arch/powerpc/include/asm/barrier.h +++ b/arch/powerpc/include/asm/barrier.h @@ -34,7 +34,7 @@ #define rmb() __asm__ __volatile__ ("sync" : : : "memory") #define wmb() __asm__ __volatile__ ("sync" : : : "memory") -#define set_mb(var, value) do { var = value; mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #ifdef __SUBARCH_HAS_LWSYNC # define SMPWMB LWSYNC @@ -89,5 +89,6 @@ do { \ #define smp_mb__before_atomic() smp_mb() #define smp_mb__after_atomic() smp_mb() +#define smp_mb__before_spinlock() smp_mb() #endif /* _ASM_POWERPC_BARRIER_H */ diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index d463c68fe7f0..ad6263cffb0f 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -144,7 +144,6 @@ __xchg_local(volatile void *ptr, unsigned long x, unsigned int size) * Compare and exchange - if *p == old, set it to new, * and return the old value of *p. */ -#define __HAVE_ARCH_CMPXCHG 1 static __always_inline unsigned long __cmpxchg_u32(volatile unsigned int *p, unsigned long old, unsigned long new) diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h index 8d724718ec21..e6f8615a11eb 100644 --- a/arch/s390/include/asm/barrier.h +++ b/arch/s390/include/asm/barrier.h @@ -36,7 +36,7 @@ #define smp_mb__before_atomic() smp_mb() #define smp_mb__after_atomic() smp_mb() -#define set_mb(var, value) do { var = value; mb(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); mb(); } while (0) #define smp_store_release(p, v) \ do { \ diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h index 4eadec466b8c..411464f4c97a 100644 --- a/arch/s390/include/asm/cmpxchg.h +++ b/arch/s390/include/asm/cmpxchg.h @@ -32,8 +32,6 @@ __old; \ }) -#define __HAVE_ARCH_CMPXCHG - #define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn) \ ({ \ register __typeof__(*(p1)) __old1 asm("2") = (o1); \ diff --git a/arch/score/include/asm/cmpxchg.h b/arch/score/include/asm/cmpxchg.h index f384839c3ee5..cc3f6420b71c 100644 --- a/arch/score/include/asm/cmpxchg.h +++ b/arch/score/include/asm/cmpxchg.h @@ -42,8 +42,6 @@ static inline unsigned long __cmpxchg(volatile unsigned long *m, (unsigned long)(o), \ (unsigned long)(n))) -#define __HAVE_ARCH_CMPXCHG 1 - #include <asm-generic/cmpxchg-local.h> #endif /* _ASM_SCORE_CMPXCHG_H */ diff --git a/arch/sh/include/asm/barrier.h b/arch/sh/include/asm/barrier.h index 43715308b068..bf91037db4e0 100644 --- a/arch/sh/include/asm/barrier.h +++ b/arch/sh/include/asm/barrier.h @@ -32,7 +32,7 @@ #define ctrl_barrier() __asm__ __volatile__ ("nop;nop;nop;nop;nop;nop;nop;nop") #endif -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #include <asm-generic/barrier.h> diff --git a/arch/sh/include/asm/cmpxchg.h b/arch/sh/include/asm/cmpxchg.h index f6bd1406b897..85c97b188d71 100644 --- a/arch/sh/include/asm/cmpxchg.h +++ b/arch/sh/include/asm/cmpxchg.h @@ -46,8 +46,6 @@ extern void __xchg_called_with_bad_pointer(void); * if something tries to do an invalid cmpxchg(). */ extern void __cmpxchg_called_with_bad_pointer(void); -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, unsigned long new, int size) { diff --git a/arch/sparc/include/asm/barrier_64.h b/arch/sparc/include/asm/barrier_64.h index 76648941fea7..809941e33e12 100644 --- a/arch/sparc/include/asm/barrier_64.h +++ b/arch/sparc/include/asm/barrier_64.h @@ -40,8 +40,8 @@ do { __asm__ __volatile__("ba,pt %%xcc, 1f\n\t" \ #define dma_rmb() rmb() #define dma_wmb() wmb() -#define set_mb(__var, __value) \ - do { __var = __value; membar_safe("#StoreLoad"); } while(0) +#define smp_store_mb(__var, __value) \ + do { WRITE_ONCE(__var, __value); membar_safe("#StoreLoad"); } while(0) #ifdef CONFIG_SMP #define smp_mb() mb() diff --git a/arch/sparc/include/asm/cmpxchg_32.h b/arch/sparc/include/asm/cmpxchg_32.h index d38b52dca216..83ffb83c5397 100644 --- a/arch/sparc/include/asm/cmpxchg_32.h +++ b/arch/sparc/include/asm/cmpxchg_32.h @@ -34,7 +34,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, int * * Cribbed from <asm-parisc/atomic.h> */ -#define __HAVE_ARCH_CMPXCHG 1 /* bug catcher for when unsupported size is used - won't link */ void __cmpxchg_called_with_bad_pointer(void); diff --git a/arch/sparc/include/asm/cmpxchg_64.h b/arch/sparc/include/asm/cmpxchg_64.h index 0e1ed6cfbf68..faa2f61058c2 100644 --- a/arch/sparc/include/asm/cmpxchg_64.h +++ b/arch/sparc/include/asm/cmpxchg_64.h @@ -65,8 +65,6 @@ static inline unsigned long __xchg(unsigned long x, __volatile__ void * ptr, #include <asm-generic/cmpxchg-local.h> -#define __HAVE_ARCH_CMPXCHG 1 - static inline unsigned long __cmpxchg_u32(volatile int *m, int old, int new) { diff --git a/arch/tile/include/asm/atomic_64.h b/arch/tile/include/asm/atomic_64.h index 7b11c5fadd42..0496970cef82 100644 --- a/arch/tile/include/asm/atomic_64.h +++ b/arch/tile/include/asm/atomic_64.h @@ -105,9 +105,6 @@ static inline long atomic64_add_unless(atomic64_t *v, long a, long u) #define atomic64_inc_not_zero(v) atomic64_add_unless((v), 1, 0) -/* Define this to indicate that cmpxchg is an efficient operation. */ -#define __HAVE_ARCH_CMPXCHG - #endif /* !__ASSEMBLY__ */ #endif /* _ASM_TILE_ATOMIC_64_H */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 226d5696e1d1..4e986e809861 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -127,7 +127,8 @@ config X86 select MODULES_USE_ELF_RELA if X86_64 select CLONE_BACKWARDS if X86_32 select ARCH_USE_BUILTIN_BSWAP - select ARCH_USE_QUEUE_RWLOCK + select ARCH_USE_QUEUED_SPINLOCKS + select ARCH_USE_QUEUED_RWLOCKS select OLD_SIGSUSPEND3 if X86_32 || IA32_EMULATION select OLD_SIGACTION if X86_32 select COMPAT_OLD_SIGACTION if IA32_EMULATION @@ -666,7 +667,7 @@ config PARAVIRT_DEBUG config PARAVIRT_SPINLOCKS bool "Paravirtualization layer for spinlocks" depends on PARAVIRT && SMP - select UNINLINE_SPIN_UNLOCK + select UNINLINE_SPIN_UNLOCK if !QUEUED_SPINLOCKS ---help--- Paravirtualized spinlocks allow a pvops backend to replace the spinlock implementation with something virtualization-friendly diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 959e45b81fe2..e51a8f803f55 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -35,12 +35,12 @@ #define smp_mb() mb() #define smp_rmb() dma_rmb() #define smp_wmb() barrier() -#define set_mb(var, value) do { (void)xchg(&var, value); } while (0) +#define smp_store_mb(var, value) do { (void)xchg(&var, value); } while (0) #else /* !SMP */ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #endif /* SMP */ #define read_barrier_depends() do { } while (0) diff --git a/arch/x86/include/asm/cmpxchg.h b/arch/x86/include/asm/cmpxchg.h index 99c105d78b7e..ad19841eddfe 100644 --- a/arch/x86/include/asm/cmpxchg.h +++ b/arch/x86/include/asm/cmpxchg.h @@ -4,8 +4,6 @@ #include <linux/compiler.h> #include <asm/alternative.h> /* Provides LOCK_PREFIX */ -#define __HAVE_ARCH_CMPXCHG 1 - /* * Non-existant functions to indicate usage errors at link time * (or compile-time if the compiler implements __compiletime_error(). diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 8957810ad7d1..d143bfad45d7 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -712,6 +712,31 @@ static inline void __set_fixmap(unsigned /* enum fixed_addresses */ idx, #if defined(CONFIG_SMP) && defined(CONFIG_PARAVIRT_SPINLOCKS) +#ifdef CONFIG_QUEUED_SPINLOCKS + +static __always_inline void pv_queued_spin_lock_slowpath(struct qspinlock *lock, + u32 val) +{ + PVOP_VCALL2(pv_lock_ops.queued_spin_lock_slowpath, lock, val); +} + +static __always_inline void pv_queued_spin_unlock(struct qspinlock *lock) +{ + PVOP_VCALLEE1(pv_lock_ops.queued_spin_unlock, lock); +} + +static __always_inline void pv_wait(u8 *ptr, u8 val) +{ + PVOP_VCALL2(pv_lock_ops.wait, ptr, val); +} + +static __always_inline void pv_kick(int cpu) +{ + PVOP_VCALL1(pv_lock_ops.kick, cpu); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + static __always_inline void __ticket_lock_spinning(struct arch_spinlock *lock, __ticket_t ticket) { @@ -724,7 +749,9 @@ static __always_inline void __ticket_unlock_kick(struct arch_spinlock *lock, PVOP_VCALL2(pv_lock_ops.unlock_kick, lock, ticket); } -#endif +#endif /* CONFIG_QUEUED_SPINLOCKS */ + +#endif /* SMP && PARAVIRT_SPINLOCKS */ #ifdef CONFIG_X86_32 #define PV_SAVE_REGS "pushl %ecx; pushl %edx;" diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index f7b0b5c112f2..8766c7c395c2 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -333,9 +333,19 @@ struct arch_spinlock; typedef u16 __ticket_t; #endif +struct qspinlock; + struct pv_lock_ops { +#ifdef CONFIG_QUEUED_SPINLOCKS + void (*queued_spin_lock_slowpath)(struct qspinlock *lock, u32 val); + struct paravirt_callee_save queued_spin_unlock; + + void (*wait)(u8 *ptr, u8 val); + void (*kick)(int cpu); +#else /* !CONFIG_QUEUED_SPINLOCKS */ struct paravirt_callee_save lock_spinning; void (*unlock_kick)(struct arch_spinlock *lock, __ticket_t ticket); +#endif /* !CONFIG_QUEUED_SPINLOCKS */ }; /* This contains all the paravirt structures: we get a convenient diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h new file mode 100644 index 000000000000..9d51fae1cba3 --- /dev/null +++ b/arch/x86/include/asm/qspinlock.h @@ -0,0 +1,57 @@ +#ifndef _ASM_X86_QSPINLOCK_H +#define _ASM_X86_QSPINLOCK_H + +#include <asm/cpufeature.h> +#include <asm-generic/qspinlock_types.h> +#include <asm/paravirt.h> + +#define queued_spin_unlock queued_spin_unlock +/** + * queued_spin_unlock - release a queued spinlock + * @lock : Pointer to queued spinlock structure + * + * A smp_store_release() on the least-significant byte. + */ +static inline void native_queued_spin_unlock(struct qspinlock *lock) +{ + smp_store_release((u8 *)lock, 0); +} + +#ifdef CONFIG_PARAVIRT_SPINLOCKS +extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __pv_init_lock_hash(void); +extern void __pv_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); +extern void __raw_callee_save___pv_queued_spin_unlock(struct qspinlock *lock); + +static inline void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) +{ + pv_queued_spin_lock_slowpath(lock, val); +} + +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + pv_queued_spin_unlock(lock); +} +#else +static inline void queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} +#endif + +#define virt_queued_spin_lock virt_queued_spin_lock + +static inline bool virt_queued_spin_lock(struct qspinlock *lock) +{ + if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) + return false; + + while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0) + cpu_relax(); + + return true; +} + +#include <asm-generic/qspinlock.h> + +#endif /* _ASM_X86_QSPINLOCK_H */ diff --git a/arch/x86/include/asm/qspinlock_paravirt.h b/arch/x86/include/asm/qspinlock_paravirt.h new file mode 100644 index 000000000000..b002e711ba88 --- /dev/null +++ b/arch/x86/include/asm/qspinlock_paravirt.h @@ -0,0 +1,6 @@ +#ifndef __ASM_QSPINLOCK_PARAVIRT_H +#define __ASM_QSPINLOCK_PARAVIRT_H + +PV_CALLEE_SAVE_REGS_THUNK(__pv_queued_spin_unlock); + +#endif diff --git a/arch/x86/include/asm/spinlock.h b/arch/x86/include/asm/spinlock.h index 64b611782ef0..be0a05913b91 100644 --- a/arch/x86/include/asm/spinlock.h +++ b/arch/x86/include/asm/spinlock.h @@ -42,6 +42,10 @@ extern struct static_key paravirt_ticketlocks_enabled; static __always_inline bool static_key_false(struct static_key *key); +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm/qspinlock.h> +#else + #ifdef CONFIG_PARAVIRT_SPINLOCKS static inline void __ticket_enter_slowpath(arch_spinlock_t *lock) @@ -196,6 +200,7 @@ static inline void arch_spin_unlock_wait(arch_spinlock_t *lock) cpu_relax(); } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ /* * Read-write spinlocks, allowing multiple readers diff --git a/arch/x86/include/asm/spinlock_types.h b/arch/x86/include/asm/spinlock_types.h index 5f9d7572d82b..65c3e37f879a 100644 --- a/arch/x86/include/asm/spinlock_types.h +++ b/arch/x86/include/asm/spinlock_types.h @@ -23,6 +23,9 @@ typedef u32 __ticketpair_t; #define TICKET_SHIFT (sizeof(__ticket_t) * 8) +#ifdef CONFIG_QUEUED_SPINLOCKS +#include <asm-generic/qspinlock_types.h> +#else typedef struct arch_spinlock { union { __ticketpair_t head_tail; @@ -33,6 +36,7 @@ typedef struct arch_spinlock { } arch_spinlock_t; #define __ARCH_SPIN_LOCK_UNLOCKED { { 0 } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ #include <asm-generic/qrwlock_types.h> diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 20190bdac9d5..95cf78d44ab4 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -53,9 +53,12 @@ static DEFINE_MUTEX(mce_chrdev_read_mutex); #define rcu_dereference_check_mce(p) \ - rcu_dereference_index_check((p), \ - rcu_read_lock_sched_held() || \ - lockdep_is_held(&mce_chrdev_read_mutex)) +({ \ + rcu_lockdep_assert(rcu_read_lock_sched_held() || \ + lockdep_is_held(&mce_chrdev_read_mutex), \ + "suspicious rcu_dereference_check_mce() usage"); \ + smp_load_acquire(&(p)); \ +}) #define CREATE_TRACE_POINTS #include <trace/events/mce.h> @@ -1887,7 +1890,7 @@ out: static unsigned int mce_chrdev_poll(struct file *file, poll_table *wait) { poll_wait(file, &mce_chrdev_wait, wait); - if (rcu_access_index(mcelog.next)) + if (READ_ONCE(mcelog.next)) return POLLIN | POLLRDNORM; if (!mce_apei_read_done && apei_check_mce()) return POLLIN | POLLRDNORM; @@ -1932,8 +1935,8 @@ void register_mce_write_callback(ssize_t (*fn)(struct file *filp, } EXPORT_SYMBOL_GPL(register_mce_write_callback); -ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, - size_t usize, loff_t *off) +static ssize_t mce_chrdev_write(struct file *filp, const char __user *ubuf, + size_t usize, loff_t *off) { if (mce_write) return mce_write(filp, ubuf, usize, off); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 6be186ccef46..5801a14f7524 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -910,10 +910,7 @@ int x86_schedule_events(struct cpu_hw_events *cpuc, int n, int *assign) if (x86_pmu.commit_scheduling) x86_pmu.commit_scheduling(cpuc, i, assign[i]); } - } - - if (!assign || unsched) { - + } else { for (i = 0; i < n; i++) { e = cpuc->event_list[i]; /* @@ -1126,13 +1123,16 @@ int x86_perf_event_set_period(struct perf_event *event) per_cpu(pmc_prev_left[idx], smp_processor_id()) = left; - /* - * The hw event starts counting from this event offset, - * mark it to be able to extra future deltas: - */ - local64_set(&hwc->prev_count, (u64)-left); + if (!(hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) || + local64_read(&hwc->prev_count) != (u64)-left) { + /* + * The hw event starts counting from this event offset, + * mark it to be able to extra future deltas: + */ + local64_set(&hwc->prev_count, (u64)-left); - wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + wrmsrl(hwc->event_base, (u64)(-left) & x86_pmu.cntval_mask); + } /* * Due to erratum on certan cpu we need diff --git a/arch/x86/kernel/cpu/perf_event.h b/arch/x86/kernel/cpu/perf_event.h index f068695eaca0..3e7fd27dfe20 100644 --- a/arch/x86/kernel/cpu/perf_event.h +++ b/arch/x86/kernel/cpu/perf_event.h @@ -75,6 +75,8 @@ struct event_constraint { #define PERF_X86_EVENT_DYNAMIC 0x0080 /* dynamic alloc'd constraint */ #define PERF_X86_EVENT_RDPMC_ALLOWED 0x0100 /* grant rdpmc permission */ #define PERF_X86_EVENT_EXCL_ACCT 0x0200 /* accounted EXCL event */ +#define PERF_X86_EVENT_AUTO_RELOAD 0x0400 /* use PEBS auto-reload */ +#define PERF_X86_EVENT_FREERUNNING 0x0800 /* use freerunning PEBS */ struct amd_nb { @@ -88,6 +90,18 @@ struct amd_nb { #define MAX_PEBS_EVENTS 8 /* + * Flags PEBS can handle without an PMI. + * + * TID can only be handled by flushing at context switch. + * + */ +#define PEBS_FREERUNNING_FLAGS \ + (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ + PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ + PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ + PERF_SAMPLE_TRANSACTION) + +/* * A debug store configuration. * * We only support architectures that use 64bit fields. @@ -133,7 +147,6 @@ enum intel_excl_state_type { }; struct intel_excl_states { - enum intel_excl_state_type init_state[X86_PMC_IDX_MAX]; enum intel_excl_state_type state[X86_PMC_IDX_MAX]; bool sched_started; /* true if scheduling has started */ }; @@ -527,10 +540,10 @@ struct x86_pmu { void (*put_event_constraints)(struct cpu_hw_events *cpuc, struct perf_event *event); - void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); - void (*start_scheduling)(struct cpu_hw_events *cpuc); + void (*commit_scheduling)(struct cpu_hw_events *cpuc, int idx, int cntr); + void (*stop_scheduling)(struct cpu_hw_events *cpuc); struct event_constraint *event_constraints; @@ -870,6 +883,8 @@ void intel_pmu_pebs_enable_all(void); void intel_pmu_pebs_disable_all(void); +void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in); + void intel_ds_init(void); void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 2813ea0f142e..19980d9a6cc9 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1903,9 +1903,8 @@ static void intel_start_scheduling(struct cpu_hw_events *cpuc) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl, *xlo; + struct intel_excl_states *xl; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; /* sibling thread */ /* * nothing needed if in group validation mode @@ -1916,10 +1915,9 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) /* * no exclusion needed */ - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return; - xlo = &excl_cntrs->states[o_tid]; xl = &excl_cntrs->states[tid]; xl->sched_started = true; @@ -1928,22 +1926,41 @@ intel_start_scheduling(struct cpu_hw_events *cpuc) * in stop_event_scheduling() * makes scheduling appear as a transaction */ - WARN_ON_ONCE(!irqs_disabled()); raw_spin_lock(&excl_cntrs->lock); +} - /* - * save initial state of sibling thread - */ - memcpy(xlo->init_state, xlo->state, sizeof(xlo->init_state)); +static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) +{ + struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; + struct event_constraint *c = cpuc->event_constraint[idx]; + struct intel_excl_states *xl; + int tid = cpuc->excl_thread_id; + + if (cpuc->is_fake || !is_ht_workaround_enabled()) + return; + + if (WARN_ON_ONCE(!excl_cntrs)) + return; + + if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) + return; + + xl = &excl_cntrs->states[tid]; + + lockdep_assert_held(&excl_cntrs->lock); + + if (c->flags & PERF_X86_EVENT_EXCL) + xl->state[cntr] = INTEL_EXCL_EXCLUSIVE; + else + xl->state[cntr] = INTEL_EXCL_SHARED; } static void intel_stop_scheduling(struct cpu_hw_events *cpuc) { struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl, *xlo; + struct intel_excl_states *xl; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; /* sibling thread */ /* * nothing needed if in group validation mode @@ -1953,17 +1970,11 @@ intel_stop_scheduling(struct cpu_hw_events *cpuc) /* * no exclusion needed */ - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return; - xlo = &excl_cntrs->states[o_tid]; xl = &excl_cntrs->states[tid]; - /* - * make new sibling thread state visible - */ - memcpy(xlo->state, xlo->init_state, sizeof(xlo->state)); - xl->sched_started = false; /* * release shared state lock (acquired in intel_start_scheduling()) @@ -1975,12 +1986,10 @@ static struct event_constraint * intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, int idx, struct event_constraint *c) { - struct event_constraint *cx; struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xl, *xlo; - int is_excl, i; + struct intel_excl_states *xlo; int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; /* alternate */ + int is_excl, i; /* * validating a group does not require @@ -1992,27 +2001,8 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, /* * no exclusion needed */ - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return c; - /* - * event requires exclusive counter access - * across HT threads - */ - is_excl = c->flags & PERF_X86_EVENT_EXCL; - if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { - event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; - if (!cpuc->n_excl++) - WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); - } - - /* - * xl = state of current HT - * xlo = state of sibling HT - */ - xl = &excl_cntrs->states[tid]; - xlo = &excl_cntrs->states[o_tid]; - - cx = c; /* * because we modify the constraint, we need @@ -2023,10 +2013,7 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * been cloned (marked dynamic) */ if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) { - - /* sanity check */ - if (idx < 0) - return &emptyconstraint; + struct event_constraint *cx; /* * grab pre-allocated constraint entry @@ -2037,13 +2024,14 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * initialize dynamic constraint * with static constraint */ - memcpy(cx, c, sizeof(*cx)); + *cx = *c; /* * mark constraint as dynamic, so we * can free it later on */ cx->flags |= PERF_X86_EVENT_DYNAMIC; + c = cx; } /* @@ -2054,6 +2042,22 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, */ /* + * state of sibling HT + */ + xlo = &excl_cntrs->states[tid ^ 1]; + + /* + * event requires exclusive counter access + * across HT threads + */ + is_excl = c->flags & PERF_X86_EVENT_EXCL; + if (is_excl && !(event->hw.flags & PERF_X86_EVENT_EXCL_ACCT)) { + event->hw.flags |= PERF_X86_EVENT_EXCL_ACCT; + if (!cpuc->n_excl++) + WRITE_ONCE(excl_cntrs->has_exclusive[tid], 1); + } + + /* * Modify static constraint with current dynamic * state of thread * @@ -2061,37 +2065,37 @@ intel_get_excl_constraints(struct cpu_hw_events *cpuc, struct perf_event *event, * SHARED : sibling counter measuring non-exclusive event * UNUSED : sibling counter unused */ - for_each_set_bit(i, cx->idxmsk, X86_PMC_IDX_MAX) { + for_each_set_bit(i, c->idxmsk, X86_PMC_IDX_MAX) { /* * exclusive event in sibling counter * our corresponding counter cannot be used * regardless of our event */ - if (xl->state[i] == INTEL_EXCL_EXCLUSIVE) - __clear_bit(i, cx->idxmsk); + if (xlo->state[i] == INTEL_EXCL_EXCLUSIVE) + __clear_bit(i, c->idxmsk); /* * if measuring an exclusive event, sibling * measuring non-exclusive, then counter cannot * be used */ - if (is_excl && xl->state[i] == INTEL_EXCL_SHARED) - __clear_bit(i, cx->idxmsk); + if (is_excl && xlo->state[i] == INTEL_EXCL_SHARED) + __clear_bit(i, c->idxmsk); } /* * recompute actual bit weight for scheduling algorithm */ - cx->weight = hweight64(cx->idxmsk64); + c->weight = hweight64(c->idxmsk64); /* * if we return an empty mask, then switch * back to static empty constraint to avoid * the cost of freeing later on */ - if (cx->weight == 0) - cx = &emptyconstraint; + if (c->weight == 0) + c = &emptyconstraint; - return cx; + return c; } static struct event_constraint * @@ -2124,10 +2128,8 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, { struct hw_perf_event *hwc = &event->hw; struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct intel_excl_states *xlo, *xl; - unsigned long flags = 0; /* keep compiler happy */ int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; + struct intel_excl_states *xl; /* * nothing needed if in group validation mode @@ -2135,13 +2137,9 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, if (cpuc->is_fake) return; - WARN_ON_ONCE(!excl_cntrs); - - if (!excl_cntrs) + if (WARN_ON_ONCE(!excl_cntrs)) return; - xl = &excl_cntrs->states[tid]; - xlo = &excl_cntrs->states[o_tid]; if (hwc->flags & PERF_X86_EVENT_EXCL_ACCT) { hwc->flags &= ~PERF_X86_EVENT_EXCL_ACCT; if (!--cpuc->n_excl) @@ -2149,22 +2147,25 @@ static void intel_put_excl_constraints(struct cpu_hw_events *cpuc, } /* - * put_constraint may be called from x86_schedule_events() - * which already has the lock held so here make locking - * conditional + * If event was actually assigned, then mark the counter state as + * unused now. */ - if (!xl->sched_started) - raw_spin_lock_irqsave(&excl_cntrs->lock, flags); + if (hwc->idx >= 0) { + xl = &excl_cntrs->states[tid]; - /* - * if event was actually assigned, then mark the - * counter state as unused now - */ - if (hwc->idx >= 0) - xlo->state[hwc->idx] = INTEL_EXCL_UNUSED; + /* + * put_constraint may be called from x86_schedule_events() + * which already has the lock held so here make locking + * conditional. + */ + if (!xl->sched_started) + raw_spin_lock(&excl_cntrs->lock); - if (!xl->sched_started) - raw_spin_unlock_irqrestore(&excl_cntrs->lock, flags); + xl->state[hwc->idx] = INTEL_EXCL_UNUSED; + + if (!xl->sched_started) + raw_spin_unlock(&excl_cntrs->lock); + } } static void @@ -2196,41 +2197,6 @@ static void intel_put_event_constraints(struct cpu_hw_events *cpuc, intel_put_excl_constraints(cpuc, event); } -static void intel_commit_scheduling(struct cpu_hw_events *cpuc, int idx, int cntr) -{ - struct intel_excl_cntrs *excl_cntrs = cpuc->excl_cntrs; - struct event_constraint *c = cpuc->event_constraint[idx]; - struct intel_excl_states *xlo, *xl; - int tid = cpuc->excl_thread_id; - int o_tid = 1 - tid; - int is_excl; - - if (cpuc->is_fake || !c) - return; - - is_excl = c->flags & PERF_X86_EVENT_EXCL; - - if (!(c->flags & PERF_X86_EVENT_DYNAMIC)) - return; - - WARN_ON_ONCE(!excl_cntrs); - - if (!excl_cntrs) - return; - - xl = &excl_cntrs->states[tid]; - xlo = &excl_cntrs->states[o_tid]; - - WARN_ON_ONCE(!raw_spin_is_locked(&excl_cntrs->lock)); - - if (cntr >= 0) { - if (is_excl) - xlo->init_state[cntr] = INTEL_EXCL_EXCLUSIVE; - else - xlo->init_state[cntr] = INTEL_EXCL_SHARED; - } -} - static void intel_pebs_aliases_core2(struct perf_event *event) { if ((event->hw.config & X86_RAW_EVENT_MASK) == 0x003c) { @@ -2294,8 +2260,15 @@ static int intel_pmu_hw_config(struct perf_event *event) if (ret) return ret; - if (event->attr.precise_ip && x86_pmu.pebs_aliases) - x86_pmu.pebs_aliases(event); + if (event->attr.precise_ip) { + if (!event->attr.freq) { + event->hw.flags |= PERF_X86_EVENT_AUTO_RELOAD; + if (!(event->attr.sample_type & ~PEBS_FREERUNNING_FLAGS)) + event->hw.flags |= PERF_X86_EVENT_FREERUNNING; + } + if (x86_pmu.pebs_aliases) + x86_pmu.pebs_aliases(event); + } if (needs_branch_stack(event)) { ret = intel_pmu_setup_lbr_filter(event); @@ -2544,19 +2517,11 @@ struct intel_shared_regs *allocate_shared_regs(int cpu) static struct intel_excl_cntrs *allocate_excl_cntrs(int cpu) { struct intel_excl_cntrs *c; - int i; c = kzalloc_node(sizeof(struct intel_excl_cntrs), GFP_KERNEL, cpu_to_node(cpu)); if (c) { raw_spin_lock_init(&c->lock); - for (i = 0; i < X86_PMC_IDX_MAX; i++) { - c->states[0].state[i] = INTEL_EXCL_UNUSED; - c->states[0].init_state[i] = INTEL_EXCL_UNUSED; - - c->states[1].state[i] = INTEL_EXCL_UNUSED; - c->states[1].init_state[i] = INTEL_EXCL_UNUSED; - } c->core_id = -1; } return c; @@ -2677,6 +2642,15 @@ static void intel_pmu_cpu_dying(int cpu) fini_debug_store_on_cpu(cpu); } +static void intel_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + if (x86_pmu.pebs_active) + intel_pmu_pebs_sched_task(ctx, sched_in); + if (x86_pmu.lbr_nr) + intel_pmu_lbr_sched_task(ctx, sched_in); +} + PMU_FORMAT_ATTR(offcore_rsp, "config1:0-63"); PMU_FORMAT_ATTR(ldlat, "config1:0-15"); @@ -2766,7 +2740,7 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, .guest_get_msrs = intel_guest_get_msrs, - .sched_task = intel_pmu_lbr_sched_task, + .sched_task = intel_pmu_sched_task, }; static __init void intel_clovertown_quirk(void) @@ -2939,8 +2913,8 @@ static __init void intel_ht_bug(void) { x86_pmu.flags |= PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED; - x86_pmu.commit_scheduling = intel_commit_scheduling; x86_pmu.start_scheduling = intel_start_scheduling; + x86_pmu.commit_scheduling = intel_commit_scheduling; x86_pmu.stop_scheduling = intel_stop_scheduling; } @@ -3398,8 +3372,8 @@ static __init int fixup_ht_bug(void) x86_pmu.flags &= ~(PMU_FL_EXCL_CNTRS | PMU_FL_EXCL_ENABLED); - x86_pmu.commit_scheduling = NULL; x86_pmu.start_scheduling = NULL; + x86_pmu.commit_scheduling = NULL; x86_pmu.stop_scheduling = NULL; watchdog_nmi_enable_all(); diff --git a/arch/x86/kernel/cpu/perf_event_intel_cqm.c b/arch/x86/kernel/cpu/perf_event_intel_cqm.c index e4d1b8b738fa..188076161c1b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_cqm.c +++ b/arch/x86/kernel/cpu/perf_event_intel_cqm.c @@ -13,16 +13,35 @@ #define MSR_IA32_QM_CTR 0x0c8e #define MSR_IA32_QM_EVTSEL 0x0c8d -static unsigned int cqm_max_rmid = -1; +static u32 cqm_max_rmid = -1; static unsigned int cqm_l3_scale; /* supposedly cacheline size */ -struct intel_cqm_state { - raw_spinlock_t lock; - int rmid; - int cnt; +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @rmid: The cached Resource Monitoring ID + * @closid: The cached Class Of Service ID + * @rmid_usecnt: The usage counter for rmid + * + * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 rmid; + u32 closid; + int rmid_usecnt; }; -static DEFINE_PER_CPU(struct intel_cqm_state, cqm_state); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Both functions which modify the state + * (intel_cqm_event_start and intel_cqm_event_stop) are called with + * interrupts disabled, which is sufficient for the protection. + */ +static DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Protects cache_cgroups and cqm_rmid_free_lru and cqm_rmid_limbo_lru. @@ -57,7 +76,7 @@ static cpumask_t cqm_cpumask; * near-zero occupancy value, i.e. no cachelines are tagged with this * RMID, once __intel_cqm_rmid_rotate() returns. */ -static unsigned int intel_cqm_rotation_rmid; +static u32 intel_cqm_rotation_rmid; #define INVALID_RMID (-1) @@ -69,7 +88,7 @@ static unsigned int intel_cqm_rotation_rmid; * Likewise, an rmid value of -1 is used to indicate "no rmid currently * assigned" and is used as part of the rotation code. */ -static inline bool __rmid_valid(unsigned int rmid) +static inline bool __rmid_valid(u32 rmid) { if (!rmid || rmid == INVALID_RMID) return false; @@ -77,7 +96,7 @@ static inline bool __rmid_valid(unsigned int rmid) return true; } -static u64 __rmid_read(unsigned int rmid) +static u64 __rmid_read(u32 rmid) { u64 val; @@ -102,7 +121,7 @@ enum rmid_recycle_state { }; struct cqm_rmid_entry { - unsigned int rmid; + u32 rmid; enum rmid_recycle_state state; struct list_head list; unsigned long queue_time; @@ -147,7 +166,7 @@ static LIST_HEAD(cqm_rmid_limbo_lru); */ static struct cqm_rmid_entry **cqm_rmid_ptrs; -static inline struct cqm_rmid_entry *__rmid_entry(int rmid) +static inline struct cqm_rmid_entry *__rmid_entry(u32 rmid) { struct cqm_rmid_entry *entry; @@ -162,7 +181,7 @@ static inline struct cqm_rmid_entry *__rmid_entry(int rmid) * * We expect to be called with cache_mutex held. */ -static int __get_rmid(void) +static u32 __get_rmid(void) { struct cqm_rmid_entry *entry; @@ -177,7 +196,7 @@ static int __get_rmid(void) return entry->rmid; } -static void __put_rmid(unsigned int rmid) +static void __put_rmid(u32 rmid) { struct cqm_rmid_entry *entry; @@ -372,7 +391,7 @@ static bool __conflict_event(struct perf_event *a, struct perf_event *b) } struct rmid_read { - unsigned int rmid; + u32 rmid; atomic64_t value; }; @@ -381,12 +400,11 @@ static void __intel_cqm_event_count(void *info); /* * Exchange the RMID of a group of events. */ -static unsigned int -intel_cqm_xchg_rmid(struct perf_event *group, unsigned int rmid) +static u32 intel_cqm_xchg_rmid(struct perf_event *group, u32 rmid) { struct perf_event *event; - unsigned int old_rmid = group->hw.cqm_rmid; struct list_head *head = &group->hw.cqm_group_entry; + u32 old_rmid = group->hw.cqm_rmid; lockdep_assert_held(&cache_mutex); @@ -451,7 +469,7 @@ static void intel_cqm_stable(void *arg) * If we have group events waiting for an RMID that don't conflict with * events already running, assign @rmid. */ -static bool intel_cqm_sched_in_event(unsigned int rmid) +static bool intel_cqm_sched_in_event(u32 rmid) { struct perf_event *leader, *event; @@ -598,7 +616,7 @@ static bool intel_cqm_rmid_stabilize(unsigned int *available) static void __intel_cqm_pick_and_rotate(struct perf_event *next) { struct perf_event *rotor; - unsigned int rmid; + u32 rmid; lockdep_assert_held(&cache_mutex); @@ -626,7 +644,7 @@ static void __intel_cqm_pick_and_rotate(struct perf_event *next) static void intel_cqm_sched_out_conflicting_events(struct perf_event *event) { struct perf_event *group, *g; - unsigned int rmid; + u32 rmid; lockdep_assert_held(&cache_mutex); @@ -828,8 +846,8 @@ static void intel_cqm_setup_event(struct perf_event *event, struct perf_event **group) { struct perf_event *iter; - unsigned int rmid; bool conflict = false; + u32 rmid; list_for_each_entry(iter, &cache_groups, hw.cqm_groups_entry) { rmid = iter->hw.cqm_rmid; @@ -860,7 +878,7 @@ static void intel_cqm_setup_event(struct perf_event *event, static void intel_cqm_event_read(struct perf_event *event) { unsigned long flags; - unsigned int rmid; + u32 rmid; u64 val; /* @@ -961,55 +979,48 @@ out: static void intel_cqm_event_start(struct perf_event *event, int mode) { - struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); - unsigned int rmid = event->hw.cqm_rmid; - unsigned long flags; + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 rmid = event->hw.cqm_rmid; if (!(event->hw.cqm_state & PERF_HES_STOPPED)) return; event->hw.cqm_state &= ~PERF_HES_STOPPED; - raw_spin_lock_irqsave(&state->lock, flags); - - if (state->cnt++) - WARN_ON_ONCE(state->rmid != rmid); - else + if (state->rmid_usecnt++) { + if (!WARN_ON_ONCE(state->rmid != rmid)) + return; + } else { WARN_ON_ONCE(state->rmid); + } state->rmid = rmid; - wrmsrl(MSR_IA32_PQR_ASSOC, state->rmid); - - raw_spin_unlock_irqrestore(&state->lock, flags); + wrmsr(MSR_IA32_PQR_ASSOC, rmid, state->closid); } static void intel_cqm_event_stop(struct perf_event *event, int mode) { - struct intel_cqm_state *state = this_cpu_ptr(&cqm_state); - unsigned long flags; + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); if (event->hw.cqm_state & PERF_HES_STOPPED) return; event->hw.cqm_state |= PERF_HES_STOPPED; - raw_spin_lock_irqsave(&state->lock, flags); intel_cqm_event_read(event); - if (!--state->cnt) { + if (!--state->rmid_usecnt) { state->rmid = 0; - wrmsrl(MSR_IA32_PQR_ASSOC, 0); + wrmsr(MSR_IA32_PQR_ASSOC, 0, state->closid); } else { WARN_ON_ONCE(!state->rmid); } - - raw_spin_unlock_irqrestore(&state->lock, flags); } static int intel_cqm_event_add(struct perf_event *event, int mode) { unsigned long flags; - unsigned int rmid; + u32 rmid; raw_spin_lock_irqsave(&cache_lock, flags); @@ -1024,11 +1035,6 @@ static int intel_cqm_event_add(struct perf_event *event, int mode) return 0; } -static void intel_cqm_event_del(struct perf_event *event, int mode) -{ - intel_cqm_event_stop(event, mode); -} - static void intel_cqm_event_destroy(struct perf_event *event) { struct perf_event *group_other = NULL; @@ -1057,7 +1063,7 @@ static void intel_cqm_event_destroy(struct perf_event *event) list_replace(&event->hw.cqm_groups_entry, &group_other->hw.cqm_groups_entry); } else { - unsigned int rmid = event->hw.cqm_rmid; + u32 rmid = event->hw.cqm_rmid; if (__rmid_valid(rmid)) __put_rmid(rmid); @@ -1221,7 +1227,7 @@ static struct pmu intel_cqm_pmu = { .task_ctx_nr = perf_sw_context, .event_init = intel_cqm_event_init, .add = intel_cqm_event_add, - .del = intel_cqm_event_del, + .del = intel_cqm_event_stop, .start = intel_cqm_event_start, .stop = intel_cqm_event_stop, .read = intel_cqm_event_read, @@ -1243,12 +1249,12 @@ static inline void cqm_pick_event_reader(int cpu) static void intel_cqm_cpu_prepare(unsigned int cpu) { - struct intel_cqm_state *state = &per_cpu(cqm_state, cpu); + struct intel_pqr_state *state = &per_cpu(pqr_state, cpu); struct cpuinfo_x86 *c = &cpu_data(cpu); - raw_spin_lock_init(&state->lock); state->rmid = 0; - state->cnt = 0; + state->closid = 0; + state->rmid_usecnt = 0; WARN_ON(c->x86_cache_max_rmid != cqm_max_rmid); WARN_ON(c->x86_cache_occ_scale != cqm_l3_scale); diff --git a/arch/x86/kernel/cpu/perf_event_intel_ds.c b/arch/x86/kernel/cpu/perf_event_intel_ds.c index 7f73b3553e2e..71fc40238843 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_ds.c +++ b/arch/x86/kernel/cpu/perf_event_intel_ds.c @@ -11,7 +11,7 @@ #define BTS_RECORD_SIZE 24 #define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE PAGE_SIZE +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_FIXUP_SIZE PAGE_SIZE /* @@ -250,7 +250,7 @@ static int alloc_pebs_buffer(int cpu) { struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; int node = cpu_to_node(cpu); - int max, thresh = 1; /* always use a single PEBS record */ + int max; void *buffer, *ibuffer; if (!x86_pmu.pebs) @@ -280,9 +280,6 @@ static int alloc_pebs_buffer(int cpu) ds->pebs_absolute_maximum = ds->pebs_buffer_base + max * x86_pmu.pebs_record_size; - ds->pebs_interrupt_threshold = ds->pebs_buffer_base + - thresh * x86_pmu.pebs_record_size; - return 0; } @@ -549,6 +546,19 @@ int intel_pmu_drain_bts_buffer(void) return 1; } +static inline void intel_pmu_drain_pebs_buffer(void) +{ + struct pt_regs regs; + + x86_pmu.drain_pebs(®s); +} + +void intel_pmu_pebs_sched_task(struct perf_event_context *ctx, bool sched_in) +{ + if (!sched_in) + intel_pmu_drain_pebs_buffer(); +} + /* * PEBS */ @@ -684,25 +694,66 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) return &emptyconstraint; } +static inline bool pebs_is_enabled(struct cpu_hw_events *cpuc) +{ + return (cpuc->pebs_enabled & ((1ULL << MAX_PEBS_EVENTS) - 1)); +} + void intel_pmu_pebs_enable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; + struct debug_store *ds = cpuc->ds; + bool first_pebs; + u64 threshold; hwc->config &= ~ARCH_PERFMON_EVENTSEL_INT; + first_pebs = !pebs_is_enabled(cpuc); cpuc->pebs_enabled |= 1ULL << hwc->idx; if (event->hw.flags & PERF_X86_EVENT_PEBS_LDLAT) cpuc->pebs_enabled |= 1ULL << (hwc->idx + 32); else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled |= 1ULL << 63; + + /* + * When the event is constrained enough we can use a larger + * threshold and run the event with less frequent PMI. + */ + if (hwc->flags & PERF_X86_EVENT_FREERUNNING) { + threshold = ds->pebs_absolute_maximum - + x86_pmu.max_pebs_events * x86_pmu.pebs_record_size; + + if (first_pebs) + perf_sched_cb_inc(event->ctx->pmu); + } else { + threshold = ds->pebs_buffer_base + x86_pmu.pebs_record_size; + + /* + * If not all events can use larger buffer, + * roll back to threshold = 1 + */ + if (!first_pebs && + (ds->pebs_interrupt_threshold > threshold)) + perf_sched_cb_dec(event->ctx->pmu); + } + + /* Use auto-reload if possible to save a MSR write in the PMI */ + if (hwc->flags & PERF_X86_EVENT_AUTO_RELOAD) { + ds->pebs_event_reset[hwc->idx] = + (u64)(-hwc->sample_period) & x86_pmu.cntval_mask; + } + + if (first_pebs || ds->pebs_interrupt_threshold > threshold) + ds->pebs_interrupt_threshold = threshold; } void intel_pmu_pebs_disable(struct perf_event *event) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct hw_perf_event *hwc = &event->hw; + struct debug_store *ds = cpuc->ds; cpuc->pebs_enabled &= ~(1ULL << hwc->idx); @@ -711,6 +762,13 @@ void intel_pmu_pebs_disable(struct perf_event *event) else if (event->hw.flags & PERF_X86_EVENT_PEBS_ST) cpuc->pebs_enabled &= ~(1ULL << 63); + if (ds->pebs_interrupt_threshold > + ds->pebs_buffer_base + x86_pmu.pebs_record_size) { + intel_pmu_drain_pebs_buffer(); + if (!pebs_is_enabled(cpuc)) + perf_sched_cb_dec(event->ctx->pmu); + } + if (cpuc->enabled) wrmsrl(MSR_IA32_PEBS_ENABLE, cpuc->pebs_enabled); @@ -846,8 +904,10 @@ static inline u64 intel_hsw_transaction(struct pebs_record_hsw *pebs) return txn; } -static void __intel_pmu_pebs_event(struct perf_event *event, - struct pt_regs *iregs, void *__pebs) +static void setup_pebs_sample_data(struct perf_event *event, + struct pt_regs *iregs, void *__pebs, + struct perf_sample_data *data, + struct pt_regs *regs) { #define PERF_X86_EVENT_PEBS_HSW_PREC \ (PERF_X86_EVENT_PEBS_ST_HSW | \ @@ -859,13 +919,11 @@ static void __intel_pmu_pebs_event(struct perf_event *event, */ struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct pebs_record_hsw *pebs = __pebs; - struct perf_sample_data data; - struct pt_regs regs; u64 sample_type; int fll, fst, dsrc; int fl = event->hw.flags; - if (!intel_pmu_save_and_restart(event)) + if (pebs == NULL) return; sample_type = event->attr.sample_type; @@ -874,15 +932,15 @@ static void __intel_pmu_pebs_event(struct perf_event *event, fll = fl & PERF_X86_EVENT_PEBS_LDLAT; fst = fl & (PERF_X86_EVENT_PEBS_ST | PERF_X86_EVENT_PEBS_HSW_PREC); - perf_sample_data_init(&data, 0, event->hw.last_period); + perf_sample_data_init(data, 0, event->hw.last_period); - data.period = event->hw.last_period; + data->period = event->hw.last_period; /* * Use latency for weight (only avail with PEBS-LL) */ if (fll && (sample_type & PERF_SAMPLE_WEIGHT)) - data.weight = pebs->lat; + data->weight = pebs->lat; /* * data.data_src encodes the data source @@ -895,7 +953,7 @@ static void __intel_pmu_pebs_event(struct perf_event *event, val = precise_datala_hsw(event, pebs->dse); else if (fst) val = precise_store_data(pebs->dse); - data.data_src.val = val; + data->data_src.val = val; } /* @@ -908,61 +966,123 @@ static void __intel_pmu_pebs_event(struct perf_event *event, * PERF_SAMPLE_IP and PERF_SAMPLE_CALLCHAIN to function properly. * A possible PERF_SAMPLE_REGS will have to transfer all regs. */ - regs = *iregs; - regs.flags = pebs->flags; - set_linear_ip(®s, pebs->ip); - regs.bp = pebs->bp; - regs.sp = pebs->sp; + *regs = *iregs; + regs->flags = pebs->flags; + set_linear_ip(regs, pebs->ip); + regs->bp = pebs->bp; + regs->sp = pebs->sp; if (sample_type & PERF_SAMPLE_REGS_INTR) { - regs.ax = pebs->ax; - regs.bx = pebs->bx; - regs.cx = pebs->cx; - regs.dx = pebs->dx; - regs.si = pebs->si; - regs.di = pebs->di; - regs.bp = pebs->bp; - regs.sp = pebs->sp; - - regs.flags = pebs->flags; + regs->ax = pebs->ax; + regs->bx = pebs->bx; + regs->cx = pebs->cx; + regs->dx = pebs->dx; + regs->si = pebs->si; + regs->di = pebs->di; + regs->bp = pebs->bp; + regs->sp = pebs->sp; + + regs->flags = pebs->flags; #ifndef CONFIG_X86_32 - regs.r8 = pebs->r8; - regs.r9 = pebs->r9; - regs.r10 = pebs->r10; - regs.r11 = pebs->r11; - regs.r12 = pebs->r12; - regs.r13 = pebs->r13; - regs.r14 = pebs->r14; - regs.r15 = pebs->r15; + regs->r8 = pebs->r8; + regs->r9 = pebs->r9; + regs->r10 = pebs->r10; + regs->r11 = pebs->r11; + regs->r12 = pebs->r12; + regs->r13 = pebs->r13; + regs->r14 = pebs->r14; + regs->r15 = pebs->r15; #endif } if (event->attr.precise_ip > 1 && x86_pmu.intel_cap.pebs_format >= 2) { - regs.ip = pebs->real_ip; - regs.flags |= PERF_EFLAGS_EXACT; - } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(®s)) - regs.flags |= PERF_EFLAGS_EXACT; + regs->ip = pebs->real_ip; + regs->flags |= PERF_EFLAGS_EXACT; + } else if (event->attr.precise_ip > 1 && intel_pmu_pebs_fixup_ip(regs)) + regs->flags |= PERF_EFLAGS_EXACT; else - regs.flags &= ~PERF_EFLAGS_EXACT; + regs->flags &= ~PERF_EFLAGS_EXACT; if ((sample_type & PERF_SAMPLE_ADDR) && x86_pmu.intel_cap.pebs_format >= 1) - data.addr = pebs->dla; + data->addr = pebs->dla; if (x86_pmu.intel_cap.pebs_format >= 2) { /* Only set the TSX weight when no memory weight. */ if ((sample_type & PERF_SAMPLE_WEIGHT) && !fll) - data.weight = intel_hsw_weight(pebs); + data->weight = intel_hsw_weight(pebs); if (sample_type & PERF_SAMPLE_TRANSACTION) - data.txn = intel_hsw_transaction(pebs); + data->txn = intel_hsw_transaction(pebs); } if (has_branch_stack(event)) - data.br_stack = &cpuc->lbr_stack; + data->br_stack = &cpuc->lbr_stack; +} + +static inline void * +get_next_pebs_record_by_bit(void *base, void *top, int bit) +{ + struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); + void *at; + u64 pebs_status; + + if (base == NULL) + return NULL; + + for (at = base; at < top; at += x86_pmu.pebs_record_size) { + struct pebs_record_nhm *p = at; + + if (test_bit(bit, (unsigned long *)&p->status)) { + /* PEBS v3 has accurate status bits */ + if (x86_pmu.intel_cap.pebs_format >= 3) + return at; - if (perf_event_overflow(event, &data, ®s)) + if (p->status == (1 << bit)) + return at; + + /* clear non-PEBS bit and re-check */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + if (pebs_status == (1 << bit)) + return at; + } + } + return NULL; +} + +static void __intel_pmu_pebs_event(struct perf_event *event, + struct pt_regs *iregs, + void *base, void *top, + int bit, int count) +{ + struct perf_sample_data data; + struct pt_regs regs; + void *at = get_next_pebs_record_by_bit(base, top, bit); + + if (!intel_pmu_save_and_restart(event) && + !(event->hw.flags & PERF_X86_EVENT_AUTO_RELOAD)) + return; + + while (count > 1) { + setup_pebs_sample_data(event, iregs, at, &data, ®s); + perf_event_output(event, &data, ®s); + at += x86_pmu.pebs_record_size; + at = get_next_pebs_record_by_bit(at, top, bit); + count--; + } + + setup_pebs_sample_data(event, iregs, at, &data, ®s); + + /* + * All but the last records are processed. + * The last one is left to be able to call the overflow handler. + */ + if (perf_event_overflow(event, &data, ®s)) { x86_pmu_stop(event, 0); + return; + } + } static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) @@ -992,72 +1112,99 @@ static void intel_pmu_drain_pebs_core(struct pt_regs *iregs) if (!event->attr.precise_ip) return; - n = top - at; + n = (top - at) / x86_pmu.pebs_record_size; if (n <= 0) return; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(n > 1, "bad leftover pebs %d\n", n); - at += n - 1; - - __intel_pmu_pebs_event(event, iregs, at); + __intel_pmu_pebs_event(event, iregs, at, top, 0, n); } static void intel_pmu_drain_pebs_nhm(struct pt_regs *iregs) { struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct debug_store *ds = cpuc->ds; - struct perf_event *event = NULL; - void *at, *top; - u64 status = 0; - int bit; + struct perf_event *event; + void *base, *at, *top; + short counts[MAX_PEBS_EVENTS] = {}; + short error[MAX_PEBS_EVENTS] = {}; + int bit, i; if (!x86_pmu.pebs_active) return; - at = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; + base = (struct pebs_record_nhm *)(unsigned long)ds->pebs_buffer_base; top = (struct pebs_record_nhm *)(unsigned long)ds->pebs_index; ds->pebs_index = ds->pebs_buffer_base; - if (unlikely(at > top)) + if (unlikely(base >= top)) return; - /* - * Should not happen, we program the threshold at 1 and do not - * set a reset value. - */ - WARN_ONCE(top - at > x86_pmu.max_pebs_events * x86_pmu.pebs_record_size, - "Unexpected number of pebs records %ld\n", - (long)(top - at) / x86_pmu.pebs_record_size); - - for (; at < top; at += x86_pmu.pebs_record_size) { + for (at = base; at < top; at += x86_pmu.pebs_record_size) { struct pebs_record_nhm *p = at; - for_each_set_bit(bit, (unsigned long *)&p->status, - x86_pmu.max_pebs_events) { - event = cpuc->events[bit]; - if (!test_bit(bit, cpuc->active_mask)) - continue; - - WARN_ON_ONCE(!event); + /* PEBS v3 has accurate status bits */ + if (x86_pmu.intel_cap.pebs_format >= 3) { + for_each_set_bit(bit, (unsigned long *)&p->status, + MAX_PEBS_EVENTS) + counts[bit]++; - if (!event->attr.precise_ip) - continue; + continue; + } - if (__test_and_set_bit(bit, (unsigned long *)&status)) + bit = find_first_bit((unsigned long *)&p->status, + x86_pmu.max_pebs_events); + if (bit >= x86_pmu.max_pebs_events) + continue; + if (!test_bit(bit, cpuc->active_mask)) + continue; + /* + * The PEBS hardware does not deal well with the situation + * when events happen near to each other and multiple bits + * are set. But it should happen rarely. + * + * If these events include one PEBS and multiple non-PEBS + * events, it doesn't impact PEBS record. The record will + * be handled normally. (slow path) + * + * If these events include two or more PEBS events, the + * records for the events can be collapsed into a single + * one, and it's not possible to reconstruct all events + * that caused the PEBS record. It's called collision. + * If collision happened, the record will be dropped. + * + */ + if (p->status != (1 << bit)) { + u64 pebs_status; + + /* slow path */ + pebs_status = p->status & cpuc->pebs_enabled; + pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + if (pebs_status != (1 << bit)) { + for_each_set_bit(i, (unsigned long *)&pebs_status, + MAX_PEBS_EVENTS) + error[i]++; continue; - - break; + } } + counts[bit]++; + } - if (!event || bit >= x86_pmu.max_pebs_events) + for (bit = 0; bit < x86_pmu.max_pebs_events; bit++) { + if ((counts[bit] == 0) && (error[bit] == 0)) continue; + event = cpuc->events[bit]; + WARN_ON_ONCE(!event); + WARN_ON_ONCE(!event->attr.precise_ip); - __intel_pmu_pebs_event(event, iregs, at); + /* log dropped samples number */ + if (error[bit]) + perf_log_lost_samples(event, error[bit]); + + if (counts[bit]) { + __intel_pmu_pebs_event(event, iregs, base, + top, bit, counts[bit]); + } } } diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 94e5b506caa6..452a7bd2dedb 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -96,6 +96,7 @@ enum { X86_BR_NO_TX = 1 << 14,/* not in transaction */ X86_BR_ZERO_CALL = 1 << 15,/* zero length call */ X86_BR_CALL_STACK = 1 << 16,/* call stack */ + X86_BR_IND_JMP = 1 << 17,/* indirect jump */ }; #define X86_BR_PLM (X86_BR_USER | X86_BR_KERNEL) @@ -113,6 +114,7 @@ enum { X86_BR_IRQ |\ X86_BR_ABORT |\ X86_BR_IND_CALL |\ + X86_BR_IND_JMP |\ X86_BR_ZERO_CALL) #define X86_BR_ALL (X86_BR_PLM | X86_BR_ANY) @@ -262,9 +264,6 @@ void intel_pmu_lbr_sched_task(struct perf_event_context *ctx, bool sched_in) struct cpu_hw_events *cpuc = this_cpu_ptr(&cpu_hw_events); struct x86_perf_task_context *task_ctx; - if (!x86_pmu.lbr_nr) - return; - /* * If LBR callstack feature is enabled and the stack was saved when * the task was scheduled out, restore the stack. Otherwise flush @@ -523,6 +522,9 @@ static int intel_pmu_setup_sw_lbr_filter(struct perf_event *event) X86_BR_CALL_STACK; } + if (br_type & PERF_SAMPLE_BRANCH_IND_JUMP) + mask |= X86_BR_IND_JMP; + /* * stash actual user request into reg, it may * be used by fixup code for some CPU @@ -736,7 +738,7 @@ static int branch_type(unsigned long from, unsigned long to, int abort) break; case 4: case 5: - ret = X86_BR_JMP; + ret = X86_BR_IND_JMP; break; } break; @@ -844,6 +846,7 @@ static const int nhm_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { */ [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL | LBR_IND_JMP, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, }; static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -856,6 +859,7 @@ static const int snb_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { | LBR_FAR, [PERF_SAMPLE_BRANCH_IND_CALL_SHIFT] = LBR_IND_CALL, [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, }; static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { @@ -870,6 +874,7 @@ static const int hsw_lbr_sel_map[PERF_SAMPLE_BRANCH_MAX_SHIFT] = { [PERF_SAMPLE_BRANCH_COND_SHIFT] = LBR_JCC, [PERF_SAMPLE_BRANCH_CALL_STACK_SHIFT] = LBR_REL_CALL | LBR_IND_CALL | LBR_RETURN | LBR_CALL_STACK, + [PERF_SAMPLE_BRANCH_IND_JUMP_SHIFT] = LBR_IND_JMP, }; /* core */ diff --git a/arch/x86/kernel/cpu/perf_event_intel_pt.c b/arch/x86/kernel/cpu/perf_event_intel_pt.c index 123ff1bb2f60..159887c3a89d 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_pt.c +++ b/arch/x86/kernel/cpu/perf_event_intel_pt.c @@ -187,15 +187,6 @@ static bool pt_event_valid(struct perf_event *event) * These all are cpu affine and operate on a local PT */ -static bool pt_is_running(void) -{ - u64 ctl; - - rdmsrl(MSR_IA32_RTIT_CTL, ctl); - - return !!(ctl & RTIT_CTL_TRACEEN); -} - static void pt_config(struct perf_event *event) { u64 reg; @@ -609,7 +600,12 @@ static unsigned int pt_topa_next_entry(struct pt_buffer *buf, unsigned int pg) * @handle: Current output handle. * * Place INT and STOP marks to prevent overwriting old data that the consumer - * hasn't yet collected. + * hasn't yet collected and waking up the consumer after a certain fraction of + * the buffer has filled up. Only needed and sensible for non-snapshot counters. + * + * This obviously relies on buf::head to figure out buffer markers, so it has + * to be called after pt_buffer_reset_offsets() and before the hardware tracing + * is enabled. */ static int pt_buffer_reset_markers(struct pt_buffer *buf, struct perf_output_handle *handle) @@ -618,9 +614,6 @@ static int pt_buffer_reset_markers(struct pt_buffer *buf, unsigned long head = local64_read(&buf->head); unsigned long idx, npages, wakeup; - if (buf->snapshot) - return 0; - /* can't stop in the middle of an output region */ if (buf->output_off + handle->size + 1 < sizes(TOPA_ENTRY(buf->cur, buf->cur_idx)->size)) @@ -674,7 +667,7 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) struct topa *cur = buf->first, *prev = buf->last; struct topa_entry *te_cur = TOPA_ENTRY(cur, 0), *te_prev = TOPA_ENTRY(prev, prev->last - 1); - int pg = 0, idx = 0, ntopa = 0; + int pg = 0, idx = 0; while (pg < buf->nr_pages) { int tidx; @@ -689,9 +682,9 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) /* advance to next topa table */ idx = 0; cur = list_entry(cur->list.next, struct topa, list); - ntopa++; - } else + } else { idx++; + } te_cur = TOPA_ENTRY(cur, idx); } @@ -703,7 +696,14 @@ static void pt_buffer_setup_topa_index(struct pt_buffer *buf) * @head: Write pointer (aux_head) from AUX buffer. * * Find the ToPA table and entry corresponding to given @head and set buffer's - * "current" pointers accordingly. + * "current" pointers accordingly. This is done after we have obtained the + * current aux_head position from a successful call to perf_aux_output_begin() + * to make sure the hardware is writing to the right place. + * + * This function modifies buf::{cur,cur_idx,output_off} that will be programmed + * into PT msrs when the tracing is enabled and buf::head and buf::data_size, + * which are used to determine INT and STOP markers' locations by a subsequent + * call to pt_buffer_reset_markers(). */ static void pt_buffer_reset_offsets(struct pt_buffer *buf, unsigned long head) { @@ -901,6 +901,7 @@ void intel_pt_interrupt(void) } pt_buffer_reset_offsets(buf, pt->handle.head); + /* snapshot counters don't use PMI, so it's safe */ ret = pt_buffer_reset_markers(buf, &pt->handle); if (ret) { perf_aux_output_end(&pt->handle, 0, true); @@ -923,7 +924,7 @@ static void pt_event_start(struct perf_event *event, int mode) struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf = perf_get_aux(&pt->handle); - if (pt_is_running() || !buf || pt_buffer_is_full(buf, pt)) { + if (!buf || pt_buffer_is_full(buf, pt)) { event->hw.state = PERF_HES_STOPPED; return; } @@ -954,7 +955,6 @@ static void pt_event_stop(struct perf_event *event, int mode) event->hw.state = PERF_HES_STOPPED; if (mode & PERF_EF_UPDATE) { - struct pt *pt = this_cpu_ptr(&pt_ctx); struct pt_buffer *buf = perf_get_aux(&pt->handle); if (!buf) diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.c b/arch/x86/kernel/cpu/perf_event_intel_uncore.c index 90b7c501c95b..7c1de1610178 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.c @@ -922,6 +922,9 @@ static int __init uncore_pci_init(void) case 69: /* Haswell Celeron */ ret = hsw_uncore_pci_init(); break; + case 61: /* Broadwell */ + ret = bdw_uncore_pci_init(); + break; default: return 0; } diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore.h b/arch/x86/kernel/cpu/perf_event_intel_uncore.h index ceac8f5dc018..0f77f0a196e4 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore.h +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore.h @@ -325,6 +325,7 @@ extern struct event_constraint uncore_constraint_empty; int snb_uncore_pci_init(void); int ivb_uncore_pci_init(void); int hsw_uncore_pci_init(void); +int bdw_uncore_pci_init(void); void snb_uncore_cpu_init(void); void nhm_uncore_cpu_init(void); diff --git a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c index 4562e9e22c60..b005a78c7012 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c +++ b/arch/x86/kernel/cpu/perf_event_intel_uncore_snb.c @@ -7,6 +7,7 @@ #define PCI_DEVICE_ID_INTEL_IVB_E3_IMC 0x0150 #define PCI_DEVICE_ID_INTEL_HSW_IMC 0x0c00 #define PCI_DEVICE_ID_INTEL_HSW_U_IMC 0x0a04 +#define PCI_DEVICE_ID_INTEL_BDW_IMC 0x1604 /* SNB event control */ #define SNB_UNC_CTL_EV_SEL_MASK 0x000000ff @@ -486,6 +487,14 @@ static const struct pci_device_id hsw_uncore_pci_ids[] = { { /* end: all zeroes */ }, }; +static const struct pci_device_id bdw_uncore_pci_ids[] = { + { /* IMC */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_INTEL_BDW_IMC), + .driver_data = UNCORE_PCI_DEV_DATA(SNB_PCI_UNCORE_IMC, 0), + }, + { /* end: all zeroes */ }, +}; + static struct pci_driver snb_uncore_pci_driver = { .name = "snb_uncore", .id_table = snb_uncore_pci_ids, @@ -501,6 +510,11 @@ static struct pci_driver hsw_uncore_pci_driver = { .id_table = hsw_uncore_pci_ids, }; +static struct pci_driver bdw_uncore_pci_driver = { + .name = "bdw_uncore", + .id_table = bdw_uncore_pci_ids, +}; + struct imc_uncore_pci_dev { __u32 pci_id; struct pci_driver *driver; @@ -514,6 +528,7 @@ static const struct imc_uncore_pci_dev desktop_imc_pci_ids[] = { IMC_DEV(IVB_E3_IMC, &ivb_uncore_pci_driver), /* Xeon E3-1200 v2/3rd Gen Core processor */ IMC_DEV(HSW_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core Processor */ IMC_DEV(HSW_U_IMC, &hsw_uncore_pci_driver), /* 4th Gen Core ULT Mobile Processor */ + IMC_DEV(BDW_IMC, &bdw_uncore_pci_driver), /* 5th Gen Core U */ { /* end marker */ } }; @@ -561,6 +576,11 @@ int hsw_uncore_pci_init(void) return imc_uncore_pci_init(); } +int bdw_uncore_pci_init(void) +{ + return imc_uncore_pci_init(); +} + /* end of Sandy Bridge uncore support */ /* Nehalem uncore support */ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 9435620062df..1681504e44a4 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -584,6 +584,39 @@ static void kvm_kick_cpu(int cpu) kvm_hypercall2(KVM_HC_KICK_CPU, flags, apicid); } + +#ifdef CONFIG_QUEUED_SPINLOCKS + +#include <asm/qspinlock.h> + +static void kvm_wait(u8 *ptr, u8 val) +{ + unsigned long flags; + + if (in_nmi()) + return; + + local_irq_save(flags); + + if (READ_ONCE(*ptr) != val) + goto out; + + /* + * halt until it's our turn and kicked. Note that we do safe halt + * for irq enabled case to avoid hang when lock info is overwritten + * in irq spinlock slowpath and no spurious interrupt occur to save us. + */ + if (arch_irqs_disabled_flags(flags)) + halt(); + else + safe_halt(); + +out: + local_irq_restore(flags); +} + +#else /* !CONFIG_QUEUED_SPINLOCKS */ + enum kvm_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -817,6 +850,8 @@ static void kvm_unlock_kick(struct arch_spinlock *lock, __ticket_t ticket) } } +#endif /* !CONFIG_QUEUED_SPINLOCKS */ + /* * Setup pv_lock_ops to exploit KVM_FEATURE_PV_UNHALT if present. */ @@ -828,8 +863,16 @@ void __init kvm_spinlock_init(void) if (!kvm_para_has_feature(KVM_FEATURE_PV_UNHALT)) return; +#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = kvm_wait; + pv_lock_ops.kick = kvm_kick_cpu; +#else /* !CONFIG_QUEUED_SPINLOCKS */ pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(kvm_lock_spinning); pv_lock_ops.unlock_kick = kvm_unlock_kick; +#endif } static __init int kvm_spinlock_init_jump(void) diff --git a/arch/x86/kernel/paravirt-spinlocks.c b/arch/x86/kernel/paravirt-spinlocks.c index bbb6c7316341..33ee3e0efd65 100644 --- a/arch/x86/kernel/paravirt-spinlocks.c +++ b/arch/x86/kernel/paravirt-spinlocks.c @@ -8,11 +8,33 @@ #include <asm/paravirt.h> +#ifdef CONFIG_QUEUED_SPINLOCKS +__visible void __native_queued_spin_unlock(struct qspinlock *lock) +{ + native_queued_spin_unlock(lock); +} + +PV_CALLEE_SAVE_REGS_THUNK(__native_queued_spin_unlock); + +bool pv_is_native_spin_unlock(void) +{ + return pv_lock_ops.queued_spin_unlock.func == + __raw_callee_save___native_queued_spin_unlock; +} +#endif + struct pv_lock_ops pv_lock_ops = { #ifdef CONFIG_SMP +#ifdef CONFIG_QUEUED_SPINLOCKS + .queued_spin_lock_slowpath = native_queued_spin_lock_slowpath, + .queued_spin_unlock = PV_CALLEE_SAVE(__native_queued_spin_unlock), + .wait = paravirt_nop, + .kick = paravirt_nop, +#else /* !CONFIG_QUEUED_SPINLOCKS */ .lock_spinning = __PV_IS_CALLEE_SAVE(paravirt_nop), .unlock_kick = paravirt_nop, -#endif +#endif /* !CONFIG_QUEUED_SPINLOCKS */ +#endif /* SMP */ }; EXPORT_SYMBOL(pv_lock_ops); diff --git a/arch/x86/kernel/paravirt_patch_32.c b/arch/x86/kernel/paravirt_patch_32.c index d9f32e6d6ab6..e1b013696dde 100644 --- a/arch/x86/kernel/paravirt_patch_32.c +++ b/arch/x86/kernel/paravirt_patch_32.c @@ -12,6 +12,10 @@ DEF_NATIVE(pv_mmu_ops, read_cr3, "mov %cr3, %eax"); DEF_NATIVE(pv_cpu_ops, clts, "clts"); DEF_NATIVE(pv_cpu_ops, read_tsc, "rdtsc"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%eax)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { /* arg in %eax, return in %eax */ @@ -24,6 +28,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) return 0; } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -47,14 +53,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_mmu_ops, write_cr3); PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_cpu_ops, read_tsc); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/kernel/paravirt_patch_64.c b/arch/x86/kernel/paravirt_patch_64.c index a1da6737ba5b..a1fa86782186 100644 --- a/arch/x86/kernel/paravirt_patch_64.c +++ b/arch/x86/kernel/paravirt_patch_64.c @@ -21,6 +21,10 @@ DEF_NATIVE(pv_cpu_ops, swapgs, "swapgs"); DEF_NATIVE(, mov32, "mov %edi, %eax"); DEF_NATIVE(, mov64, "mov %rdi, %rax"); +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) +DEF_NATIVE(pv_lock_ops, queued_spin_unlock, "movb $0, (%rdi)"); +#endif + unsigned paravirt_patch_ident_32(void *insnbuf, unsigned len) { return paravirt_patch_insns(insnbuf, len, @@ -33,6 +37,8 @@ unsigned paravirt_patch_ident_64(void *insnbuf, unsigned len) start__mov64, end__mov64); } +extern bool pv_is_native_spin_unlock(void); + unsigned native_patch(u8 type, u16 clobbers, void *ibuf, unsigned long addr, unsigned len) { @@ -59,14 +65,22 @@ unsigned native_patch(u8 type, u16 clobbers, void *ibuf, PATCH_SITE(pv_cpu_ops, clts); PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); - - patch_site: - ret = paravirt_patch_insns(ibuf, len, start, end); - break; +#if defined(CONFIG_PARAVIRT_SPINLOCKS) && defined(CONFIG_QUEUED_SPINLOCKS) + case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): + if (pv_is_native_spin_unlock()) { + start = start_pv_lock_ops_queued_spin_unlock; + end = end_pv_lock_ops_queued_spin_unlock; + goto patch_site; + } +#endif default: ret = paravirt_patch_default(type, clobbers, ibuf, addr, len); break; + +patch_site: + ret = paravirt_patch_insns(ibuf, len, start, end); + break; } #undef PATCH_SITE return ret; diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 7e8a1a650435..b9531d343134 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -39,7 +39,8 @@ #define smp_mb() barrier() #define smp_rmb() barrier() #define smp_wmb() barrier() -#define set_mb(var, value) do { var = value; barrier(); } while (0) + +#define smp_store_mb(var, value) do { WRITE_ONCE(var, value); barrier(); } while (0) #define read_barrier_depends() do { } while (0) #define smp_read_barrier_depends() do { } while (0) diff --git a/arch/x86/xen/spinlock.c b/arch/x86/xen/spinlock.c index 956374c1edbc..9e2ba5c6e1dd 100644 --- a/arch/x86/xen/spinlock.c +++ b/arch/x86/xen/spinlock.c @@ -17,6 +17,56 @@ #include "xen-ops.h" #include "debugfs.h" +static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; +static DEFINE_PER_CPU(char *, irq_name); +static bool xen_pvspin = true; + +#ifdef CONFIG_QUEUED_SPINLOCKS + +#include <asm/qspinlock.h> + +static void xen_qlock_kick(int cpu) +{ + xen_send_IPI_one(cpu, XEN_SPIN_UNLOCK_VECTOR); +} + +/* + * Halt the current CPU & release it back to the host + */ +static void xen_qlock_wait(u8 *byte, u8 val) +{ + int irq = __this_cpu_read(lock_kicker_irq); + + /* If kicker interrupts not initialized yet, just spin */ + if (irq == -1) + return; + + /* clear pending */ + xen_clear_irq_pending(irq); + barrier(); + + /* + * We check the byte value after clearing pending IRQ to make sure + * that we won't miss a wakeup event because of the clearing. + * + * The sync_clear_bit() call in xen_clear_irq_pending() is atomic. + * So it is effectively a memory barrier for x86. + */ + if (READ_ONCE(*byte) != val) + return; + + /* + * If an interrupt happens here, it will leave the wakeup irq + * pending, which will cause xen_poll_irq() to return + * immediately. + */ + + /* Block until irq becomes pending (or perhaps a spurious wakeup) */ + xen_poll_irq(irq); +} + +#else /* CONFIG_QUEUED_SPINLOCKS */ + enum xen_contention_stat { TAKEN_SLOW, TAKEN_SLOW_PICKUP, @@ -100,12 +150,9 @@ struct xen_lock_waiting { __ticket_t want; }; -static DEFINE_PER_CPU(int, lock_kicker_irq) = -1; -static DEFINE_PER_CPU(char *, irq_name); static DEFINE_PER_CPU(struct xen_lock_waiting, lock_waiting); static cpumask_t waiting_cpus; -static bool xen_pvspin = true; __visible void xen_lock_spinning(struct arch_spinlock *lock, __ticket_t want) { int irq = __this_cpu_read(lock_kicker_irq); @@ -217,6 +264,7 @@ static void xen_unlock_kick(struct arch_spinlock *lock, __ticket_t next) } } } +#endif /* CONFIG_QUEUED_SPINLOCKS */ static irqreturn_t dummy_handler(int irq, void *dev_id) { @@ -280,8 +328,16 @@ void __init xen_init_spinlocks(void) return; } printk(KERN_DEBUG "xen: PV spinlocks enabled\n"); +#ifdef CONFIG_QUEUED_SPINLOCKS + __pv_init_lock_hash(); + pv_lock_ops.queued_spin_lock_slowpath = __pv_queued_spin_lock_slowpath; + pv_lock_ops.queued_spin_unlock = PV_CALLEE_SAVE(__pv_queued_spin_unlock); + pv_lock_ops.wait = xen_qlock_wait; + pv_lock_ops.kick = xen_qlock_kick; +#else pv_lock_ops.lock_spinning = PV_CALLEE_SAVE(xen_lock_spinning); pv_lock_ops.unlock_kick = xen_unlock_kick; +#endif } /* @@ -310,7 +366,7 @@ static __init int xen_parse_nopvspin(char *arg) } early_param("xen_nopvspin", xen_parse_nopvspin); -#ifdef CONFIG_XEN_DEBUG_FS +#if defined(CONFIG_XEN_DEBUG_FS) && !defined(CONFIG_QUEUED_SPINLOCKS) static struct dentry *d_spin_debug; |