From 3ca0ff571b092ee4d807f1168caa428d95b0173b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 13:36:04 +0200 Subject: locking/mutex: Rework mutex::owner The current mutex implementation has an atomic lock word and a non-atomic owner field. This disparity leads to a number of issues with the current mutex code as it means that we can have a locked mutex without an explicit owner (because the owner field has not been set, or already cleared). This leads to a number of weird corner cases, esp. between the optimistic spinning and debug code. Where the optimistic spinning code needs the owner field updated inside the lock region, the debug code is more relaxed because the whole lock is serialized by the wait_lock. Also, the spinning code itself has a few corner cases where we need to deal with a held lock without an owner field. Furthermore, it becomes even more of a problem when trying to fix starvation cases in the current code. We end up stacking special case on special case. To solve this rework the basic mutex implementation to be a single atomic word that contains the owner and uses the low bits for extra state. This matches how PI futexes and rt_mutex already work. By having the owner an integral part of the lock state a lot of the problems dissapear and we get a better option to deal with starvation cases, direct owner handoff. Changing the basic mutex does however invalidate all the arch specific mutex code; this patch leaves that unused in-place, a later patch will remove that. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex-debug.c | 13 -- kernel/locking/mutex-debug.h | 10 -- kernel/locking/mutex.c | 371 ++++++++++++++++++------------------------- kernel/locking/mutex.h | 26 --- kernel/sched/core.c | 2 +- 5 files changed, 157 insertions(+), 265 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex-debug.c b/kernel/locking/mutex-debug.c index 9c951fade415..9aa713629387 100644 --- a/kernel/locking/mutex-debug.c +++ b/kernel/locking/mutex-debug.c @@ -73,21 +73,8 @@ void debug_mutex_unlock(struct mutex *lock) { if (likely(debug_locks)) { DEBUG_LOCKS_WARN_ON(lock->magic != lock); - - if (!lock->owner) - DEBUG_LOCKS_WARN_ON(!lock->owner); - else - DEBUG_LOCKS_WARN_ON(lock->owner != current); - DEBUG_LOCKS_WARN_ON(!lock->wait_list.prev && !lock->wait_list.next); } - - /* - * __mutex_slowpath_needs_to_unlock() is explicitly 0 for debug - * mutexes so that we can do it here after we've verified state. - */ - mutex_clear_owner(lock); - atomic_set(&lock->count, 1); } void debug_mutex_init(struct mutex *lock, const char *name, diff --git a/kernel/locking/mutex-debug.h b/kernel/locking/mutex-debug.h index 57a871ae3c81..a459faa48987 100644 --- a/kernel/locking/mutex-debug.h +++ b/kernel/locking/mutex-debug.h @@ -27,16 +27,6 @@ extern void debug_mutex_unlock(struct mutex *lock); extern void debug_mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key); -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} - #define spin_lock_mutex(lock, flags) \ do { \ struct mutex *l = container_of(lock, struct mutex, wait_lock); \ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a70b90db3909..de1ce0bae0d5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -27,41 +27,113 @@ #include #include -/* - * In the DEBUG case we are using the "NULL fastpath" for mutexes, - * which forces all calls into the slowpath: - */ #ifdef CONFIG_DEBUG_MUTEXES # include "mutex-debug.h" -# include -/* - * Must be 0 for the debug case so we do not do the unlock outside of the - * wait_lock region. debug_mutex_unlock() will do the actual unlock in this - * case. - */ -# undef __mutex_slowpath_needs_to_unlock -# define __mutex_slowpath_needs_to_unlock() 0 #else # include "mutex.h" -# include #endif void __mutex_init(struct mutex *lock, const char *name, struct lock_class_key *key) { - atomic_set(&lock->count, 1); + atomic_long_set(&lock->owner, 0); spin_lock_init(&lock->wait_lock); INIT_LIST_HEAD(&lock->wait_list); - mutex_clear_owner(lock); #ifdef CONFIG_MUTEX_SPIN_ON_OWNER osq_lock_init(&lock->osq); #endif debug_mutex_init(lock, name, key); } - EXPORT_SYMBOL(__mutex_init); +/* + * @owner: contains: 'struct task_struct *' to the current lock owner, + * NULL means not owned. Since task_struct pointers are aligned at + * ARCH_MIN_TASKALIGN (which is at least sizeof(void *)), we have low + * bits to store extra state. + * + * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + */ +#define MUTEX_FLAG_WAITERS 0x01 + +#define MUTEX_FLAGS 0x03 + +static inline struct task_struct *__owner_task(unsigned long owner) +{ + return (struct task_struct *)(owner & ~MUTEX_FLAGS); +} + +static inline unsigned long __owner_flags(unsigned long owner) +{ + return owner & MUTEX_FLAGS; +} + +/* + * Actual trylock that will work on any unlocked state. + */ +static inline bool __mutex_trylock(struct mutex *lock) +{ + unsigned long owner, curr = (unsigned long)current; + + owner = atomic_long_read(&lock->owner); + for (;;) { /* must loop, can race against a flag */ + unsigned long old; + + if (__owner_task(owner)) + return false; + + old = atomic_long_cmpxchg_acquire(&lock->owner, owner, + curr | __owner_flags(owner)); + if (old == owner) + return true; + + owner = old; + } +} + +#ifndef CONFIG_DEBUG_LOCK_ALLOC +/* + * Lockdep annotations are contained to the slow paths for simplicity. + * There is nothing that would stop spreading the lockdep annotations outwards + * except more code. + */ + +/* + * Optimistic trylock that only works in the uncontended case. Make sure to + * follow with a __mutex_trylock() before failing. + */ +static __always_inline bool __mutex_trylock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (!atomic_long_cmpxchg_acquire(&lock->owner, 0UL, curr)) + return true; + + return false; +} + +static __always_inline bool __mutex_unlock_fast(struct mutex *lock) +{ + unsigned long curr = (unsigned long)current; + + if (atomic_long_cmpxchg_release(&lock->owner, curr, 0UL) == curr) + return true; + + return false; +} +#endif + +static inline void __mutex_set_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_or(flag, &lock->owner); +} + +static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) +{ + atomic_long_andnot(flag, &lock->owner); +} + #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * We split the mutex lock/unlock logic into separate fastpath and @@ -69,7 +141,7 @@ EXPORT_SYMBOL(__mutex_init); * We also put the fastpath first in the kernel image, to make sure the * branch is predicted by the CPU as default-untaken. */ -__visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); +static void __sched __mutex_lock_slowpath(struct mutex *lock); /** * mutex_lock - acquire the mutex @@ -95,14 +167,10 @@ __visible void __sched __mutex_lock_slowpath(atomic_t *lock_count); void __sched mutex_lock(struct mutex *lock) { might_sleep(); - /* - * The locking fastpath is the 1->0 transition from - * 'unlocked' into 'locked' state. - */ - __mutex_fastpath_lock(&lock->count, __mutex_lock_slowpath); - mutex_set_owner(lock); -} + if (!__mutex_trylock_fast(lock)) + __mutex_lock_slowpath(lock); +} EXPORT_SYMBOL(mutex_lock); #endif @@ -149,9 +217,6 @@ static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, /* * After acquiring lock with fastpath or when we lost out in contested * slowpath, set ctx and wake up any waiters so they can recheck. - * - * This function is never called when CONFIG_DEBUG_LOCK_ALLOC is set, - * as the fastpath and opportunistic spinning are disabled in that case. */ static __always_inline void ww_mutex_set_context_fastpath(struct ww_mutex *lock, @@ -176,7 +241,7 @@ ww_mutex_set_context_fastpath(struct ww_mutex *lock, /* * Check if lock is contended, if not there is nobody to wake up */ - if (likely(atomic_read(&lock->base.count) == 0)) + if (likely(!(atomic_long_read(&lock->base.owner) & MUTEX_FLAG_WAITERS))) return; /* @@ -227,7 +292,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) bool ret = true; rcu_read_lock(); - while (lock->owner == owner) { + while (__mutex_owner(lock) == owner) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking lock->owner still matches owner. If that fails, @@ -260,26 +325,19 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) return 0; rcu_read_lock(); - owner = READ_ONCE(lock->owner); + owner = __mutex_owner(lock); if (owner) retval = owner->on_cpu; rcu_read_unlock(); + /* - * if lock->owner is not set, the mutex owner may have just acquired - * it and not set the owner yet or the mutex has been released. + * If lock->owner is not set, the mutex has been released. Return true + * such that we'll trylock in the spin path, which is a faster option + * than the blocking slow path. */ return retval; } -/* - * Atomically try to take the lock when it is available - */ -static inline bool mutex_try_to_acquire(struct mutex *lock) -{ - return !mutex_is_locked(lock) && - (atomic_cmpxchg_acquire(&lock->count, 1, 0) == 1); -} - /* * Optimistic spinning. * @@ -288,13 +346,6 @@ static inline bool mutex_try_to_acquire(struct mutex *lock) * need to reschedule. The rationale is that if the lock owner is * running, it is likely to release the lock soon. * - * Since this needs the lock owner, and this mutex implementation - * doesn't track the owner atomically in the lock field, we need to - * track it non-atomically. - * - * We can't do this for DEBUG_MUTEXES because that relies on wait_lock - * to serialize everything. - * * The mutex spinners are queued up using MCS lock so that only one * spinner can compete for the mutex. However, if mutex spinning isn't * going to happen, there is no point in going through the lock/unlock @@ -342,35 +393,16 @@ static bool mutex_optimistic_spin(struct mutex *lock, * If there's an owner, wait for it to either * release the lock or go to sleep. */ - owner = READ_ONCE(lock->owner); + owner = __mutex_owner(lock); if (owner && !mutex_spin_on_owner(lock, owner)) break; /* Try to acquire the mutex if it is unlocked. */ - if (mutex_try_to_acquire(lock)) { - lock_acquired(&lock->dep_map, ip); - - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - - ww_mutex_set_context_fastpath(ww, ww_ctx); - } - - mutex_set_owner(lock); + if (__mutex_trylock(lock)) { osq_unlock(&lock->osq); return true; } - /* - * When there's no owner, we might have preempted between the - * owner acquiring the lock and setting the owner field. If - * we're an RT task that will live-lock because we won't let - * the owner complete. - */ - if (!owner && (need_resched() || rt_task(task))) - break; - /* * The cpu_relax() call is a compiler barrier which forces * everything in this loop to be re-loaded. We don't need @@ -406,8 +438,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, } #endif -__visible __used noinline -void __sched __mutex_unlock_slowpath(atomic_t *lock_count); +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip); /** * mutex_unlock - release the mutex @@ -422,21 +453,12 @@ void __sched __mutex_unlock_slowpath(atomic_t *lock_count); */ void __sched mutex_unlock(struct mutex *lock) { - /* - * The unlocking fastpath is the 0->1 transition from 'locked' - * into 'unlocked' state: - */ -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(lock); +#ifndef CONFIG_DEBUG_LOCK_ALLOC + if (__mutex_unlock_fast(lock)) + return; #endif - __mutex_fastpath_unlock(&lock->count, __mutex_unlock_slowpath); + __mutex_unlock_slowpath(lock, _RET_IP_); } - EXPORT_SYMBOL(mutex_unlock); /** @@ -465,15 +487,7 @@ void __sched ww_mutex_unlock(struct ww_mutex *lock) lock->ctx = NULL; } -#ifndef CONFIG_DEBUG_MUTEXES - /* - * When debugging is enabled we must not clear the owner before time, - * the slow path will always be taken, and that clears the owner field - * after verifying that it was indeed current. - */ - mutex_clear_owner(&lock->base); -#endif - __mutex_fastpath_unlock(&lock->base.count, __mutex_unlock_slowpath); + mutex_unlock(&lock->base); } EXPORT_SYMBOL(ww_mutex_unlock); @@ -520,20 +534,24 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { /* got the lock, yay! */ + lock_acquired(&lock->dep_map, ip); + if (use_ww_ctx) { + struct ww_mutex *ww; + ww = container_of(lock, struct ww_mutex, base); + + ww_mutex_set_context_fastpath(ww, ww_ctx); + } preempt_enable(); return 0; } spin_lock_mutex(&lock->wait_lock, flags); - /* - * Once more, try to acquire the lock. Only try-lock the mutex if - * it is unlocked to reduce unnecessary xchg() operations. + * After waiting to acquire the wait_lock, try again. */ - if (!mutex_is_locked(lock) && - (atomic_xchg_acquire(&lock->count, 0) == 1)) + if (__mutex_trylock(lock)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -543,21 +561,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; + if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter) + __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); + lock_contended(&lock->dep_map, ip); for (;;) { - /* - * Lets try to take the lock again - this is needed even if - * we get here for the first time (shortly after failing to - * acquire the lock), to make sure that we get a wakeup once - * it's unlocked. Later on, if we sleep, this is the - * operation that gives us the lock. We xchg it to -1, so - * that when we release the lock, we properly wake up the - * other waiters. We only attempt the xchg if the count is - * non-negative in order to avoid unnecessary xchg operations: - */ - if (atomic_read(&lock->count) >= 0 && - (atomic_xchg_acquire(&lock->count, -1) == 1)) + if (__mutex_trylock(lock)) break; /* @@ -585,15 +595,14 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); - /* set it to 0 if there are no waiters left: */ if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); + __mutex_clear_flag(lock, MUTEX_FLAG_WAITERS); + debug_mutex_free_waiter(&waiter); skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - mutex_set_owner(lock); if (use_ww_ctx) { struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); @@ -631,7 +640,6 @@ _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, nest, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(_mutex_lock_nest_lock); int __sched @@ -650,7 +658,6 @@ mutex_lock_interruptible_nested(struct mutex *lock, unsigned int subclass) return __mutex_lock_common(lock, TASK_INTERRUPTIBLE, subclass, NULL, _RET_IP_, NULL, 0); } - EXPORT_SYMBOL_GPL(mutex_lock_interruptible_nested); static inline int @@ -715,29 +722,22 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); /* * Release the lock, slowpath: */ -static inline void -__mutex_unlock_common_slowpath(struct mutex *lock, int nested) +static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { - unsigned long flags; + unsigned long owner, flags; WAKE_Q(wake_q); + mutex_release(&lock->dep_map, 1, ip); + /* - * As a performance measurement, release the lock before doing other - * wakeup related duties to follow. This allows other tasks to acquire - * the lock sooner, while still handling cleanups in past unlock calls. - * This can be done as we do not enforce strict equivalence between the - * mutex counter and wait_list. - * - * - * Some architectures leave the lock unlocked in the fastpath failure - * case, others need to leave it locked. In the later case we have to - * unlock it here - as the lock counter is currently 0 or negative. + * Release the lock before (potentially) taking the spinlock + * such that other contenders can get on with things ASAP. */ - if (__mutex_slowpath_needs_to_unlock()) - atomic_set(&lock->count, 1); + owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner); + if (!__owner_flags(owner)) + return; spin_lock_mutex(&lock->wait_lock, flags); - mutex_release(&lock->dep_map, nested, _RET_IP_); debug_mutex_unlock(lock); if (!list_empty(&lock->wait_list)) { @@ -754,17 +754,6 @@ __mutex_unlock_common_slowpath(struct mutex *lock, int nested) wake_up_q(&wake_q); } -/* - * Release the lock, slowpath: - */ -__visible void -__mutex_unlock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - - __mutex_unlock_common_slowpath(lock, 1); -} - #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * Here come the less common (and hence less performance-critical) APIs: @@ -789,38 +778,30 @@ __mutex_lock_interruptible_slowpath(struct mutex *lock); */ int __sched mutex_lock_interruptible(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_interruptible_slowpath(lock); + + return __mutex_lock_interruptible_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_interruptible); int __sched mutex_lock_killable(struct mutex *lock) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->count); - if (likely(!ret)) { - mutex_set_owner(lock); + + if (__mutex_trylock_fast(lock)) return 0; - } else - return __mutex_lock_killable_slowpath(lock); + + return __mutex_lock_killable_slowpath(lock); } EXPORT_SYMBOL(mutex_lock_killable); -__visible void __sched -__mutex_lock_slowpath(atomic_t *lock_count) +static noinline void __sched +__mutex_lock_slowpath(struct mutex *lock) { - struct mutex *lock = container_of(lock_count, struct mutex, count); - __mutex_lock_common(lock, TASK_UNINTERRUPTIBLE, 0, NULL, _RET_IP_, NULL, 0); } @@ -856,37 +837,6 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, #endif -/* - * Spinlock based trylock, we take the spinlock and check whether we - * can get the lock: - */ -static inline int __mutex_trylock_slowpath(atomic_t *lock_count) -{ - struct mutex *lock = container_of(lock_count, struct mutex, count); - unsigned long flags; - int prev; - - /* No need to trylock if the mutex is locked. */ - if (mutex_is_locked(lock)) - return 0; - - spin_lock_mutex(&lock->wait_lock, flags); - - prev = atomic_xchg_acquire(&lock->count, -1); - if (likely(prev == 1)) { - mutex_set_owner(lock); - mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - } - - /* Set it back to 0 if there are no waiters: */ - if (likely(list_empty(&lock->wait_list))) - atomic_set(&lock->count, 0); - - spin_unlock_mutex(&lock->wait_lock, flags); - - return prev == 1; -} - /** * mutex_trylock - try to acquire the mutex, without waiting * @lock: the mutex to be acquired @@ -903,13 +853,12 @@ static inline int __mutex_trylock_slowpath(atomic_t *lock_count) */ int __sched mutex_trylock(struct mutex *lock) { - int ret; + bool locked = __mutex_trylock(lock); - ret = __mutex_fastpath_trylock(&lock->count, __mutex_trylock_slowpath); - if (ret) - mutex_set_owner(lock); + if (locked) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); - return ret; + return locked; } EXPORT_SYMBOL(mutex_trylock); @@ -917,36 +866,28 @@ EXPORT_SYMBOL(mutex_trylock); int __sched __ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock); int __sched __ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) { - int ret; - might_sleep(); - ret = __mutex_fastpath_lock_retval(&lock->base.count); - - if (likely(!ret)) { + if (__mutex_trylock_fast(&lock->base)) { ww_mutex_set_context_fastpath(lock, ctx); - mutex_set_owner(&lock->base); - } else - ret = __ww_mutex_lock_interruptible_slowpath(lock, ctx); - return ret; + return 0; + } + + return __ww_mutex_lock_interruptible_slowpath(lock, ctx); } EXPORT_SYMBOL(__ww_mutex_lock_interruptible); diff --git a/kernel/locking/mutex.h b/kernel/locking/mutex.h index 6cd6b8e9efd7..4410a4af42a3 100644 --- a/kernel/locking/mutex.h +++ b/kernel/locking/mutex.h @@ -16,32 +16,6 @@ #define mutex_remove_waiter(lock, waiter, task) \ __list_del((waiter)->list.prev, (waiter)->list.next) -#ifdef CONFIG_MUTEX_SPIN_ON_OWNER -/* - * The mutex owner can get read and written to locklessly. - * We should use WRITE_ONCE when writing the owner value to - * avoid store tearing, otherwise, a thread could potentially - * read a partially written and incomplete owner value. - */ -static inline void mutex_set_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, current); -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ - WRITE_ONCE(lock->owner, NULL); -} -#else -static inline void mutex_set_owner(struct mutex *lock) -{ -} - -static inline void mutex_clear_owner(struct mutex *lock) -{ -} -#endif - #define debug_mutex_wake_waiter(lock, waiter) do { } while (0) #define debug_mutex_free_waiter(waiter) do { } while (0) #define debug_mutex_add_waiter(lock, waiter, ti) do { } while (0) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 94732d1ab00a..8912aafd09e1 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -75,11 +75,11 @@ #include #include #include +#include #include #include #include -#include #ifdef CONFIG_PARAVIRT #include #endif -- cgit v1.2.3 From a3ea3d9b865c2a8f7fe455c7fa26db4b6fd066e3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 13:45:15 +0200 Subject: locking/mutex: Allow MUTEX_SPIN_ON_OWNER when DEBUG_MUTEXES Now that mutex::count and mutex::owner are the same field, we can allow SPIN_ON_OWNER while DEBUG_MUTEX. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/Kconfig.locks | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index ebdb0043203a..84d882f3e299 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -225,7 +225,7 @@ config ARCH_SUPPORTS_ATOMIC_RMW config MUTEX_SPIN_ON_OWNER def_bool y - depends on SMP && !DEBUG_MUTEXES && ARCH_SUPPORTS_ATOMIC_RMW + depends on SMP && ARCH_SUPPORTS_ATOMIC_RMW config RWSEM_SPIN_ON_OWNER def_bool y -- cgit v1.2.3 From 9d659ae14b545c4296e812c70493bfdc999b5c1c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 23 Aug 2016 14:40:16 +0200 Subject: locking/mutex: Add lock handoff to avoid starvation Implement lock handoff to avoid lock starvation. Lock starvation is possible because mutex_lock() allows lock stealing, where a running (or optimistic spinning) task beats the woken waiter to the acquire. Lock stealing is an important performance optimization because waiting for a waiter to wake up and get runtime can take a significant time, during which everyboy would stall on the lock. The down-side is of course that it allows for starvation. This patch has the waiter requesting a handoff if it fails to acquire the lock upon waking. This re-introduces some of the wait time, because once we do a handoff we have to wait for the waiter to wake up again. A future patch will add a round of optimistic spinning to attempt to alleviate this penalty, but if that turns out to not be enough, we can add a counter and only request handoff after multiple failed wakeups. There are a few tricky implementation details: - accepting a handoff must only be done in the wait-loop. Since the handoff condition is owner == current, it can easily cause recursive locking trouble. - accepting the handoff must be careful to provide the ACQUIRE semantics. - having the HANDOFF bit set on unlock requires care, we must not clear the owner. - we must be careful to not leave HANDOFF set after we've acquired the lock. The tricky scenario is setting the HANDOFF bit on an unlocked mutex. Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Waiman Long Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 142 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index de1ce0bae0d5..b4ebd8b9bcd5 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -54,8 +54,10 @@ EXPORT_SYMBOL(__mutex_init); * bits to store extra state. * * Bit0 indicates a non-empty waiter list; unlock must issue a wakeup. + * Bit1 indicates unlock needs to hand the lock to the top-waiter */ #define MUTEX_FLAG_WAITERS 0x01 +#define MUTEX_FLAG_HANDOFF 0x02 #define MUTEX_FLAGS 0x03 @@ -71,20 +73,48 @@ static inline unsigned long __owner_flags(unsigned long owner) /* * Actual trylock that will work on any unlocked state. + * + * When setting the owner field, we must preserve the low flag bits. + * + * Be careful with @handoff, only set that in a wait-loop (where you set + * HANDOFF) to avoid recursive lock attempts. */ -static inline bool __mutex_trylock(struct mutex *lock) +static inline bool __mutex_trylock(struct mutex *lock, const bool handoff) { unsigned long owner, curr = (unsigned long)current; owner = atomic_long_read(&lock->owner); for (;;) { /* must loop, can race against a flag */ - unsigned long old; + unsigned long old, flags = __owner_flags(owner); + + if (__owner_task(owner)) { + if (handoff && unlikely(__owner_task(owner) == current)) { + /* + * Provide ACQUIRE semantics for the lock-handoff. + * + * We cannot easily use load-acquire here, since + * the actual load is a failed cmpxchg, which + * doesn't imply any barriers. + * + * Also, this is a fairly unlikely scenario, and + * this contains the cost. + */ + smp_mb(); /* ACQUIRE */ + return true; + } - if (__owner_task(owner)) return false; + } + + /* + * We set the HANDOFF bit, we must make sure it doesn't live + * past the point where we acquire it. This would be possible + * if we (accidentally) set the bit on an unlocked mutex. + */ + if (handoff) + flags &= ~MUTEX_FLAG_HANDOFF; - old = atomic_long_cmpxchg_acquire(&lock->owner, owner, - curr | __owner_flags(owner)); + old = atomic_long_cmpxchg_acquire(&lock->owner, owner, curr | flags); if (old == owner) return true; @@ -134,6 +164,39 @@ static inline void __mutex_clear_flag(struct mutex *lock, unsigned long flag) atomic_long_andnot(flag, &lock->owner); } +static inline bool __mutex_waiter_is_first(struct mutex *lock, struct mutex_waiter *waiter) +{ + return list_first_entry(&lock->wait_list, struct mutex_waiter, list) == waiter; +} + +/* + * Give up ownership to a specific task, when @task = NULL, this is equivalent + * to a regular unlock. Clears HANDOFF, preserves WAITERS. Provides RELEASE + * semantics like a regular unlock, the __mutex_trylock() provides matching + * ACQUIRE semantics for the handoff. + */ +static void __mutex_handoff(struct mutex *lock, struct task_struct *task) +{ + unsigned long owner = atomic_long_read(&lock->owner); + + for (;;) { + unsigned long old, new; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + new = (owner & MUTEX_FLAG_WAITERS); + new |= (unsigned long)task; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, new); + if (old == owner) + break; + + owner = old; + } +} + #ifndef CONFIG_DEBUG_LOCK_ALLOC /* * We split the mutex lock/unlock logic into separate fastpath and @@ -398,7 +461,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, break; /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock)) { + if (__mutex_trylock(lock, false)) { osq_unlock(&lock->osq); return true; } @@ -523,6 +586,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct task_struct *task = current; struct mutex_waiter waiter; unsigned long flags; + bool first = false; int ret; if (use_ww_ctx) { @@ -534,7 +598,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, preempt_disable(); mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); - if (__mutex_trylock(lock) || mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + if (__mutex_trylock(lock, false) || + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (use_ww_ctx) { @@ -551,7 +616,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* * After waiting to acquire the wait_lock, try again. */ - if (__mutex_trylock(lock)) + if (__mutex_trylock(lock, false)) goto skip_wait; debug_mutex_lock_common(lock, &waiter); @@ -561,13 +626,13 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, list_add_tail(&waiter.list, &lock->wait_list); waiter.task = task; - if (list_first_entry(&lock->wait_list, struct mutex_waiter, list) == &waiter) + if (__mutex_waiter_is_first(lock, &waiter)) __mutex_set_flag(lock, MUTEX_FLAG_WAITERS); lock_contended(&lock->dep_map, ip); for (;;) { - if (__mutex_trylock(lock)) + if (__mutex_trylock(lock, first)) break; /* @@ -586,17 +651,20 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, } __set_task_state(task, state); - - /* didn't get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); spin_lock_mutex(&lock->wait_lock, flags); + + if (!first && __mutex_waiter_is_first(lock, &waiter)) { + first = true; + __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); + } } __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); if (likely(list_empty(&lock->wait_list))) - __mutex_clear_flag(lock, MUTEX_FLAG_WAITERS); + __mutex_clear_flag(lock, MUTEX_FLAGS); debug_mutex_free_waiter(&waiter); @@ -724,33 +792,61 @@ EXPORT_SYMBOL_GPL(__ww_mutex_lock_interruptible); */ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigned long ip) { + struct task_struct *next = NULL; unsigned long owner, flags; WAKE_Q(wake_q); mutex_release(&lock->dep_map, 1, ip); /* - * Release the lock before (potentially) taking the spinlock - * such that other contenders can get on with things ASAP. + * Release the lock before (potentially) taking the spinlock such that + * other contenders can get on with things ASAP. + * + * Except when HANDOFF, in that case we must not clear the owner field, + * but instead set it to the top waiter. */ - owner = atomic_long_fetch_and_release(MUTEX_FLAGS, &lock->owner); - if (!__owner_flags(owner)) - return; + owner = atomic_long_read(&lock->owner); + for (;;) { + unsigned long old; + +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(__owner_task(owner) != current); +#endif + + if (owner & MUTEX_FLAG_HANDOFF) + break; + + old = atomic_long_cmpxchg_release(&lock->owner, owner, + __owner_flags(owner)); + if (old == owner) { + if (owner & MUTEX_FLAG_WAITERS) + break; + + return; + } + + owner = old; + } spin_lock_mutex(&lock->wait_lock, flags); debug_mutex_unlock(lock); - if (!list_empty(&lock->wait_list)) { /* get the first entry from the wait-list: */ struct mutex_waiter *waiter = - list_entry(lock->wait_list.next, - struct mutex_waiter, list); + list_first_entry(&lock->wait_list, + struct mutex_waiter, list); + + next = waiter->task; debug_mutex_wake_waiter(lock, waiter); - wake_q_add(&wake_q, waiter->task); + wake_q_add(&wake_q, next); } + if (owner & MUTEX_FLAG_HANDOFF) + __mutex_handoff(lock, next); + spin_unlock_mutex(&lock->wait_lock, flags); + wake_up_q(&wake_q); } @@ -853,7 +949,7 @@ __ww_mutex_lock_interruptible_slowpath(struct ww_mutex *lock, */ int __sched mutex_trylock(struct mutex *lock) { - bool locked = __mutex_trylock(lock); + bool locked = __mutex_trylock(lock, false); if (locked) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); -- cgit v1.2.3 From 5bbd7e644378334700889fb762d5893985a7311f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 2 Sep 2016 13:42:12 +0200 Subject: locking/mutex: Restructure wait loop Doesn't really matter yet, but pull the HANDOFF and trylock out from under the wait_lock. The intention is to add an optimistic spin loop here, which requires we do not hold the wait_lock, so shuffle code around in preparation. Also clarify the purpose of taking the wait_lock in the wait loop, its tempting to want to avoid it altogether, but the cancellation cases need to to avoid losing wakeups. Suggested-by: Waiman Long Tested-by: Jason Low Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 30 +++++++++++++++++++++++++----- 1 file changed, 25 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index b4ebd8b9bcd5..8bb2304bb78d 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -631,13 +631,21 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, lock_contended(&lock->dep_map, ip); + set_task_state(task, state); for (;;) { + /* + * Once we hold wait_lock, we're serialized against + * mutex_unlock() handing the lock off to us, do a trylock + * before testing the error conditions to make sure we pick up + * the handoff. + */ if (__mutex_trylock(lock, first)) - break; + goto acquired; /* - * got a signal? (This code gets eliminated in the - * TASK_UNINTERRUPTIBLE case.) + * Check for signals and wound conditions while holding + * wait_lock. This ensures the lock cancellation is ordered + * against mutex_unlock() and wake-ups do not go missing. */ if (unlikely(signal_pending_state(state, task))) { ret = -EINTR; @@ -650,16 +658,27 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, goto err; } - __set_task_state(task, state); spin_unlock_mutex(&lock->wait_lock, flags); schedule_preempt_disabled(); - spin_lock_mutex(&lock->wait_lock, flags); if (!first && __mutex_waiter_is_first(lock, &waiter)) { first = true; __mutex_set_flag(lock, MUTEX_FLAG_HANDOFF); } + + set_task_state(task, state); + /* + * Here we order against unlock; we must either see it change + * state back to RUNNING and fall through the next schedule(), + * or we must see its unlock and acquire. + */ + if (__mutex_trylock(lock, first)) + break; + + spin_lock_mutex(&lock->wait_lock, flags); } + spin_lock_mutex(&lock->wait_lock, flags); +acquired: __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); @@ -682,6 +701,7 @@ skip_wait: return 0; err: + __set_task_state(task, TASK_RUNNING); mutex_remove_waiter(lock, &waiter, task); spin_unlock_mutex(&lock->wait_lock, flags); debug_mutex_free_waiter(&waiter); -- cgit v1.2.3 From a40ca56577f628eb3f7af22b484e95edfdd047a2 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 26 Aug 2016 19:35:08 -0400 Subject: locking/mutex: Simplify some ww_mutex code in __mutex_lock_common() This patch removes some of the redundant ww_mutex code in __mutex_lock_common(). Tested-by: Jason Low Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Ding Tianhong Cc: Imre Deak Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/1472254509-27508-1-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 8bb2304bb78d..6c0d3040e4dc 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -587,10 +587,11 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, struct mutex_waiter waiter; unsigned long flags; bool first = false; + struct ww_mutex *ww; int ret; if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + ww = container_of(lock, struct ww_mutex, base); if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) return -EALREADY; } @@ -602,12 +603,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww; - ww = container_of(lock, struct ww_mutex, base); - + if (use_ww_ctx) ww_mutex_set_context_fastpath(ww, ww_ctx); - } preempt_enable(); return 0; } @@ -691,10 +688,8 @@ skip_wait: /* got the lock - cleanup and rejoice! */ lock_acquired(&lock->dep_map, ip); - if (use_ww_ctx) { - struct ww_mutex *ww = container_of(lock, struct ww_mutex, base); + if (use_ww_ctx) ww_mutex_set_context_slowpath(ww, ww_ctx); - } spin_unlock_mutex(&lock->wait_lock, flags); preempt_enable(); -- cgit v1.2.3 From b341afb325eb390f707a82cbefd65cda887302ab Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Fri, 26 Aug 2016 19:35:09 -0400 Subject: locking/mutex: Enable optimistic spinning of woken waiter This patch makes the waiter that sets the HANDOFF flag start spinning instead of sleeping until the handoff is complete or the owner sleeps. Otherwise, the handoff will cause the optimistic spinners to abort spinning as the handed-off owner may not be running. Tested-by: Jason Low Signed-off-by: Waiman Long Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Davidlohr Bueso Cc: Ding Tianhong Cc: Imre Deak Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Will Deacon Link: http://lkml.kernel.org/r/1472254509-27508-2-git-send-email-Waiman.Long@hpe.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 77 +++++++++++++++++++++++++++++++++++--------------- 1 file changed, 54 insertions(+), 23 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 6c0d3040e4dc..17a88e929e6a 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -416,24 +416,39 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) * * Returns true when the lock was taken, otherwise false, indicating * that we need to jump to the slowpath and sleep. + * + * The waiter flag is set to true if the spinner is a waiter in the wait + * queue. The waiter-spinner will spin on the lock directly and concurrently + * with the spinner at the head of the OSQ, if present, until the owner is + * changed to itself. */ static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { struct task_struct *task = current; - if (!mutex_can_spin_on_owner(lock)) - goto done; + if (!waiter) { + /* + * The purpose of the mutex_can_spin_on_owner() function is + * to eliminate the overhead of osq_lock() and osq_unlock() + * in case spinning isn't possible. As a waiter-spinner + * is not going to take OSQ lock anyway, there is no need + * to call mutex_can_spin_on_owner(). + */ + if (!mutex_can_spin_on_owner(lock)) + goto fail; - /* - * In order to avoid a stampede of mutex spinners trying to - * acquire the mutex all at once, the spinners need to take a - * MCS (queued) lock first before spinning on the owner field. - */ - if (!osq_lock(&lock->osq)) - goto done; + /* + * In order to avoid a stampede of mutex spinners trying to + * acquire the mutex all at once, the spinners need to take a + * MCS (queued) lock first before spinning on the owner field. + */ + if (!osq_lock(&lock->osq)) + goto fail; + } - while (true) { + for (;;) { struct task_struct *owner; if (use_ww_ctx && ww_ctx->acquired > 0) { @@ -449,7 +464,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * performed the optimistic spinning cannot be done. */ if (READ_ONCE(ww->ctx)) - break; + goto fail_unlock; } /* @@ -457,15 +472,20 @@ static bool mutex_optimistic_spin(struct mutex *lock, * release the lock or go to sleep. */ owner = __mutex_owner(lock); - if (owner && !mutex_spin_on_owner(lock, owner)) - break; + if (owner) { + if (waiter && owner == task) { + smp_mb(); /* ACQUIRE */ + break; + } - /* Try to acquire the mutex if it is unlocked. */ - if (__mutex_trylock(lock, false)) { - osq_unlock(&lock->osq); - return true; + if (!mutex_spin_on_owner(lock, owner)) + goto fail_unlock; } + /* Try to acquire the mutex if it is unlocked. */ + if (__mutex_trylock(lock, waiter)) + break; + /* * The cpu_relax() call is a compiler barrier which forces * everything in this loop to be re-loaded. We don't need @@ -475,8 +495,17 @@ static bool mutex_optimistic_spin(struct mutex *lock, cpu_relax_lowlatency(); } - osq_unlock(&lock->osq); -done: + if (!waiter) + osq_unlock(&lock->osq); + + return true; + + +fail_unlock: + if (!waiter) + osq_unlock(&lock->osq); + +fail: /* * If we fell out of the spin path because of need_resched(), * reschedule now, before we try-lock the mutex. This avoids getting @@ -495,7 +524,8 @@ done: } #else static bool mutex_optimistic_spin(struct mutex *lock, - struct ww_acquire_ctx *ww_ctx, const bool use_ww_ctx) + struct ww_acquire_ctx *ww_ctx, + const bool use_ww_ctx, const bool waiter) { return false; } @@ -600,7 +630,7 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, mutex_acquire_nest(&lock->dep_map, subclass, 0, nest_lock, ip); if (__mutex_trylock(lock, false) || - mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx)) { + mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, false)) { /* got the lock, yay! */ lock_acquired(&lock->dep_map, ip); if (use_ww_ctx) @@ -669,7 +699,8 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, * state back to RUNNING and fall through the next schedule(), * or we must see its unlock and acquire. */ - if (__mutex_trylock(lock, first)) + if ((first && mutex_optimistic_spin(lock, ww_ctx, use_ww_ctx, true)) || + __mutex_trylock(lock, first)) break; spin_lock_mutex(&lock->wait_lock, flags); -- cgit v1.2.3 From 83f06168ef15da5dc735c7ea14fae67609ed9538 Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Tue, 8 Nov 2016 00:02:07 -0800 Subject: locking/lockdep: Remove unused parameter from the add_lock_to_list() function The 'class' parameter is not used, remove it. n Signed-off-by: Tahsin Erdogan Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1478592127-4376-1-git-send-email-tahsin@google.com Signed-off-by: Ingo Molnar --- kernel/locking/lockdep.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 589d763a49b3..e74b438c850b 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -840,9 +840,9 @@ static struct lock_list *alloc_list_entry(void) /* * Add a new dependency to the head of the list: */ -static int add_lock_to_list(struct lock_class *class, struct lock_class *this, - struct list_head *head, unsigned long ip, - int distance, struct stack_trace *trace) +static int add_lock_to_list(struct lock_class *this, struct list_head *head, + unsigned long ip, int distance, + struct stack_trace *trace) { struct lock_list *entry; /* @@ -1869,14 +1869,14 @@ check_prev_add(struct task_struct *curr, struct held_lock *prev, * Ok, all validations passed, add the new lock * to the previous lock's dependency list: */ - ret = add_lock_to_list(hlock_class(prev), hlock_class(next), + ret = add_lock_to_list(hlock_class(next), &hlock_class(prev)->locks_after, next->acquire_ip, distance, &trace); if (!ret) return 0; - ret = add_lock_to_list(hlock_class(next), hlock_class(prev), + ret = add_lock_to_list(hlock_class(prev), &hlock_class(next)->locks_before, next->acquire_ip, distance, &trace); if (!ret) -- cgit v1.2.3 From bf0d31c05411f030e7df9f98b5be4d0c6fd5cd39 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Oct 2016 11:03:12 +0200 Subject: locking/core, stop_machine: Yield the CPU during stop machine() Some time ago the following commit: 57f2ffe14fd125c2 ("s390: remove diag 44 calls from cpu_relax()") ... stopped cpu_relax() on s390 yielding to the hypervisor. As it turns out this made stop_machine() run really slow on virtualized overcommited systems. For example the kprobes test during bootup took several seconds instead of just running unnoticed with large guests. Therefore, yielding was reintroduced with commit: 4d92f50249eb ("s390: reintroduce diag 44 calls for cpu_relax()") ... but in fact the stop machine code seems to be the only place where this yielding was really necessary. This place is probably the most important one as it makes all but one guest CPUs wait for one guest CPU. As we now have cpu_relax_yield(), we can use this in multi_cpu_stop(). For now lets only add it here. We can add it later in other places when necessary. Signed-off-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Noam Camus Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Cc: linuxppc-dev@lists.ozlabs.org Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1477386195-32736-3-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- kernel/stop_machine.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index ec9ab2f01489..1eb82661ecdb 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -194,7 +194,7 @@ static int multi_cpu_stop(void *data) /* Simple state machine */ do { /* Chill out and ensure we re-read multi_stop_state. */ - cpu_relax(); + cpu_relax_yield(); if (msdata->state != curstate) { curstate = msdata->state; switch (curstate) { -- cgit v1.2.3 From f2f09a4cee3507dba0e24b87ba2961a5c377d3a7 Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Oct 2016 11:03:14 +0200 Subject: locking/core: Remove cpu_relax_lowlatency() users With the s390 special case of a yielding cpu_relax() implementation gone, we can now remove all users of cpu_relax_lowlatency() and replace them with cpu_relax(). Signed-off-by: Christian Borntraeger Signed-off-by: Peter Zijlstra (Intel) Cc: Catalin Marinas Cc: Heiko Carstens Cc: Linus Torvalds Cc: Martin Schwidefsky Cc: Nicholas Piggin Cc: Noam Camus Cc: Peter Zijlstra Cc: Russell King Cc: Thomas Gleixner Cc: Will Deacon Cc: linuxppc-dev@lists.ozlabs.org Cc: virtualization@lists.linux-foundation.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1477386195-32736-5-git-send-email-borntraeger@de.ibm.com Signed-off-by: Ingo Molnar --- drivers/gpu/drm/i915/i915_gem_request.c | 2 +- drivers/vhost/net.c | 4 ++-- kernel/locking/mcs_spinlock.h | 4 ++-- kernel/locking/mutex.c | 4 ++-- kernel/locking/osq_lock.c | 6 +++--- kernel/locking/qrwlock.c | 6 +++--- kernel/locking/rwsem-xadd.c | 4 ++-- lib/lockref.c | 2 +- 8 files changed, 16 insertions(+), 16 deletions(-) (limited to 'kernel') diff --git a/drivers/gpu/drm/i915/i915_gem_request.c b/drivers/gpu/drm/i915/i915_gem_request.c index 8832f8ec1583..383d13416442 100644 --- a/drivers/gpu/drm/i915/i915_gem_request.c +++ b/drivers/gpu/drm/i915/i915_gem_request.c @@ -723,7 +723,7 @@ bool __i915_spin_request(const struct drm_i915_gem_request *req, if (busywait_stop(timeout_us, cpu)) break; - cpu_relax_lowlatency(); + cpu_relax(); } while (!need_resched()); return false; diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 5dc128a8da83..5dc34653274a 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -342,7 +342,7 @@ static int vhost_net_tx_get_vq_desc(struct vhost_net *net, endtime = busy_clock() + vq->busyloop_timeout; while (vhost_can_busy_poll(vq->dev, endtime) && vhost_vq_avail_empty(vq->dev, vq)) - cpu_relax_lowlatency(); + cpu_relax(); preempt_enable(); r = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), out_num, in_num, NULL, NULL); @@ -533,7 +533,7 @@ static int vhost_net_rx_peek_head_len(struct vhost_net *net, struct sock *sk) while (vhost_can_busy_poll(&net->dev, endtime) && !sk_has_rx_data(sk) && vhost_vq_avail_empty(&net->dev, vq)) - cpu_relax_lowlatency(); + cpu_relax(); preempt_enable(); diff --git a/kernel/locking/mcs_spinlock.h b/kernel/locking/mcs_spinlock.h index c835270f0c2f..6a385aabcce7 100644 --- a/kernel/locking/mcs_spinlock.h +++ b/kernel/locking/mcs_spinlock.h @@ -28,7 +28,7 @@ struct mcs_spinlock { #define arch_mcs_spin_lock_contended(l) \ do { \ while (!(smp_load_acquire(l))) \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } while (0) #endif @@ -108,7 +108,7 @@ void mcs_spin_unlock(struct mcs_spinlock **lock, struct mcs_spinlock *node) return; /* Wait until the next pointer is set */ while (!(next = READ_ONCE(node->next))) - cpu_relax_lowlatency(); + cpu_relax(); } /* Pass lock to next waiter. */ diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index 17a88e929e6a..a65e09a046ac 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -369,7 +369,7 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) break; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); @@ -492,7 +492,7 @@ static bool mutex_optimistic_spin(struct mutex *lock, * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } if (!waiter) diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 05a37857ab55..4ea2710b9d6c 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -75,7 +75,7 @@ osq_wait_next(struct optimistic_spin_queue *lock, break; } - cpu_relax_lowlatency(); + cpu_relax(); } return next; @@ -122,7 +122,7 @@ bool osq_lock(struct optimistic_spin_queue *lock) if (need_resched()) goto unqueue; - cpu_relax_lowlatency(); + cpu_relax(); } return true; @@ -148,7 +148,7 @@ unqueue: if (smp_load_acquire(&node->locked)) return true; - cpu_relax_lowlatency(); + cpu_relax(); /* * Or we race against a concurrent unqueue()'s step-B, in which diff --git a/kernel/locking/qrwlock.c b/kernel/locking/qrwlock.c index 19248ddf37ce..cc3ed0ccdfa2 100644 --- a/kernel/locking/qrwlock.c +++ b/kernel/locking/qrwlock.c @@ -54,7 +54,7 @@ static __always_inline void rspin_until_writer_unlock(struct qrwlock *lock, u32 cnts) { while ((cnts & _QW_WMASK) == _QW_LOCKED) { - cpu_relax_lowlatency(); + cpu_relax(); cnts = atomic_read_acquire(&lock->cnts); } } @@ -130,7 +130,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) (cmpxchg_relaxed(&l->wmode, 0, _QW_WAITING) == 0)) break; - cpu_relax_lowlatency(); + cpu_relax(); } /* When no more readers, set the locked flag */ @@ -141,7 +141,7 @@ void queued_write_lock_slowpath(struct qrwlock *lock) _QW_LOCKED) == _QW_WAITING)) break; - cpu_relax_lowlatency(); + cpu_relax(); } unlock: arch_spin_unlock(&lock->wait_lock); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2337b4bb2366..2fa2e2e64950 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -368,7 +368,7 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) return false; } - cpu_relax_lowlatency(); + cpu_relax(); } rcu_read_unlock(); out: @@ -423,7 +423,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) * memory barriers as we'll eventually observe the right * values at the cost of a few extra spins. */ - cpu_relax_lowlatency(); + cpu_relax(); } osq_unlock(&sem->osq); done: diff --git a/lib/lockref.c b/lib/lockref.c index 5a92189ad711..c4bfcb8836cd 100644 --- a/lib/lockref.c +++ b/lib/lockref.c @@ -20,7 +20,7 @@ if (likely(old.lock_count == prev.lock_count)) { \ SUCCESS; \ } \ - cpu_relax_lowlatency(); \ + cpu_relax(); \ } \ } while (0) -- cgit v1.2.3 From 194a6b5b9cb6b91a5f7d86984165a3bc55188599 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 17 Nov 2016 11:46:38 -0500 Subject: sched/wake_q: Rename WAKE_Q to DEFINE_WAKE_Q Currently the wake_q data structure is defined by the WAKE_Q() macro. This macro, however, looks like a function doing something as "wake" is a verb. Even checkpatch.pl was confused as it reported warnings like WARNING: Missing a blank line after declarations #548: FILE: kernel/futex.c:3665: + int ret; + WAKE_Q(wake_q); This patch renames the WAKE_Q() macro to DEFINE_WAKE_Q() which clarifies what the macro is doing and eliminates the checkpatch.pl warnings. Signed-off-by: Waiman Long Acked-by: Davidlohr Bueso Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/1479401198-1765-1-git-send-email-longman@redhat.com [ Resolved conflict and added missing rename. ] Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++-- ipc/mqueue.c | 4 ++-- ipc/msg.c | 8 ++++---- kernel/futex.c | 8 ++++---- kernel/locking/mutex.c | 2 +- kernel/locking/rtmutex.c | 2 +- kernel/locking/rwsem-xadd.c | 10 +++++----- 7 files changed, 19 insertions(+), 19 deletions(-) (limited to 'kernel') diff --git a/include/linux/sched.h b/include/linux/sched.h index c1aa3b02f6ac..dc37cbe2b13c 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -989,7 +989,7 @@ enum cpu_idle_type { * already in a wake queue, the wakeup will happen soon and the second * waker can just skip it. * - * The WAKE_Q macro declares and initializes the list head. + * The DEFINE_WAKE_Q macro declares and initializes the list head. * wake_up_q() does NOT reinitialize the list; it's expected to be * called near the end of a function, where the fact that the queue is * not used again will be easy to see by inspection. @@ -1009,7 +1009,7 @@ struct wake_q_head { #define WAKE_Q_TAIL ((struct wake_q_node *) 0x01) -#define WAKE_Q(name) \ +#define DEFINE_WAKE_Q(name) \ struct wake_q_head name = { WAKE_Q_TAIL, &name.first } extern void wake_q_add(struct wake_q_head *head, diff --git a/ipc/mqueue.c b/ipc/mqueue.c index 8cbd6e6894d5..7a2d8f0c8ae5 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -967,7 +967,7 @@ SYSCALL_DEFINE5(mq_timedsend, mqd_t, mqdes, const char __user *, u_msg_ptr, struct timespec ts; struct posix_msg_tree_node *new_leaf = NULL; int ret = 0; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (u_abs_timeout) { int res = prepare_timeout(u_abs_timeout, &expires, &ts); @@ -1151,7 +1151,7 @@ SYSCALL_DEFINE5(mq_timedreceive, mqd_t, mqdes, char __user *, u_msg_ptr, msg_ptr = wait.msg; } } else { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); msg_ptr = msg_get(info); diff --git a/ipc/msg.c b/ipc/msg.c index e12307d0c920..32e9bd837cde 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -235,7 +235,7 @@ static void freeque(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp) { struct msg_msg *msg, *t; struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm); - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); expunge_all(msq, -EIDRM, &wake_q); ss_wakeup(msq, &wake_q, true); @@ -397,7 +397,7 @@ static int msgctl_down(struct ipc_namespace *ns, int msqid, int cmd, goto out_up; case IPC_SET: { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (msqid64.msg_qbytes > ns->msg_ctlmnb && !capable(CAP_SYS_RESOURCE)) { @@ -634,7 +634,7 @@ long do_msgsnd(int msqid, long mtype, void __user *mtext, struct msg_msg *msg; int err; struct ipc_namespace *ns; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; @@ -850,7 +850,7 @@ long do_msgrcv(int msqid, void __user *buf, size_t bufsz, long msgtyp, int msgfl struct msg_queue *msq; struct ipc_namespace *ns; struct msg_msg *msg, *copy = NULL; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); ns = current->nsproxy->ipc_ns; diff --git a/kernel/futex.c b/kernel/futex.c index 2c4be467fecd..9246d9f593d1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1298,7 +1298,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this, struct task_struct *new_owner; struct futex_pi_state *pi_state = this->pi_state; u32 uninitialized_var(curval), newval; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); bool deboost; int ret = 0; @@ -1415,7 +1415,7 @@ futex_wake(u32 __user *uaddr, unsigned int flags, int nr_wake, u32 bitset) struct futex_q *this, *next; union futex_key key = FUTEX_KEY_INIT; int ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (!bitset) return -EINVAL; @@ -1469,7 +1469,7 @@ futex_wake_op(u32 __user *uaddr1, unsigned int flags, u32 __user *uaddr2, struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; int ret, op_ret; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); retry: ret = get_futex_key(uaddr1, flags & FLAGS_SHARED, &key1, VERIFY_READ); @@ -1708,7 +1708,7 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct futex_q *this, *next; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (requeue_pi) { /* diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index a65e09a046ac..c0731685603f 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -840,7 +840,7 @@ static noinline void __sched __mutex_unlock_slowpath(struct mutex *lock, unsigne { struct task_struct *next = NULL; unsigned long owner, flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); mutex_release(&lock->dep_map, 1, ip); diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 1ec0f48962b3..d8d7a153b711 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1382,7 +1382,7 @@ rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, struct wake_q_head *wqh)) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) { rt_mutex_deadlock_account_unlock(current); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 2fa2e2e64950..263e7449282a 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -225,7 +225,7 @@ struct rw_semaphore __sched *rwsem_down_read_failed(struct rw_semaphore *sem) long count, adjustment = -RWSEM_ACTIVE_READ_BIAS; struct rwsem_waiter waiter; struct task_struct *tsk = current; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); waiter.task = tsk; waiter.type = RWSEM_WAITING_FOR_READ; @@ -461,7 +461,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) bool waiting = true; /* any queued threads before us */ struct rwsem_waiter waiter; struct rw_semaphore *ret = sem; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* undo write bias from down_write operation, stop active locking */ count = atomic_long_sub_return(RWSEM_ACTIVE_WRITE_BIAS, &sem->count); @@ -495,7 +495,7 @@ __rwsem_down_write_failed_common(struct rw_semaphore *sem, int state) * wake any read locks that were queued ahead of us. */ if (count > RWSEM_WAITING_BIAS) { - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); __rwsem_mark_wake(sem, RWSEM_WAKE_READERS, &wake_q); /* @@ -571,7 +571,7 @@ __visible struct rw_semaphore *rwsem_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); /* * If a spinner is present, it is not necessary to do the wakeup. @@ -625,7 +625,7 @@ __visible struct rw_semaphore *rwsem_downgrade_wake(struct rw_semaphore *sem) { unsigned long flags; - WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_q); raw_spin_lock_irqsave(&sem->wait_lock, flags); -- cgit v1.2.3 From 5aff60a191e579ae00ae5ca6ce16c13b687bc8a3 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 2 Nov 2016 05:08:29 -0400 Subject: locking/osq: Break out of spin-wait busy waiting loop for a preempted vCPU in osq_lock() An over-committed guest with more vCPUs than pCPUs has a heavy overload in osq_lock(). This is because if vCPU-A holds the osq lock and yields out, vCPU-B ends up waiting for per_cpu node->locked to be set. IOW, vCPU-B waits for vCPU-A to run and unlock the osq lock. Use the new vcpu_is_preempted(cpu) interface to detect if a vCPU is currently running or not, and break out of the spin-loop if so. test case: $ perf record -a perf bench sched messaging -g 400 -p && perf report before patch: 18.09% sched-messaging [kernel.vmlinux] [k] osq_lock 12.28% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner 5.27% sched-messaging [kernel.vmlinux] [k] mutex_unlock 3.89% sched-messaging [kernel.vmlinux] [k] wait_consider_task 3.64% sched-messaging [kernel.vmlinux] [k] _raw_write_lock_irq 3.41% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner.is 2.49% sched-messaging [kernel.vmlinux] [k] system_call after patch: 20.68% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner 8.45% sched-messaging [kernel.vmlinux] [k] mutex_unlock 4.12% sched-messaging [kernel.vmlinux] [k] system_call 3.01% sched-messaging [kernel.vmlinux] [k] system_call_common 2.83% sched-messaging [kernel.vmlinux] [k] copypage_power7 2.64% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner 2.00% sched-messaging [kernel.vmlinux] [k] osq_lock Suggested-by: Boqun Feng Tested-by: Juergen Gross Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Acked-by: Christian Borntraeger Acked-by: Paolo Bonzini Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: bsingharora@gmail.com Cc: dave@stgolabs.net Cc: kernellwp@gmail.com Cc: konrad.wilk@oracle.com Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: rkrcmar@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: xen-devel-request@lists.xenproject.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1478077718-37424-3-git-send-email-xinhui.pan@linux.vnet.ibm.com [ Translated to English. ] Signed-off-by: Ingo Molnar --- kernel/locking/osq_lock.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'kernel') diff --git a/kernel/locking/osq_lock.c b/kernel/locking/osq_lock.c index 4ea2710b9d6c..a3167941093b 100644 --- a/kernel/locking/osq_lock.c +++ b/kernel/locking/osq_lock.c @@ -21,6 +21,11 @@ static inline int encode_cpu(int cpu_nr) return cpu_nr + 1; } +static inline int node_cpu(struct optimistic_spin_node *node) +{ + return node->cpu - 1; +} + static inline struct optimistic_spin_node *decode_cpu(int encoded_cpu_val) { int cpu_nr = encoded_cpu_val - 1; @@ -118,8 +123,10 @@ bool osq_lock(struct optimistic_spin_queue *lock) while (!READ_ONCE(node->locked)) { /* * If we need to reschedule bail... so we can block. + * Use vcpu_is_preempted() to avoid waiting for a preempted + * lock holder: */ - if (need_resched()) + if (need_resched() || vcpu_is_preempted(node_cpu(node->prev))) goto unqueue; cpu_relax(); -- cgit v1.2.3 From 05ffc951392df57edecc2519327b169210c3df75 Mon Sep 17 00:00:00 2001 From: Pan Xinhui Date: Wed, 2 Nov 2016 05:08:30 -0400 Subject: locking/mutex: Break out of expensive busy-loop on {mutex,rwsem}_spin_on_owner() when owner vCPU is preempted An over-committed guest with more vCPUs than pCPUs has a heavy overload in the two spin_on_owner. This blames on the lock holder preemption issue. Break out of the loop if the vCPU is preempted: if vcpu_is_preempted(cpu) is true. test-case: perf record -a perf bench sched messaging -g 400 -p && perf report before patch: 20.68% sched-messaging [kernel.vmlinux] [k] mutex_spin_on_owner 8.45% sched-messaging [kernel.vmlinux] [k] mutex_unlock 4.12% sched-messaging [kernel.vmlinux] [k] system_call 3.01% sched-messaging [kernel.vmlinux] [k] system_call_common 2.83% sched-messaging [kernel.vmlinux] [k] copypage_power7 2.64% sched-messaging [kernel.vmlinux] [k] rwsem_spin_on_owner 2.00% sched-messaging [kernel.vmlinux] [k] osq_lock after patch: 9.99% sched-messaging [kernel.vmlinux] [k] mutex_unlock 5.28% sched-messaging [unknown] [H] 0xc0000000000768e0 4.27% sched-messaging [kernel.vmlinux] [k] __copy_tofrom_user_power7 3.77% sched-messaging [kernel.vmlinux] [k] copypage_power7 3.24% sched-messaging [kernel.vmlinux] [k] _raw_write_lock_irq 3.02% sched-messaging [kernel.vmlinux] [k] system_call 2.69% sched-messaging [kernel.vmlinux] [k] wait_consider_task Tested-by: Juergen Gross Signed-off-by: Pan Xinhui Signed-off-by: Peter Zijlstra (Intel) Acked-by: Christian Borntraeger Acked-by: Paolo Bonzini Cc: David.Laight@ACULAB.COM Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: benh@kernel.crashing.org Cc: boqun.feng@gmail.com Cc: bsingharora@gmail.com Cc: dave@stgolabs.net Cc: kernellwp@gmail.com Cc: konrad.wilk@oracle.com Cc: linuxppc-dev@lists.ozlabs.org Cc: mpe@ellerman.id.au Cc: paulmck@linux.vnet.ibm.com Cc: paulus@samba.org Cc: rkrcmar@redhat.com Cc: virtualization@lists.linux-foundation.org Cc: will.deacon@arm.com Cc: xen-devel-request@lists.xenproject.org Cc: xen-devel@lists.xenproject.org Link: http://lkml.kernel.org/r/1478077718-37424-4-git-send-email-xinhui.pan@linux.vnet.ibm.com Signed-off-by: Ingo Molnar --- kernel/locking/mutex.c | 13 +++++++++++-- kernel/locking/rwsem-xadd.c | 14 +++++++++++--- 2 files changed, 22 insertions(+), 5 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/mutex.c b/kernel/locking/mutex.c index c0731685603f..9b349619f431 100644 --- a/kernel/locking/mutex.c +++ b/kernel/locking/mutex.c @@ -364,7 +364,11 @@ bool mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner) */ barrier(); - if (!owner->on_cpu || need_resched()) { + /* + * Use vcpu_is_preempted to detect lock holder preemption issue. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { ret = false; break; } @@ -389,8 +393,13 @@ static inline int mutex_can_spin_on_owner(struct mutex *lock) rcu_read_lock(); owner = __mutex_owner(lock); + + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ if (owner) - retval = owner->on_cpu; + retval = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); rcu_read_unlock(); /* diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index 263e7449282a..631506004f9e 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -336,7 +336,11 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) goto done; } - ret = owner->on_cpu; + /* + * As lock holder preemption issue, we both skip spinning if task is not + * on cpu or its cpu is preempted + */ + ret = owner->on_cpu && !vcpu_is_preempted(task_cpu(owner)); done: rcu_read_unlock(); return ret; @@ -362,8 +366,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) */ barrier(); - /* abort spinning when need_resched or owner is not running */ - if (!owner->on_cpu || need_resched()) { + /* + * abort spinning when need_resched or owner is not running or + * owner's cpu is preempted. + */ + if (!owner->on_cpu || need_resched() || + vcpu_is_preempted(task_cpu(owner))) { rcu_read_unlock(); return false; } -- cgit v1.2.3 From b5016e8203003c44264ec88fe2276ff54a51f689 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:44 +0000 Subject: locking/rtmutex: Get rid of RT_MUTEX_OWNER_MASKALL This is a left over from the original rtmutex implementation which used both bit0 and bit1 in the owner pointer. Commit: 8161239a8bcc ("rtmutex: Simplify PI algorithm and make highest prio task get lock") ... removed the usage of bit1, but kept the extra mask around. This is confusing at best. Remove it and just use RT_MUTEX_HAS_WAITERS for the masking. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Will Deacon Link: http://lkml.kernel.org/r/20161130210030.509567906@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex_common.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index e317e1cbb3eb..990134617b4c 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -71,13 +71,12 @@ task_top_pi_waiter(struct task_struct *p) * lock->owner state tracking: */ #define RT_MUTEX_HAS_WAITERS 1UL -#define RT_MUTEX_OWNER_MASKALL 1UL static inline struct task_struct *rt_mutex_owner(struct rt_mutex *lock) { unsigned long owner = (unsigned long) READ_ONCE(lock->owner); - return (struct task_struct *) (owner & ~RT_MUTEX_OWNER_MASKALL); + return (struct task_struct *) (owner & ~RT_MUTEX_HAS_WAITERS); } /* -- cgit v1.2.3 From 84d82ec5b9046ecdf16031d3e93a66ef50257402 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 30 Nov 2016 21:04:45 +0000 Subject: locking/rtmutex: Explain locking rules for rt_mutex_proxy_unlock()/init_proxy_locked() While debugging the unlock vs. dequeue race which resulted in state corruption of futexes the lockless nature of rt_mutex_proxy_unlock() caused some confusion. Add commentry to explain why it is safe to do this lockless. Add matching comments to rt_mutex_init_proxy_locked() for completeness sake. Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra (Intel) Cc: David Daney Cc: Linus Torvalds Cc: Mark Rutland Cc: Peter Zijlstra Cc: Sebastian Siewior Cc: Steven Rostedt Cc: Will Deacon Link: http://lkml.kernel.org/r/20161130210030.591941927@linutronix.de Signed-off-by: Ingo Molnar --- kernel/locking/rtmutex.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) (limited to 'kernel') diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 6e6cab7ac12f..2f443ed2320a 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -1619,11 +1619,15 @@ EXPORT_SYMBOL_GPL(__rt_mutex_init); * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a * proxy owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * @proxy_owner:the task to set as owner * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This initializes the rtmutex and + * assigns it to @proxy_owner. Concurrent operations on the rtmutex are not + * possible at this point because the pi_state which contains the rtmutex + * is not yet visible to other tasks. */ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) @@ -1637,10 +1641,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, /** * rt_mutex_proxy_unlock - release a lock on behalf of owner * - * @lock: the rt_mutex to be locked + * @lock: the rt_mutex to be locked * * No locking. Caller has to do serializing itself - * Special API call for PI-futex support + * + * Special API call for PI-futex support. This merrily cleans up the rtmutex + * (debugging) state. Concurrent operations on this rt_mutex are not + * possible because it belongs to the pi_state which is about to be freed + * and it is not longer visible to other tasks. */ void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner) -- cgit v1.2.3