summaryrefslogtreecommitdiffstats
path: root/kernel/locking/qspinlock_paravirt.h
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/locking/qspinlock_paravirt.h')
-rw-r--r--kernel/locking/qspinlock_paravirt.h102
1 files changed, 73 insertions, 29 deletions
diff --git a/kernel/locking/qspinlock_paravirt.h b/kernel/locking/qspinlock_paravirt.h
index df19ae4debd0..c8e6e9a596f5 100644
--- a/kernel/locking/qspinlock_paravirt.h
+++ b/kernel/locking/qspinlock_paravirt.h
@@ -22,9 +22,14 @@
#define _Q_SLOW_VAL (3U << _Q_LOCKED_OFFSET)
+/*
+ * Queue node uses: vcpu_running & vcpu_halted.
+ * Queue head uses: vcpu_running & vcpu_hashed.
+ */
enum vcpu_state {
vcpu_running = 0,
- vcpu_halted,
+ vcpu_halted, /* Used only in pv_wait_node */
+ vcpu_hashed, /* = pv_hash'ed + vcpu_halted */
};
struct pv_node {
@@ -153,7 +158,8 @@ static void pv_init_node(struct mcs_spinlock *node)
/*
* Wait for node->locked to become true, halt the vcpu after a short spin.
- * pv_kick_node() is used to wake the vcpu again.
+ * pv_kick_node() is used to set _Q_SLOW_VAL and fill in hash table on its
+ * behalf.
*/
static void pv_wait_node(struct mcs_spinlock *node)
{
@@ -172,9 +178,9 @@ static void pv_wait_node(struct mcs_spinlock *node)
*
* [S] pn->state = vcpu_halted [S] next->locked = 1
* MB MB
- * [L] pn->locked [RmW] pn->state = vcpu_running
+ * [L] pn->locked [RmW] pn->state = vcpu_hashed
*
- * Matches the xchg() from pv_kick_node().
+ * Matches the cmpxchg() from pv_kick_node().
*/
smp_store_mb(pn->state, vcpu_halted);
@@ -182,9 +188,10 @@ static void pv_wait_node(struct mcs_spinlock *node)
pv_wait(&pn->state, vcpu_halted);
/*
- * Reset the vCPU state to avoid unncessary CPU kicking
+ * If pv_kick_node() changed us to vcpu_hashed, retain that value
+ * so that pv_wait_head() knows to not also try to hash this lock.
*/
- WRITE_ONCE(pn->state, vcpu_running);
+ cmpxchg(&pn->state, vcpu_halted, vcpu_running);
/*
* If the locked flag is still not set after wakeup, it is a
@@ -194,6 +201,7 @@ static void pv_wait_node(struct mcs_spinlock *node)
* MCS lock will be released soon.
*/
}
+
/*
* By now our node->locked should be 1 and our caller will not actually
* spin-wait for it. We do however rely on our caller to do a
@@ -202,24 +210,35 @@ static void pv_wait_node(struct mcs_spinlock *node)
}
/*
- * Called after setting next->locked = 1, used to wake those stuck in
- * pv_wait_node().
+ * Called after setting next->locked = 1 when we're the lock owner.
+ *
+ * Instead of waking the waiters stuck in pv_wait_node() advance their state such
+ * that they're waiting in pv_wait_head(), this avoids a wake/sleep cycle.
*/
-static void pv_kick_node(struct mcs_spinlock *node)
+static void pv_kick_node(struct qspinlock *lock, struct mcs_spinlock *node)
{
struct pv_node *pn = (struct pv_node *)node;
+ struct __qspinlock *l = (void *)lock;
/*
- * Note that because node->locked is already set, this actual
- * mcs_spinlock entry could be re-used already.
+ * If the vCPU is indeed halted, advance its state to match that of
+ * pv_wait_node(). If OTOH this fails, the vCPU was running and will
+ * observe its next->locked value and advance itself.
*
- * This should be fine however, kicking people for no reason is
- * harmless.
+ * Matches with smp_store_mb() and cmpxchg() in pv_wait_node()
+ */
+ if (cmpxchg(&pn->state, vcpu_halted, vcpu_hashed) != vcpu_halted)
+ return;
+
+ /*
+ * Put the lock into the hash table and set the _Q_SLOW_VAL.
*
- * See the comment in pv_wait_node().
+ * As this is the same vCPU that will check the _Q_SLOW_VAL value and
+ * the hash table later on at unlock time, no atomic instruction is
+ * needed.
*/
- if (xchg(&pn->state, vcpu_running) == vcpu_halted)
- pv_kick(pn->cpu);
+ WRITE_ONCE(l->locked, _Q_SLOW_VAL);
+ (void)pv_hash(lock, pn);
}
/*
@@ -233,6 +252,13 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
struct qspinlock **lp = NULL;
int loop;
+ /*
+ * If pv_kick_node() already advanced our state, we don't need to
+ * insert ourselves into the hash table anymore.
+ */
+ if (READ_ONCE(pn->state) == vcpu_hashed)
+ lp = (struct qspinlock **)1;
+
for (;;) {
for (loop = SPIN_THRESHOLD; loop; loop--) {
if (!READ_ONCE(l->locked))
@@ -240,17 +266,22 @@ static void pv_wait_head(struct qspinlock *lock, struct mcs_spinlock *node)
cpu_relax();
}
- WRITE_ONCE(pn->state, vcpu_halted);
if (!lp) { /* ONCE */
+ WRITE_ONCE(pn->state, vcpu_hashed);
lp = pv_hash(lock, pn);
+
/*
- * lp must be set before setting _Q_SLOW_VAL
+ * We must hash before setting _Q_SLOW_VAL, such that
+ * when we observe _Q_SLOW_VAL in __pv_queued_spin_unlock()
+ * we'll be sure to be able to observe our hash entry.
*
- * [S] lp = lock [RmW] l = l->locked = 0
- * MB MB
- * [S] l->locked = _Q_SLOW_VAL [L] lp
+ * [S] pn->state
+ * [S] <hash> [Rmw] l->locked == _Q_SLOW_VAL
+ * MB RMB
+ * [RmW] l->locked = _Q_SLOW_VAL [L] <unhash>
+ * [L] pn->state
*
- * Matches the cmpxchg() in __pv_queued_spin_unlock().
+ * Matches the smp_rmb() in __pv_queued_spin_unlock().
*/
if (!cmpxchg(&l->locked, _Q_LOCKED_VAL, _Q_SLOW_VAL)) {
/*
@@ -287,24 +318,34 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
{
struct __qspinlock *l = (void *)lock;
struct pv_node *node;
- u8 lockval = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ u8 locked;
/*
* We must not unlock if SLOW, because in that case we must first
* unhash. Otherwise it would be possible to have multiple @lock
* entries, which would be BAD.
*/
- if (likely(lockval == _Q_LOCKED_VAL))
+ locked = cmpxchg(&l->locked, _Q_LOCKED_VAL, 0);
+ if (likely(locked == _Q_LOCKED_VAL))
return;
- if (unlikely(lockval != _Q_SLOW_VAL)) {
- if (debug_locks_silent)
- return;
- WARN(1, "pvqspinlock: lock %p has corrupted value 0x%x!\n", lock, atomic_read(&lock->val));
+ if (unlikely(locked != _Q_SLOW_VAL)) {
+ WARN(!debug_locks_silent,
+ "pvqspinlock: lock 0x%lx has corrupted value 0x%x!\n",
+ (unsigned long)lock, atomic_read(&lock->val));
return;
}
/*
+ * A failed cmpxchg doesn't provide any memory-ordering guarantees,
+ * so we need a barrier to order the read of the node data in
+ * pv_unhash *after* we've read the lock being _Q_SLOW_VAL.
+ *
+ * Matches the cmpxchg() in pv_wait_head() setting _Q_SLOW_VAL.
+ */
+ smp_rmb();
+
+ /*
* Since the above failed to release, this must be the SLOW path.
* Therefore start by looking up the blocked node and unhashing it.
*/
@@ -319,8 +360,11 @@ __visible void __pv_queued_spin_unlock(struct qspinlock *lock)
/*
* At this point the memory pointed at by lock can be freed/reused,
* however we can still use the pv_node to kick the CPU.
+ * The other vCPU may not really be halted, but kicking an active
+ * vCPU is harmless other than the additional latency in completing
+ * the unlock.
*/
- if (READ_ONCE(node->state) == vcpu_halted)
+ if (READ_ONCE(node->state) == vcpu_hashed)
pv_kick(node->cpu);
}
/*