summaryrefslogtreecommitdiffstats
path: root/kernel/futex.c
diff options
context:
space:
mode:
Diffstat (limited to 'kernel/futex.c')
-rw-r--r--kernel/futex.c293
1 files changed, 192 insertions, 101 deletions
diff --git a/kernel/futex.c b/kernel/futex.c
index 684d7549825a..a5d2e74c89e0 100644
--- a/kernel/futex.c
+++ b/kernel/futex.c
@@ -124,16 +124,16 @@
* futex_wait(futex, val);
*
* waiters++; (a)
- * mb(); (A) <-- paired with -.
- * |
- * lock(hash_bucket(futex)); |
- * |
- * uval = *futex; |
- * | *futex = newval;
- * | sys_futex(WAKE, futex);
- * | futex_wake(futex);
- * |
- * `-------> mb(); (B)
+ * smp_mb(); (A) <-- paired with -.
+ * |
+ * lock(hash_bucket(futex)); |
+ * |
+ * uval = *futex; |
+ * | *futex = newval;
+ * | sys_futex(WAKE, futex);
+ * | futex_wake(futex);
+ * |
+ * `--------> smp_mb(); (B)
* if (uval == val)
* queue();
* unlock(hash_bucket(futex));
@@ -334,7 +334,7 @@ static inline void futex_get_mm(union futex_key *key)
/*
* Ensure futex_get_mm() implies a full barrier such that
* get_futex_key() implies a full barrier. This is relied upon
- * as full barrier (B), see the ordering comment above.
+ * as smp_mb(); (B), see the ordering comment above.
*/
smp_mb__after_atomic();
}
@@ -407,10 +407,10 @@ static void get_futex_key_refs(union futex_key *key)
switch (key->both.offset & (FUT_OFF_INODE|FUT_OFF_MMSHARED)) {
case FUT_OFF_INODE:
- ihold(key->shared.inode); /* implies MB (B) */
+ ihold(key->shared.inode); /* implies smp_mb(); (B) */
break;
case FUT_OFF_MMSHARED:
- futex_get_mm(key); /* implies MB (B) */
+ futex_get_mm(key); /* implies smp_mb(); (B) */
break;
default:
/*
@@ -418,7 +418,7 @@ static void get_futex_key_refs(union futex_key *key)
* mm, therefore the only purpose of calling get_futex_key_refs
* is because we need the barrier for the lockless waiter check.
*/
- smp_mb(); /* explicit MB (B) */
+ smp_mb(); /* explicit smp_mb(); (B) */
}
}
@@ -469,7 +469,8 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
{
unsigned long address = (unsigned long)uaddr;
struct mm_struct *mm = current->mm;
- struct page *page, *page_head;
+ struct page *page;
+ struct address_space *mapping;
int err, ro = 0;
/*
@@ -496,7 +497,7 @@ get_futex_key(u32 __user *uaddr, int fshared, union futex_key *key, int rw)
if (!fshared) {
key->private.mm = mm;
key->private.address = address;
- get_futex_key_refs(key); /* implies MB (B) */
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
return 0;
}
@@ -519,46 +520,22 @@ again:
else
err = 0;
-#ifdef CONFIG_TRANSPARENT_HUGEPAGE
- page_head = page;
- if (unlikely(PageTail(page))) {
- put_page(page);
- /* serialize against __split_huge_page_splitting() */
- local_irq_disable();
- if (likely(__get_user_pages_fast(address, 1, !ro, &page) == 1)) {
- page_head = compound_head(page);
- /*
- * page_head is valid pointer but we must pin
- * it before taking the PG_lock and/or
- * PG_compound_lock. The moment we re-enable
- * irqs __split_huge_page_splitting() can
- * return and the head page can be freed from
- * under us. We can't take the PG_lock and/or
- * PG_compound_lock on a page that could be
- * freed from under us.
- */
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
- local_irq_enable();
- } else {
- local_irq_enable();
- goto again;
- }
- }
-#else
- page_head = compound_head(page);
- if (page != page_head) {
- get_page(page_head);
- put_page(page);
- }
-#endif
-
- lock_page(page_head);
+ /*
+ * The treatment of mapping from this point on is critical. The page
+ * lock protects many things but in this context the page lock
+ * stabilizes mapping, prevents inode freeing in the shared
+ * file-backed region case and guards against movement to swap cache.
+ *
+ * Strictly speaking the page lock is not needed in all cases being
+ * considered here and page lock forces unnecessarily serialization
+ * From this point on, mapping will be re-verified if necessary and
+ * page lock will be acquired only if it is unavoidable
+ */
+ page = compound_head(page);
+ mapping = READ_ONCE(page->mapping);
/*
- * If page_head->mapping is NULL, then it cannot be a PageAnon
+ * If page->mapping is NULL, then it cannot be a PageAnon
* page; but it might be the ZERO_PAGE or in the gate area or
* in a special mapping (all cases which we are happy to fail);
* or it may have been a good file page when get_user_pages_fast
@@ -570,25 +547,38 @@ again:
*
* The case we do have to guard against is when memory pressure made
* shmem_writepage move it from filecache to swapcache beneath us:
- * an unlikely race, but we do need to retry for page_head->mapping.
+ * an unlikely race, but we do need to retry for page->mapping.
*/
- if (!page_head->mapping) {
- int shmem_swizzled = PageSwapCache(page_head);
- unlock_page(page_head);
- put_page(page_head);
+ if (unlikely(!mapping)) {
+ int shmem_swizzled;
+
+ /*
+ * Page lock is required to identify which special case above
+ * applies. If this is really a shmem page then the page lock
+ * will prevent unexpected transitions.
+ */
+ lock_page(page);
+ shmem_swizzled = PageSwapCache(page) || page->mapping;
+ unlock_page(page);
+ put_page(page);
+
if (shmem_swizzled)
goto again;
+
return -EFAULT;
}
/*
* Private mappings are handled in a simple way.
*
+ * If the futex key is stored on an anonymous page, then the associated
+ * object is the mm which is implicitly pinned by the calling process.
+ *
* NOTE: When userspace waits on a MAP_SHARED mapping, even if
* it's a read-only handle, it's expected that futexes attach to
* the object not the particular process.
*/
- if (PageAnon(page_head)) {
+ if (PageAnon(page)) {
/*
* A RO anonymous page will never change and thus doesn't make
* sense for futex operations.
@@ -601,17 +591,75 @@ again:
key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */
key->private.mm = mm;
key->private.address = address;
+
+ get_futex_key_refs(key); /* implies smp_mb(); (B) */
+
} else {
+ struct inode *inode;
+
+ /*
+ * The associated futex object in this case is the inode and
+ * the page->mapping must be traversed. Ordinarily this should
+ * be stabilised under page lock but it's not strictly
+ * necessary in this case as we just want to pin the inode, not
+ * update the radix tree or anything like that.
+ *
+ * The RCU read lock is taken as the inode is finally freed
+ * under RCU. If the mapping still matches expectations then the
+ * mapping->host can be safely accessed as being a valid inode.
+ */
+ rcu_read_lock();
+
+ if (READ_ONCE(page->mapping) != mapping) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ inode = READ_ONCE(mapping->host);
+ if (!inode) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ /*
+ * Take a reference unless it is about to be freed. Previously
+ * this reference was taken by ihold under the page lock
+ * pinning the inode in place so i_lock was unnecessary. The
+ * only way for this check to fail is if the inode was
+ * truncated in parallel so warn for now if this happens.
+ *
+ * We are not calling into get_futex_key_refs() in file-backed
+ * cases, therefore a successful atomic_inc return below will
+ * guarantee that get_futex_key() will still imply smp_mb(); (B).
+ */
+ if (WARN_ON_ONCE(!atomic_inc_not_zero(&inode->i_count))) {
+ rcu_read_unlock();
+ put_page(page);
+
+ goto again;
+ }
+
+ /* Should be impossible but lets be paranoid for now */
+ if (WARN_ON_ONCE(inode->i_mapping != mapping)) {
+ err = -EFAULT;
+ rcu_read_unlock();
+ iput(inode);
+
+ goto out;
+ }
+
key->both.offset |= FUT_OFF_INODE; /* inode-based key */
- key->shared.inode = page_head->mapping->host;
+ key->shared.inode = inode;
key->shared.pgoff = basepage_index(page);
+ rcu_read_unlock();
}
- get_futex_key_refs(key); /* implies MB (B) */
-
out:
- unlock_page(page_head);
- put_page(page_head);
+ put_page(page);
return err;
}
@@ -639,7 +687,7 @@ static int fault_in_user_writeable(u32 __user *uaddr)
down_read(&mm->mmap_sem);
ret = fixup_user_fault(current, mm, (unsigned long)uaddr,
- FAULT_FLAG_WRITE);
+ FAULT_FLAG_WRITE, NULL);
up_read(&mm->mmap_sem);
return ret < 0 ? ret : 0;
@@ -725,9 +773,12 @@ static struct futex_pi_state * alloc_pi_state(void)
}
/*
+ * Drops a reference to the pi_state object and frees or caches it
+ * when the last reference is gone.
+ *
* Must be called with the hb lock held.
*/
-static void free_pi_state(struct futex_pi_state *pi_state)
+static void put_pi_state(struct futex_pi_state *pi_state)
{
if (!pi_state)
return;
@@ -1223,7 +1274,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
if (pi_state->owner != current)
return -EINVAL;
- raw_spin_lock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock);
new_owner = rt_mutex_next_owner(&pi_state->pi_mutex);
/*
@@ -1249,22 +1300,22 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this,
else if (curval != uval)
ret = -EINVAL;
if (ret) {
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
return ret;
}
- raw_spin_lock_irq(&pi_state->owner->pi_lock);
+ raw_spin_lock(&pi_state->owner->pi_lock);
WARN_ON(list_empty(&pi_state->list));
list_del_init(&pi_state->list);
- raw_spin_unlock_irq(&pi_state->owner->pi_lock);
+ raw_spin_unlock(&pi_state->owner->pi_lock);
- raw_spin_lock_irq(&new_owner->pi_lock);
+ raw_spin_lock(&new_owner->pi_lock);
WARN_ON(!list_empty(&pi_state->list));
list_add(&pi_state->list, &new_owner->pi_state_list);
pi_state->owner = new_owner;
- raw_spin_unlock_irq(&new_owner->pi_lock);
+ raw_spin_unlock(&new_owner->pi_lock);
- raw_spin_unlock(&pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock);
deboost = rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q);
@@ -1706,31 +1757,35 @@ retry_private:
* exist yet, look it up one more time to ensure we have a
* reference to it. If the lock was taken, ret contains the
* vpid of the top waiter task.
+ * If the lock was not taken, we have pi_state and an initial
+ * refcount on it. In case of an error we have nothing.
*/
if (ret > 0) {
WARN_ON(pi_state);
drop_count++;
task_count++;
/*
- * If we acquired the lock, then the user
- * space value of uaddr2 should be vpid. It
- * cannot be changed by the top waiter as it
- * is blocked on hb2 lock if it tries to do
- * so. If something fiddled with it behind our
- * back the pi state lookup might unearth
- * it. So we rather use the known value than
- * rereading and handing potential crap to
- * lookup_pi_state.
+ * If we acquired the lock, then the user space value
+ * of uaddr2 should be vpid. It cannot be changed by
+ * the top waiter as it is blocked on hb2 lock if it
+ * tries to do so. If something fiddled with it behind
+ * our back the pi state lookup might unearth it. So
+ * we rather use the known value than rereading and
+ * handing potential crap to lookup_pi_state.
+ *
+ * If that call succeeds then we have pi_state and an
+ * initial refcount on it.
*/
ret = lookup_pi_state(ret, hb2, &key2, &pi_state);
}
switch (ret) {
case 0:
+ /* We hold a reference on the pi state. */
break;
+
+ /* If the above failed, then pi_state is NULL */
case -EFAULT:
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1746,8 +1801,6 @@ retry_private:
* exit to complete.
* - The user space value changed.
*/
- free_pi_state(pi_state);
- pi_state = NULL;
double_unlock_hb(hb1, hb2);
hb_waiters_dec(hb2);
put_futex_key(&key2);
@@ -1801,30 +1854,58 @@ retry_private:
* of requeue_pi if we couldn't acquire the lock atomically.
*/
if (requeue_pi) {
- /* Prepare the waiter to take the rt_mutex. */
+ /*
+ * Prepare the waiter to take the rt_mutex. Take a
+ * refcount on the pi_state and store the pointer in
+ * the futex_q object of the waiter.
+ */
atomic_inc(&pi_state->refcount);
this->pi_state = pi_state;
ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex,
this->rt_waiter,
this->task);
if (ret == 1) {
- /* We got the lock. */
+ /*
+ * We got the lock. We do neither drop the
+ * refcount on pi_state nor clear
+ * this->pi_state because the waiter needs the
+ * pi_state for cleaning up the user space
+ * value. It will drop the refcount after
+ * doing so.
+ */
requeue_pi_wake_futex(this, &key2, hb2);
drop_count++;
continue;
} else if (ret) {
- /* -EDEADLK */
+ /*
+ * rt_mutex_start_proxy_lock() detected a
+ * potential deadlock when we tried to queue
+ * that waiter. Drop the pi_state reference
+ * which we took above and remove the pointer
+ * to the state from the waiters futex_q
+ * object.
+ */
this->pi_state = NULL;
- free_pi_state(pi_state);
- goto out_unlock;
+ put_pi_state(pi_state);
+ /*
+ * We stop queueing more waiters and let user
+ * space deal with the mess.
+ */
+ break;
}
}
requeue_futex(this, hb1, hb2, &key2);
drop_count++;
}
+ /*
+ * We took an extra initial reference to the pi_state either
+ * in futex_proxy_trylock_atomic() or in lookup_pi_state(). We
+ * need to drop it here again.
+ */
+ put_pi_state(pi_state);
+
out_unlock:
- free_pi_state(pi_state);
double_unlock_hb(hb1, hb2);
wake_up_q(&wake_q);
hb_waiters_dec(hb2);
@@ -1866,7 +1947,7 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q)
q->lock_ptr = &hb->lock;
- spin_lock(&hb->lock); /* implies MB (A) */
+ spin_lock(&hb->lock); /* implies smp_mb(); (A) */
return hb;
}
@@ -1929,8 +2010,12 @@ static int unqueue_me(struct futex_q *q)
/* In the common case we don't take the spinlock, which is nice. */
retry:
- lock_ptr = q->lock_ptr;
- barrier();
+ /*
+ * q->lock_ptr can change between this read and the following spin_lock.
+ * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and
+ * optimizing lock_ptr out of the logic below.
+ */
+ lock_ptr = READ_ONCE(q->lock_ptr);
if (lock_ptr != NULL) {
spin_lock(lock_ptr);
/*
@@ -1973,7 +2058,7 @@ static void unqueue_me_pi(struct futex_q *q)
__unqueue_futex(q);
BUG_ON(!q->pi_state);
- free_pi_state(q->pi_state);
+ put_pi_state(q->pi_state);
q->pi_state = NULL;
spin_unlock(q->lock_ptr);
@@ -2129,11 +2214,11 @@ static int fixup_owner(u32 __user *uaddr, struct futex_q *q, int locked)
* we returned due to timeout or signal without taking the
* rt_mutex. Too late.
*/
- raw_spin_lock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_lock_irq(&q->pi_state->pi_mutex.wait_lock);
owner = rt_mutex_owner(&q->pi_state->pi_mutex);
if (!owner)
owner = rt_mutex_next_owner(&q->pi_state->pi_mutex);
- raw_spin_unlock(&q->pi_state->pi_mutex.wait_lock);
+ raw_spin_unlock_irq(&q->pi_state->pi_mutex.wait_lock);
ret = fixup_pi_state_owner(uaddr, q, owner);
goto out;
}
@@ -2755,6 +2840,11 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags,
if (q.pi_state && (q.pi_state->owner != current)) {
spin_lock(q.lock_ptr);
ret = fixup_pi_state_owner(uaddr2, &q, current);
+ /*
+ * Drop the reference to the pi state which
+ * the requeue_pi() code acquired for us.
+ */
+ put_pi_state(q.pi_state);
spin_unlock(q.lock_ptr);
}
} else {
@@ -2881,7 +2971,7 @@ SYSCALL_DEFINE3(get_robust_list, int, pid,
}
ret = -EPERM;
- if (!ptrace_may_access(p, PTRACE_MODE_READ))
+ if (!ptrace_may_access(p, PTRACE_MODE_READ_REALCREDS))
goto err_unlock;
head = p->robust_list;
@@ -3046,7 +3136,8 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout,
if (op & FUTEX_CLOCK_REALTIME) {
flags |= FLAGS_CLOCKRT;
- if (cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI)
+ if (cmd != FUTEX_WAIT && cmd != FUTEX_WAIT_BITSET && \
+ cmd != FUTEX_WAIT_REQUEUE_PI)
return -ENOSYS;
}