diff options
Diffstat (limited to 'kernel/futex/core.c')
-rw-r--r-- | kernel/futex/core.c | 1176 |
1 files changed, 1176 insertions, 0 deletions
diff --git a/kernel/futex/core.c b/kernel/futex/core.c new file mode 100644 index 000000000000..25d8a88b32e5 --- /dev/null +++ b/kernel/futex/core.c @@ -0,0 +1,1176 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Fast Userspace Mutexes (which I call "Futexes!"). + * (C) Rusty Russell, IBM 2002 + * + * Generalized futexes, futex requeueing, misc fixes by Ingo Molnar + * (C) Copyright 2003 Red Hat Inc, All Rights Reserved + * + * Removed page pinning, fix privately mapped COW pages and other cleanups + * (C) Copyright 2003, 2004 Jamie Lokier + * + * Robust futex support started by Ingo Molnar + * (C) Copyright 2006 Red Hat Inc, All Rights Reserved + * Thanks to Thomas Gleixner for suggestions, analysis and fixes. + * + * PI-futex support started by Ingo Molnar and Thomas Gleixner + * Copyright (C) 2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * PRIVATE futexes by Eric Dumazet + * Copyright (C) 2007 Eric Dumazet <dada1@cosmosbay.com> + * + * Requeue-PI support by Darren Hart <dvhltc@us.ibm.com> + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * + * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly + * enough at me, Linus for the original (flawed) idea, Matthew + * Kirkwood for proof-of-concept implementation. + * + * "The futexes are also cursed." + * "But they come in a choice of three flavours!" + */ +#include <linux/compat.h> +#include <linux/jhash.h> +#include <linux/pagemap.h> +#include <linux/memblock.h> +#include <linux/fault-inject.h> +#include <linux/slab.h> + +#include "futex.h" +#include "../locking/rtmutex_common.h" + +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG +int __read_mostly futex_cmpxchg_enabled; +#endif + + +/* + * The base of the bucket array and its size are always used together + * (after initialization only in futex_hash()), so ensure that they + * reside in the same cacheline. + */ +static struct { + struct futex_hash_bucket *queues; + unsigned long hashsize; +} __futex_data __read_mostly __aligned(2*sizeof(long)); +#define futex_queues (__futex_data.queues) +#define futex_hashsize (__futex_data.hashsize) + + +/* + * Fault injections for futexes. + */ +#ifdef CONFIG_FAIL_FUTEX + +static struct { + struct fault_attr attr; + + bool ignore_private; +} fail_futex = { + .attr = FAULT_ATTR_INITIALIZER, + .ignore_private = false, +}; + +static int __init setup_fail_futex(char *str) +{ + return setup_fault_attr(&fail_futex.attr, str); +} +__setup("fail_futex=", setup_fail_futex); + +bool should_fail_futex(bool fshared) +{ + if (fail_futex.ignore_private && !fshared) + return false; + + return should_fail(&fail_futex.attr, 1); +} + +#ifdef CONFIG_FAULT_INJECTION_DEBUG_FS + +static int __init fail_futex_debugfs(void) +{ + umode_t mode = S_IFREG | S_IRUSR | S_IWUSR; + struct dentry *dir; + + dir = fault_create_debugfs_attr("fail_futex", NULL, + &fail_futex.attr); + if (IS_ERR(dir)) + return PTR_ERR(dir); + + debugfs_create_bool("ignore-private", mode, dir, + &fail_futex.ignore_private); + return 0; +} + +late_initcall(fail_futex_debugfs); + +#endif /* CONFIG_FAULT_INJECTION_DEBUG_FS */ + +#endif /* CONFIG_FAIL_FUTEX */ + +/** + * futex_hash - Return the hash bucket in the global hash + * @key: Pointer to the futex key for which the hash is calculated + * + * We hash on the keys returned from get_futex_key (see below) and return the + * corresponding hash bucket in the global hash. + */ +struct futex_hash_bucket *futex_hash(union futex_key *key) +{ + u32 hash = jhash2((u32 *)key, offsetof(typeof(*key), both.offset) / 4, + key->both.offset); + + return &futex_queues[hash & (futex_hashsize - 1)]; +} + + +/** + * futex_setup_timer - set up the sleeping hrtimer. + * @time: ptr to the given timeout value + * @timeout: the hrtimer_sleeper structure to be set up + * @flags: futex flags + * @range_ns: optional range in ns + * + * Return: Initialized hrtimer_sleeper structure or NULL if no timeout + * value given + */ +struct hrtimer_sleeper * +futex_setup_timer(ktime_t *time, struct hrtimer_sleeper *timeout, + int flags, u64 range_ns) +{ + if (!time) + return NULL; + + hrtimer_init_sleeper_on_stack(timeout, (flags & FLAGS_CLOCKRT) ? + CLOCK_REALTIME : CLOCK_MONOTONIC, + HRTIMER_MODE_ABS); + /* + * If range_ns is 0, calling hrtimer_set_expires_range_ns() is + * effectively the same as calling hrtimer_set_expires(). + */ + hrtimer_set_expires_range_ns(&timeout->timer, *time, range_ns); + + return timeout; +} + +/* + * Generate a machine wide unique identifier for this inode. + * + * This relies on u64 not wrapping in the life-time of the machine; which with + * 1ns resolution means almost 585 years. + * + * This further relies on the fact that a well formed program will not unmap + * the file while it has a (shared) futex waiting on it. This mapping will have + * a file reference which pins the mount and inode. + * + * If for some reason an inode gets evicted and read back in again, it will get + * a new sequence number and will _NOT_ match, even though it is the exact same + * file. + * + * It is important that futex_match() will never have a false-positive, esp. + * for PI futexes that can mess up the state. The above argues that false-negatives + * are only possible for malformed programs. + */ +static u64 get_inode_sequence_number(struct inode *inode) +{ + static atomic64_t i_seq; + u64 old; + + /* Does the inode already have a sequence number? */ + old = atomic64_read(&inode->i_sequence); + if (likely(old)) + return old; + + for (;;) { + u64 new = atomic64_add_return(1, &i_seq); + if (WARN_ON_ONCE(!new)) + continue; + + old = atomic64_cmpxchg_relaxed(&inode->i_sequence, 0, new); + if (old) + return old; + return new; + } +} + +/** + * get_futex_key() - Get parameters which are the keys for a futex + * @uaddr: virtual address of the futex + * @fshared: false for a PROCESS_PRIVATE futex, true for PROCESS_SHARED + * @key: address where result is stored. + * @rw: mapping needs to be read/write (values: FUTEX_READ, + * FUTEX_WRITE) + * + * Return: a negative error code or 0 + * + * The key words are stored in @key on success. + * + * For shared mappings (when @fshared), the key is: + * + * ( inode->i_sequence, page->index, offset_within_page ) + * + * [ also see get_inode_sequence_number() ] + * + * For private mappings (or when !@fshared), the key is: + * + * ( current->mm, address, 0 ) + * + * This allows (cross process, where applicable) identification of the futex + * without keeping the page pinned for the duration of the FUTEX_WAIT. + * + * lock_page() might sleep, the caller should not hold a spinlock. + */ +int get_futex_key(u32 __user *uaddr, bool fshared, union futex_key *key, + enum futex_access rw) +{ + unsigned long address = (unsigned long)uaddr; + struct mm_struct *mm = current->mm; + struct page *page, *tail; + struct address_space *mapping; + int err, ro = 0; + + /* + * The futex address must be "naturally" aligned. + */ + key->both.offset = address % PAGE_SIZE; + if (unlikely((address % sizeof(u32)) != 0)) + return -EINVAL; + address -= key->both.offset; + + if (unlikely(!access_ok(uaddr, sizeof(u32)))) + return -EFAULT; + + if (unlikely(should_fail_futex(fshared))) + return -EFAULT; + + /* + * PROCESS_PRIVATE futexes are fast. + * As the mm cannot disappear under us and the 'key' only needs + * virtual address, we dont even have to find the underlying vma. + * Note : We do have to check 'uaddr' is a valid user address, + * but access_ok() should be faster than find_vma() + */ + if (!fshared) { + key->private.mm = mm; + key->private.address = address; + return 0; + } + +again: + /* Ignore any VERIFY_READ mapping (futex common case) */ + if (unlikely(should_fail_futex(true))) + return -EFAULT; + + err = get_user_pages_fast(address, 1, FOLL_WRITE, &page); + /* + * If write access is not required (eg. FUTEX_WAIT), try + * and get read-only access. + */ + if (err == -EFAULT && rw == FUTEX_READ) { + err = get_user_pages_fast(address, 1, 0, &page); + ro = 1; + } + if (err < 0) + return err; + else + err = 0; + + /* + * The treatment of mapping from this point on is critical. The page + * lock protects many things but in this context the page lock + * stabilizes mapping, prevents inode freeing in the shared + * file-backed region case and guards against movement to swap cache. + * + * Strictly speaking the page lock is not needed in all cases being + * considered here and page lock forces unnecessarily serialization + * From this point on, mapping will be re-verified if necessary and + * page lock will be acquired only if it is unavoidable + * + * Mapping checks require the head page for any compound page so the + * head page and mapping is looked up now. For anonymous pages, it + * does not matter if the page splits in the future as the key is + * based on the address. For filesystem-backed pages, the tail is + * required as the index of the page determines the key. For + * base pages, there is no tail page and tail == page. + */ + tail = page; + page = compound_head(page); + mapping = READ_ONCE(page->mapping); + + /* + * If page->mapping is NULL, then it cannot be a PageAnon + * page; but it might be the ZERO_PAGE or in the gate area or + * in a special mapping (all cases which we are happy to fail); + * or it may have been a good file page when get_user_pages_fast + * found it, but truncated or holepunched or subjected to + * invalidate_complete_page2 before we got the page lock (also + * cases which we are happy to fail). And we hold a reference, + * so refcount care in invalidate_complete_page's remove_mapping + * prevents drop_caches from setting mapping to NULL beneath us. + * + * The case we do have to guard against is when memory pressure made + * shmem_writepage move it from filecache to swapcache beneath us: + * an unlikely race, but we do need to retry for page->mapping. + */ + if (unlikely(!mapping)) { + int shmem_swizzled; + + /* + * Page lock is required to identify which special case above + * applies. If this is really a shmem page then the page lock + * will prevent unexpected transitions. + */ + lock_page(page); + shmem_swizzled = PageSwapCache(page) || page->mapping; + unlock_page(page); + put_page(page); + + if (shmem_swizzled) + goto again; + + return -EFAULT; + } + + /* + * Private mappings are handled in a simple way. + * + * If the futex key is stored on an anonymous page, then the associated + * object is the mm which is implicitly pinned by the calling process. + * + * NOTE: When userspace waits on a MAP_SHARED mapping, even if + * it's a read-only handle, it's expected that futexes attach to + * the object not the particular process. + */ + if (PageAnon(page)) { + /* + * A RO anonymous page will never change and thus doesn't make + * sense for futex operations. + */ + if (unlikely(should_fail_futex(true)) || ro) { + err = -EFAULT; + goto out; + } + + key->both.offset |= FUT_OFF_MMSHARED; /* ref taken on mm */ + key->private.mm = mm; + key->private.address = address; + + } else { + struct inode *inode; + + /* + * The associated futex object in this case is the inode and + * the page->mapping must be traversed. Ordinarily this should + * be stabilised under page lock but it's not strictly + * necessary in this case as we just want to pin the inode, not + * update the radix tree or anything like that. + * + * The RCU read lock is taken as the inode is finally freed + * under RCU. If the mapping still matches expectations then the + * mapping->host can be safely accessed as being a valid inode. + */ + rcu_read_lock(); + + if (READ_ONCE(page->mapping) != mapping) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + inode = READ_ONCE(mapping->host); + if (!inode) { + rcu_read_unlock(); + put_page(page); + + goto again; + } + + key->both.offset |= FUT_OFF_INODE; /* inode-based key */ + key->shared.i_seq = get_inode_sequence_number(inode); + key->shared.pgoff = page_to_pgoff(tail); + rcu_read_unlock(); + } + +out: + put_page(page); + return err; +} + +/** + * fault_in_user_writeable() - Fault in user address and verify RW access + * @uaddr: pointer to faulting user space address + * + * Slow path to fixup the fault we just took in the atomic write + * access to @uaddr. + * + * We have no generic implementation of a non-destructive write to the + * user address. We know that we faulted in the atomic pagefault + * disabled section so we can as well avoid the #PF overhead by + * calling get_user_pages() right away. + */ +int fault_in_user_writeable(u32 __user *uaddr) +{ + struct mm_struct *mm = current->mm; + int ret; + + mmap_read_lock(mm); + ret = fixup_user_fault(mm, (unsigned long)uaddr, + FAULT_FLAG_WRITE, NULL); + mmap_read_unlock(mm); + + return ret < 0 ? ret : 0; +} + +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (futex_match(&this->key, key)) + return this; + } + return NULL; +} + +int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) +{ + int ret; + + pagefault_disable(); + ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); + pagefault_enable(); + + return ret; +} + +int futex_get_value_locked(u32 *dest, u32 __user *from) +{ + int ret; + + pagefault_disable(); + ret = __get_user(*dest, from); + pagefault_enable(); + + return ret ? -EFAULT : 0; +} + +/** + * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status + * @exiting: Pointer to the exiting task + * + * Caller must hold a refcount on @exiting. + */ +void wait_for_owner_exiting(int ret, struct task_struct *exiting) +{ + if (ret != -EBUSY) { + WARN_ON_ONCE(exiting); + return; + } + + if (WARN_ON_ONCE(ret == -EBUSY && !exiting)) + return; + + mutex_lock(&exiting->futex_exit_mutex); + /* + * No point in doing state checking here. If the waiter got here + * while the task was in exec()->exec_futex_release() then it can + * have any FUTEX_STATE_* value when the waiter has acquired the + * mutex. OK, if running, EXITING or DEAD if it reached exit() + * already. Highly unlikely and not a problem. Just one more round + * through the futex maze. + */ + mutex_unlock(&exiting->futex_exit_mutex); + + put_task_struct(exiting); +} + +/** + * __futex_unqueue() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be NULL and must be held by the caller. + */ +void __futex_unqueue(struct futex_q *q) +{ + struct futex_hash_bucket *hb; + + if (WARN_ON_SMP(!q->lock_ptr) || WARN_ON(plist_node_empty(&q->list))) + return; + lockdep_assert_held(q->lock_ptr); + + hb = container_of(q->lock_ptr, struct futex_hash_bucket, lock); + plist_del(&q->list, &hb->chain); + futex_hb_waiters_dec(hb); +} + +/* The key must be already stored in q->key. */ +struct futex_hash_bucket *futex_q_lock(struct futex_q *q) + __acquires(&hb->lock) +{ + struct futex_hash_bucket *hb; + + hb = futex_hash(&q->key); + + /* + * Increment the counter before taking the lock so that + * a potential waker won't miss a to-be-slept task that is + * waiting for the spinlock. This is safe as all futex_q_lock() + * users end up calling futex_queue(). Similarly, for housekeeping, + * decrement the counter at futex_q_unlock() when some error has + * occurred and we don't end up adding the task to the list. + */ + futex_hb_waiters_inc(hb); /* implies smp_mb(); (A) */ + + q->lock_ptr = &hb->lock; + + spin_lock(&hb->lock); + return hb; +} + +void futex_q_unlock(struct futex_hash_bucket *hb) + __releases(&hb->lock) +{ + spin_unlock(&hb->lock); + futex_hb_waiters_dec(hb); +} + +void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb) +{ + int prio; + + /* + * The priority used to register this element is + * - either the real thread-priority for the real-time threads + * (i.e. threads with a priority lower than MAX_RT_PRIO) + * - or MAX_RT_PRIO for non-RT threads. + * Thus, all RT-threads are woken first in priority order, and + * the others are woken last, in FIFO order. + */ + prio = min(current->normal_prio, MAX_RT_PRIO); + + plist_node_init(&q->list, prio); + plist_add(&q->list, &hb->chain); + q->task = current; +} + +/** + * futex_unqueue() - Remove the futex_q from its futex_hash_bucket + * @q: The futex_q to unqueue + * + * The q->lock_ptr must not be held by the caller. A call to futex_unqueue() must + * be paired with exactly one earlier call to futex_queue(). + * + * Return: + * - 1 - if the futex_q was still queued (and we removed unqueued it); + * - 0 - if the futex_q was already removed by the waking thread + */ +int futex_unqueue(struct futex_q *q) +{ + spinlock_t *lock_ptr; + int ret = 0; + + /* In the common case we don't take the spinlock, which is nice. */ +retry: + /* + * q->lock_ptr can change between this read and the following spin_lock. + * Use READ_ONCE to forbid the compiler from reloading q->lock_ptr and + * optimizing lock_ptr out of the logic below. + */ + lock_ptr = READ_ONCE(q->lock_ptr); + if (lock_ptr != NULL) { + spin_lock(lock_ptr); + /* + * q->lock_ptr can change between reading it and + * spin_lock(), causing us to take the wrong lock. This + * corrects the race condition. + * + * Reasoning goes like this: if we have the wrong lock, + * q->lock_ptr must have changed (maybe several times) + * between reading it and the spin_lock(). It can + * change again after the spin_lock() but only if it was + * already changed before the spin_lock(). It cannot, + * however, change back to the original value. Therefore + * we can detect whether we acquired the correct lock. + */ + if (unlikely(lock_ptr != q->lock_ptr)) { + spin_unlock(lock_ptr); + goto retry; + } + __futex_unqueue(q); + + BUG_ON(q->pi_state); + + spin_unlock(lock_ptr); + ret = 1; + } + + return ret; +} + +/* + * PI futexes can not be requeued and must remove themselves from the + * hash bucket. The hash bucket lock (i.e. lock_ptr) is held. + */ +void futex_unqueue_pi(struct futex_q *q) +{ + __futex_unqueue(q); + + BUG_ON(!q->pi_state); + put_pi_state(q->pi_state); + q->pi_state = NULL; +} + +/* Constants for the pending_op argument of handle_futex_death */ +#define HANDLE_DEATH_PENDING true +#define HANDLE_DEATH_LIST false + +/* + * Process a futex-list entry, check whether it's owned by the + * dying task, and do notification if so: + */ +static int handle_futex_death(u32 __user *uaddr, struct task_struct *curr, + bool pi, bool pending_op) +{ + u32 uval, nval, mval; + int err; + + /* Futex address must be 32bit aligned */ + if ((((unsigned long)uaddr) % sizeof(*uaddr)) != 0) + return -1; + +retry: + if (get_user(uval, uaddr)) + return -1; + + /* + * Special case for regular (non PI) futexes. The unlock path in + * user space has two race scenarios: + * + * 1. The unlock path releases the user space futex value and + * before it can execute the futex() syscall to wake up + * waiters it is killed. + * + * 2. A woken up waiter is killed before it can acquire the + * futex in user space. + * + * In both cases the TID validation below prevents a wakeup of + * potential waiters which can cause these waiters to block + * forever. + * + * In both cases the following conditions are met: + * + * 1) task->robust_list->list_op_pending != NULL + * @pending_op == true + * 2) User space futex value == 0 + * 3) Regular futex: @pi == false + * + * If these conditions are met, it is safe to attempt waking up a + * potential waiter without touching the user space futex value and + * trying to set the OWNER_DIED bit. The user space futex value is + * uncontended and the rest of the user space mutex state is + * consistent, so a woken waiter will just take over the + * uncontended futex. Setting the OWNER_DIED bit would create + * inconsistent state and malfunction of the user space owner died + * handling. + */ + if (pending_op && !pi && !uval) { + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + return 0; + } + + if ((uval & FUTEX_TID_MASK) != task_pid_vnr(curr)) + return 0; + + /* + * Ok, this dying thread is truly holding a futex + * of interest. Set the OWNER_DIED bit atomically + * via cmpxchg, and if the value had FUTEX_WAITERS + * set, wake up a waiter (if any). (We have to do a + * futex_wake() even if OWNER_DIED is already set - + * to handle the rare but possible case of recursive + * thread-death.) The rest of the cleanup is done in + * userspace. + */ + mval = (uval & FUTEX_WAITERS) | FUTEX_OWNER_DIED; + + /* + * We are not holding a lock here, but we want to have + * the pagefault_disable/enable() protection because + * we want to handle the fault gracefully. If the + * access fails we try to fault in the futex with R/W + * verification via get_user_pages. get_user() above + * does not guarantee R/W access. If that fails we + * give up and leave the futex locked. + */ + if ((err = futex_cmpxchg_value_locked(&nval, uaddr, uval, mval))) { + switch (err) { + case -EFAULT: + if (fault_in_user_writeable(uaddr)) + return -1; + goto retry; + + case -EAGAIN: + cond_resched(); + goto retry; + + default: + WARN_ON_ONCE(1); + return err; + } + } + + if (nval != uval) + goto retry; + + /* + * Wake robust non-PI futexes here. The wakeup of + * PI futexes happens in exit_pi_state(): + */ + if (!pi && (uval & FUTEX_WAITERS)) + futex_wake(uaddr, 1, 1, FUTEX_BITSET_MATCH_ANY); + + return 0; +} + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int fetch_robust_entry(struct robust_list __user **entry, + struct robust_list __user * __user *head, + unsigned int *pi) +{ + unsigned long uentry; + + if (get_user(uentry, (unsigned long __user *)head)) + return -EFAULT; + + *entry = (void __user *)(uentry & ~1UL); + *pi = uentry & 1; + + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +static void exit_robust_list(struct task_struct *curr) +{ + struct robust_list_head __user *head = curr->robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int next_pi; + unsigned long futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (fetch_robust_entry(&entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (fetch_robust_entry(&pending, &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = fetch_robust_entry(&next_entry, &entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * don't process it twice: + */ + if (entry != pending) { + if (handle_futex_death((void __user *)entry + futex_offset, + curr, pi, HANDLE_DEATH_LIST)) + return; + } + if (rc) + return; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + + if (pending) { + handle_futex_death((void __user *)pending + futex_offset, + curr, pip, HANDLE_DEATH_PENDING); + } +} + +#ifdef CONFIG_COMPAT +static void __user *futex_uaddr(struct robust_list __user *entry, + compat_long_t futex_offset) +{ + compat_uptr_t base = ptr_to_compat(entry); + void __user *uaddr = compat_ptr(base + futex_offset); + + return uaddr; +} + +/* + * Fetch a robust-list pointer. Bit 0 signals PI futexes: + */ +static inline int +compat_fetch_robust_entry(compat_uptr_t *uentry, struct robust_list __user **entry, + compat_uptr_t __user *head, unsigned int *pi) +{ + if (get_user(*uentry, head)) + return -EFAULT; + + *entry = compat_ptr((*uentry) & ~1); + *pi = (unsigned int)(*uentry) & 1; + + return 0; +} + +/* + * Walk curr->robust_list (very carefully, it's a userspace list!) + * and mark any locks found there dead, and notify any waiters. + * + * We silently return on any sign of list-walking problem. + */ +static void compat_exit_robust_list(struct task_struct *curr) +{ + struct compat_robust_list_head __user *head = curr->compat_robust_list; + struct robust_list __user *entry, *next_entry, *pending; + unsigned int limit = ROBUST_LIST_LIMIT, pi, pip; + unsigned int next_pi; + compat_uptr_t uentry, next_uentry, upending; + compat_long_t futex_offset; + int rc; + + if (!futex_cmpxchg_enabled) + return; + + /* + * Fetch the list head (which was registered earlier, via + * sys_set_robust_list()): + */ + if (compat_fetch_robust_entry(&uentry, &entry, &head->list.next, &pi)) + return; + /* + * Fetch the relative futex offset: + */ + if (get_user(futex_offset, &head->futex_offset)) + return; + /* + * Fetch any possibly pending lock-add first, and handle it + * if it exists: + */ + if (compat_fetch_robust_entry(&upending, &pending, + &head->list_op_pending, &pip)) + return; + + next_entry = NULL; /* avoid warning with gcc */ + while (entry != (struct robust_list __user *) &head->list) { + /* + * Fetch the next entry in the list before calling + * handle_futex_death: + */ + rc = compat_fetch_robust_entry(&next_uentry, &next_entry, + (compat_uptr_t __user *)&entry->next, &next_pi); + /* + * A pending lock might already be on the list, so + * dont process it twice: + */ + if (entry != pending) { + void __user *uaddr = futex_uaddr(entry, futex_offset); + + if (handle_futex_death(uaddr, curr, pi, + HANDLE_DEATH_LIST)) + return; + } + if (rc) + return; + uentry = next_uentry; + entry = next_entry; + pi = next_pi; + /* + * Avoid excessively long or circular lists: + */ + if (!--limit) + break; + + cond_resched(); + } + if (pending) { + void __user *uaddr = futex_uaddr(pending, futex_offset); + + handle_futex_death(uaddr, curr, pip, HANDLE_DEATH_PENDING); + } +} +#endif + +#ifdef CONFIG_FUTEX_PI + +/* + * This task is holding PI mutexes at exit time => bad. + * Kernel cleans up PI-state, but userspace is likely hosed. + * (Robust-futex cleanup is separate and might save the day for userspace.) + */ +static void exit_pi_state_list(struct task_struct *curr) +{ + struct list_head *next, *head = &curr->pi_state_list; + struct futex_pi_state *pi_state; + struct futex_hash_bucket *hb; + union futex_key key = FUTEX_KEY_INIT; + + if (!futex_cmpxchg_enabled) + return; + /* + * We are a ZOMBIE and nobody can enqueue itself on + * pi_state_list anymore, but we have to be careful + * versus waiters unqueueing themselves: + */ + raw_spin_lock_irq(&curr->pi_lock); + while (!list_empty(head)) { + next = head->next; + pi_state = list_entry(next, struct futex_pi_state, list); + key = pi_state->key; + hb = futex_hash(&key); + + /* + * We can race against put_pi_state() removing itself from the + * list (a waiter going away). put_pi_state() will first + * decrement the reference count and then modify the list, so + * its possible to see the list entry but fail this reference + * acquire. + * + * In that case; drop the locks to let put_pi_state() make + * progress and retry the loop. + */ + if (!refcount_inc_not_zero(&pi_state->refcount)) { + raw_spin_unlock_irq(&curr->pi_lock); + cpu_relax(); + raw_spin_lock_irq(&curr->pi_lock); + continue; + } + raw_spin_unlock_irq(&curr->pi_lock); + + spin_lock(&hb->lock); + raw_spin_lock_irq(&pi_state->pi_mutex.wait_lock); + raw_spin_lock(&curr->pi_lock); + /* + * We dropped the pi-lock, so re-check whether this + * task still owns the PI-state: + */ + if (head->next != next) { + /* retain curr->pi_lock for the loop invariant */ + raw_spin_unlock(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + put_pi_state(pi_state); + continue; + } + + WARN_ON(pi_state->owner != curr); + WARN_ON(list_empty(&pi_state->list)); + list_del_init(&pi_state->list); + pi_state->owner = NULL; + + raw_spin_unlock(&curr->pi_lock); + raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); + spin_unlock(&hb->lock); + + rt_mutex_futex_unlock(&pi_state->pi_mutex); + put_pi_state(pi_state); + + raw_spin_lock_irq(&curr->pi_lock); + } + raw_spin_unlock_irq(&curr->pi_lock); +} +#else +static inline void exit_pi_state_list(struct task_struct *curr) { } +#endif + +static void futex_cleanup(struct task_struct *tsk) +{ + if (unlikely(tsk->robust_list)) { + exit_robust_list(tsk); + tsk->robust_list = NULL; + } + +#ifdef CONFIG_COMPAT + if (unlikely(tsk->compat_robust_list)) { + compat_exit_robust_list(tsk); + tsk->compat_robust_list = NULL; + } +#endif + + if (unlikely(!list_empty(&tsk->pi_state_list))) + exit_pi_state_list(tsk); +} + +/** + * futex_exit_recursive - Set the tasks futex state to FUTEX_STATE_DEAD + * @tsk: task to set the state on + * + * Set the futex exit state of the task lockless. The futex waiter code + * observes that state when a task is exiting and loops until the task has + * actually finished the futex cleanup. The worst case for this is that the + * waiter runs through the wait loop until the state becomes visible. + * + * This is called from the recursive fault handling path in do_exit(). + * + * This is best effort. Either the futex exit code has run already or + * not. If the OWNER_DIED bit has been set on the futex then the waiter can + * take it over. If not, the problem is pushed back to user space. If the + * futex exit code did not run yet, then an already queued waiter might + * block forever, but there is nothing which can be done about that. + */ +void futex_exit_recursive(struct task_struct *tsk) +{ + /* If the state is FUTEX_STATE_EXITING then futex_exit_mutex is held */ + if (tsk->futex_state == FUTEX_STATE_EXITING) + mutex_unlock(&tsk->futex_exit_mutex); + tsk->futex_state = FUTEX_STATE_DEAD; +} + +static void futex_cleanup_begin(struct task_struct *tsk) +{ + /* + * Prevent various race issues against a concurrent incoming waiter + * including live locks by forcing the waiter to block on + * tsk->futex_exit_mutex when it observes FUTEX_STATE_EXITING in + * attach_to_pi_owner(). + */ + mutex_lock(&tsk->futex_exit_mutex); + + /* + * Switch the state to FUTEX_STATE_EXITING under tsk->pi_lock. + * + * This ensures that all subsequent checks of tsk->futex_state in + * attach_to_pi_owner() must observe FUTEX_STATE_EXITING with + * tsk->pi_lock held. + * + * It guarantees also that a pi_state which was queued right before + * the state change under tsk->pi_lock by a concurrent waiter must + * be observed in exit_pi_state_list(). + */ + raw_spin_lock_irq(&tsk->pi_lock); + tsk->futex_state = FUTEX_STATE_EXITING; + raw_spin_unlock_irq(&tsk->pi_lock); +} + +static void futex_cleanup_end(struct task_struct *tsk, int state) +{ + /* + * Lockless store. The only side effect is that an observer might + * take another loop until it becomes visible. + */ + tsk->futex_state = state; + /* + * Drop the exit protection. This unblocks waiters which observed + * FUTEX_STATE_EXITING to reevaluate the state. + */ + mutex_unlock(&tsk->futex_exit_mutex); +} + +void futex_exec_release(struct task_struct *tsk) +{ + /* + * The state handling is done for consistency, but in the case of + * exec() there is no way to prevent further damage as the PID stays + * the same. But for the unlikely and arguably buggy case that a + * futex is held on exec(), this provides at least as much state + * consistency protection which is possible. + */ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + /* + * Reset the state to FUTEX_STATE_OK. The task is alive and about + * exec a new binary. + */ + futex_cleanup_end(tsk, FUTEX_STATE_OK); +} + +void futex_exit_release(struct task_struct *tsk) +{ + futex_cleanup_begin(tsk); + futex_cleanup(tsk); + futex_cleanup_end(tsk, FUTEX_STATE_DEAD); +} + +static void __init futex_detect_cmpxchg(void) +{ +#ifndef CONFIG_HAVE_FUTEX_CMPXCHG + u32 curval; + + /* + * This will fail and we want it. Some arch implementations do + * runtime detection of the futex_atomic_cmpxchg_inatomic() + * functionality. We want to know that before we call in any + * of the complex code paths. Also we want to prevent + * registration of robust lists in that case. NULL is + * guaranteed to fault and we get -EFAULT on functional + * implementation, the non-functional ones will return + * -ENOSYS. + */ + if (futex_cmpxchg_value_locked(&curval, NULL, 0, 0) == -EFAULT) + futex_cmpxchg_enabled = 1; +#endif +} + +static int __init futex_init(void) +{ + unsigned int futex_shift; + unsigned long i; + +#if CONFIG_BASE_SMALL + futex_hashsize = 16; +#else + futex_hashsize = roundup_pow_of_two(256 * num_possible_cpus()); +#endif + + futex_queues = alloc_large_system_hash("futex", sizeof(*futex_queues), + futex_hashsize, 0, + futex_hashsize < 256 ? HASH_SMALL : 0, + &futex_shift, NULL, + futex_hashsize, futex_hashsize); + futex_hashsize = 1UL << futex_shift; + + futex_detect_cmpxchg(); + + for (i = 0; i < futex_hashsize; i++) { + atomic_set(&futex_queues[i].waiters, 0); + plist_head_init(&futex_queues[i].chain); + spin_lock_init(&futex_queues[i].lock); + } + + return 0; +} +core_initcall(futex_init); |